3 Commits

Author SHA1 Message Date
5ce292d1c2 chore: modularity & good code 2024-10-18 19:26:11 +02:00
e62d2cc186 feat: add arg parsing 2024-10-18 18:50:40 +02:00
4acaf0308c chore: define rust channel (+ fmt) 2024-10-18 18:43:18 +02:00
7 changed files with 168 additions and 21 deletions

120
Cargo.lock generated
View File

@@ -26,6 +26,55 @@ dependencies = [
"memchr",
]
[[package]]
name = "anstream"
version = "0.6.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
[[package]]
name = "anstyle-parse"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
]
[[package]]
name = "atomic-waker"
version = "1.1.2"
@@ -92,6 +141,52 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "clap"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "colorchoice"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
[[package]]
name = "core-foundation"
version = "0.9.4"
@@ -261,6 +356,12 @@ version = "0.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "hermit-abi"
version = "0.3.9"
@@ -405,6 +506,12 @@ version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "itoa"
version = "1.0.11"
@@ -626,6 +733,7 @@ dependencies = [
name = "r34-scraper"
version = "0.1.0"
dependencies = [
"clap",
"regex",
"reqwest",
"tokio",
@@ -920,6 +1028,12 @@ version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
@@ -1127,6 +1241,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "vcpkg"
version = "0.2.15"

View File

@@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.5.20", features = ["derive"] }
regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] }
tokio = { version = "1", features = ["full"] }

View File

@@ -5,6 +5,3 @@ a scraper that well scrapes r34
## note
this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

2
rust-toolchain.toml Normal file
View File

@@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"

16
src/args/mod.rs Normal file
View File

@@ -0,0 +1,16 @@
use clap::Parser;
#[derive(Parser)]
#[command(version)]
pub struct Args {
/// User Agent to use for requests
#[arg(
short,
default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
)]
pub user_agent: String,
// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
}

View File

@@ -1,33 +1,44 @@
#![feature(async_closure)]
#![feature(async_closure, iter_intersperse)]
pub mod args;
use clap::Parser;
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main]
async fn main() -> ExitCode {
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
let tags = std::io::stdin()
.lines()
.next()
.unwrap()
.unwrap()
.trim()
.to_string();
let args = args::Args::parse();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
let tags = args.tags.unwrap_or_else(|| {
println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
loop {
let client = Client::builder()
.user_agent(args.user_agent)
.build()
.unwrap();
for page in 0.. {
println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
);
let post_html = async || {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
))
.send()
@@ -73,9 +84,9 @@ async fn main() -> ExitCode {
println!("found image url: {img_url}");
}
}
page += 1;
}
return ExitCode::SUCCESS;
}
fn extract_urls(html: &str) -> Vec<String> {
@@ -88,7 +99,7 @@ fn extract_urls(html: &str) -> Vec<String> {
fn extract_img_url(html: &str) -> String {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{