From 6dd45a17fac8f14a2c6f60288c4e9ad69d1396cb Mon Sep 17 00:00:00 2001 From: danmax Date: Mon, 14 Oct 2024 22:09:05 -0400 Subject: [PATCH] done with basics --- Cargo.toml | 9 +++++++++ src/main.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 Cargo.toml create mode 100644 src/main.rs diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..bea413b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,9 @@ +[package] +name = "r34-scraper" +version = "0.1.0" +edition = "2021" + +[dependencies] +regex = "1.11.0" +reqwest = { version = "0.12.8", features = ["blocking"] } +tokio = { version = "1", features = ["full"] } diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..5c19c3c --- /dev/null +++ b/src/main.rs @@ -0,0 +1,46 @@ +use regex::Regex; +use reqwest::Client; + +#[tokio::main] +async fn main() -> Result<(), Box> { + + let body = Client::new() + .get("https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid=0") + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; + + let urls = extract_urls(&body); + for url in urls { + println!("found post: {}", url); + + let post = Client::new() + .get(url) + .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; + + let img_url = extract_img_url(&post); + match !img_url.is_empty() { + true => println!("found image url: {}", img_url), + false => println!("image url not found"), + } + } + + Ok(()) +} + + +fn extract_urls(html: &str) -> Vec { + let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap(); + + let urls: Vec = re.find_iter(html) + .map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string())) + .collect(); + + urls +} + +fn extract_img_url(html: &str) -> String { + let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap(); + match re.find(html) { + Some(img_url) => img_url.as_str().to_string(), + None => String::new(), + } +}