From 7b73c942b85635f4ed051f212059aa2806b64459 Mon Sep 17 00:00:00 2001 From: danmax Date: Mon, 14 Oct 2024 23:59:06 -0400 Subject: [PATCH] made scraper scrape all the pages --- src/main.rs | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5c19c3c..3797704 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,42 +3,56 @@ use reqwest::Client; #[tokio::main] async fn main() -> Result<(), Box> { + let mut r34pid = -42; - let body = Client::new() - .get("https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid=0") + loop { + r34pid += 42; + + let r34_url = format!( + "https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid={}", + r34pid + ); + + let body = Client::new() + .get(r34_url) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; - let urls = extract_urls(&body); - for url in urls { - println!("found post: {}", url); + println!("-------------------------------"); + println!(" now scraping page {}", r34pid / 42 + 1); + println!("-------------------------------"); - let post = Client::new() + let urls = extract_urls(&body); + for url in urls { + println!("found post: {}", url); + + let post = Client::new() .get(url) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; - - let img_url = extract_img_url(&post); - match !img_url.is_empty() { - true => println!("found image url: {}", img_url), - false => println!("image url not found"), + + let img_url = extract_img_url(&post); + match !img_url.is_empty() { + true => println!("found image url: {}", img_url), + false => println!("image url not found"), + } } } - - Ok(()) } - fn extract_urls(html: &str) -> Vec { let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap(); - - let urls: Vec = re.find_iter(html) + + let urls: Vec = re + .find_iter(html) .map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string())) .collect(); - + urls } fn extract_img_url(html: &str) -> String { - let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap(); + let re = + Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+") + .unwrap(); match re.find(html) { Some(img_url) => img_url.as_str().to_string(), None => String::new(),