diff --git a/src/main.rs b/src/main.rs index 26fc107..65d01d5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,68 +1,78 @@ +use std::process::ExitCode; + use regex::Regex; use reqwest::Client; +const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; + #[tokio::main] -async fn main() -> Result<(), Box> { - let mut r34_tags = String::new(); +async fn main() -> ExitCode { println!("which tags do you want to scrape? ex: 1girls+1boys+yomama"); - std::io::stdin().read_line(&mut r34_tags).unwrap(); - r34_tags.trim().to_string(); + let tags = std::io::stdin() + .lines() + .next() + .unwrap() + .unwrap() + .trim() + .to_string(); + + let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); + let mut page = 0; - let mut r34pid = -42; loop { - r34pid += 42; + println!("now scraping page {page}"); - let r34_url = format!( - "https://rule34.xxx/index.php?page=post&s=list&tags={}&pid={}", r34_tags, r34pid); - - let body = Client::new() - .get(r34_url) - .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; - - let urls = extract_urls(&body); - - if !urls.is_empty() { - } else { + let urls = extract_urls( + &client + .get(format!( + "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}", + page * 42 + )) + .send() + .await + .unwrap() + .text() + .await + .unwrap(), + ); + if urls.is_empty() { println!("no urls found, exiting..."); - std::process::exit(1); + return ExitCode::FAILURE; } - println!("-------------------------------"); - println!(" now scraping page {}", r34pid / 42 + 1); - println!("-------------------------------"); - - for url in urls { - println!("found post: {}", url); + println!("found post: {url}"); - let post = Client::new() - .get(url) - .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; - - let img_url = extract_img_url(&post); - match !img_url.is_empty() { - true => println!("found image url: {}", img_url), - false => println!("image url not found"), + let img_url = + extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); + if img_url.is_empty() { + println!("image url not found"); + } else { + println!("found image url: {img_url}"); } } + + page += 1; } } fn extract_urls(html: &str) -> Vec { - let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap(); - - let urls: Vec = re + Regex::new(r"/index\.php\?page=post&s=view&id=\d+") + .unwrap() .find_iter(html) - .map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string())) - .collect(); - - urls + .map(|mat| format!("https://rule34.xxx{}", mat.as_str())) + .collect() } fn extract_img_url(html: &str) -> String { - let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+").unwrap(); - match re.find(html) { - Some(img_url) => img_url.as_str().to_string(), - None => String::new(), + if let Some(img_url) = + Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+") + .unwrap() + .find(html) + { + img_url.as_str().to_string() + } else { + String::new() } } +