#![feature(async_closure)] use regex::Regex; use reqwest::Client; use std::process::ExitCode; use tokio::time::{sleep, Duration}; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; #[tokio::main] async fn main() -> ExitCode { println!("which tags do you want to scrape? ex: 1girls+1boys+yomama"); let tags = std::io::stdin() .lines() .next() .unwrap() .unwrap() .trim() .to_string(); let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); let mut page = 0; loop { println!("now scraping page {}", page + 1); let post_html = async || { extract_urls( &client .get(format!( "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}", page * 42 )) .send() .await .unwrap() .text() .await .unwrap(), ) }; let mut urls = post_html().await; let mut wait_time = 5000; if urls.is_empty() { for reconnection_attempts in 0..4 { println!("no urls found, retrying in {} seconds...", wait_time / 1000); sleep(Duration::from_millis(wait_time)).await; urls = post_html().await; if !urls.is_empty() { println!("urls found! continuing..."); break; } if reconnection_attempts == 3 { println!("no urls found in 4 attempts, exiting..."); return ExitCode::FAILURE; } wait_time += 5000; } } for url in urls { let img_url = extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); if img_url.is_empty() { println!("image url not found"); } else { println!("found image url: {img_url}"); } } page += 1; } } fn extract_urls(html: &str) -> Vec { Regex::new(r"/index\.php\?page=post&s=view&id=\d+") .unwrap() .find_iter(html) .map(|mat| format!("https://rule34.xxx{}", mat.as_str())) .collect() } fn extract_img_url(html: &str) -> String { if let Some(img_url) = Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") .unwrap() .find(html) { img_url.as_str().to_string() } else { String::new() } }