#![feature(async_closure, iter_intersperse)] pub mod args; use clap::Parser; use futures::{stream, StreamExt}; use regex::Regex; use reqwest::Client; use tokio::time::{sleep, Duration}; use async_std::sync::Mutex; use std::{process::ExitCode, sync::Arc}; #[tokio::main] async fn main() -> ExitCode { let args = args::Args::parse(); let tags = args.tags.unwrap_or_else(|| { println!("which tags do you want to scrape? ex: 1girls 1boys yomama"); let tags_binding = std::io::stdin().lines().next().unwrap().unwrap(); tags_binding .split(' ') .filter(|item| !item.is_empty()) .map(|item| item.to_owned()) .collect() }); let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect(); let client = Client::builder() .user_agent(&args.user_agent) .build() .unwrap(); for page in 0.. { println!("now scraping page {}", page + 1); println!( "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", page * 42 ); let post_html = async |client: &Client| { extract_urls( &client .get(format!( "https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}", page * 42 )) .send() .await .unwrap() .text() .await .unwrap(), ) }; let mut urls = post_html(&client).await; let mut wait_time = 5000; if urls.is_empty() { for reconnection_attempts in 0..4 { println!("no urls found, retrying in {} seconds...", wait_time / 1000); sleep(Duration::from_millis(wait_time)).await; urls = post_html(&client).await; if !urls.is_empty() { println!("urls found! continuing..."); break; } if reconnection_attempts == 3 { println!("no urls found in 4 attempts, exiting..."); return ExitCode::FAILURE; } wait_time += 5000; } } let ratelimit_lock = &Arc::new(Mutex::new(())); let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| { let client = &client; async move { // "thread" let thread_id = format!("[{: >4}]", i % 9999); println!("{thread_id} scraping {url:?}"); loop { let lock = ratelimit_lock.lock().await; drop(lock); let resp = client.get(&url).send().await.unwrap(); match extract_img_url(&resp.text().await.unwrap()) { Ok(img_url) => { if img_url.is_empty() { println!("{thread_id} image url not found"); } else { println!("{thread_id} found image url: {img_url}"); } break img_url; } Err(_) => { let lock = ratelimit_lock.lock().await; println!("{thread_id} ratelimited, retrying after 1 second"); tokio::time::sleep(std::time::Duration::from_millis(1000)).await; drop(lock); continue; } } }; } }).buffered(args.jobs); let _ = responses.for_each(|_| async {}).await; } return ExitCode::SUCCESS; } fn extract_urls(html: &str) -> Vec { Regex::new(r"/index\.php\?page=post&s=view&id=\d+") .unwrap() .find_iter(html) .map(|mat| format!("https://rule34.xxx{}", mat.as_str())) .collect() } fn extract_img_url(html: &str) -> Result { if let Some(img_url) = Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") .unwrap() .find(html) { Ok(img_url.as_str().to_string()) } else { if html.contains("503 Rate limiting") { Err("ratelimited") } else { Ok(String::new()) } } }