#![feature(async_closure)] pub mod args; use clap::Parser; use futures::{stream, StreamExt}; use indicatif::ProgressBar; use regex::Regex; use reqwest::Client; use tokio::time::{sleep, Duration}; use std::io::Write; use std::process::ExitCode; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; const BAR_LENGTH: u64 = 8; #[tokio::main] async fn main() -> ExitCode { let args = args::Args::parse(); let uri_tags = &args.tags.join("+"); let _ = std::fs::create_dir(uri_tags); let running = Arc::new(AtomicBool::new(true)); let running_t = running.clone(); ctrlc::set_handler(move || { running_t.store(false, Ordering::SeqCst); }) .unwrap(); let client = Client::builder() .user_agent(&args.user_agent) .build() .unwrap(); for page in args.page - 1.. { if !running.load(Ordering::SeqCst) { return ExitCode::FAILURE; } println!("now scraping page {} (https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={})", page + 1, page * 42); let post_html = async |client: &Client| { extract_urls( &client .get(format!( "https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}", page * 42 )) .send() .await .unwrap() .text() .await .unwrap(), ) }; let mut urls = post_html(&client).await; if urls.is_empty() { let mut reconnection_attempts = 0; loop { println!("no urls found, retrying in 5 seconds..."); sleep(Duration::from_millis(5000)).await; if !running.load(Ordering::SeqCst) { return ExitCode::FAILURE; } urls = post_html(&client).await; if !urls.is_empty() { println!("urls found! continuing..."); break; } reconnection_attempts += 1; if reconnection_attempts == 12 { println!("no urls found in 1 minute, exiting..."); return ExitCode::FAILURE; } } } let multi_prog = indicatif::MultiProgress::new(); let urls_amount = urls.len(); let responses = stream::iter(urls.into_iter().enumerate()) .map(|(i, url)| { let i = i + 1; let client = &client; let running_t = running.clone(); let this_bar = indicatif::ProgressBar::new(BAR_LENGTH); this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> ")); let this_bar = multi_prog.insert(i, this_bar); async move { // "thread" loop { if !running_t.load(Ordering::SeqCst) { return; } this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m")); let resp = client.get(&url).send().await.unwrap(); if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { if img_url.is_empty() { this_bar.abandon_with_message(format!( "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" )); } else { download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await; } break; } this_bar .set_message(format!( "\x1b[37m[{i: >4}/{urls_amount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m", args.delay.as_millis()) ); tokio::time::sleep(args.delay).await; } } }) .buffered(args.jobs); let _ = responses.for_each(|()| async {}).await; } ExitCode::SUCCESS } fn extract_urls(html: &str) -> Vec { Regex::new(r"/index\.php\?page=post&s=view&id=\d+") .unwrap() .find_iter(html) .map(|mat| format!("https://rule34.xxx{}", mat.as_str())) .collect() } fn extract_img_url(html: &str) -> Result { if let Some(img_url) = Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") .unwrap() .find(html) { Ok(img_url.as_str().to_string()) } else if html.contains("503 Rate limiting") { Err("ratelimited") } else { Ok(String::new()) } } async fn download_file( running: Arc, img_url: &str, this_bar: ProgressBar, i: usize, urls_amount: usize, uri_tags: &str, ) { let args = args::Args::parse(); let file_name = Regex::new(r"[^/]+$") .unwrap() .find(img_url) .map(|m| m.as_str()) .unwrap(); let file_path = uri_tags.to_owned() + "/" + file_name; let mut file = if std::fs::File::open(&file_path).is_ok() { this_bar.finish_with_message(format!( "\x1b[37m[{i: >4}/{urls_amount}] \x1b[33m{file_name} exists, skipping...\x1b[0m" )); return; } else { std::fs::File::create(&file_path).unwrap() }; let mut res = Client::new() .get(img_url) .header("User-Agent", &args.user_agent) .send() .await .unwrap(); let file_length = res.content_length().unwrap(); let mut written = 0; while let Some(img_chunk) = res.chunk().await.unwrap() { if !running.load(Ordering::SeqCst) { this_bar.abandon_with_message(format!( "\x1b[37m[{i: >4}/{urls_amount}] \x1b[33mcancelling {img_url}\x1b[0m" )); drop(file); std::fs::remove_file(&file_path).unwrap(); return; } file.write_all(&img_chunk).unwrap(); written += img_chunk.len(); this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); } this_bar.finish_with_message(format!( "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m" )); }