not-r34/src/main.rs
2024-10-19 21:50:54 +02:00

148 lines
5.1 KiB
Rust

#![feature(async_closure, iter_intersperse)]
pub mod args;
use clap::Parser;
use futures::{stream, StreamExt};
use regex::Regex;
use reqwest::Client;
use tokio::time::{sleep, Duration};
use std::process::ExitCode;
#[tokio::main]
async fn main() -> ExitCode {
let args = args::Args::parse();
let tags = args.tags.unwrap_or_else(|| {
println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder()
.user_agent(&args.user_agent)
.build()
.unwrap();
for page in 0.. {
println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
);
let post_html = async |client: &Client| {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
page * 42
))
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
)
};
let mut urls = post_html(&client).await;
let mut wait_time = 5000;
if urls.is_empty() {
for reconnection_attempts in 0..4 {
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
sleep(Duration::from_millis(wait_time)).await;
urls = post_html(&client).await;
if !urls.is_empty() {
println!("urls found! continuing...");
break;
}
if reconnection_attempts == 3 {
println!("no urls found in 4 attempts, exiting...");
return ExitCode::FAILURE;
}
wait_time += 5000;
}
}
let multi_prog = indicatif::MultiProgress::new();
let urls_ammount = urls.len();
let responses = stream::iter(urls.into_iter().enumerate())
.map(|(i, url)| {
let i = i + 1;
let client = &client;
let this_bar = indicatif::ProgressBar::new_spinner();
this_bar.enable_steady_tick(Duration::from_millis(50));
let this_prog = multi_prog.insert(i, this_bar);
async move {
// "thread"
loop {
this_prog.set_message(format!("\x1b[30m[{i: >4}/{urls_ammount}] \x1b[36mscraping {url:?}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap();
match extract_img_url(&resp.text().await.unwrap()) {
Ok(img_url) => {
if img_url.is_empty() {
this_prog.abandon_with_message(format!(
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[1;31mimage url not found\x1b[0m"
));
} else {
this_prog.finish_with_message(format!(
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[32mfound image url: {img_url}\x1b[0m"
));
}
break img_url;
}
Err(_) => {
this_prog
.set_message(format!(
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m",
args.delay.as_millis())
);
tokio::time::sleep(args.delay).await;
continue;
}
}
}
}
})
.buffered(args.jobs);
let _ = responses.for_each(|_| async {}).await;
}
return ExitCode::SUCCESS;
}
fn extract_urls(html: &str) -> Vec<String> {
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
.unwrap()
.find_iter(html)
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
.collect()
}
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
Ok(img_url.as_str().to_string())
} else if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
}