not-r34/src/main.rs

205 lines
6.6 KiB
Rust

#![feature(async_closure)]
pub mod args;
use clap::Parser;
use futures::{stream, StreamExt};
use indicatif::ProgressBar;
use regex::Regex;
use reqwest::Client;
use tokio::time::{sleep, Duration};
use std::io::Write;
use std::process::ExitCode;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
const BAR_LENGTH: u64 = 8;
#[tokio::main]
async fn main() -> ExitCode {
let args = args::Args::parse();
let uri_tags = &args.tags.join("+");
let _ = std::fs::create_dir(uri_tags);
let running = Arc::new(AtomicBool::new(true));
let running_t = running.clone();
ctrlc::set_handler(move || {
running_t.store(false, Ordering::SeqCst);
})
.unwrap();
let client = Client::builder()
.user_agent(&args.user_agent)
.build()
.unwrap();
for page in args.page - 1.. {
if !running.load(Ordering::SeqCst) {
return ExitCode::FAILURE;
}
println!("now scraping page {} (https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={})", page + 1, page * 42);
let post_html = async |client: &Client| {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
page * 42
))
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
)
};
let mut urls = post_html(&client).await;
if urls.is_empty() {
let mut reconnection_attempts = 0;
loop {
println!("no urls found, retrying in 5 seconds...");
sleep(Duration::from_millis(5000)).await;
if !running.load(Ordering::SeqCst) {
return ExitCode::FAILURE;
}
urls = post_html(&client).await;
if !urls.is_empty() {
println!("urls found! continuing...");
break;
}
reconnection_attempts += 1;
if reconnection_attempts == 12 {
println!("no urls found in 1 minute, exiting...");
return ExitCode::FAILURE;
}
}
}
let multi_prog = indicatif::MultiProgress::new();
let urls_amount = urls.len();
let responses = stream::iter(urls.into_iter().enumerate())
.map(|(i, url)| {
let i = i + 1;
let client = &client;
let running_t = running.clone();
let this_bar = indicatif::ProgressBar::new(BAR_LENGTH);
this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> "));
let this_bar = multi_prog.insert(i, this_bar);
async move {
// "thread"
loop {
if !running_t.load(Ordering::SeqCst) {
return;
}
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) {
if img_url.is_empty() {
this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
));
} else {
download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await;
}
break;
}
this_bar
.set_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m",
args.delay.as_millis())
);
tokio::time::sleep(args.delay).await;
}
}
})
.buffered(args.jobs);
let _ = responses.for_each(|()| async {}).await;
}
ExitCode::SUCCESS
}
fn extract_urls(html: &str) -> Vec<String> {
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
.unwrap()
.find_iter(html)
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
.collect()
}
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
Ok(img_url.as_str().to_string())
} else if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
}
async fn download_file(
running: Arc<AtomicBool>,
img_url: &str,
this_bar: ProgressBar,
i: usize,
urls_amount: usize,
uri_tags: &str,
) {
let args = args::Args::parse();
let file_name = Regex::new(r"[^/]+$")
.unwrap()
.find(img_url)
.map(|m| m.as_str())
.unwrap();
let file_path = uri_tags.to_owned() + "/" + file_name;
let mut file = if std::fs::File::open(&file_path).is_ok() {
this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33m{file_name} exists, skipping...\x1b[0m"
));
return;
} else {
std::fs::File::create(&file_path).unwrap()
};
let mut res = Client::new()
.get(img_url)
.header("User-Agent", &args.user_agent)
.send()
.await
.unwrap();
let file_length = res.content_length().unwrap();
let mut written = 0;
while let Some(img_chunk) = res.chunk().await.unwrap() {
if !running.load(Ordering::SeqCst) {
this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33mcancelling {img_url}\x1b[0m"
));
drop(file);
std::fs::remove_file(&file_path).unwrap();
return;
}
file.write_all(&img_chunk).unwrap();
written += img_chunk.len();
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
}
this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
));
}