forked from danmax/r34-scraper
added chunked downloads, added progress bars, cleaned up javalsai's bad code
Co-authored-by: ErrorNoInternet <errornointernet@envs.net>
This commit is contained in:
139
src/main.rs
139
src/main.rs
@@ -8,8 +8,10 @@ use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
use std::process::ExitCode;
|
||||
use std::io::Write;
|
||||
use std::process::ExitCode;
|
||||
|
||||
const BAR_LENGTH: u64 = 8;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> ExitCode {
|
||||
@@ -21,10 +23,16 @@ async fn main() -> ExitCode {
|
||||
tags_binding
|
||||
.split(' ')
|
||||
.filter(|item| !item.is_empty())
|
||||
.map(|item| item.to_owned())
|
||||
.map(std::borrow::ToOwned::to_owned)
|
||||
.collect()
|
||||
});
|
||||
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
|
||||
let tags_folder = &tags.join("+");
|
||||
let uri_tags = tags
|
||||
.into_iter()
|
||||
.intersperse(String::from("+"))
|
||||
.collect::<String>();
|
||||
|
||||
let _ = std::fs::create_dir(tags_folder);
|
||||
|
||||
let client = Client::builder()
|
||||
.user_agent(&args.user_agent)
|
||||
@@ -55,74 +63,65 @@ async fn main() -> ExitCode {
|
||||
};
|
||||
|
||||
let mut urls = post_html(&client).await;
|
||||
|
||||
let mut wait_time = 5000;
|
||||
|
||||
if urls.is_empty() {
|
||||
for reconnection_attempts in 0..4 {
|
||||
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
|
||||
sleep(Duration::from_millis(wait_time)).await;
|
||||
let mut reconnection_attempts = 0;
|
||||
loop {
|
||||
println!("no urls found, retrying in 5 seconds...");
|
||||
sleep(Duration::from_millis(5000)).await;
|
||||
|
||||
urls = post_html(&client).await;
|
||||
|
||||
if !urls.is_empty() {
|
||||
println!("urls found! continuing...");
|
||||
break;
|
||||
}
|
||||
|
||||
if reconnection_attempts == 3 {
|
||||
println!("no urls found in 4 attempts, exiting...");
|
||||
reconnection_attempts += 1;
|
||||
if reconnection_attempts == 12 {
|
||||
println!("no urls found in 1 minute, exiting...");
|
||||
return ExitCode::FAILURE;
|
||||
}
|
||||
|
||||
wait_time += 5000;
|
||||
}
|
||||
}
|
||||
|
||||
let multi_prog = indicatif::MultiProgress::new();
|
||||
let urls_ammount = urls.len();
|
||||
let urls_amount = urls.len();
|
||||
let responses = stream::iter(urls.into_iter().enumerate())
|
||||
.map(|(i, url)| {
|
||||
let i = i + 1;
|
||||
let client = &client;
|
||||
let this_bar = indicatif::ProgressBar::new_spinner();
|
||||
this_bar.enable_steady_tick(Duration::from_millis(50));
|
||||
let this_prog = multi_prog.insert(i, this_bar);
|
||||
let this_bar = indicatif::ProgressBar::new(BAR_LENGTH);
|
||||
this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> "));
|
||||
let this_bar = multi_prog.insert(i, this_bar);
|
||||
async move {
|
||||
// "thread"
|
||||
loop {
|
||||
this_prog.set_message(format!("\x1b[30m[{i: >4}/{urls_ammount}] \x1b[36mscraping {url:?}\x1b[0m"));
|
||||
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url:?}\x1b[0m"));
|
||||
let resp = client.get(&url).send().await.unwrap();
|
||||
match extract_img_url(&resp.text().await.unwrap()) {
|
||||
Ok(img_url) => {
|
||||
if img_url.is_empty() {
|
||||
this_prog.abandon_with_message(format!(
|
||||
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[1;31mimage url not found\x1b[0m"
|
||||
));
|
||||
} else {
|
||||
download_file(&img_url, this_prog, i, urls_ammount).await;
|
||||
|
||||
}
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
this_prog
|
||||
.set_message(format!(
|
||||
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m",
|
||||
args.delay.as_millis())
|
||||
);
|
||||
tokio::time::sleep(args.delay).await;
|
||||
continue;
|
||||
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) {
|
||||
if img_url.is_empty() {
|
||||
this_bar.abandon_with_message(format!(
|
||||
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
|
||||
));
|
||||
} else {
|
||||
download_file(&img_url, this_bar, i, urls_amount, tags_folder).await;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
this_bar
|
||||
.set_message(format!(
|
||||
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m",
|
||||
args.delay.as_millis())
|
||||
);
|
||||
tokio::time::sleep(args.delay).await;
|
||||
}
|
||||
}
|
||||
})
|
||||
.buffered(args.jobs);
|
||||
let _ = responses.for_each(|_| async {}).await;
|
||||
let _ = responses.for_each(|()| async {}).await;
|
||||
}
|
||||
|
||||
return ExitCode::SUCCESS;
|
||||
ExitCode::SUCCESS
|
||||
}
|
||||
|
||||
fn extract_urls(html: &str) -> Vec<String> {
|
||||
@@ -140,47 +139,53 @@ fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
||||
.find(html)
|
||||
{
|
||||
Ok(img_url.as_str().to_string())
|
||||
} else if html.contains("503 Rate limiting") {
|
||||
Err("ratelimited")
|
||||
} else {
|
||||
if html.contains("503 Rate limiting") {
|
||||
Err("ratelimited")
|
||||
} else {
|
||||
Ok(String::new())
|
||||
}
|
||||
Ok(String::new())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_file(img_url: &str, this_prog: ProgressBar, i: usize, urls_ammount: usize) {
|
||||
async fn download_file(
|
||||
img_url: &str,
|
||||
this_bar: ProgressBar,
|
||||
i: usize,
|
||||
urls_amount: usize,
|
||||
tags_folder: &str,
|
||||
) {
|
||||
let args = args::Args::parse();
|
||||
|
||||
let file_name = Regex::new(r"[^/]+$")
|
||||
.unwrap()
|
||||
.find(img_url)
|
||||
.find(img_url)
|
||||
.map(|m| m.as_str())
|
||||
.unwrap();
|
||||
|
||||
let downl_img = Client::new()
|
||||
let file_path = tags_folder.to_owned() + "/" + file_name;
|
||||
|
||||
let mut file = if std::fs::File::open(&file_path).is_ok() {
|
||||
this_bar.finish_with_message(format!(
|
||||
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33m{file_name} exists, skipping...\x1b[0m"
|
||||
));
|
||||
return;
|
||||
} else {
|
||||
std::fs::File::create(file_path).unwrap()
|
||||
};
|
||||
|
||||
let mut res = Client::new()
|
||||
.get(img_url)
|
||||
.header("User-Agent", &args.user_agent)
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.bytes()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
match std::fs::File::open(file_name) {
|
||||
Ok(_) => {
|
||||
this_prog.finish_with_message(format!(
|
||||
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[32mfile exists, skipping...\x1b[0m"
|
||||
));
|
||||
}
|
||||
Err(_) => {
|
||||
|
||||
this_prog.finish_with_message(format!(
|
||||
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[32mdownloaded image: {img_url}\x1b[0m"
|
||||
));
|
||||
let mut file = std::fs::File::create(file_name).unwrap();
|
||||
file.write_all(&downl_img).unwrap();
|
||||
}
|
||||
let file_length = res.content_length().unwrap();
|
||||
let mut written = 0;
|
||||
while let Some(img_chunk) = res.chunk().await.unwrap() {
|
||||
file.write_all(&img_chunk).unwrap();
|
||||
written += img_chunk.len();
|
||||
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
|
||||
}
|
||||
this_bar.finish_with_message(format!(
|
||||
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
|
||||
));
|
||||
}
|
||||
|
Reference in New Issue
Block a user