forked from danmax/r34-scraper
147 lines
5.0 KiB
Rust
147 lines
5.0 KiB
Rust
#![feature(async_closure, iter_intersperse)]
|
|
pub mod args;
|
|
|
|
use clap::Parser;
|
|
use futures::{stream, StreamExt};
|
|
use regex::Regex;
|
|
use reqwest::Client;
|
|
use tokio::time::{sleep, Duration};
|
|
|
|
use std::process::ExitCode;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> ExitCode {
|
|
let args = args::Args::parse();
|
|
|
|
let tags = args.tags.unwrap_or_else(|| {
|
|
println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
|
|
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
|
|
tags_binding
|
|
.split(' ')
|
|
.filter(|item| !item.is_empty())
|
|
.map(|item| item.to_owned())
|
|
.collect()
|
|
});
|
|
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
|
|
|
|
let client = Client::builder()
|
|
.user_agent(&args.user_agent)
|
|
.build()
|
|
.unwrap();
|
|
|
|
for page in 0.. {
|
|
println!("now scraping page {}", page + 1);
|
|
println!(
|
|
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
|
|
page * 42
|
|
);
|
|
|
|
let post_html = async |client: &Client| {
|
|
extract_urls(
|
|
&client
|
|
.get(format!(
|
|
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
|
|
page * 42
|
|
))
|
|
.send()
|
|
.await
|
|
.unwrap()
|
|
.text()
|
|
.await
|
|
.unwrap(),
|
|
)
|
|
};
|
|
|
|
let mut urls = post_html(&client).await;
|
|
|
|
let mut wait_time = 5000;
|
|
|
|
if urls.is_empty() {
|
|
for reconnection_attempts in 0..4 {
|
|
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
|
|
sleep(Duration::from_millis(wait_time)).await;
|
|
|
|
urls = post_html(&client).await;
|
|
|
|
if !urls.is_empty() {
|
|
println!("urls found! continuing...");
|
|
break;
|
|
}
|
|
|
|
if reconnection_attempts == 3 {
|
|
println!("no urls found in 4 attempts, exiting...");
|
|
return ExitCode::FAILURE;
|
|
}
|
|
|
|
wait_time += 5000;
|
|
}
|
|
}
|
|
|
|
let multi_prog = indicatif::MultiProgress::new();
|
|
let urls_ammount = urls.len();
|
|
let responses = stream::iter(urls.into_iter().enumerate())
|
|
.map(|(i, url)| {
|
|
let i = i + 1;
|
|
let client = &client;
|
|
let this_bar = indicatif::ProgressBar::new_spinner();
|
|
this_bar.enable_steady_tick(Duration::from_millis(50));
|
|
let this_prog = multi_prog.insert(i, this_bar);
|
|
async move {
|
|
// "thread"
|
|
loop {
|
|
this_prog.set_message(format!("\x1b[30m[{i}/{urls_ammount}] \x1b[36mscraping {url:?}\x1b[0m"));
|
|
let resp = client.get(&url).send().await.unwrap();
|
|
match extract_img_url(&resp.text().await.unwrap()) {
|
|
Ok(img_url) => {
|
|
if img_url.is_empty() {
|
|
this_prog.abandon_with_message(format!(
|
|
"\x1b[30m[{i}/{urls_ammount}] \x1b[1;31mimage url not found\x1b[0m"
|
|
));
|
|
} else {
|
|
this_prog.finish_with_message(format!(
|
|
"\x1b[30m[{i}/{urls_ammount}] \x1b[32mfound image url: {img_url}\x1b[0m"
|
|
));
|
|
}
|
|
break img_url;
|
|
}
|
|
Err(_) => {
|
|
this_prog
|
|
.set_message(format!("\x1b[30m[{i}/{urls_ammount}] \x1b[31mratelimited, retrying after 1 second\x1b[0m"));
|
|
tokio::time::sleep(std::time::Duration::from_millis(1000)).await;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
})
|
|
.buffered(args.jobs);
|
|
let _ = responses.for_each(|_| async {}).await;
|
|
}
|
|
|
|
return ExitCode::SUCCESS;
|
|
}
|
|
|
|
fn extract_urls(html: &str) -> Vec<String> {
|
|
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
|
|
.unwrap()
|
|
.find_iter(html)
|
|
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
|
|
.collect()
|
|
}
|
|
|
|
fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
|
if let Some(img_url) =
|
|
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
|
.unwrap()
|
|
.find(html)
|
|
{
|
|
Ok(img_url.as_str().to_string())
|
|
} else {
|
|
if html.contains("503 Rate limiting") {
|
|
Err("ratelimited")
|
|
} else {
|
|
Ok(String::new())
|
|
}
|
|
}
|
|
}
|