forked from danmax/r34-scraper
156 lines
5.0 KiB
Rust
156 lines
5.0 KiB
Rust
#![feature(async_closure)]
|
|
use regex::Regex;
|
|
use reqwest::Client;
|
|
use std::process::ExitCode;
|
|
use taap::Argument;
|
|
use tokio::time::{sleep, Duration};
|
|
|
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
|
|
|
|
#[tokio::main]
|
|
async fn main() -> ExitCode {
|
|
// Taap setup
|
|
let mut arguments = Argument::new(
|
|
"r34-scrape",
|
|
"A scraper for r34.xxx",
|
|
"Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
|
|
"Danmax and authors 2024",
|
|
);
|
|
|
|
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
|
|
let parsed_arguments = arguments.parse_args(None);
|
|
|
|
let tags = parsed_arguments.get("TAGS").unwrap();
|
|
|
|
// End of taap setup
|
|
// Check if empty and warn
|
|
// Can't use tags.0 because taap is not buggy at all :3
|
|
if tags.1.is_empty() {
|
|
println!("[warning] No tags were used, use --help for help")
|
|
}
|
|
|
|
let mut thread_counter = 0;
|
|
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
|
let mut page = 0;
|
|
|
|
loop {
|
|
println!("now scraping page {}", page + 1);
|
|
|
|
let post_html = async || {
|
|
extract_urls(
|
|
&client
|
|
.get(format!(
|
|
"https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
|
|
if tags.0 {
|
|
format!("&tags={}", tags.1.join("+"))
|
|
} else {
|
|
"".to_owned()
|
|
},
|
|
page * 42
|
|
))
|
|
.send()
|
|
.await
|
|
.unwrap()
|
|
.text()
|
|
.await
|
|
.unwrap(),
|
|
)
|
|
};
|
|
|
|
let mut urls = post_html().await;
|
|
|
|
let mut wait_time = 5000;
|
|
|
|
if urls.is_empty() {
|
|
for reconnection_attempts in 0..4 {
|
|
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
|
|
sleep(Duration::from_millis(wait_time)).await;
|
|
|
|
urls = post_html().await;
|
|
|
|
if !urls.is_empty() {
|
|
println!("urls found! continuing...");
|
|
break;
|
|
}
|
|
|
|
if reconnection_attempts == 3 {
|
|
println!("no urls found in 4 attempts, exiting...");
|
|
return ExitCode::FAILURE;
|
|
}
|
|
|
|
wait_time += 5000;
|
|
}
|
|
}
|
|
|
|
for url in urls {
|
|
tokio::spawn(async move {
|
|
let thread_id = format!("[{thread_counter: >4}]");
|
|
loop {
|
|
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
|
match extract_img_url(
|
|
&client
|
|
.get(url.clone())
|
|
.send()
|
|
.await
|
|
.unwrap()
|
|
.text()
|
|
.await
|
|
.unwrap(),
|
|
) {
|
|
Ok(img_url) => {
|
|
if img_url.is_empty() {
|
|
println!("{thread_id} image url not found");
|
|
} else {
|
|
println!("{thread_id} found image url: {img_url}");
|
|
}
|
|
}
|
|
Err(_) => {
|
|
println!("{thread_id} ratelimited, retrying after 1 second");
|
|
std::thread::sleep(std::time::Duration::from_millis(1000));
|
|
continue;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
});
|
|
thread_counter += 1;
|
|
if thread_counter > 9999 {
|
|
thread_counter = 0;
|
|
}
|
|
while tokio::runtime::Handle::current()
|
|
.metrics()
|
|
.num_alive_tasks()
|
|
> 4
|
|
{
|
|
std::thread::sleep(std::time::Duration::from_millis(100));
|
|
}
|
|
}
|
|
|
|
page += 1;
|
|
}
|
|
}
|
|
|
|
fn extract_urls(html: &str) -> Vec<String> {
|
|
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
|
|
.unwrap()
|
|
.find_iter(html)
|
|
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
|
|
.collect()
|
|
}
|
|
|
|
fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
|
if let Some(img_url) =
|
|
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
|
.unwrap()
|
|
.find(html)
|
|
{
|
|
Ok(img_url.as_str().to_string())
|
|
} else {
|
|
if html.contains("503 Rate limiting") {
|
|
Err("ratelimited")
|
|
} else {
|
|
Ok(String::new())
|
|
}
|
|
}
|
|
}
|