merge master & misc feats

This commit is contained in:
2024-10-19 20:22:49 +02:00
6 changed files with 356 additions and 19 deletions

View File

@@ -10,7 +10,11 @@ pub struct Args {
)]
pub user_agent: String,
// Tags to search for
/// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
/// Async jobs to use for fetching
#[arg(short, long, default_value = "4")]
pub jobs: usize
}

View File

@@ -2,10 +2,13 @@
pub mod args;
use clap::Parser;
use futures::{stream, StreamExt};
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use tokio::time::{sleep, Duration};
use async_std::sync::Mutex;
use std::{process::ExitCode, sync::Arc};
#[tokio::main]
async fn main() -> ExitCode {
@@ -23,7 +26,7 @@ async fn main() -> ExitCode {
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder()
.user_agent(args.user_agent)
.user_agent(&args.user_agent)
.build()
.unwrap();
@@ -34,11 +37,11 @@ async fn main() -> ExitCode {
page * 42
);
let post_html = async || {
let post_html = async |client: &Client| {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
page * 42
))
.send()
@@ -50,7 +53,7 @@ async fn main() -> ExitCode {
)
};
let mut urls = post_html().await;
let mut urls = post_html(&client).await;
let mut wait_time = 5000;
@@ -59,7 +62,7 @@ async fn main() -> ExitCode {
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
sleep(Duration::from_millis(wait_time)).await;
urls = post_html().await;
urls = post_html(&client).await;
if !urls.is_empty() {
println!("urls found! continuing...");
@@ -75,15 +78,38 @@ async fn main() -> ExitCode {
}
}
for url in urls {
let img_url =
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
if img_url.is_empty() {
println!("image url not found");
} else {
println!("found image url: {img_url}");
let ratelimit_lock = &Arc::new(Mutex::new(()));
let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| {
let client = &client;
async move {
// "thread"
let thread_id = format!("[{: >4}]", i % 9999);
println!("{thread_id} scraping {url:?}");
loop {
let lock = ratelimit_lock.lock().await;
drop(lock);
let resp = client.get(&url).send().await.unwrap();
match extract_img_url(&resp.text().await.unwrap()) {
Ok(img_url) => {
if img_url.is_empty() {
println!("{thread_id} image url not found");
} else {
println!("{thread_id} found image url: {img_url}");
}
break img_url;
}
Err(_) => {
let lock = ratelimit_lock.lock().await;
println!("{thread_id} ratelimited, retrying after 1 second");
tokio::time::sleep(std::time::Duration::from_millis(1000)).await;
drop(lock);
continue;
}
}
};
}
}
}).buffered(args.jobs);
let _ = responses.for_each(|_| async {}).await;
}
return ExitCode::SUCCESS;
@@ -97,14 +123,18 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect()
}
fn extract_img_url(html: &str) -> String {
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
img_url.as_str().to_string()
Ok(img_url.as_str().to_string())
} else {
String::new()
if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
}
}