forked from danmax/r34-scraper
merge master & misc feats
This commit is contained in:
@@ -10,7 +10,11 @@ pub struct Args {
|
||||
)]
|
||||
pub user_agent: String,
|
||||
|
||||
// Tags to search for
|
||||
/// Tags to search for
|
||||
#[arg(short, long)]
|
||||
pub tags: Option<Vec<String>>,
|
||||
|
||||
/// Async jobs to use for fetching
|
||||
#[arg(short, long, default_value = "4")]
|
||||
pub jobs: usize
|
||||
}
|
||||
|
64
src/main.rs
64
src/main.rs
@@ -2,10 +2,13 @@
|
||||
pub mod args;
|
||||
|
||||
use clap::Parser;
|
||||
use futures::{stream, StreamExt};
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use std::process::ExitCode;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use async_std::sync::Mutex;
|
||||
|
||||
use std::{process::ExitCode, sync::Arc};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> ExitCode {
|
||||
@@ -23,7 +26,7 @@ async fn main() -> ExitCode {
|
||||
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
|
||||
|
||||
let client = Client::builder()
|
||||
.user_agent(args.user_agent)
|
||||
.user_agent(&args.user_agent)
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
@@ -34,11 +37,11 @@ async fn main() -> ExitCode {
|
||||
page * 42
|
||||
);
|
||||
|
||||
let post_html = async || {
|
||||
let post_html = async |client: &Client| {
|
||||
extract_urls(
|
||||
&client
|
||||
.get(format!(
|
||||
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
|
||||
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
|
||||
page * 42
|
||||
))
|
||||
.send()
|
||||
@@ -50,7 +53,7 @@ async fn main() -> ExitCode {
|
||||
)
|
||||
};
|
||||
|
||||
let mut urls = post_html().await;
|
||||
let mut urls = post_html(&client).await;
|
||||
|
||||
let mut wait_time = 5000;
|
||||
|
||||
@@ -59,7 +62,7 @@ async fn main() -> ExitCode {
|
||||
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
|
||||
sleep(Duration::from_millis(wait_time)).await;
|
||||
|
||||
urls = post_html().await;
|
||||
urls = post_html(&client).await;
|
||||
|
||||
if !urls.is_empty() {
|
||||
println!("urls found! continuing...");
|
||||
@@ -75,15 +78,38 @@ async fn main() -> ExitCode {
|
||||
}
|
||||
}
|
||||
|
||||
for url in urls {
|
||||
let img_url =
|
||||
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
|
||||
if img_url.is_empty() {
|
||||
println!("image url not found");
|
||||
} else {
|
||||
println!("found image url: {img_url}");
|
||||
let ratelimit_lock = &Arc::new(Mutex::new(()));
|
||||
let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| {
|
||||
let client = &client;
|
||||
async move {
|
||||
// "thread"
|
||||
let thread_id = format!("[{: >4}]", i % 9999);
|
||||
println!("{thread_id} scraping {url:?}");
|
||||
loop {
|
||||
let lock = ratelimit_lock.lock().await;
|
||||
drop(lock);
|
||||
let resp = client.get(&url).send().await.unwrap();
|
||||
match extract_img_url(&resp.text().await.unwrap()) {
|
||||
Ok(img_url) => {
|
||||
if img_url.is_empty() {
|
||||
println!("{thread_id} image url not found");
|
||||
} else {
|
||||
println!("{thread_id} found image url: {img_url}");
|
||||
}
|
||||
break img_url;
|
||||
}
|
||||
Err(_) => {
|
||||
let lock = ratelimit_lock.lock().await;
|
||||
println!("{thread_id} ratelimited, retrying after 1 second");
|
||||
tokio::time::sleep(std::time::Duration::from_millis(1000)).await;
|
||||
drop(lock);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}).buffered(args.jobs);
|
||||
let _ = responses.for_each(|_| async {}).await;
|
||||
}
|
||||
|
||||
return ExitCode::SUCCESS;
|
||||
@@ -97,14 +123,18 @@ fn extract_urls(html: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_img_url(html: &str) -> String {
|
||||
fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
||||
if let Some(img_url) =
|
||||
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
||||
.unwrap()
|
||||
.find(html)
|
||||
{
|
||||
img_url.as_str().to_string()
|
||||
Ok(img_url.as_str().to_string())
|
||||
} else {
|
||||
String::new()
|
||||
if html.contains("503 Rate limiting") {
|
||||
Err("ratelimited")
|
||||
} else {
|
||||
Ok(String::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user