diff --git a/Cargo.lock b/Cargo.lock index e62339a..05bab33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -75,6 +75,119 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + +[[package]] +name = "async-channel" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-executor" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-global-executor" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c" +dependencies = [ + "async-channel 2.3.1", + "async-executor", + "async-io", + "async-lock", + "blocking", + "futures-lite", + "once_cell", +] + +[[package]] +name = "async-io" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "444b0228950ee6501b3568d3c93bf1176a1fdbc3b758dcd9475046d30f4dc7e8" +dependencies = [ + "async-lock", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite", + "parking", + "polling", + "rustix", + "slab", + "tracing", + "windows-sys 0.59.0", +] + +[[package]] +name = "async-lock" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" +dependencies = [ + "event-listener 5.3.1", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-std" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c634475f29802fde2b8f0b505b1bd00dfe4df7d4a000f0b36f7671197d5c3615" +dependencies = [ + "async-channel 1.9.0", + "async-global-executor", + "async-io", + "async-lock", + "crossbeam-utils", + "futures-channel", + "futures-core", + "futures-io", + "futures-lite", + "gloo-timers", + "kv-log-macro", + "log", + "memchr", + "once_cell", + "pin-project-lite", + "pin-utils", + "slab", + "wasm-bindgen-futures", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "atomic-waker" version = "1.1.2" @@ -114,6 +227,19 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "blocking" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703f41c54fc768e63e091340b424302bb1c29ef4aa0c7f10fe849dfb114d29ea" +dependencies = [ + "async-channel 2.3.1", + "async-task", + "futures-io", + "futures-lite", + "piper", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -187,6 +313,15 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -203,6 +338,12 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "crossbeam-utils" +version = "0.8.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" + [[package]] name = "encoding_rs" version = "0.8.34" @@ -228,6 +369,33 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "event-listener" +version = "5.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" +dependencies = [ + "event-listener 5.3.1", + "pin-project-lite", +] + [[package]] name = "fastrand" version = "2.1.1" @@ -264,6 +432,21 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -280,12 +463,47 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52527eb5074e35e9339c6b4e8d12600c7128b68fb25dcb9fa9dec18f7c25f3a5" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -304,8 +522,10 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -331,6 +551,18 @@ version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + [[package]] name = "h2" version = "0.4.6" @@ -368,6 +600,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "http" version = "1.1.0" @@ -527,6 +765,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kv-log-macro" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" +dependencies = [ + "log", +] + [[package]] name = "libc" version = "0.2.159" @@ -554,6 +801,9 @@ name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +dependencies = [ + "value-bag", +] [[package]] name = "memchr" @@ -582,7 +832,7 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", "wasi", "windows-sys 0.52.0", @@ -664,6 +914,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.3" @@ -705,12 +961,38 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand", + "futures-io", +] + [[package]] name = "pkg-config" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +[[package]] +name = "polling" +version = "3.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc2790cd301dec6cd3b7a025e4815cf825724a51c98dccfe6a3e55f05ffb6511" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi 0.4.0", + "pin-project-lite", + "rustix", + "tracing", + "windows-sys 0.59.0", +] + [[package]] name = "proc-macro2" version = "1.0.87" @@ -733,9 +1015,12 @@ dependencies = [ name = "r34-scraper" version = "0.1.0" dependencies = [ + "async-std", "clap", + "futures", "regex", "reqwest", + "taap", "tokio", ] @@ -1081,6 +1366,12 @@ dependencies = [ "libc", ] +[[package]] +name = "taap" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac904d7c1c1da5a57cf33092db7bd8ab2e4f75ff424f5686d0d71114901d253" + [[package]] name = "tempfile" version = "3.13.0" @@ -1247,6 +1538,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "value-bag" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101" + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/Cargo.toml b/Cargo.toml index 13de98d..7b5e8d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,10 @@ version = "0.1.0" edition = "2021" [dependencies] +async-std = "1.13.0" clap = { version = "4.5.20", features = ["derive"] } +futures = "0.3.31" regex = "1.11.0" reqwest = { version = "0.12.8", features = ["blocking"] } +taap = "0.1.4" tokio = { version = "1", features = ["full"] } diff --git a/README.md b/README.md index 07564f7..ebe935a 100644 --- a/README.md +++ b/README.md @@ -5,3 +5,6 @@ a scraper that well scrapes r34 ## note this thing is still not completed, it only gathers links, it doesnt download things yet + +## example usage image +![example image](./image.png) diff --git a/image.png b/image.png new file mode 100644 index 0000000..3bb0e7d Binary files /dev/null and b/image.png differ diff --git a/src/args/mod.rs b/src/args/mod.rs index f982d82..22c3479 100644 --- a/src/args/mod.rs +++ b/src/args/mod.rs @@ -10,7 +10,11 @@ pub struct Args { )] pub user_agent: String, - // Tags to search for + /// Tags to search for #[arg(short, long)] pub tags: Option>, + + /// Async jobs to use for fetching + #[arg(short, long, default_value = "4")] + pub jobs: usize } diff --git a/src/main.rs b/src/main.rs index ab25cba..9ea9b16 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,10 +2,13 @@ pub mod args; use clap::Parser; +use futures::{stream, StreamExt}; use regex::Regex; use reqwest::Client; -use std::process::ExitCode; use tokio::time::{sleep, Duration}; +use async_std::sync::Mutex; + +use std::{process::ExitCode, sync::Arc}; #[tokio::main] async fn main() -> ExitCode { @@ -23,7 +26,7 @@ async fn main() -> ExitCode { let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect(); let client = Client::builder() - .user_agent(args.user_agent) + .user_agent(&args.user_agent) .build() .unwrap(); @@ -34,11 +37,11 @@ async fn main() -> ExitCode { page * 42 ); - let post_html = async || { + let post_html = async |client: &Client| { extract_urls( &client .get(format!( - "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", + "https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}", page * 42 )) .send() @@ -50,7 +53,7 @@ async fn main() -> ExitCode { ) }; - let mut urls = post_html().await; + let mut urls = post_html(&client).await; let mut wait_time = 5000; @@ -59,7 +62,7 @@ async fn main() -> ExitCode { println!("no urls found, retrying in {} seconds...", wait_time / 1000); sleep(Duration::from_millis(wait_time)).await; - urls = post_html().await; + urls = post_html(&client).await; if !urls.is_empty() { println!("urls found! continuing..."); @@ -75,15 +78,38 @@ async fn main() -> ExitCode { } } - for url in urls { - let img_url = - extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); - if img_url.is_empty() { - println!("image url not found"); - } else { - println!("found image url: {img_url}"); + let ratelimit_lock = &Arc::new(Mutex::new(())); + let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| { + let client = &client; + async move { + // "thread" + let thread_id = format!("[{: >4}]", i % 9999); + println!("{thread_id} scraping {url:?}"); + loop { + let lock = ratelimit_lock.lock().await; + drop(lock); + let resp = client.get(&url).send().await.unwrap(); + match extract_img_url(&resp.text().await.unwrap()) { + Ok(img_url) => { + if img_url.is_empty() { + println!("{thread_id} image url not found"); + } else { + println!("{thread_id} found image url: {img_url}"); + } + break img_url; + } + Err(_) => { + let lock = ratelimit_lock.lock().await; + println!("{thread_id} ratelimited, retrying after 1 second"); + tokio::time::sleep(std::time::Duration::from_millis(1000)).await; + drop(lock); + continue; + } + } + }; } - } + }).buffered(args.jobs); + let _ = responses.for_each(|_| async {}).await; } return ExitCode::SUCCESS; @@ -97,14 +123,18 @@ fn extract_urls(html: &str) -> Vec { .collect() } -fn extract_img_url(html: &str) -> String { +fn extract_img_url(html: &str) -> Result { if let Some(img_url) = Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") .unwrap() .find(html) { - img_url.as_str().to_string() + Ok(img_url.as_str().to_string()) } else { - String::new() + if html.contains("503 Rate limiting") { + Err("ratelimited") + } else { + Ok(String::new()) + } } }