forked from danmax/r34-scraper
		
	merge master & misc feats
This commit is contained in:
		
							
								
								
									
										299
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										299
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -75,6 +75,119 @@ dependencies = [ | ||||
|  "windows-sys 0.52.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-channel" | ||||
| version = "1.9.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" | ||||
| dependencies = [ | ||||
|  "concurrent-queue", | ||||
|  "event-listener 2.5.3", | ||||
|  "futures-core", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-channel" | ||||
| version = "2.3.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" | ||||
| dependencies = [ | ||||
|  "concurrent-queue", | ||||
|  "event-listener-strategy", | ||||
|  "futures-core", | ||||
|  "pin-project-lite", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-executor" | ||||
| version = "1.13.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec" | ||||
| dependencies = [ | ||||
|  "async-task", | ||||
|  "concurrent-queue", | ||||
|  "fastrand", | ||||
|  "futures-lite", | ||||
|  "slab", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-global-executor" | ||||
| version = "2.4.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c" | ||||
| dependencies = [ | ||||
|  "async-channel 2.3.1", | ||||
|  "async-executor", | ||||
|  "async-io", | ||||
|  "async-lock", | ||||
|  "blocking", | ||||
|  "futures-lite", | ||||
|  "once_cell", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-io" | ||||
| version = "2.3.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "444b0228950ee6501b3568d3c93bf1176a1fdbc3b758dcd9475046d30f4dc7e8" | ||||
| dependencies = [ | ||||
|  "async-lock", | ||||
|  "cfg-if", | ||||
|  "concurrent-queue", | ||||
|  "futures-io", | ||||
|  "futures-lite", | ||||
|  "parking", | ||||
|  "polling", | ||||
|  "rustix", | ||||
|  "slab", | ||||
|  "tracing", | ||||
|  "windows-sys 0.59.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-lock" | ||||
| version = "3.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" | ||||
| dependencies = [ | ||||
|  "event-listener 5.3.1", | ||||
|  "event-listener-strategy", | ||||
|  "pin-project-lite", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-std" | ||||
| version = "1.13.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c634475f29802fde2b8f0b505b1bd00dfe4df7d4a000f0b36f7671197d5c3615" | ||||
| dependencies = [ | ||||
|  "async-channel 1.9.0", | ||||
|  "async-global-executor", | ||||
|  "async-io", | ||||
|  "async-lock", | ||||
|  "crossbeam-utils", | ||||
|  "futures-channel", | ||||
|  "futures-core", | ||||
|  "futures-io", | ||||
|  "futures-lite", | ||||
|  "gloo-timers", | ||||
|  "kv-log-macro", | ||||
|  "log", | ||||
|  "memchr", | ||||
|  "once_cell", | ||||
|  "pin-project-lite", | ||||
|  "pin-utils", | ||||
|  "slab", | ||||
|  "wasm-bindgen-futures", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "async-task" | ||||
| version = "4.7.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" | ||||
|  | ||||
| [[package]] | ||||
| name = "atomic-waker" | ||||
| version = "1.1.2" | ||||
| @@ -114,6 +227,19 @@ version = "2.6.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" | ||||
|  | ||||
| [[package]] | ||||
| name = "blocking" | ||||
| version = "1.6.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "703f41c54fc768e63e091340b424302bb1c29ef4aa0c7f10fe849dfb114d29ea" | ||||
| dependencies = [ | ||||
|  "async-channel 2.3.1", | ||||
|  "async-task", | ||||
|  "futures-io", | ||||
|  "futures-lite", | ||||
|  "piper", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "bumpalo" | ||||
| version = "3.16.0" | ||||
| @@ -187,6 +313,15 @@ version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" | ||||
|  | ||||
| [[package]] | ||||
| name = "concurrent-queue" | ||||
| version = "2.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" | ||||
| dependencies = [ | ||||
|  "crossbeam-utils", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "core-foundation" | ||||
| version = "0.9.4" | ||||
| @@ -203,6 +338,12 @@ version = "0.8.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" | ||||
|  | ||||
| [[package]] | ||||
| name = "crossbeam-utils" | ||||
| version = "0.8.20" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" | ||||
|  | ||||
| [[package]] | ||||
| name = "encoding_rs" | ||||
| version = "0.8.34" | ||||
| @@ -228,6 +369,33 @@ dependencies = [ | ||||
|  "windows-sys 0.52.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "event-listener" | ||||
| version = "2.5.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" | ||||
|  | ||||
| [[package]] | ||||
| name = "event-listener" | ||||
| version = "5.3.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" | ||||
| dependencies = [ | ||||
|  "concurrent-queue", | ||||
|  "parking", | ||||
|  "pin-project-lite", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "event-listener-strategy" | ||||
| version = "0.5.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" | ||||
| dependencies = [ | ||||
|  "event-listener 5.3.1", | ||||
|  "pin-project-lite", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "fastrand" | ||||
| version = "2.1.1" | ||||
| @@ -264,6 +432,21 @@ dependencies = [ | ||||
|  "percent-encoding", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "futures" | ||||
| version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" | ||||
| dependencies = [ | ||||
|  "futures-channel", | ||||
|  "futures-core", | ||||
|  "futures-executor", | ||||
|  "futures-io", | ||||
|  "futures-sink", | ||||
|  "futures-task", | ||||
|  "futures-util", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-channel" | ||||
| version = "0.3.31" | ||||
| @@ -280,12 +463,47 @@ version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-executor" | ||||
| version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" | ||||
| dependencies = [ | ||||
|  "futures-core", | ||||
|  "futures-task", | ||||
|  "futures-util", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-io" | ||||
| version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-lite" | ||||
| version = "2.3.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "52527eb5074e35e9339c6b4e8d12600c7128b68fb25dcb9fa9dec18f7c25f3a5" | ||||
| dependencies = [ | ||||
|  "fastrand", | ||||
|  "futures-core", | ||||
|  "futures-io", | ||||
|  "parking", | ||||
|  "pin-project-lite", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-macro" | ||||
| version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" | ||||
| dependencies = [ | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "futures-sink" | ||||
| version = "0.3.31" | ||||
| @@ -304,8 +522,10 @@ version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" | ||||
| dependencies = [ | ||||
|  "futures-channel", | ||||
|  "futures-core", | ||||
|  "futures-io", | ||||
|  "futures-macro", | ||||
|  "futures-sink", | ||||
|  "futures-task", | ||||
|  "memchr", | ||||
| @@ -331,6 +551,18 @@ version = "0.31.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" | ||||
|  | ||||
| [[package]] | ||||
| name = "gloo-timers" | ||||
| version = "0.3.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" | ||||
| dependencies = [ | ||||
|  "futures-channel", | ||||
|  "futures-core", | ||||
|  "js-sys", | ||||
|  "wasm-bindgen", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "h2" | ||||
| version = "0.4.6" | ||||
| @@ -368,6 +600,12 @@ version = "0.3.9" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" | ||||
|  | ||||
| [[package]] | ||||
| name = "hermit-abi" | ||||
| version = "0.4.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" | ||||
|  | ||||
| [[package]] | ||||
| name = "http" | ||||
| version = "1.1.0" | ||||
| @@ -527,6 +765,15 @@ dependencies = [ | ||||
|  "wasm-bindgen", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "kv-log-macro" | ||||
| version = "1.0.7" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f" | ||||
| dependencies = [ | ||||
|  "log", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "libc" | ||||
| version = "0.2.159" | ||||
| @@ -554,6 +801,9 @@ name = "log" | ||||
| version = "0.4.22" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" | ||||
| dependencies = [ | ||||
|  "value-bag", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "memchr" | ||||
| @@ -582,7 +832,7 @@ version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" | ||||
| dependencies = [ | ||||
|  "hermit-abi", | ||||
|  "hermit-abi 0.3.9", | ||||
|  "libc", | ||||
|  "wasi", | ||||
|  "windows-sys 0.52.0", | ||||
| @@ -664,6 +914,12 @@ dependencies = [ | ||||
|  "vcpkg", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "parking" | ||||
| version = "2.2.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" | ||||
|  | ||||
| [[package]] | ||||
| name = "parking_lot" | ||||
| version = "0.12.3" | ||||
| @@ -705,12 +961,38 @@ version = "0.1.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" | ||||
|  | ||||
| [[package]] | ||||
| name = "piper" | ||||
| version = "0.2.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" | ||||
| dependencies = [ | ||||
|  "atomic-waker", | ||||
|  "fastrand", | ||||
|  "futures-io", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "pkg-config" | ||||
| version = "0.3.31" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" | ||||
|  | ||||
| [[package]] | ||||
| name = "polling" | ||||
| version = "3.7.3" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "cc2790cd301dec6cd3b7a025e4815cf825724a51c98dccfe6a3e55f05ffb6511" | ||||
| dependencies = [ | ||||
|  "cfg-if", | ||||
|  "concurrent-queue", | ||||
|  "hermit-abi 0.4.0", | ||||
|  "pin-project-lite", | ||||
|  "rustix", | ||||
|  "tracing", | ||||
|  "windows-sys 0.59.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "proc-macro2" | ||||
| version = "1.0.87" | ||||
| @@ -733,9 +1015,12 @@ dependencies = [ | ||||
| name = "r34-scraper" | ||||
| version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "async-std", | ||||
|  "clap", | ||||
|  "futures", | ||||
|  "regex", | ||||
|  "reqwest", | ||||
|  "taap", | ||||
|  "tokio", | ||||
| ] | ||||
|  | ||||
| @@ -1081,6 +1366,12 @@ dependencies = [ | ||||
|  "libc", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "taap" | ||||
| version = "0.1.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4ac904d7c1c1da5a57cf33092db7bd8ab2e4f75ff424f5686d0d71114901d253" | ||||
|  | ||||
| [[package]] | ||||
| name = "tempfile" | ||||
| version = "3.13.0" | ||||
| @@ -1247,6 +1538,12 @@ version = "0.2.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" | ||||
|  | ||||
| [[package]] | ||||
| name = "value-bag" | ||||
| version = "1.9.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101" | ||||
|  | ||||
| [[package]] | ||||
| name = "vcpkg" | ||||
| version = "0.2.15" | ||||
|   | ||||
| @@ -4,7 +4,10 @@ version = "0.1.0" | ||||
| edition = "2021" | ||||
|  | ||||
| [dependencies] | ||||
| async-std = "1.13.0" | ||||
| clap = { version = "4.5.20", features = ["derive"] } | ||||
| futures = "0.3.31" | ||||
| regex = "1.11.0" | ||||
| reqwest = { version = "0.12.8", features = ["blocking"] } | ||||
| taap = "0.1.4" | ||||
| tokio = { version = "1", features = ["full"] } | ||||
|   | ||||
| @@ -5,3 +5,6 @@ a scraper that well scrapes r34 | ||||
| ## note  | ||||
|  | ||||
| this thing is still not completed, it only gathers links, it doesnt download things yet | ||||
|  | ||||
| ## example usage image | ||||
|  | ||||
|   | ||||
| @@ -10,7 +10,11 @@ pub struct Args { | ||||
|     )] | ||||
|     pub user_agent: String, | ||||
|  | ||||
|     // Tags to search for | ||||
|     /// Tags to search for | ||||
|     #[arg(short, long)] | ||||
|     pub tags: Option<Vec<String>>, | ||||
|  | ||||
|     /// Async jobs to use for fetching | ||||
|     #[arg(short, long, default_value = "4")] | ||||
|     pub jobs: usize | ||||
| } | ||||
|   | ||||
							
								
								
									
										64
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -2,10 +2,13 @@ | ||||
| pub mod args; | ||||
|  | ||||
| use clap::Parser; | ||||
| use futures::{stream, StreamExt}; | ||||
| use regex::Regex; | ||||
| use reqwest::Client; | ||||
| use std::process::ExitCode; | ||||
| use tokio::time::{sleep, Duration}; | ||||
| use async_std::sync::Mutex; | ||||
|  | ||||
| use std::{process::ExitCode, sync::Arc}; | ||||
|  | ||||
| #[tokio::main] | ||||
| async fn main() -> ExitCode { | ||||
| @@ -23,7 +26,7 @@ async fn main() -> ExitCode { | ||||
|     let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect(); | ||||
|  | ||||
|     let client = Client::builder() | ||||
|         .user_agent(args.user_agent) | ||||
|         .user_agent(&args.user_agent) | ||||
|         .build() | ||||
|         .unwrap(); | ||||
|  | ||||
| @@ -34,11 +37,11 @@ async fn main() -> ExitCode { | ||||
|             page * 42 | ||||
|         ); | ||||
|  | ||||
|         let post_html = async || { | ||||
|         let post_html = async |client: &Client| { | ||||
|             extract_urls( | ||||
|                 &client | ||||
|                     .get(format!( | ||||
|                         "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", | ||||
|                         "https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}", | ||||
|                         page * 42 | ||||
|                     )) | ||||
|                     .send() | ||||
| @@ -50,7 +53,7 @@ async fn main() -> ExitCode { | ||||
|             ) | ||||
|         }; | ||||
|  | ||||
|         let mut urls = post_html().await; | ||||
|         let mut urls = post_html(&client).await; | ||||
|  | ||||
|         let mut wait_time = 5000; | ||||
|  | ||||
| @@ -59,7 +62,7 @@ async fn main() -> ExitCode { | ||||
|                 println!("no urls found, retrying in {} seconds...", wait_time / 1000); | ||||
|                 sleep(Duration::from_millis(wait_time)).await; | ||||
|  | ||||
|                 urls = post_html().await; | ||||
|                 urls = post_html(&client).await; | ||||
|  | ||||
|                 if !urls.is_empty() { | ||||
|                     println!("urls found! continuing..."); | ||||
| @@ -75,15 +78,38 @@ async fn main() -> ExitCode { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for url in urls { | ||||
|             let img_url = | ||||
|                 extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); | ||||
|             if img_url.is_empty() { | ||||
|                 println!("image url not found"); | ||||
|             } else { | ||||
|                 println!("found image url: {img_url}"); | ||||
|         let ratelimit_lock = &Arc::new(Mutex::new(())); | ||||
|         let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| { | ||||
|             let client = &client; | ||||
|             async move { | ||||
|                 // "thread" | ||||
|                 let thread_id = format!("[{: >4}]", i % 9999); | ||||
|                 println!("{thread_id} scraping {url:?}"); | ||||
|                 loop { | ||||
|                     let lock = ratelimit_lock.lock().await; | ||||
|                     drop(lock); | ||||
|                     let resp = client.get(&url).send().await.unwrap(); | ||||
|                     match extract_img_url(&resp.text().await.unwrap()) { | ||||
|                         Ok(img_url) => { | ||||
|                             if img_url.is_empty() { | ||||
|                                 println!("{thread_id} image url not found"); | ||||
|                             } else { | ||||
|                                 println!("{thread_id} found image url: {img_url}"); | ||||
|                             } | ||||
|                             break img_url; | ||||
|                         } | ||||
|                         Err(_) => { | ||||
|                             let lock = ratelimit_lock.lock().await; | ||||
|                             println!("{thread_id} ratelimited, retrying after 1 second"); | ||||
|                             tokio::time::sleep(std::time::Duration::from_millis(1000)).await; | ||||
|                             drop(lock); | ||||
|                             continue; | ||||
|                         } | ||||
|                     } | ||||
|                 }; | ||||
|             } | ||||
|         } | ||||
|         }).buffered(args.jobs); | ||||
|         let _ = responses.for_each(|_| async {}).await; | ||||
|     } | ||||
|  | ||||
|     return ExitCode::SUCCESS; | ||||
| @@ -97,14 +123,18 @@ fn extract_urls(html: &str) -> Vec<String> { | ||||
|         .collect() | ||||
| } | ||||
|  | ||||
| fn extract_img_url(html: &str) -> String { | ||||
| fn extract_img_url(html: &str) -> Result<String, &'static str> { | ||||
|     if let Some(img_url) = | ||||
|         Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") | ||||
|             .unwrap() | ||||
|             .find(html) | ||||
|     { | ||||
|         img_url.as_str().to_string() | ||||
|         Ok(img_url.as_str().to_string()) | ||||
|     } else { | ||||
|         String::new() | ||||
|         if html.contains("503 Rate limiting") { | ||||
|             Err("ratelimited") | ||||
|         } else { | ||||
|             Ok(String::new()) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user