merge master & misc feats

This commit is contained in:
javalsai 2024-10-19 20:22:49 +02:00
commit 063acc97db
Signed by: javalsai
SSH Key Fingerprint: SHA256:3G83yKhBUWVABVX/vPWH88xnK4+ptMtHkZGCRXD4Mk8
6 changed files with 356 additions and 19 deletions

299
Cargo.lock generated
View File

@ -75,6 +75,119 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "async-channel"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35"
dependencies = [
"concurrent-queue",
"event-listener 2.5.3",
"futures-core",
]
[[package]]
name = "async-channel"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a"
dependencies = [
"concurrent-queue",
"event-listener-strategy",
"futures-core",
"pin-project-lite",
]
[[package]]
name = "async-executor"
version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec"
dependencies = [
"async-task",
"concurrent-queue",
"fastrand",
"futures-lite",
"slab",
]
[[package]]
name = "async-global-executor"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05b1b633a2115cd122d73b955eadd9916c18c8f510ec9cd1686404c60ad1c29c"
dependencies = [
"async-channel 2.3.1",
"async-executor",
"async-io",
"async-lock",
"blocking",
"futures-lite",
"once_cell",
]
[[package]]
name = "async-io"
version = "2.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "444b0228950ee6501b3568d3c93bf1176a1fdbc3b758dcd9475046d30f4dc7e8"
dependencies = [
"async-lock",
"cfg-if",
"concurrent-queue",
"futures-io",
"futures-lite",
"parking",
"polling",
"rustix",
"slab",
"tracing",
"windows-sys 0.59.0",
]
[[package]]
name = "async-lock"
version = "3.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18"
dependencies = [
"event-listener 5.3.1",
"event-listener-strategy",
"pin-project-lite",
]
[[package]]
name = "async-std"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c634475f29802fde2b8f0b505b1bd00dfe4df7d4a000f0b36f7671197d5c3615"
dependencies = [
"async-channel 1.9.0",
"async-global-executor",
"async-io",
"async-lock",
"crossbeam-utils",
"futures-channel",
"futures-core",
"futures-io",
"futures-lite",
"gloo-timers",
"kv-log-macro",
"log",
"memchr",
"once_cell",
"pin-project-lite",
"pin-utils",
"slab",
"wasm-bindgen-futures",
]
[[package]]
name = "async-task"
version = "4.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de"
[[package]]
name = "atomic-waker"
version = "1.1.2"
@ -114,6 +227,19 @@ version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "blocking"
version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "703f41c54fc768e63e091340b424302bb1c29ef4aa0c7f10fe849dfb114d29ea"
dependencies = [
"async-channel 2.3.1",
"async-task",
"futures-io",
"futures-lite",
"piper",
]
[[package]]
name = "bumpalo"
version = "3.16.0"
@ -187,6 +313,15 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
[[package]]
name = "concurrent-queue"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
@ -203,6 +338,12 @@ version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crossbeam-utils"
version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "encoding_rs"
version = "0.8.34"
@ -228,6 +369,33 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "event-listener"
version = "2.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
[[package]]
name = "event-listener"
version = "5.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba"
dependencies = [
"concurrent-queue",
"parking",
"pin-project-lite",
]
[[package]]
name = "event-listener-strategy"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1"
dependencies = [
"event-listener 5.3.1",
"pin-project-lite",
]
[[package]]
name = "fastrand"
version = "2.1.1"
@ -264,6 +432,21 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
@ -280,12 +463,47 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-lite"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52527eb5074e35e9339c6b4e8d12600c7128b68fb25dcb9fa9dec18f7c25f3a5"
dependencies = [
"fastrand",
"futures-core",
"futures-io",
"parking",
"pin-project-lite",
]
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@ -304,8 +522,10 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
@ -331,6 +551,18 @@ version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "gloo-timers"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994"
dependencies = [
"futures-channel",
"futures-core",
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "h2"
version = "0.4.6"
@ -368,6 +600,12 @@ version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "hermit-abi"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
[[package]]
name = "http"
version = "1.1.0"
@ -527,6 +765,15 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "kv-log-macro"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0de8b303297635ad57c9f5059fd9cee7a47f8e8daa09df0fcd07dd39fb22977f"
dependencies = [
"log",
]
[[package]]
name = "libc"
version = "0.2.159"
@ -554,6 +801,9 @@ name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
dependencies = [
"value-bag",
]
[[package]]
name = "memchr"
@ -582,7 +832,7 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec"
dependencies = [
"hermit-abi",
"hermit-abi 0.3.9",
"libc",
"wasi",
"windows-sys 0.52.0",
@ -664,6 +914,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "parking"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
[[package]]
name = "parking_lot"
version = "0.12.3"
@ -705,12 +961,38 @@ version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "piper"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066"
dependencies = [
"atomic-waker",
"fastrand",
"futures-io",
]
[[package]]
name = "pkg-config"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
[[package]]
name = "polling"
version = "3.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cc2790cd301dec6cd3b7a025e4815cf825724a51c98dccfe6a3e55f05ffb6511"
dependencies = [
"cfg-if",
"concurrent-queue",
"hermit-abi 0.4.0",
"pin-project-lite",
"rustix",
"tracing",
"windows-sys 0.59.0",
]
[[package]]
name = "proc-macro2"
version = "1.0.87"
@ -733,9 +1015,12 @@ dependencies = [
name = "r34-scraper"
version = "0.1.0"
dependencies = [
"async-std",
"clap",
"futures",
"regex",
"reqwest",
"taap",
"tokio",
]
@ -1081,6 +1366,12 @@ dependencies = [
"libc",
]
[[package]]
name = "taap"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac904d7c1c1da5a57cf33092db7bd8ab2e4f75ff424f5686d0d71114901d253"
[[package]]
name = "tempfile"
version = "3.13.0"
@ -1247,6 +1538,12 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "value-bag"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
[[package]]
name = "vcpkg"
version = "0.2.15"

View File

@ -4,7 +4,10 @@ version = "0.1.0"
edition = "2021"
[dependencies]
async-std = "1.13.0"
clap = { version = "4.5.20", features = ["derive"] }
futures = "0.3.31"
regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] }
taap = "0.1.4"
tokio = { version = "1", features = ["full"] }

View File

@ -5,3 +5,6 @@ a scraper that well scrapes r34
## note
this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

View File

@ -10,7 +10,11 @@ pub struct Args {
)]
pub user_agent: String,
// Tags to search for
/// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
/// Async jobs to use for fetching
#[arg(short, long, default_value = "4")]
pub jobs: usize
}

View File

@ -2,10 +2,13 @@
pub mod args;
use clap::Parser;
use futures::{stream, StreamExt};
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use tokio::time::{sleep, Duration};
use async_std::sync::Mutex;
use std::{process::ExitCode, sync::Arc};
#[tokio::main]
async fn main() -> ExitCode {
@ -23,7 +26,7 @@ async fn main() -> ExitCode {
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder()
.user_agent(args.user_agent)
.user_agent(&args.user_agent)
.build()
.unwrap();
@ -34,11 +37,11 @@ async fn main() -> ExitCode {
page * 42
);
let post_html = async || {
let post_html = async |client: &Client| {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
"https://rule34.xxx/index.php?page=post&s=list&pid={}&tags={uri_tags}",
page * 42
))
.send()
@ -50,7 +53,7 @@ async fn main() -> ExitCode {
)
};
let mut urls = post_html().await;
let mut urls = post_html(&client).await;
let mut wait_time = 5000;
@ -59,7 +62,7 @@ async fn main() -> ExitCode {
println!("no urls found, retrying in {} seconds...", wait_time / 1000);
sleep(Duration::from_millis(wait_time)).await;
urls = post_html().await;
urls = post_html(&client).await;
if !urls.is_empty() {
println!("urls found! continuing...");
@ -75,15 +78,38 @@ async fn main() -> ExitCode {
}
}
for url in urls {
let img_url =
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
if img_url.is_empty() {
println!("image url not found");
} else {
println!("found image url: {img_url}");
let ratelimit_lock = &Arc::new(Mutex::new(()));
let responses = stream::iter(urls.into_iter().enumerate()).map(|(i, url)| {
let client = &client;
async move {
// "thread"
let thread_id = format!("[{: >4}]", i % 9999);
println!("{thread_id} scraping {url:?}");
loop {
let lock = ratelimit_lock.lock().await;
drop(lock);
let resp = client.get(&url).send().await.unwrap();
match extract_img_url(&resp.text().await.unwrap()) {
Ok(img_url) => {
if img_url.is_empty() {
println!("{thread_id} image url not found");
} else {
println!("{thread_id} found image url: {img_url}");
}
break img_url;
}
Err(_) => {
let lock = ratelimit_lock.lock().await;
println!("{thread_id} ratelimited, retrying after 1 second");
tokio::time::sleep(std::time::Duration::from_millis(1000)).await;
drop(lock);
continue;
}
}
};
}
}
}).buffered(args.jobs);
let _ = responses.for_each(|_| async {}).await;
}
return ExitCode::SUCCESS;
@ -97,14 +123,18 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect()
}
fn extract_img_url(html: &str) -> String {
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
img_url.as_str().to_string()
Ok(img_url.as_str().to_string())
} else {
String::new()
if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
}
}