Compare commits

..

No commits in common. "a852c8bcc51b26a317b6ff5eebf6837cd854c781" and "bcd349e36f8b8f09293abfb7a5d5c554cce243de" have entirely different histories.

8 changed files with 122 additions and 100 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
/target /target
/downloads

59
Cargo.lock generated
View File

@ -833,6 +833,16 @@ version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "lock_api"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.22" version = "0.4.22"
@ -963,6 +973,29 @@ version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
[[package]]
name = "parking_lot"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-targets",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.3.1" version = "2.3.1"
@ -1050,6 +1083,15 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "redox_syscall"
version = "0.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f"
dependencies = [
"bitflags",
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.11.0" version = "1.11.0"
@ -1211,6 +1253,12 @@ dependencies = [
"windows-sys 0.59.0", "windows-sys 0.59.0",
] ]
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]] [[package]]
name = "security-framework" name = "security-framework"
version = "2.11.1" version = "2.11.1"
@ -1284,6 +1332,15 @@ version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "signal-hook-registry"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.9" version = "0.4.9"
@ -1406,7 +1463,9 @@ dependencies = [
"bytes", "bytes",
"libc", "libc",
"mio", "mio",
"parking_lot",
"pin-project-lite", "pin-project-lite",
"signal-hook-registry",
"socket2", "socket2",
"tokio-macros", "tokio-macros",
"windows-sys 0.52.0", "windows-sys 0.52.0",

View File

@ -1,13 +1,13 @@
[package] [package]
name = "r34-scraper" name = "r34-scraper"
version = "1.0.0" version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
async-std = "1" async-std = "1.13.0"
clap = { version = "4", features = ["derive"] } clap = { version = "4.5.20", features = ["derive"] }
futures = "0" futures = "0.3.31"
indicatif = "0" indicatif = "0.17.8"
regex = "1" regex = "1.11.0"
reqwest = { version = "0", features = ["blocking"] } reqwest = { version = "0.12.8", features = ["blocking"] }
tokio = { version = "1", features = ["macros", "rt-multi-thread"] } tokio = { version = "1", features = ["full"] }

View File

@ -4,7 +4,7 @@ a scraper that well scrapes r34
## note ## note
this program is pretty much complete, although i am planning to add a few extra features. this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image ## example usage image
![example image](./image.png) ![example image](./image.png)

BIN
image.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 122 KiB

After

Width:  |  Height:  |  Size: 79 KiB

View File

@ -1,2 +0,0 @@
[toolchain]
channel = "nightly"

View File

@ -11,15 +11,11 @@ pub struct Args {
pub user_agent: String, pub user_agent: String,
/// Tags to search for /// Tags to search for
#[arg(short, long, required = true)] #[arg(short, long)]
pub tags: Vec<String>, pub tags: Option<Vec<String>>,
/// Page to start scraping from
#[arg(short, long, default_value_t = 1)]
pub page: usize,
/// Async jobs to use for fetching /// Async jobs to use for fetching
#[arg(short, long, default_value_t = 4)] #[arg(short, long, default_value = "4")]
pub jobs: usize, pub jobs: usize,
/// Delay for rate-limits (ms) /// Delay for rate-limits (ms)

View File

@ -1,31 +1,35 @@
#![feature(async_closure)] #![feature(async_closure, iter_intersperse)]
pub mod args; pub mod args;
use clap::Parser; use clap::Parser;
use futures::{stream, StreamExt}; use futures::{stream, StreamExt};
use indicatif::ProgressBar;
use regex::Regex; use regex::Regex;
use reqwest::Client; use reqwest::Client;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
use std::io::Write;
use std::process::ExitCode; use std::process::ExitCode;
const BAR_LENGTH: u64 = 8;
#[tokio::main] #[tokio::main]
async fn main() -> ExitCode { async fn main() -> ExitCode {
let args = args::Args::parse(); let args = args::Args::parse();
let uri_tags = &args.tags.join("+"); let tags = args.tags.unwrap_or_else(|| {
let _ = std::fs::create_dir(uri_tags); println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder() let client = Client::builder()
.user_agent(&args.user_agent) .user_agent(&args.user_agent)
.build() .build()
.unwrap(); .unwrap();
for page in args.page - 1.. { for page in 0.. {
println!("now scraping page {}", page + 1); println!("now scraping page {}", page + 1);
println!( println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
@ -49,65 +53,75 @@ async fn main() -> ExitCode {
}; };
let mut urls = post_html(&client).await; let mut urls = post_html(&client).await;
let mut wait_time = 5000;
if urls.is_empty() { if urls.is_empty() {
let mut reconnection_attempts = 0; for reconnection_attempts in 0..4 {
loop { println!("no urls found, retrying in {} seconds...", wait_time / 1000);
println!("no urls found, retrying in 5 seconds..."); sleep(Duration::from_millis(wait_time)).await;
sleep(Duration::from_millis(5000)).await;
urls = post_html(&client).await; urls = post_html(&client).await;
if !urls.is_empty() { if !urls.is_empty() {
println!("urls found! continuing..."); println!("urls found! continuing...");
break; break;
} }
reconnection_attempts += 1; if reconnection_attempts == 3 {
if reconnection_attempts == 12 { println!("no urls found in 4 attempts, exiting...");
println!("no urls found in 1 minute, exiting...");
return ExitCode::FAILURE; return ExitCode::FAILURE;
} }
wait_time += 5000;
} }
} }
let multi_prog = indicatif::MultiProgress::new(); let multi_prog = indicatif::MultiProgress::new();
let urls_amount = urls.len(); let urls_ammount = urls.len();
let responses = stream::iter(urls.into_iter().enumerate()) let responses = stream::iter(urls.into_iter().enumerate())
.map(|(i, url)| { .map(|(i, url)| {
let i = i + 1; let i = i + 1;
let client = &client; let client = &client;
let this_bar = indicatif::ProgressBar::new(BAR_LENGTH); let this_bar = indicatif::ProgressBar::new_spinner();
this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> ")); this_bar.enable_steady_tick(Duration::from_millis(50));
let this_bar = multi_prog.insert(i, this_bar); let this_prog = multi_prog.insert(i, this_bar);
async move { async move {
// "thread" // "thread"
loop { loop {
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url:?}\x1b[0m")); this_prog.set_message(format!("\x1b[30m[{i: >4}/{urls_ammount}] \x1b[36mscraping {url:?}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap(); let resp = client.get(&url).send().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { match extract_img_url(&resp.text().await.unwrap()) {
Ok(img_url) => {
if img_url.is_empty() { if img_url.is_empty() {
this_bar.abandon_with_message(format!( this_prog.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" "\x1b[30m[{i: >4}/{urls_ammount}] \x1b[1;31mimage url not found\x1b[0m"
)); ));
} else { } else {
download_file(&img_url, this_bar, i, urls_amount, uri_tags).await; this_prog.finish_with_message(format!(
"\x1b[30m[{i: >4}/{urls_ammount}] \x1b[32mfound image url: {img_url}\x1b[0m"
));
} }
break; break img_url;
} }
Err(_) => {
this_bar this_prog
.set_message(format!( .set_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m", "\x1b[30m[{i: >4}/{urls_ammount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m",
args.delay.as_millis()) args.delay.as_millis())
); );
tokio::time::sleep(args.delay).await; tokio::time::sleep(args.delay).await;
continue;
}
}
} }
} }
}) })
.buffered(args.jobs); .buffered(args.jobs);
let _ = responses.for_each(|()| async {}).await; let _ = responses.for_each(|_| async {}).await;
} }
ExitCode::SUCCESS return ExitCode::SUCCESS;
} }
fn extract_urls(html: &str) -> Vec<String> { fn extract_urls(html: &str) -> Vec<String> {
@ -131,47 +145,3 @@ fn extract_img_url(html: &str) -> Result<String, &'static str> {
Ok(String::new()) Ok(String::new())
} }
} }
async fn download_file(
img_url: &str,
this_bar: ProgressBar,
i: usize,
urls_amount: usize,
uri_tags: &str,
) {
let args = args::Args::parse();
let file_name = Regex::new(r"[^/]+$")
.unwrap()
.find(img_url)
.map(|m| m.as_str())
.unwrap();
let file_path = uri_tags.to_owned() + "/" + file_name;
let mut file = if std::fs::File::open(&file_path).is_ok() {
this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33m{file_name} exists, skipping...\x1b[0m"
));
return;
} else {
std::fs::File::create(file_path).unwrap()
};
let mut res = Client::new()
.get(img_url)
.header("User-Agent", &args.user_agent)
.send()
.await
.unwrap();
let file_length = res.content_length().unwrap();
let mut written = 0;
while let Some(img_chunk) = res.chunk().await.unwrap() {
file.write_all(&img_chunk).unwrap();
written += img_chunk.len();
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
}
this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
));
}