13 Commits

Author SHA1 Message Date
094b0b7412 feat: capture SIGINT to avoid corrupting downloads 2024-10-20 18:43:05 -04:00
351439034e Merge pull request 'refactor: minor nitpicks' (#11) from ErrorNoInternet/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/11
2024-10-21 00:00:43 +02:00
fe3c399c1d refactor: minor nitpicks 2024-10-20 17:59:10 -04:00
danmax
89830d6e1e update lock file 2024-10-20 17:15:59 -04:00
1b5608b014 Merge pull request 'bring back toolchain' (#10) from javalsai-changes into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/10
2024-10-20 21:32:37 +02:00
a852c8bcc5 chore: bring back rust-toolchain.toml 2024-10-20 21:31:27 +02:00
235e13230b Merge branch 'main' into javalsai-changes 2024-10-20 21:31:03 +02:00
danmax
bd517ed0b5 chore: bump version to 1.0.0 2024-10-20 01:48:21 -04:00
5f848be434 Merge pull request 'refactor: accept cli args only' (#9) from ErrorNoInternet/r34-scraper:cli-args-only into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/9
2024-10-20 06:25:37 +02:00
8723769429 refactor: accept cli args only 2024-10-20 00:24:35 -04:00
08ed5e51f2 Merge pull request 'feat(cli): add page argument to start scraping from a specific page' (#8) from ErrorNoInternet/r34-scraper:add-page-arg into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/8
2024-10-20 06:17:35 +02:00
3573f6ff5a feat(cli): add page argument to start scraping from a specific page 2024-10-20 00:16:22 -04:00
bcd349e36f cargo clippy 👍 2024-10-19 21:50:54 +02:00
5 changed files with 83 additions and 34 deletions

35
Cargo.lock generated
View File

@@ -267,6 +267,12 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.5.20" version = "4.5.20"
@@ -357,6 +363,16 @@ version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "ctrlc"
version = "3.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90eeab0aa92f3f9b4e87f258c72b139c207d251f9cbc1080a0086b86a8870dd3"
dependencies = [
"nix",
"windows-sys 0.59.0",
]
[[package]] [[package]]
name = "encode_unicode" name = "encode_unicode"
version = "0.3.6" version = "0.3.6"
@@ -892,6 +908,18 @@ dependencies = [
"tempfile", "tempfile",
] ]
[[package]]
name = "nix"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
dependencies = [
"bitflags",
"cfg-if",
"cfg_aliases",
"libc",
]
[[package]] [[package]]
name = "number_prefix" name = "number_prefix"
version = "0.4.0" version = "0.4.0"
@@ -1039,10 +1067,11 @@ dependencies = [
[[package]] [[package]]
name = "r34-scraper" name = "r34-scraper"
version = "0.1.0" version = "1.0.0"
dependencies = [ dependencies = [
"async-std", "async-std",
"clap", "clap",
"ctrlc",
"futures", "futures",
"indicatif", "indicatif",
"regex", "regex",
@@ -1329,9 +1358,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]] [[package]]
name = "syn" name = "syn"
version = "2.0.79" version = "2.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590" checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021"
dependencies = [ dependencies = [
"proc-macro2", "proc-macro2",
"quote", "quote",

View File

@@ -1,11 +1,12 @@
[package] [package]
name = "r34-scraper" name = "r34-scraper"
version = "0.1.0" version = "1.0.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
async-std = "1" async-std = "1"
clap = { version = "4", features = ["derive"] } clap = { version = "4", features = ["derive"] }
ctrlc = "3"
futures = "0" futures = "0"
indicatif = "0" indicatif = "0"
regex = "1" regex = "1"

2
rust-toolchain.toml Normal file
View File

@@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"

View File

@@ -11,11 +11,15 @@ pub struct Args {
pub user_agent: String, pub user_agent: String,
/// Tags to search for /// Tags to search for
#[arg(short, long)] #[arg(short, long, required = true)]
pub tags: Option<Vec<String>>, pub tags: Vec<String>,
/// Page to start scraping from
#[arg(short, long, default_value_t = 1)]
pub page: usize,
/// Async jobs to use for fetching /// Async jobs to use for fetching
#[arg(short, long, default_value = "4")] #[arg(short, long, default_value_t = 4)]
pub jobs: usize, pub jobs: usize,
/// Delay for rate-limits (ms) /// Delay for rate-limits (ms)

View File

@@ -1,4 +1,4 @@
#![feature(async_closure, iter_intersperse)] #![feature(async_closure)]
pub mod args; pub mod args;
use clap::Parser; use clap::Parser;
@@ -10,6 +10,8 @@ use tokio::time::{sleep, Duration};
use std::io::Write; use std::io::Write;
use std::process::ExitCode; use std::process::ExitCode;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
const BAR_LENGTH: u64 = 8; const BAR_LENGTH: u64 = 8;
@@ -17,34 +19,27 @@ const BAR_LENGTH: u64 = 8;
async fn main() -> ExitCode { async fn main() -> ExitCode {
let args = args::Args::parse(); let args = args::Args::parse();
let tags = args.tags.unwrap_or_else(|| { let uri_tags = &args.tags.join("+");
println!("which tags do you want to scrape? ex: 1girls 1boys yomama"); let _ = std::fs::create_dir(uri_tags);
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(std::borrow::ToOwned::to_owned)
.collect()
});
let tags_folder = &tags.join("+");
let uri_tags = tags
.into_iter()
.intersperse(String::from("+"))
.collect::<String>();
let _ = std::fs::create_dir(tags_folder); let running = Arc::new(AtomicBool::new(true));
let running_t = running.clone();
ctrlc::set_handler(move || {
running_t.store(false, Ordering::SeqCst);
})
.unwrap();
let client = Client::builder() let client = Client::builder()
.user_agent(&args.user_agent) .user_agent(&args.user_agent)
.build() .build()
.unwrap(); .unwrap();
for page in 0.. { for page in args.page - 1.. {
println!("now scraping page {}", page + 1); if !running.load(Ordering::SeqCst) {
println!( return ExitCode::FAILURE;
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", }
page * 42
); println!("now scraping page {} (https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={})", page + 1, page * 42);
let post_html = async |client: &Client| { let post_html = async |client: &Client| {
extract_urls( extract_urls(
@@ -69,6 +64,10 @@ async fn main() -> ExitCode {
println!("no urls found, retrying in 5 seconds..."); println!("no urls found, retrying in 5 seconds...");
sleep(Duration::from_millis(5000)).await; sleep(Duration::from_millis(5000)).await;
if !running.load(Ordering::SeqCst) {
return ExitCode::FAILURE;
}
urls = post_html(&client).await; urls = post_html(&client).await;
if !urls.is_empty() { if !urls.is_empty() {
println!("urls found! continuing..."); println!("urls found! continuing...");
@@ -89,13 +88,18 @@ async fn main() -> ExitCode {
.map(|(i, url)| { .map(|(i, url)| {
let i = i + 1; let i = i + 1;
let client = &client; let client = &client;
let running_t = running.clone();
let this_bar = indicatif::ProgressBar::new(BAR_LENGTH); let this_bar = indicatif::ProgressBar::new(BAR_LENGTH);
this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> ")); this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> "));
let this_bar = multi_prog.insert(i, this_bar); let this_bar = multi_prog.insert(i, this_bar);
async move { async move {
// "thread" // "thread"
loop { loop {
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url:?}\x1b[0m")); if !running_t.load(Ordering::SeqCst) {
return;
}
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap(); let resp = client.get(&url).send().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) {
if img_url.is_empty() { if img_url.is_empty() {
@@ -103,7 +107,7 @@ async fn main() -> ExitCode {
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
)); ));
} else { } else {
download_file(&img_url, this_bar, i, urls_amount, tags_folder).await; download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await;
} }
break; break;
} }
@@ -147,11 +151,12 @@ fn extract_img_url(html: &str) -> Result<String, &'static str> {
} }
async fn download_file( async fn download_file(
running: Arc<AtomicBool>,
img_url: &str, img_url: &str,
this_bar: ProgressBar, this_bar: ProgressBar,
i: usize, i: usize,
urls_amount: usize, urls_amount: usize,
tags_folder: &str, uri_tags: &str,
) { ) {
let args = args::Args::parse(); let args = args::Args::parse();
@@ -161,7 +166,7 @@ async fn download_file(
.map(|m| m.as_str()) .map(|m| m.as_str())
.unwrap(); .unwrap();
let file_path = tags_folder.to_owned() + "/" + file_name; let file_path = uri_tags.to_owned() + "/" + file_name;
let mut file = if std::fs::File::open(&file_path).is_ok() { let mut file = if std::fs::File::open(&file_path).is_ok() {
this_bar.finish_with_message(format!( this_bar.finish_with_message(format!(
@@ -169,7 +174,7 @@ async fn download_file(
)); ));
return; return;
} else { } else {
std::fs::File::create(file_path).unwrap() std::fs::File::create(&file_path).unwrap()
}; };
let mut res = Client::new() let mut res = Client::new()
@@ -181,6 +186,14 @@ async fn download_file(
let file_length = res.content_length().unwrap(); let file_length = res.content_length().unwrap();
let mut written = 0; let mut written = 0;
while let Some(img_chunk) = res.chunk().await.unwrap() { while let Some(img_chunk) = res.chunk().await.unwrap() {
if !running.load(Ordering::SeqCst) {
this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33mcancelling {img_url}\x1b[0m"
));
drop(file);
std::fs::remove_file(&file_path).unwrap();
return;
}
file.write_all(&img_chunk).unwrap(); file.write_all(&img_chunk).unwrap();
written += img_chunk.len(); written += img_chunk.len();
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);