Compare commits

..

19 Commits

Author SHA1 Message Date
54ccc84719 add database 2025-02-01 21:37:41 -05:00
5eaff063f3 Update README.md
made readme more accurate
2024-11-25 03:31:16 +01:00
ef1e502af1 Merge pull request 'fix' (#14) from ErrorNoInternet/r34-scraper:fix into main
👍
2024-11-25 03:29:35 +01:00
1878807461
fix 2024-11-24 21:28:26 -05:00
20a3a8c4c6 Merge pull request 'update dependencies (plox sent me patch)' (#13) from ErrorNoInternet/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/13
2024-10-25 03:01:46 +02:00
52fe7d5187 update dependencies 2024-10-24 20:58:18 -04:00
2648b9c20e Merge pull request 'feat: capture SIGINT to avoid corrupting downloads' (#12) from ErrorNoInternet/r34-scraper:ctrlc into main
bro got no bitches
2024-10-21 00:44:42 +02:00
094b0b7412
feat: capture SIGINT to avoid corrupting downloads 2024-10-20 18:43:05 -04:00
351439034e Merge pull request 'refactor: minor nitpicks' (#11) from ErrorNoInternet/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/11
2024-10-21 00:00:43 +02:00
fe3c399c1d
refactor: minor nitpicks 2024-10-20 17:59:10 -04:00
danmax
89830d6e1e update lock file 2024-10-20 17:15:59 -04:00
1b5608b014 Merge pull request 'bring back toolchain' (#10) from javalsai-changes into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/10
2024-10-20 21:32:37 +02:00
a852c8bcc5
chore: bring back rust-toolchain.toml 2024-10-20 21:31:27 +02:00
235e13230b
Merge branch 'main' into javalsai-changes 2024-10-20 21:31:03 +02:00
danmax
bd517ed0b5 chore: bump version to 1.0.0 2024-10-20 01:48:21 -04:00
5f848be434 Merge pull request 'refactor: accept cli args only' (#9) from ErrorNoInternet/r34-scraper:cli-args-only into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/9
2024-10-20 06:25:37 +02:00
08ed5e51f2 Merge pull request 'feat(cli): add page argument to start scraping from a specific page' (#8) from ErrorNoInternet/r34-scraper:add-page-arg into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/8
2024-10-20 06:17:35 +02:00
3573f6ff5a
feat(cli): add page argument to start scraping from a specific page 2024-10-20 00:16:22 -04:00
bcd349e36f
cargo clippy 👍 2024-10-19 21:50:54 +02:00
6 changed files with 140 additions and 55 deletions

99
Cargo.lock generated
View File

@ -28,9 +28,9 @@ dependencies = [
[[package]]
name = "anstream"
version = "0.6.15"
version = "0.6.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
dependencies = [
"anstyle",
"anstyle-parse",
@ -43,36 +43,36 @@ dependencies = [
[[package]]
name = "anstyle"
version = "1.0.8"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56"
[[package]]
name = "anstyle-parse"
version = "0.2.5"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.1"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys 0.52.0",
"windows-sys 0.59.0",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.4"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys 0.52.0",
"windows-sys 0.59.0",
]
[[package]]
@ -248,9 +248,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "bytes"
version = "1.7.2"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3"
checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da"
[[package]]
name = "cc"
@ -267,6 +267,12 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "clap"
version = "4.5.20"
@ -309,9 +315,9 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "colorchoice"
version = "1.0.2"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "concurrent-queue"
@ -357,6 +363,16 @@ version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "ctrlc"
version = "3.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90eeab0aa92f3f9b4e87f258c72b139c207d251f9cbc1080a0086b86a8870dd3"
dependencies = [
"nix",
"windows-sys 0.59.0",
]
[[package]]
name = "encode_unicode"
version = "0.3.6"
@ -365,9 +381,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]]
name = "encoding_rs"
version = "0.8.34"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
@ -892,6 +908,18 @@ dependencies = [
"tempfile",
]
[[package]]
name = "nix"
version = "0.29.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46"
dependencies = [
"bitflags",
"cfg-if",
"cfg_aliases",
"libc",
]
[[package]]
name = "number_prefix"
version = "0.4.0"
@ -971,9 +999,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "pin-project-lite"
version = "0.2.14"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff"
[[package]]
name = "pin-utils"
@ -1021,9 +1049,9 @@ checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2"
[[package]]
name = "proc-macro2"
version = "1.0.88"
version = "1.0.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
dependencies = [
"unicode-ident",
]
@ -1039,10 +1067,11 @@ dependencies = [
[[package]]
name = "r34-scraper"
version = "0.1.0"
version = "1.0.0"
dependencies = [
"async-std",
"clap",
"ctrlc",
"futures",
"indicatif",
"regex",
@ -1052,9 +1081,9 @@ dependencies = [
[[package]]
name = "regex"
version = "1.11.0"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
@ -1236,18 +1265,18 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.210"
version = "1.0.213"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.210"
version = "1.0.213"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5"
dependencies = [
"proc-macro2",
"quote",
@ -1329,9 +1358,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "2.0.79"
version = "2.0.85"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89132cd0bf050864e1d38dc3bbc07a0eb8e7530af26344d3d2bbbef83499f590"
checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56"
dependencies = [
"proc-macro2",
"quote",
@ -1398,9 +1427,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.40.0"
version = "1.41.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998"
checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb"
dependencies = [
"backtrace",
"bytes",
@ -1540,9 +1569,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "value-bag"
version = "1.9.0"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101"
checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2"
[[package]]
name = "vcpkg"

View File

@ -1,11 +1,12 @@
[package]
name = "r34-scraper"
version = "0.1.0"
version = "1.0.0"
edition = "2021"
[dependencies]
async-std = "1"
clap = { version = "4", features = ["derive"] }
ctrlc = "3"
futures = "0"
indicatif = "0"
regex = "1"

View File

@ -4,7 +4,7 @@ a scraper that well scrapes r34
## note
this program is pretty much complete, although i am planning to add a few extra features.
this program is pretty much complete, although i might add a few extra features.
## example usage image
![example image](./image.png)

2
rust-toolchain.toml Normal file
View File

@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"

View File

@ -14,10 +14,17 @@ pub struct Args {
#[arg(short, long, required = true)]
pub tags: Vec<String>,
/// Page to start scraping from
#[arg(short, long, default_value_t = 1)]
pub page: usize,
/// Async jobs to use for fetching
#[arg(short, long, default_value = "4")]
#[arg(short, long, default_value_t = 4)]
pub jobs: usize,
#[arg(long, default_value = "downloads")]
pub dir: String,
/// Delay for rate-limits (ms)
#[arg(short, long, default_value = "1000", value_parser = parse_duration)]
pub delay: std::time::Duration,

View File

@ -1,4 +1,3 @@
#![feature(async_closure)]
pub mod args;
use clap::Parser;
@ -10,6 +9,8 @@ use tokio::time::{sleep, Duration};
use std::io::Write;
use std::process::ExitCode;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
const BAR_LENGTH: u64 = 8;
@ -18,19 +19,27 @@ async fn main() -> ExitCode {
let args = args::Args::parse();
let uri_tags = &args.tags.join("+");
let _ = std::fs::create_dir(uri_tags);
let dir = &args.dir;
let _ = std::fs::create_dir(dir);
let running = Arc::new(AtomicBool::new(true));
let running_t = running.clone();
ctrlc::set_handler(move || {
running_t.store(false, Ordering::SeqCst);
})
.unwrap();
let client = Client::builder()
.user_agent(&args.user_agent)
.build()
.unwrap();
for page in 0.. {
println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
);
for page in args.page - 1.. {
if !running.load(Ordering::SeqCst) {
return ExitCode::FAILURE;
}
println!("now scraping page {} (https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={})", page + 1, page * 42);
let post_html = async |client: &Client| {
extract_urls(
@ -55,6 +64,10 @@ async fn main() -> ExitCode {
println!("no urls found, retrying in 5 seconds...");
sleep(Duration::from_millis(5000)).await;
if !running.load(Ordering::SeqCst) {
return ExitCode::FAILURE;
}
urls = post_html(&client).await;
if !urls.is_empty() {
println!("urls found! continuing...");
@ -75,21 +88,26 @@ async fn main() -> ExitCode {
.map(|(i, url)| {
let i = i + 1;
let client = &client;
let running_t = running.clone();
let this_bar = indicatif::ProgressBar::new(BAR_LENGTH);
this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> "));
let this_bar = multi_prog.insert(i, this_bar);
async move {
// "thread"
loop {
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url:?}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) {
if !running_t.load(Ordering::SeqCst) {
return;
}
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap().text().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp) {
if img_url.is_empty() {
this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
));
} else {
download_file(&img_url, this_bar, i, urls_amount, uri_tags).await;
download_file(running_t, &img_url, this_bar, i, urls_amount, &resp, dir.as_str()).await;
}
break;
}
@ -104,7 +122,7 @@ async fn main() -> ExitCode {
}
})
.buffered(args.jobs);
let _ = responses.for_each(|()| async {}).await;
let () = responses.for_each(|()| async {}).await;
}
ExitCode::SUCCESS
@ -118,9 +136,26 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect()
}
fn write_to_database(html: &str, file_path: &str) {
let strings = html.split('\n');
for line in strings {
if line.contains("<title>") {
let line = line.trim_start();
let mut file = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open("database")
.unwrap();
writeln!(file, "{file_path}: {line}").unwrap();
break;
}
}
}
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
Regex::new(r"https://us\.rule34\.xxx//images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
@ -133,11 +168,13 @@ fn extract_img_url(html: &str) -> Result<String, &'static str> {
}
async fn download_file(
running: Arc<AtomicBool>,
img_url: &str,
this_bar: ProgressBar,
i: usize,
urls_amount: usize,
uri_tags: &str,
html: &str,
dir: &str,
) {
let args = args::Args::parse();
@ -147,7 +184,7 @@ async fn download_file(
.map(|m| m.as_str())
.unwrap();
let file_path = uri_tags.to_owned() + "/" + file_name;
let file_path = dir.to_owned() + "/" + file_name;
let mut file = if std::fs::File::open(&file_path).is_ok() {
this_bar.finish_with_message(format!(
@ -155,7 +192,7 @@ async fn download_file(
));
return;
} else {
std::fs::File::create(file_path).unwrap()
std::fs::File::create(&file_path).unwrap()
};
let mut res = Client::new()
@ -167,10 +204,19 @@ async fn download_file(
let file_length = res.content_length().unwrap();
let mut written = 0;
while let Some(img_chunk) = res.chunk().await.unwrap() {
if !running.load(Ordering::SeqCst) {
this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[33mcancelling {img_url}\x1b[0m"
));
drop(file);
std::fs::remove_file(&file_path).unwrap();
return;
}
file.write_all(&img_chunk).unwrap();
written += img_chunk.len();
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
}
write_to_database(html, &file_path);
this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
));