forked from danmax/r34-scraper
		
	Compare commits
	
		
			13 Commits
		
	
	
		
			javalsai-c
			...
			javalsai-c
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| a852c8bcc5 | |||
| 235e13230b | |||
|  | bd517ed0b5 | ||
| 5f848be434 | |||
| 8723769429 | |||
| 08ed5e51f2 | |||
| 3573f6ff5a | |||
| ee0e938782 | |||
| ca6df90460 | |||
| 325730bd37 | |||
|  | 188b714741 | ||
|  | b5a70e3426 | ||
| 137378beb3 | 
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1 +1,2 @@ | |||||||
| /target | /target | ||||||
|  | /downloads | ||||||
|   | |||||||
							
								
								
									
										59
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										59
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -833,16 +833,6 @@ version = "0.4.14" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" | checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "lock_api" |  | ||||||
| version = "0.4.12" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" |  | ||||||
| dependencies = [ |  | ||||||
|  "autocfg", |  | ||||||
|  "scopeguard", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "log" | name = "log" | ||||||
| version = "0.4.22" | version = "0.4.22" | ||||||
| @@ -973,29 +963,6 @@ version = "2.2.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" | checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "parking_lot" |  | ||||||
| version = "0.12.3" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" |  | ||||||
| dependencies = [ |  | ||||||
|  "lock_api", |  | ||||||
|  "parking_lot_core", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "parking_lot_core" |  | ||||||
| version = "0.9.10" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" |  | ||||||
| dependencies = [ |  | ||||||
|  "cfg-if", |  | ||||||
|  "libc", |  | ||||||
|  "redox_syscall", |  | ||||||
|  "smallvec", |  | ||||||
|  "windows-targets", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "percent-encoding" | name = "percent-encoding" | ||||||
| version = "2.3.1" | version = "2.3.1" | ||||||
| @@ -1083,15 +1050,6 @@ dependencies = [ | |||||||
|  "tokio", |  "tokio", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "redox_syscall" |  | ||||||
| version = "0.5.7" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" |  | ||||||
| dependencies = [ |  | ||||||
|  "bitflags", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex" | name = "regex" | ||||||
| version = "1.11.0" | version = "1.11.0" | ||||||
| @@ -1253,12 +1211,6 @@ dependencies = [ | |||||||
|  "windows-sys 0.59.0", |  "windows-sys 0.59.0", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "scopeguard" |  | ||||||
| version = "1.2.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "security-framework" | name = "security-framework" | ||||||
| version = "2.11.1" | version = "2.11.1" | ||||||
| @@ -1332,15 +1284,6 @@ version = "1.3.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "signal-hook-registry" |  | ||||||
| version = "1.4.2" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" |  | ||||||
| dependencies = [ |  | ||||||
|  "libc", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "slab" | name = "slab" | ||||||
| version = "0.4.9" | version = "0.4.9" | ||||||
| @@ -1463,9 +1406,7 @@ dependencies = [ | |||||||
|  "bytes", |  "bytes", | ||||||
|  "libc", |  "libc", | ||||||
|  "mio", |  "mio", | ||||||
|  "parking_lot", |  | ||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
|  "signal-hook-registry", |  | ||||||
|  "socket2", |  "socket2", | ||||||
|  "tokio-macros", |  "tokio-macros", | ||||||
|  "windows-sys 0.52.0", |  "windows-sys 0.52.0", | ||||||
|   | |||||||
							
								
								
									
										16
									
								
								Cargo.toml
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								Cargo.toml
									
									
									
									
									
								
							| @@ -1,13 +1,13 @@ | |||||||
| [package] | [package] | ||||||
| name = "r34-scraper" | name = "r34-scraper" | ||||||
| version = "0.1.0" | version = "1.0.0" | ||||||
| edition = "2021" | edition = "2021" | ||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| async-std = "1.13.0" | async-std = "1" | ||||||
| clap = { version = "4.5.20", features = ["derive"] } | clap = { version = "4", features = ["derive"] } | ||||||
| futures = "0.3.31" | futures = "0" | ||||||
| indicatif = "0.17.8" | indicatif = "0" | ||||||
| regex = "1.11.0" | regex = "1" | ||||||
| reqwest = { version = "0.12.8", features = ["blocking"] } | reqwest = { version = "0", features = ["blocking"] } | ||||||
| tokio = { version = "1", features = ["full"] } | tokio = { version = "1", features = ["macros", "rt-multi-thread"] } | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ a scraper that well scrapes r34 | |||||||
|  |  | ||||||
| ## note  | ## note  | ||||||
|  |  | ||||||
| this thing is still not completed, it only gathers links, it doesnt download things yet | this program is pretty much complete, although i am planning to add a few extra features. | ||||||
|  |  | ||||||
| ## example usage image | ## example usage image | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								image.png
									
									
									
									
									
								
							
							
						
						
									
										
											BIN
										
									
								
								image.png
									
									
									
									
									
								
							
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 79 KiB After Width: | Height: | Size: 122 KiB | 
							
								
								
									
										2
									
								
								rust-toolchain.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								rust-toolchain.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,2 @@ | |||||||
|  | [toolchain] | ||||||
|  | channel = "nightly" | ||||||
| @@ -11,11 +11,15 @@ pub struct Args { | |||||||
|     pub user_agent: String, |     pub user_agent: String, | ||||||
|  |  | ||||||
|     /// Tags to search for |     /// Tags to search for | ||||||
|     #[arg(short, long)] |     #[arg(short, long, required = true)] | ||||||
|     pub tags: Option<Vec<String>>, |     pub tags: Vec<String>, | ||||||
|  |  | ||||||
|  |     /// Page to start scraping from | ||||||
|  |     #[arg(short, long, default_value_t = 1)] | ||||||
|  |     pub page: usize, | ||||||
|  |  | ||||||
|     /// Async jobs to use for fetching |     /// Async jobs to use for fetching | ||||||
|     #[arg(short, long, default_value = "4")] |     #[arg(short, long, default_value_t = 4)] | ||||||
|     pub jobs: usize, |     pub jobs: usize, | ||||||
|  |  | ||||||
|     /// Delay for rate-limits (ms) |     /// Delay for rate-limits (ms) | ||||||
|   | |||||||
							
								
								
									
										118
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										118
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -1,35 +1,31 @@ | |||||||
| #![feature(async_closure, iter_intersperse)] | #![feature(async_closure)] | ||||||
| pub mod args; | pub mod args; | ||||||
|  |  | ||||||
| use clap::Parser; | use clap::Parser; | ||||||
| use futures::{stream, StreamExt}; | use futures::{stream, StreamExt}; | ||||||
|  | use indicatif::ProgressBar; | ||||||
| use regex::Regex; | use regex::Regex; | ||||||
| use reqwest::Client; | use reqwest::Client; | ||||||
| use tokio::time::{sleep, Duration}; | use tokio::time::{sleep, Duration}; | ||||||
|  |  | ||||||
|  | use std::io::Write; | ||||||
| use std::process::ExitCode; | use std::process::ExitCode; | ||||||
|  |  | ||||||
|  | const BAR_LENGTH: u64 = 8; | ||||||
|  |  | ||||||
| #[tokio::main] | #[tokio::main] | ||||||
| async fn main() -> ExitCode { | async fn main() -> ExitCode { | ||||||
|     let args = args::Args::parse(); |     let args = args::Args::parse(); | ||||||
|  |  | ||||||
|     let tags = args.tags.unwrap_or_else(|| { |     let uri_tags = &args.tags.join("+"); | ||||||
|         println!("which tags do you want to scrape? ex: 1girls 1boys yomama"); |     let _ = std::fs::create_dir(uri_tags); | ||||||
|         let tags_binding = std::io::stdin().lines().next().unwrap().unwrap(); |  | ||||||
|         tags_binding |  | ||||||
|             .split(' ') |  | ||||||
|             .filter(|item| !item.is_empty()) |  | ||||||
|             .map(|item| item.to_owned()) |  | ||||||
|             .collect() |  | ||||||
|     }); |  | ||||||
|     let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect(); |  | ||||||
|  |  | ||||||
|     let client = Client::builder() |     let client = Client::builder() | ||||||
|         .user_agent(&args.user_agent) |         .user_agent(&args.user_agent) | ||||||
|         .build() |         .build() | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|  |  | ||||||
|     for page in 0.. { |     for page in args.page - 1.. { | ||||||
|         println!("now scraping page {}", page + 1); |         println!("now scraping page {}", page + 1); | ||||||
|         println!( |         println!( | ||||||
|             "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", |             "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", | ||||||
| @@ -53,75 +49,65 @@ async fn main() -> ExitCode { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let mut urls = post_html(&client).await; |         let mut urls = post_html(&client).await; | ||||||
|  |  | ||||||
|         let mut wait_time = 5000; |  | ||||||
|  |  | ||||||
|         if urls.is_empty() { |         if urls.is_empty() { | ||||||
|             for reconnection_attempts in 0..4 { |             let mut reconnection_attempts = 0; | ||||||
|                 println!("no urls found, retrying in {} seconds...", wait_time / 1000); |             loop { | ||||||
|                 sleep(Duration::from_millis(wait_time)).await; |                 println!("no urls found, retrying in 5 seconds..."); | ||||||
|  |                 sleep(Duration::from_millis(5000)).await; | ||||||
|  |  | ||||||
|                 urls = post_html(&client).await; |                 urls = post_html(&client).await; | ||||||
|  |  | ||||||
|                 if !urls.is_empty() { |                 if !urls.is_empty() { | ||||||
|                     println!("urls found! continuing..."); |                     println!("urls found! continuing..."); | ||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 if reconnection_attempts == 3 { |                 reconnection_attempts += 1; | ||||||
|                     println!("no urls found in 4 attempts, exiting..."); |                 if reconnection_attempts == 12 { | ||||||
|  |                     println!("no urls found in 1 minute, exiting..."); | ||||||
|                     return ExitCode::FAILURE; |                     return ExitCode::FAILURE; | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 wait_time += 5000; |  | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let multi_prog = indicatif::MultiProgress::new(); |         let multi_prog = indicatif::MultiProgress::new(); | ||||||
|         let urls_ammount = urls.len(); |         let urls_amount = urls.len(); | ||||||
|         let responses = stream::iter(urls.into_iter().enumerate()) |         let responses = stream::iter(urls.into_iter().enumerate()) | ||||||
|             .map(|(i, url)| { |             .map(|(i, url)| { | ||||||
|                 let i = i + 1; |                 let i = i + 1; | ||||||
|                 let client = &client; |                 let client = &client; | ||||||
|                 let this_bar = indicatif::ProgressBar::new_spinner(); |                 let this_bar = indicatif::ProgressBar::new(BAR_LENGTH); | ||||||
|                 this_bar.enable_steady_tick(Duration::from_millis(50)); |                 this_bar.set_style(indicatif::ProgressStyle::with_template("[{bar}] {msg}").unwrap().progress_chars("=> ")); | ||||||
|                 let this_prog = multi_prog.insert(i, this_bar); |                 let this_bar = multi_prog.insert(i, this_bar); | ||||||
|                 async move { |                 async move { | ||||||
|                     // "thread" |                     // "thread" | ||||||
|                     loop { |                     loop { | ||||||
|                         this_prog.set_message(format!("\x1b[30m[{i: >4}/{urls_ammount}] \x1b[36mscraping {url:?}\x1b[0m")); |                         this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url:?}\x1b[0m")); | ||||||
|                         let resp = client.get(&url).send().await.unwrap(); |                         let resp = client.get(&url).send().await.unwrap(); | ||||||
|                         match extract_img_url(&resp.text().await.unwrap()) { |                         if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { | ||||||
|                             Ok(img_url) => { |  | ||||||
|                             if img_url.is_empty() { |                             if img_url.is_empty() { | ||||||
|                                     this_prog.abandon_with_message(format!( |                                 this_bar.abandon_with_message(format!( | ||||||
|                                         "\x1b[30m[{i: >4}/{urls_ammount}] \x1b[1;31mimage url not found\x1b[0m" |                                     "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" | ||||||
|                                 )); |                                 )); | ||||||
|                             } else { |                             } else { | ||||||
|                                     this_prog.finish_with_message(format!( |                                 download_file(&img_url, this_bar, i, urls_amount, uri_tags).await; | ||||||
|                                         "\x1b[30m[{i: >4}/{urls_ammount}] \x1b[32mfound image url: {img_url}\x1b[0m" |  | ||||||
|                                     )); |  | ||||||
|                             } |                             } | ||||||
|                                 break img_url; |                             break; | ||||||
|                         } |                         } | ||||||
|                             Err(_) => { |  | ||||||
|                                 this_prog |                         this_bar | ||||||
|                             .set_message(format!( |                             .set_message(format!( | ||||||
|                                             "\x1b[30m[{i: >4}/{urls_ammount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m", |                                     "\x1b[37m[{i: >4}/{urls_amount}] \x1b[31mratelimited, retrying after {}ms\x1b[0m", | ||||||
|                                     args.delay.as_millis()) |                                     args.delay.as_millis()) | ||||||
|                                 ); |                                 ); | ||||||
|                         tokio::time::sleep(args.delay).await; |                         tokio::time::sleep(args.delay).await; | ||||||
|                                 continue; |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             }) |             }) | ||||||
|             .buffered(args.jobs); |             .buffered(args.jobs); | ||||||
|         let _ = responses.for_each(|_| async {}).await; |         let _ = responses.for_each(|()| async {}).await; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     return ExitCode::SUCCESS; |     ExitCode::SUCCESS | ||||||
| } | } | ||||||
|  |  | ||||||
| fn extract_urls(html: &str) -> Vec<String> { | fn extract_urls(html: &str) -> Vec<String> { | ||||||
| @@ -145,3 +131,47 @@ fn extract_img_url(html: &str) -> Result<String, &'static str> { | |||||||
|         Ok(String::new()) |         Ok(String::new()) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | async fn download_file( | ||||||
|  |     img_url: &str, | ||||||
|  |     this_bar: ProgressBar, | ||||||
|  |     i: usize, | ||||||
|  |     urls_amount: usize, | ||||||
|  |     uri_tags: &str, | ||||||
|  | ) { | ||||||
|  |     let args = args::Args::parse(); | ||||||
|  |  | ||||||
|  |     let file_name = Regex::new(r"[^/]+$") | ||||||
|  |         .unwrap() | ||||||
|  |         .find(img_url) | ||||||
|  |         .map(|m| m.as_str()) | ||||||
|  |         .unwrap(); | ||||||
|  |  | ||||||
|  |     let file_path = uri_tags.to_owned() + "/" + file_name; | ||||||
|  |  | ||||||
|  |     let mut file = if std::fs::File::open(&file_path).is_ok() { | ||||||
|  |         this_bar.finish_with_message(format!( | ||||||
|  |             "\x1b[37m[{i: >4}/{urls_amount}] \x1b[33m{file_name} exists, skipping...\x1b[0m" | ||||||
|  |         )); | ||||||
|  |         return; | ||||||
|  |     } else { | ||||||
|  |         std::fs::File::create(file_path).unwrap() | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     let mut res = Client::new() | ||||||
|  |         .get(img_url) | ||||||
|  |         .header("User-Agent", &args.user_agent) | ||||||
|  |         .send() | ||||||
|  |         .await | ||||||
|  |         .unwrap(); | ||||||
|  |     let file_length = res.content_length().unwrap(); | ||||||
|  |     let mut written = 0; | ||||||
|  |     while let Some(img_chunk) = res.chunk().await.unwrap() { | ||||||
|  |         file.write_all(&img_chunk).unwrap(); | ||||||
|  |         written += img_chunk.len(); | ||||||
|  |         this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); | ||||||
|  |     } | ||||||
|  |     this_bar.finish_with_message(format!( | ||||||
|  |         "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m" | ||||||
|  |     )); | ||||||
|  | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user