forked from danmax/r34-scraper
		
	Compare commits
	
		
			7 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 54ccc84719 | |||
| 5eaff063f3 | |||
| ef1e502af1 | |||
| 1878807461 | |||
| 20a3a8c4c6 | |||
| 52fe7d5187 | |||
| 2648b9c20e | 
							
								
								
									
										68
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										68
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -28,9 +28,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anstream" | name = "anstream" | ||||||
| version = "0.6.15" | version = "0.6.17" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" | checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anstyle", |  "anstyle", | ||||||
|  "anstyle-parse", |  "anstyle-parse", | ||||||
| @@ -43,36 +43,36 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anstyle" | name = "anstyle" | ||||||
| version = "1.0.8" | version = "1.0.9" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" | checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anstyle-parse" | name = "anstyle-parse" | ||||||
| version = "0.2.5" | version = "0.2.6" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" | checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "utf8parse", |  "utf8parse", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anstyle-query" | name = "anstyle-query" | ||||||
| version = "1.1.1" | version = "1.1.2" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" | checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "windows-sys 0.52.0", |  "windows-sys 0.59.0", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anstyle-wincon" | name = "anstyle-wincon" | ||||||
| version = "3.0.4" | version = "3.0.6" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" | checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anstyle", |  "anstyle", | ||||||
|  "windows-sys 0.52.0", |  "windows-sys 0.59.0", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -248,9 +248,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "bytes" | name = "bytes" | ||||||
| version = "1.7.2" | version = "1.8.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" | checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "cc" | name = "cc" | ||||||
| @@ -315,9 +315,9 @@ checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "colorchoice" | name = "colorchoice" | ||||||
| version = "1.0.2" | version = "1.0.3" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" | checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "concurrent-queue" | name = "concurrent-queue" | ||||||
| @@ -381,9 +381,9 @@ checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "encoding_rs" | name = "encoding_rs" | ||||||
| version = "0.8.34" | version = "0.8.35" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" | checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "cfg-if", |  "cfg-if", | ||||||
| ] | ] | ||||||
| @@ -999,9 +999,9 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pin-project-lite" | name = "pin-project-lite" | ||||||
| version = "0.2.14" | version = "0.2.15" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" | checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pin-utils" | name = "pin-utils" | ||||||
| @@ -1049,9 +1049,9 @@ checksum = "cc9c68a3f6da06753e9335d63e27f6b9754dd1920d941135b7ea8224f141adb2" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "proc-macro2" | name = "proc-macro2" | ||||||
| version = "1.0.88" | version = "1.0.89" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" | checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "unicode-ident", |  "unicode-ident", | ||||||
| ] | ] | ||||||
| @@ -1081,9 +1081,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex" | name = "regex" | ||||||
| version = "1.11.0" | version = "1.11.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "memchr", |  "memchr", | ||||||
| @@ -1265,18 +1265,18 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "serde" | name = "serde" | ||||||
| version = "1.0.210" | version = "1.0.213" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" | checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "serde_derive", |  "serde_derive", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "serde_derive" | name = "serde_derive" | ||||||
| version = "1.0.210" | version = "1.0.213" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" | checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "proc-macro2", |  "proc-macro2", | ||||||
|  "quote", |  "quote", | ||||||
| @@ -1358,9 +1358,9 @@ checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "syn" | name = "syn" | ||||||
| version = "2.0.82" | version = "2.0.85" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021" | checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "proc-macro2", |  "proc-macro2", | ||||||
|  "quote", |  "quote", | ||||||
| @@ -1427,9 +1427,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "tokio" | name = "tokio" | ||||||
| version = "1.40.0" | version = "1.41.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" | checksum = "145f3413504347a2be84393cc8a7d2fb4d863b375909ea59f2158261aa258bbb" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "backtrace", |  "backtrace", | ||||||
|  "bytes", |  "bytes", | ||||||
| @@ -1569,9 +1569,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "value-bag" | name = "value-bag" | ||||||
| version = "1.9.0" | version = "1.10.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5a84c137d37ab0142f0f2ddfe332651fdbf252e7b7dbb4e67b6c1f1b2e925101" | checksum = "3ef4c4aa54d5d05a279399bfa921ec387b7aba77caf7a682ae8d86785b8fdad2" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "vcpkg" | name = "vcpkg" | ||||||
|   | |||||||
| @@ -4,7 +4,7 @@ a scraper that well scrapes r34 | |||||||
|  |  | ||||||
| ## note  | ## note  | ||||||
|  |  | ||||||
| this program is pretty much complete, although i am planning to add a few extra features. | this program is pretty much complete, although i might add a few extra features. | ||||||
|  |  | ||||||
| ## example usage image | ## example usage image | ||||||
|  |  | ||||||
|   | |||||||
| @@ -22,6 +22,9 @@ pub struct Args { | |||||||
|     #[arg(short, long, default_value_t = 4)] |     #[arg(short, long, default_value_t = 4)] | ||||||
|     pub jobs: usize, |     pub jobs: usize, | ||||||
|  |  | ||||||
|  |     #[arg(long, default_value = "downloads")] | ||||||
|  |     pub dir: String, | ||||||
|  |  | ||||||
|     /// Delay for rate-limits (ms) |     /// Delay for rate-limits (ms) | ||||||
|     #[arg(short, long, default_value = "1000", value_parser = parse_duration)] |     #[arg(short, long, default_value = "1000", value_parser = parse_duration)] | ||||||
|     pub delay: std::time::Duration, |     pub delay: std::time::Duration, | ||||||
|   | |||||||
							
								
								
									
										37
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										37
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -1,4 +1,3 @@ | |||||||
| #![feature(async_closure)] |  | ||||||
| pub mod args; | pub mod args; | ||||||
|  |  | ||||||
| use clap::Parser; | use clap::Parser; | ||||||
| @@ -20,7 +19,8 @@ async fn main() -> ExitCode { | |||||||
|     let args = args::Args::parse(); |     let args = args::Args::parse(); | ||||||
|  |  | ||||||
|     let uri_tags = &args.tags.join("+"); |     let uri_tags = &args.tags.join("+"); | ||||||
|     let _ = std::fs::create_dir(uri_tags); |     let dir = &args.dir; | ||||||
|  |     let _ = std::fs::create_dir(dir); | ||||||
|  |  | ||||||
|     let running = Arc::new(AtomicBool::new(true)); |     let running = Arc::new(AtomicBool::new(true)); | ||||||
|     let running_t = running.clone(); |     let running_t = running.clone(); | ||||||
| @@ -100,14 +100,14 @@ async fn main() -> ExitCode { | |||||||
|                         } |                         } | ||||||
|  |  | ||||||
|                         this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m")); |                         this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m")); | ||||||
|                         let resp = client.get(&url).send().await.unwrap(); |                         let resp = client.get(&url).send().await.unwrap().text().await.unwrap(); | ||||||
|                         if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { |                         if let Ok(img_url) = extract_img_url(&resp) { | ||||||
|                             if img_url.is_empty() { |                             if img_url.is_empty() { | ||||||
|                                 this_bar.abandon_with_message(format!( |                                 this_bar.abandon_with_message(format!( | ||||||
|                                     "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" |                                     "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" | ||||||
|                                 )); |                                 )); | ||||||
|                             } else { |                             } else { | ||||||
|                                 download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await; |                                 download_file(running_t, &img_url, this_bar, i, urls_amount, &resp, dir.as_str()).await; | ||||||
|                             } |                             } | ||||||
|                             break; |                             break; | ||||||
|                         } |                         } | ||||||
| @@ -122,7 +122,7 @@ async fn main() -> ExitCode { | |||||||
|                 } |                 } | ||||||
|             }) |             }) | ||||||
|             .buffered(args.jobs); |             .buffered(args.jobs); | ||||||
|         let _ = responses.for_each(|()| async {}).await; |         let () = responses.for_each(|()| async {}).await; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     ExitCode::SUCCESS |     ExitCode::SUCCESS | ||||||
| @@ -136,9 +136,26 @@ fn extract_urls(html: &str) -> Vec<String> { | |||||||
|         .collect() |         .collect() | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn write_to_database(html: &str, file_path: &str) { | ||||||
|  |     let strings = html.split('\n'); | ||||||
|  |     for line in strings { | ||||||
|  |         if line.contains("<title>") { | ||||||
|  |             let line = line.trim_start(); | ||||||
|  |             let mut file = std::fs::OpenOptions::new() | ||||||
|  |                 .create(true)                 | ||||||
|  |                 .append(true) | ||||||
|  |                 .open("database") | ||||||
|  |                 .unwrap(); | ||||||
|  |  | ||||||
|  |             writeln!(file, "{file_path}: {line}").unwrap(); | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| fn extract_img_url(html: &str) -> Result<String, &'static str> { | fn extract_img_url(html: &str) -> Result<String, &'static str> { | ||||||
|     if let Some(img_url) = |     if let Some(img_url) = | ||||||
|         Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") |         Regex::new(r"https://us\.rule34\.xxx//images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") | ||||||
|             .unwrap() |             .unwrap() | ||||||
|             .find(html) |             .find(html) | ||||||
|     { |     { | ||||||
| @@ -156,7 +173,8 @@ async fn download_file( | |||||||
|     this_bar: ProgressBar, |     this_bar: ProgressBar, | ||||||
|     i: usize, |     i: usize, | ||||||
|     urls_amount: usize, |     urls_amount: usize, | ||||||
|     uri_tags: &str, |     html: &str, | ||||||
|  |     dir: &str, | ||||||
| ) { | ) { | ||||||
|     let args = args::Args::parse(); |     let args = args::Args::parse(); | ||||||
|  |  | ||||||
| @@ -166,7 +184,7 @@ async fn download_file( | |||||||
|         .map(|m| m.as_str()) |         .map(|m| m.as_str()) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
|  |  | ||||||
|     let file_path = uri_tags.to_owned() + "/" + file_name; |     let file_path = dir.to_owned() + "/" + file_name; | ||||||
|  |  | ||||||
|     let mut file = if std::fs::File::open(&file_path).is_ok() { |     let mut file = if std::fs::File::open(&file_path).is_ok() { | ||||||
|         this_bar.finish_with_message(format!( |         this_bar.finish_with_message(format!( | ||||||
| @@ -198,6 +216,7 @@ async fn download_file( | |||||||
|         written += img_chunk.len(); |         written += img_chunk.len(); | ||||||
|         this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); |         this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); | ||||||
|     } |     } | ||||||
|  |     write_to_database(html, &file_path); | ||||||
|     this_bar.finish_with_message(format!( |     this_bar.finish_with_message(format!( | ||||||
|         "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m" |         "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m" | ||||||
|     )); |     )); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user