forked from danmax/r34-scraper
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			javalsai-c
			...
			main
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | 1a5fc75162 | ||
|  | 91eff584cb | 
							
								
								
									
										120
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										120
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -26,55 +26,6 @@ dependencies = [ | ||||
|  "memchr", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstream" | ||||
| version = "0.6.15" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" | ||||
| dependencies = [ | ||||
|  "anstyle", | ||||
|  "anstyle-parse", | ||||
|  "anstyle-query", | ||||
|  "anstyle-wincon", | ||||
|  "colorchoice", | ||||
|  "is_terminal_polyfill", | ||||
|  "utf8parse", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle" | ||||
| version = "1.0.8" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-parse" | ||||
| version = "0.2.5" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" | ||||
| dependencies = [ | ||||
|  "utf8parse", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-query" | ||||
| version = "1.1.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" | ||||
| dependencies = [ | ||||
|  "windows-sys 0.52.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "anstyle-wincon" | ||||
| version = "3.0.4" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" | ||||
| dependencies = [ | ||||
|  "anstyle", | ||||
|  "windows-sys 0.52.0", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "atomic-waker" | ||||
| version = "1.1.2" | ||||
| @@ -141,52 +92,6 @@ version = "1.0.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" | ||||
|  | ||||
| [[package]] | ||||
| name = "clap" | ||||
| version = "4.5.20" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" | ||||
| dependencies = [ | ||||
|  "clap_builder", | ||||
|  "clap_derive", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_builder" | ||||
| version = "4.5.20" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" | ||||
| dependencies = [ | ||||
|  "anstream", | ||||
|  "anstyle", | ||||
|  "clap_lex", | ||||
|  "strsim", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_derive" | ||||
| version = "4.5.18" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" | ||||
| dependencies = [ | ||||
|  "heck", | ||||
|  "proc-macro2", | ||||
|  "quote", | ||||
|  "syn", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "clap_lex" | ||||
| version = "0.7.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" | ||||
|  | ||||
| [[package]] | ||||
| name = "colorchoice" | ||||
| version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" | ||||
|  | ||||
| [[package]] | ||||
| name = "core-foundation" | ||||
| version = "0.9.4" | ||||
| @@ -356,12 +261,6 @@ version = "0.15.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" | ||||
|  | ||||
| [[package]] | ||||
| name = "heck" | ||||
| version = "0.5.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" | ||||
|  | ||||
| [[package]] | ||||
| name = "hermit-abi" | ||||
| version = "0.3.9" | ||||
| @@ -506,12 +405,6 @@ version = "2.10.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" | ||||
|  | ||||
| [[package]] | ||||
| name = "is_terminal_polyfill" | ||||
| version = "1.70.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" | ||||
|  | ||||
| [[package]] | ||||
| name = "itoa" | ||||
| version = "1.0.11" | ||||
| @@ -733,7 +626,6 @@ dependencies = [ | ||||
| name = "r34-scraper" | ||||
| version = "0.1.0" | ||||
| dependencies = [ | ||||
|  "clap", | ||||
|  "regex", | ||||
|  "reqwest", | ||||
|  "tokio", | ||||
| @@ -1028,12 +920,6 @@ version = "0.9.8" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" | ||||
|  | ||||
| [[package]] | ||||
| name = "strsim" | ||||
| version = "0.11.1" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" | ||||
|  | ||||
| [[package]] | ||||
| name = "subtle" | ||||
| version = "2.6.1" | ||||
| @@ -1241,12 +1127,6 @@ dependencies = [ | ||||
|  "percent-encoding", | ||||
| ] | ||||
|  | ||||
| [[package]] | ||||
| name = "utf8parse" | ||||
| version = "0.2.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" | ||||
|  | ||||
| [[package]] | ||||
| name = "vcpkg" | ||||
| version = "0.2.15" | ||||
|   | ||||
| @@ -4,7 +4,6 @@ version = "0.1.0" | ||||
| edition = "2021" | ||||
|  | ||||
| [dependencies] | ||||
| clap = { version = "4.5.20", features = ["derive"] } | ||||
| regex = "1.11.0" | ||||
| reqwest = { version = "0.12.8", features = ["blocking"] } | ||||
| tokio = { version = "1", features = ["full"] } | ||||
|   | ||||
| @@ -5,3 +5,6 @@ a scraper that well scrapes r34 | ||||
| ## note  | ||||
|  | ||||
| this thing is still not completed, it only gathers links, it doesnt download things yet | ||||
|  | ||||
| ## example usage image | ||||
|  | ||||
|   | ||||
| @@ -1,2 +0,0 @@ | ||||
| [toolchain] | ||||
| channel = "nightly" | ||||
| @@ -1,16 +0,0 @@ | ||||
| use clap::Parser; | ||||
|  | ||||
| #[derive(Parser)] | ||||
| #[command(version)] | ||||
| pub struct Args { | ||||
|     /// User Agent to use for requests | ||||
|     #[arg( | ||||
|         short, | ||||
|         default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" | ||||
|     )] | ||||
|     pub user_agent: String, | ||||
|  | ||||
|     // Tags to search for | ||||
|     #[arg(short, long)] | ||||
|     pub tags: Option<Vec<String>>, | ||||
| } | ||||
							
								
								
									
										47
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								src/main.rs
									
									
									
									
									
								
							| @@ -1,44 +1,33 @@ | ||||
| #![feature(async_closure, iter_intersperse)] | ||||
| pub mod args; | ||||
|  | ||||
| use clap::Parser; | ||||
| #![feature(async_closure)] | ||||
| use regex::Regex; | ||||
| use reqwest::Client; | ||||
| use std::process::ExitCode; | ||||
| use tokio::time::{sleep, Duration}; | ||||
|  | ||||
| const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; | ||||
|  | ||||
| #[tokio::main] | ||||
| async fn main() -> ExitCode { | ||||
|     let args = args::Args::parse(); | ||||
|     println!("which tags do you want to scrape? ex: 1girls+1boys+yomama"); | ||||
|     let tags = std::io::stdin() | ||||
|         .lines() | ||||
|         .next() | ||||
|         .unwrap() | ||||
|         .unwrap() | ||||
|         .trim() | ||||
|         .to_string(); | ||||
|  | ||||
|     let tags = args.tags.unwrap_or_else(|| { | ||||
|         println!("which tags do you want to scrape? ex: 1girls 1boys yomama"); | ||||
|         let tags_binding = std::io::stdin().lines().next().unwrap().unwrap(); | ||||
|         tags_binding | ||||
|             .split(' ') | ||||
|             .filter(|item| !item.is_empty()) | ||||
|             .map(|item| item.to_owned()) | ||||
|             .collect() | ||||
|     }); | ||||
|     let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect(); | ||||
|     let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); | ||||
|     let mut page = 0; | ||||
|  | ||||
|     let client = Client::builder() | ||||
|         .user_agent(args.user_agent) | ||||
|         .build() | ||||
|         .unwrap(); | ||||
|  | ||||
|     for page in 0.. { | ||||
|     loop { | ||||
|         println!("now scraping page {}", page + 1); | ||||
|         println!( | ||||
|             "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", | ||||
|             page * 42 | ||||
|         ); | ||||
|  | ||||
|         let post_html = async || { | ||||
|             extract_urls( | ||||
|                 &client | ||||
|                     .get(format!( | ||||
|                         "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", | ||||
|                         "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}", | ||||
|                         page * 42 | ||||
|                     )) | ||||
|                     .send() | ||||
| @@ -84,9 +73,9 @@ async fn main() -> ExitCode { | ||||
|                 println!("found image url: {img_url}"); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return ExitCode::SUCCESS; | ||||
|         page += 1; | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn extract_urls(html: &str) -> Vec<String> { | ||||
| @@ -99,7 +88,7 @@ fn extract_urls(html: &str) -> Vec<String> { | ||||
|  | ||||
| fn extract_img_url(html: &str) -> String { | ||||
|     if let Some(img_url) = | ||||
|         Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") | ||||
|         Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") | ||||
|             .unwrap() | ||||
|             .find(html) | ||||
|     { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user