Compare commits

...

5 Commits

Author SHA1 Message Date
GayLord
c1d67c9d84 Update epilog cause it didnt offend anyone 2024-10-19 17:50:05 +02:00
GayLord
89fca9d0a3 Update gitignore, add taap, luv gaylord 2024-10-19 14:05:30 +02:00
danmax
eeee4f50b3 added multithreading
Co-authored-by: ErrorNoInternet <errornointernet@envs.net>
2024-10-18 19:36:25 -04:00
abd91a6e95 Merge pull request 'chore(README): add example usage image' (#4) from grialion/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4
2024-10-18 23:03:57 +02:00
grialion
1a5fc75162 chore(README): add example usage image 2024-10-18 23:00:26 +02:00
7 changed files with 82 additions and 1363 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target /target
/Cargo.lock

1344
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -6,4 +6,5 @@ edition = "2021"
[dependencies] [dependencies]
regex = "1.11.0" regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] } reqwest = { version = "0.12.8", features = ["blocking"] }
taap = "0.1.4"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }

View File

@ -5,3 +5,6 @@ a scraper that well scrapes r34
## note ## note
this thing is still not completed, it only gathers links, it doesnt download things yet this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

2
rust-toolchain.toml Normal file
View File

@ -0,0 +1,2 @@
[toolchain]
channel = "nightly"

View File

@ -2,21 +2,34 @@
use regex::Regex; use regex::Regex;
use reqwest::Client; use reqwest::Client;
use std::process::ExitCode; use std::process::ExitCode;
use taap::Argument;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main] #[tokio::main]
async fn main() -> ExitCode { async fn main() -> ExitCode {
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama"); // Taap setup
let tags = std::io::stdin() let mut arguments = Argument::new(
.lines() "r34-scrape",
.next() "A scraper for r34.xxx",
.unwrap() "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
.unwrap() "Danmax and authors 2024",
.trim() );
.to_string();
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
let parsed_arguments = arguments.parse_args(None);
let tags = parsed_arguments.get("TAGS").unwrap();
// End of taap setup
// Check if empty and warn
// Can't use tags.0 because taap is not buggy at all :3
if tags.1.is_empty() {
println!("[warning] No tags were used, use --help for help")
}
let mut thread_counter = 0;
let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0; let mut page = 0;
@ -27,7 +40,12 @@ async fn main() -> ExitCode {
extract_urls( extract_urls(
&client &client
.get(format!( .get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}", "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
if tags.0 {
format!("&tags={}", tags.1.join("+"))
} else {
"".to_owned()
},
page * 42 page * 42
)) ))
.send() .send()
@ -65,12 +83,46 @@ async fn main() -> ExitCode {
} }
for url in urls { for url in urls {
let img_url = tokio::spawn(async move {
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); let thread_id = format!("[{thread_counter: >4}]");
if img_url.is_empty() { loop {
println!("image url not found"); let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
} else { match extract_img_url(
println!("found image url: {img_url}"); &client
.get(url.clone())
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
) {
Ok(img_url) => {
if img_url.is_empty() {
println!("{thread_id} image url not found");
} else {
println!("{thread_id} found image url: {img_url}");
}
}
Err(_) => {
println!("{thread_id} ratelimited, retrying after 1 second");
std::thread::sleep(std::time::Duration::from_millis(1000));
continue;
}
}
break;
}
});
thread_counter += 1;
if thread_counter > 9999 {
thread_counter = 0;
}
while tokio::runtime::Handle::current()
.metrics()
.num_alive_tasks()
> 4
{
std::thread::sleep(std::time::Duration::from_millis(100));
} }
} }
@ -86,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect() .collect()
} }
fn extract_img_url(html: &str) -> String { fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) = if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap() .unwrap()
.find(html) .find(html)
{ {
img_url.as_str().to_string() Ok(img_url.as_str().to_string())
} else { } else {
String::new() if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
} }
} }