Compare commits

..

5 Commits

Author SHA1 Message Date
GayLord
c1d67c9d84 Update epilog cause it didnt offend anyone 2024-10-19 17:50:05 +02:00
GayLord
89fca9d0a3 Update gitignore, add taap, luv gaylord 2024-10-19 14:05:30 +02:00
danmax
eeee4f50b3 added multithreading
Co-authored-by: ErrorNoInternet <errornointernet@envs.net>
2024-10-18 19:36:25 -04:00
abd91a6e95 Merge pull request 'chore(README): add example usage image' (#4) from grialion/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4
2024-10-18 23:03:57 +02:00
grialion
1a5fc75162 chore(README): add example usage image 2024-10-18 23:00:26 +02:00
7 changed files with 88 additions and 1519 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target /target
/Cargo.lock

1464
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
clap = { version = "4.5.20", features = ["derive"] }
regex = "1.11.0" regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] } reqwest = { version = "0.12.8", features = ["blocking"] }
taap = "0.1.4"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }

View File

@ -5,3 +5,6 @@ a scraper that well scrapes r34
## note ## note
this thing is still not completed, it only gathers links, it doesnt download things yet this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

View File

@ -1,16 +0,0 @@
use clap::Parser;
#[derive(Parser)]
#[command(version)]
pub struct Args {
/// User Agent to use for requests
#[arg(
short,
default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
)]
pub user_agent: String,
// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
}

View File

@ -1,44 +1,51 @@
#![feature(async_closure, iter_intersperse)] #![feature(async_closure)]
pub mod args;
use clap::Parser;
use regex::Regex; use regex::Regex;
use reqwest::Client; use reqwest::Client;
use std::process::ExitCode; use std::process::ExitCode;
use taap::Argument;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main] #[tokio::main]
async fn main() -> ExitCode { async fn main() -> ExitCode {
let args = args::Args::parse(); // Taap setup
let mut arguments = Argument::new(
let tags = args.tags.unwrap_or_else(|| { "r34-scrape",
println!("which tags do you want to scrape? ex: 1girls 1boys yomama"); "A scraper for r34.xxx",
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap(); "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
tags_binding "Danmax and authors 2024",
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder()
.user_agent(args.user_agent)
.build()
.unwrap();
for page in 0.. {
println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
); );
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
let parsed_arguments = arguments.parse_args(None);
let tags = parsed_arguments.get("TAGS").unwrap();
// End of taap setup
// Check if empty and warn
// Can't use tags.0 because taap is not buggy at all :3
if tags.1.is_empty() {
println!("[warning] No tags were used, use --help for help")
}
let mut thread_counter = 0;
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
loop {
println!("now scraping page {}", page + 1);
let post_html = async || { let post_html = async || {
extract_urls( extract_urls(
&client &client
.get(format!( .get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}", "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
if tags.0 {
format!("&tags={}", tags.1.join("+"))
} else {
"".to_owned()
},
page * 42 page * 42
)) ))
.send() .send()
@ -76,17 +83,51 @@ async fn main() -> ExitCode {
} }
for url in urls { for url in urls {
let img_url = tokio::spawn(async move {
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); let thread_id = format!("[{thread_counter: >4}]");
loop {
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
match extract_img_url(
&client
.get(url.clone())
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
) {
Ok(img_url) => {
if img_url.is_empty() { if img_url.is_empty() {
println!("image url not found"); println!("{thread_id} image url not found");
} else { } else {
println!("found image url: {img_url}"); println!("{thread_id} found image url: {img_url}");
} }
} }
Err(_) => {
println!("{thread_id} ratelimited, retrying after 1 second");
std::thread::sleep(std::time::Duration::from_millis(1000));
continue;
}
}
break;
}
});
thread_counter += 1;
if thread_counter > 9999 {
thread_counter = 0;
}
while tokio::runtime::Handle::current()
.metrics()
.num_alive_tasks()
> 4
{
std::thread::sleep(std::time::Duration::from_millis(100));
}
} }
return ExitCode::SUCCESS; page += 1;
}
} }
fn extract_urls(html: &str) -> Vec<String> { fn extract_urls(html: &str) -> Vec<String> {
@ -97,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect() .collect()
} }
fn extract_img_url(html: &str) -> String { fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) = if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap() .unwrap()
.find(html) .find(html)
{ {
img_url.as_str().to_string() Ok(img_url.as_str().to_string())
} else { } else {
String::new() if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
} }
} }