Compare commits

..

4 Commits

Author SHA1 Message Date
e5e586ca2a
Merge branch 'main' into javalsai-changes 2024-10-18 23:00:05 +02:00
5ce292d1c2
chore: modularity & good code 2024-10-18 19:26:11 +02:00
e62d2cc186
feat: add arg parsing 2024-10-18 18:50:40 +02:00
4acaf0308c
chore: define rust channel (+ fmt) 2024-10-18 18:43:18 +02:00
7 changed files with 1519 additions and 88 deletions

1
.gitignore vendored
View File

@ -1,2 +1 @@
/target /target
/Cargo.lock

1464
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
clap = { version = "4.5.20", features = ["derive"] }
regex = "1.11.0" regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] } reqwest = { version = "0.12.8", features = ["blocking"] }
taap = "0.1.4"
tokio = { version = "1", features = ["full"] } tokio = { version = "1", features = ["full"] }

View File

@ -5,6 +5,3 @@ a scraper that well scrapes r34
## note ## note
this thing is still not completed, it only gathers links, it doesnt download things yet this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

16
src/args/mod.rs Normal file
View File

@ -0,0 +1,16 @@
use clap::Parser;
#[derive(Parser)]
#[command(version)]
pub struct Args {
/// User Agent to use for requests
#[arg(
short,
default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
)]
pub user_agent: String,
// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
}

View File

@ -1,51 +1,44 @@
#![feature(async_closure)] #![feature(async_closure, iter_intersperse)]
pub mod args;
use clap::Parser;
use regex::Regex; use regex::Regex;
use reqwest::Client; use reqwest::Client;
use std::process::ExitCode; use std::process::ExitCode;
use taap::Argument;
use tokio::time::{sleep, Duration}; use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main] #[tokio::main]
async fn main() -> ExitCode { async fn main() -> ExitCode {
// Taap setup let args = args::Args::parse();
let mut arguments = Argument::new(
"r34-scrape",
"A scraper for r34.xxx",
"Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
"Danmax and authors 2024",
);
arguments.add_arg("TAGS", "+", Some("the tags you want to search for")); let tags = args.tags.unwrap_or_else(|| {
let parsed_arguments = arguments.parse_args(None); println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let tags = parsed_arguments.get("TAGS").unwrap(); let client = Client::builder()
.user_agent(args.user_agent)
.build()
.unwrap();
// End of taap setup for page in 0.. {
// Check if empty and warn
// Can't use tags.0 because taap is not buggy at all :3
if tags.1.is_empty() {
println!("[warning] No tags were used, use --help for help")
}
let mut thread_counter = 0;
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
loop {
println!("now scraping page {}", page + 1); println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
);
let post_html = async || { let post_html = async || {
extract_urls( extract_urls(
&client &client
.get(format!( .get(format!(
"https://rule34.xxx/index.php?page=post&s=list{}&pid={}", "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
if tags.0 {
format!("&tags={}", tags.1.join("+"))
} else {
"".to_owned()
},
page * 42 page * 42
)) ))
.send() .send()
@ -83,51 +76,17 @@ async fn main() -> ExitCode {
} }
for url in urls { for url in urls {
tokio::spawn(async move { let img_url =
let thread_id = format!("[{thread_counter: >4}]"); extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
loop { if img_url.is_empty() {
let client = Client::builder().user_agent(USER_AGENT).build().unwrap(); println!("image url not found");
match extract_img_url( } else {
&client println!("found image url: {img_url}");
.get(url.clone())
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
) {
Ok(img_url) => {
if img_url.is_empty() {
println!("{thread_id} image url not found");
} else {
println!("{thread_id} found image url: {img_url}");
}
}
Err(_) => {
println!("{thread_id} ratelimited, retrying after 1 second");
std::thread::sleep(std::time::Duration::from_millis(1000));
continue;
}
}
break;
}
});
thread_counter += 1;
if thread_counter > 9999 {
thread_counter = 0;
}
while tokio::runtime::Handle::current()
.metrics()
.num_alive_tasks()
> 4
{
std::thread::sleep(std::time::Duration::from_millis(100));
} }
} }
page += 1;
} }
return ExitCode::SUCCESS;
} }
fn extract_urls(html: &str) -> Vec<String> { fn extract_urls(html: &str) -> Vec<String> {
@ -138,18 +97,14 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect() .collect()
} }
fn extract_img_url(html: &str) -> Result<String, &'static str> { fn extract_img_url(html: &str) -> String {
if let Some(img_url) = if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap() .unwrap()
.find(html) .find(html)
{ {
Ok(img_url.as_str().to_string()) img_url.as_str().to_string()
} else { } else {
if html.contains("503 Rate limiting") { String::new()
Err("ratelimited")
} else {
Ok(String::new())
}
} }
} }