Compare commits

..

5 Commits

Author SHA1 Message Date
GayLord
c1d67c9d84 Update epilog cause it didnt offend anyone 2024-10-19 17:50:05 +02:00
GayLord
89fca9d0a3 Update gitignore, add taap, luv gaylord 2024-10-19 14:05:30 +02:00
danmax
eeee4f50b3 added multithreading
Co-authored-by: ErrorNoInternet <errornointernet@envs.net>
2024-10-18 19:36:25 -04:00
abd91a6e95 Merge pull request 'chore(README): add example usage image' (#4) from grialion/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4
2024-10-18 23:03:57 +02:00
grialion
1a5fc75162 chore(README): add example usage image 2024-10-18 23:00:26 +02:00
7 changed files with 88 additions and 1519 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target
/Cargo.lock

1464
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.5.20", features = ["derive"] }
regex = "1.11.0"
reqwest = { version = "0.12.8", features = ["blocking"] }
taap = "0.1.4"
tokio = { version = "1", features = ["full"] }

View File

@ -5,3 +5,6 @@ a scraper that well scrapes r34
## note
this thing is still not completed, it only gathers links, it doesnt download things yet
## example usage image
![example image](./image.png)

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

View File

@ -1,16 +0,0 @@
use clap::Parser;
#[derive(Parser)]
#[command(version)]
pub struct Args {
/// User Agent to use for requests
#[arg(
short,
default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
)]
pub user_agent: String,
// Tags to search for
#[arg(short, long)]
pub tags: Option<Vec<String>>,
}

View File

@ -1,44 +1,51 @@
#![feature(async_closure, iter_intersperse)]
pub mod args;
use clap::Parser;
#![feature(async_closure)]
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use taap::Argument;
use tokio::time::{sleep, Duration};
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main]
async fn main() -> ExitCode {
let args = args::Args::parse();
let tags = args.tags.unwrap_or_else(|| {
println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
tags_binding
.split(' ')
.filter(|item| !item.is_empty())
.map(|item| item.to_owned())
.collect()
});
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
let client = Client::builder()
.user_agent(args.user_agent)
.build()
.unwrap();
for page in 0.. {
println!("now scraping page {}", page + 1);
println!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
page * 42
// Taap setup
let mut arguments = Argument::new(
"r34-scrape",
"A scraper for r34.xxx",
"Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
"Danmax and authors 2024",
);
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
let parsed_arguments = arguments.parse_args(None);
let tags = parsed_arguments.get("TAGS").unwrap();
// End of taap setup
// Check if empty and warn
// Can't use tags.0 because taap is not buggy at all :3
if tags.1.is_empty() {
println!("[warning] No tags were used, use --help for help")
}
let mut thread_counter = 0;
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
loop {
println!("now scraping page {}", page + 1);
let post_html = async || {
extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
"https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
if tags.0 {
format!("&tags={}", tags.1.join("+"))
} else {
"".to_owned()
},
page * 42
))
.send()
@ -76,17 +83,51 @@ async fn main() -> ExitCode {
}
for url in urls {
let img_url =
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
tokio::spawn(async move {
let thread_id = format!("[{thread_counter: >4}]");
loop {
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
match extract_img_url(
&client
.get(url.clone())
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
) {
Ok(img_url) => {
if img_url.is_empty() {
println!("image url not found");
println!("{thread_id} image url not found");
} else {
println!("found image url: {img_url}");
println!("{thread_id} found image url: {img_url}");
}
}
Err(_) => {
println!("{thread_id} ratelimited, retrying after 1 second");
std::thread::sleep(std::time::Duration::from_millis(1000));
continue;
}
}
break;
}
});
thread_counter += 1;
if thread_counter > 9999 {
thread_counter = 0;
}
while tokio::runtime::Handle::current()
.metrics()
.num_alive_tasks()
> 4
{
std::thread::sleep(std::time::Duration::from_millis(100));
}
}
return ExitCode::SUCCESS;
page += 1;
}
}
fn extract_urls(html: &str) -> Vec<String> {
@ -97,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect()
}
fn extract_img_url(html: &str) -> String {
fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap()
.find(html)
{
img_url.as_str().to_string()
Ok(img_url.as_str().to_string())
} else {
String::new()
if html.contains("503 Rate limiting") {
Err("ratelimited")
} else {
Ok(String::new())
}
}
}