forked from danmax/r34-scraper
Compare commits
4 Commits
main
...
javalsai-c
Author | SHA1 | Date | |
---|---|---|---|
e5e586ca2a | |||
5ce292d1c2 | |||
e62d2cc186 | |||
4acaf0308c |
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1 @@
|
|||||||
/target
|
/target
|
||||||
/Cargo.lock
|
|
||||||
|
1464
Cargo.lock
generated
Normal file
1464
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,7 @@ version = "0.1.0"
|
|||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
clap = { version = "4.5.20", features = ["derive"] }
|
||||||
regex = "1.11.0"
|
regex = "1.11.0"
|
||||||
reqwest = { version = "0.12.8", features = ["blocking"] }
|
reqwest = { version = "0.12.8", features = ["blocking"] }
|
||||||
taap = "0.1.4"
|
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
@ -5,6 +5,3 @@ a scraper that well scrapes r34
|
|||||||
## note
|
## note
|
||||||
|
|
||||||
this thing is still not completed, it only gathers links, it doesnt download things yet
|
this thing is still not completed, it only gathers links, it doesnt download things yet
|
||||||
|
|
||||||
## example usage image
|
|
||||||

|
|
||||||
|
16
src/args/mod.rs
Normal file
16
src/args/mod.rs
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
use clap::Parser;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(version)]
|
||||||
|
pub struct Args {
|
||||||
|
/// User Agent to use for requests
|
||||||
|
#[arg(
|
||||||
|
short,
|
||||||
|
default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
|
||||||
|
)]
|
||||||
|
pub user_agent: String,
|
||||||
|
|
||||||
|
// Tags to search for
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub tags: Option<Vec<String>>,
|
||||||
|
}
|
111
src/main.rs
111
src/main.rs
@ -1,51 +1,44 @@
|
|||||||
#![feature(async_closure)]
|
#![feature(async_closure, iter_intersperse)]
|
||||||
|
pub mod args;
|
||||||
|
|
||||||
|
use clap::Parser;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::process::ExitCode;
|
use std::process::ExitCode;
|
||||||
use taap::Argument;
|
|
||||||
use tokio::time::{sleep, Duration};
|
use tokio::time::{sleep, Duration};
|
||||||
|
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> ExitCode {
|
async fn main() -> ExitCode {
|
||||||
// Taap setup
|
let args = args::Args::parse();
|
||||||
let mut arguments = Argument::new(
|
|
||||||
"r34-scrape",
|
|
||||||
"A scraper for r34.xxx",
|
|
||||||
"Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
|
|
||||||
"Danmax and authors 2024",
|
|
||||||
);
|
|
||||||
|
|
||||||
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
|
let tags = args.tags.unwrap_or_else(|| {
|
||||||
let parsed_arguments = arguments.parse_args(None);
|
println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
|
||||||
|
let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
|
||||||
|
tags_binding
|
||||||
|
.split(' ')
|
||||||
|
.filter(|item| !item.is_empty())
|
||||||
|
.map(|item| item.to_owned())
|
||||||
|
.collect()
|
||||||
|
});
|
||||||
|
let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
|
||||||
|
|
||||||
let tags = parsed_arguments.get("TAGS").unwrap();
|
let client = Client::builder()
|
||||||
|
.user_agent(args.user_agent)
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
// End of taap setup
|
for page in 0.. {
|
||||||
// Check if empty and warn
|
|
||||||
// Can't use tags.0 because taap is not buggy at all :3
|
|
||||||
if tags.1.is_empty() {
|
|
||||||
println!("[warning] No tags were used, use --help for help")
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut thread_counter = 0;
|
|
||||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
|
||||||
let mut page = 0;
|
|
||||||
|
|
||||||
loop {
|
|
||||||
println!("now scraping page {}", page + 1);
|
println!("now scraping page {}", page + 1);
|
||||||
|
println!(
|
||||||
|
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
|
||||||
|
page * 42
|
||||||
|
);
|
||||||
|
|
||||||
let post_html = async || {
|
let post_html = async || {
|
||||||
extract_urls(
|
extract_urls(
|
||||||
&client
|
&client
|
||||||
.get(format!(
|
.get(format!(
|
||||||
"https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
|
"https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
|
||||||
if tags.0 {
|
|
||||||
format!("&tags={}", tags.1.join("+"))
|
|
||||||
} else {
|
|
||||||
"".to_owned()
|
|
||||||
},
|
|
||||||
page * 42
|
page * 42
|
||||||
))
|
))
|
||||||
.send()
|
.send()
|
||||||
@ -83,51 +76,17 @@ async fn main() -> ExitCode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for url in urls {
|
for url in urls {
|
||||||
tokio::spawn(async move {
|
let img_url =
|
||||||
let thread_id = format!("[{thread_counter: >4}]");
|
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
|
||||||
loop {
|
|
||||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
|
||||||
match extract_img_url(
|
|
||||||
&client
|
|
||||||
.get(url.clone())
|
|
||||||
.send()
|
|
||||||
.await
|
|
||||||
.unwrap()
|
|
||||||
.text()
|
|
||||||
.await
|
|
||||||
.unwrap(),
|
|
||||||
) {
|
|
||||||
Ok(img_url) => {
|
|
||||||
if img_url.is_empty() {
|
if img_url.is_empty() {
|
||||||
println!("{thread_id} image url not found");
|
println!("image url not found");
|
||||||
} else {
|
} else {
|
||||||
println!("{thread_id} found image url: {img_url}");
|
println!("found image url: {img_url}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(_) => {
|
|
||||||
println!("{thread_id} ratelimited, retrying after 1 second");
|
|
||||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
thread_counter += 1;
|
|
||||||
if thread_counter > 9999 {
|
|
||||||
thread_counter = 0;
|
|
||||||
}
|
|
||||||
while tokio::runtime::Handle::current()
|
|
||||||
.metrics()
|
|
||||||
.num_alive_tasks()
|
|
||||||
> 4
|
|
||||||
{
|
|
||||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
page += 1;
|
return ExitCode::SUCCESS;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_urls(html: &str) -> Vec<String> {
|
fn extract_urls(html: &str) -> Vec<String> {
|
||||||
@ -138,18 +97,14 @@ fn extract_urls(html: &str) -> Vec<String> {
|
|||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
fn extract_img_url(html: &str) -> String {
|
||||||
if let Some(img_url) =
|
if let Some(img_url) =
|
||||||
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.find(html)
|
.find(html)
|
||||||
{
|
{
|
||||||
Ok(img_url.as_str().to_string())
|
img_url.as_str().to_string()
|
||||||
} else {
|
} else {
|
||||||
if html.contains("503 Rate limiting") {
|
String::new()
|
||||||
Err("ratelimited")
|
|
||||||
} else {
|
|
||||||
Ok(String::new())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user