forked from danmax/r34-scraper
Compare commits
5 Commits
javalsai-c
...
main
Author | SHA1 | Date | |
---|---|---|---|
|
c1d67c9d84 | ||
|
89fca9d0a3 | ||
|
eeee4f50b3 | ||
abd91a6e95 | |||
|
1a5fc75162 |
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
/target
|
||||
/Cargo.lock
|
||||
|
1344
Cargo.lock
generated
1344
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -6,4 +6,5 @@ edition = "2021"
|
||||
[dependencies]
|
||||
regex = "1.11.0"
|
||||
reqwest = { version = "0.12.8", features = ["blocking"] }
|
||||
taap = "0.1.4"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
|
@ -5,3 +5,6 @@ a scraper that well scrapes r34
|
||||
## note
|
||||
|
||||
this thing is still not completed, it only gathers links, it doesnt download things yet
|
||||
|
||||
## example usage image
|
||||

|
||||
|
2
rust-toolchain.toml
Normal file
2
rust-toolchain.toml
Normal file
@ -0,0 +1,2 @@
|
||||
[toolchain]
|
||||
channel = "nightly"
|
94
src/main.rs
94
src/main.rs
@ -2,21 +2,34 @@
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use std::process::ExitCode;
|
||||
use taap::Argument;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> ExitCode {
|
||||
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
|
||||
let tags = std::io::stdin()
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
// Taap setup
|
||||
let mut arguments = Argument::new(
|
||||
"r34-scrape",
|
||||
"A scraper for r34.xxx",
|
||||
"Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
|
||||
"Danmax and authors 2024",
|
||||
);
|
||||
|
||||
arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
|
||||
let parsed_arguments = arguments.parse_args(None);
|
||||
|
||||
let tags = parsed_arguments.get("TAGS").unwrap();
|
||||
|
||||
// End of taap setup
|
||||
// Check if empty and warn
|
||||
// Can't use tags.0 because taap is not buggy at all :3
|
||||
if tags.1.is_empty() {
|
||||
println!("[warning] No tags were used, use --help for help")
|
||||
}
|
||||
|
||||
let mut thread_counter = 0;
|
||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||
let mut page = 0;
|
||||
|
||||
@ -27,7 +40,12 @@ async fn main() -> ExitCode {
|
||||
extract_urls(
|
||||
&client
|
||||
.get(format!(
|
||||
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
|
||||
"https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
|
||||
if tags.0 {
|
||||
format!("&tags={}", tags.1.join("+"))
|
||||
} else {
|
||||
"".to_owned()
|
||||
},
|
||||
page * 42
|
||||
))
|
||||
.send()
|
||||
@ -65,12 +83,46 @@ async fn main() -> ExitCode {
|
||||
}
|
||||
|
||||
for url in urls {
|
||||
let img_url =
|
||||
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
|
||||
if img_url.is_empty() {
|
||||
println!("image url not found");
|
||||
} else {
|
||||
println!("found image url: {img_url}");
|
||||
tokio::spawn(async move {
|
||||
let thread_id = format!("[{thread_counter: >4}]");
|
||||
loop {
|
||||
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||
match extract_img_url(
|
||||
&client
|
||||
.get(url.clone())
|
||||
.send()
|
||||
.await
|
||||
.unwrap()
|
||||
.text()
|
||||
.await
|
||||
.unwrap(),
|
||||
) {
|
||||
Ok(img_url) => {
|
||||
if img_url.is_empty() {
|
||||
println!("{thread_id} image url not found");
|
||||
} else {
|
||||
println!("{thread_id} found image url: {img_url}");
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
println!("{thread_id} ratelimited, retrying after 1 second");
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
});
|
||||
thread_counter += 1;
|
||||
if thread_counter > 9999 {
|
||||
thread_counter = 0;
|
||||
}
|
||||
while tokio::runtime::Handle::current()
|
||||
.metrics()
|
||||
.num_alive_tasks()
|
||||
> 4
|
||||
{
|
||||
std::thread::sleep(std::time::Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
|
||||
@ -86,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn extract_img_url(html: &str) -> String {
|
||||
fn extract_img_url(html: &str) -> Result<String, &'static str> {
|
||||
if let Some(img_url) =
|
||||
Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
|
||||
.unwrap()
|
||||
.find(html)
|
||||
{
|
||||
img_url.as_str().to_string()
|
||||
Ok(img_url.as_str().to_string())
|
||||
} else {
|
||||
String::new()
|
||||
if html.contains("503 Rate limiting") {
|
||||
Err("ratelimited")
|
||||
} else {
|
||||
Ok(String::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user