r34-scraper/src/main.rs

79 lines
2.0 KiB
Rust

use std::process::ExitCode;
use regex::Regex;
use reqwest::Client;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main]
async fn main() -> ExitCode {
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
let tags = std::io::stdin()
.lines()
.next()
.unwrap()
.unwrap()
.trim()
.to_string();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
loop {
println!("now scraping page {page}");
let urls = extract_urls(
&client
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
page * 42
))
.send()
.await
.unwrap()
.text()
.await
.unwrap(),
);
if urls.is_empty() {
println!("no urls found, exiting...");
return ExitCode::FAILURE;
}
for url in urls {
println!("found post: {url}");
let img_url =
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
if img_url.is_empty() {
println!("image url not found");
} else {
println!("found image url: {img_url}");
}
}
page += 1;
}
}
fn extract_urls(html: &str) -> Vec<String> {
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
.unwrap()
.find_iter(html)
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
.collect()
}
fn extract_img_url(html: &str) -> String {
if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+")
.unwrap()
.find(html)
{
img_url.as_str().to_string()
} else {
String::new()
}
}