made scraper scrape all the pages

This commit is contained in:
danmax 2024-10-14 23:59:06 -04:00
parent 91568abea5
commit 7b73c942b8

View File

@ -3,42 +3,56 @@ use reqwest::Client;
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> { async fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut r34pid = -42;
let body = Client::new() loop {
.get("https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid=0") r34pid += 42;
let r34_url = format!(
"https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid={}",
r34pid
);
let body = Client::new()
.get(r34_url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
let urls = extract_urls(&body); println!("-------------------------------");
for url in urls { println!(" now scraping page {}", r34pid / 42 + 1);
println!("found post: {}", url); println!("-------------------------------");
let post = Client::new() let urls = extract_urls(&body);
for url in urls {
println!("found post: {}", url);
let post = Client::new()
.get(url) .get(url)
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?; .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
let img_url = extract_img_url(&post); let img_url = extract_img_url(&post);
match !img_url.is_empty() { match !img_url.is_empty() {
true => println!("found image url: {}", img_url), true => println!("found image url: {}", img_url),
false => println!("image url not found"), false => println!("image url not found"),
}
} }
} }
Ok(())
} }
fn extract_urls(html: &str) -> Vec<String> { fn extract_urls(html: &str) -> Vec<String> {
let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap(); let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap();
let urls: Vec<String> = re.find_iter(html) let urls: Vec<String> = re
.find_iter(html)
.map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string())) .map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string()))
.collect(); .collect();
urls urls
} }
fn extract_img_url(html: &str) -> String { fn extract_img_url(html: &str) -> String {
let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap(); let re =
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+")
.unwrap();
match re.find(html) { match re.find(html) {
Some(img_url) => img_url.as_str().to_string(), Some(img_url) => img_url.as_str().to_string(),
None => String::new(), None => String::new(),