r34-scraper/src/main.rs

use std::process::ExitCode;

use regex::Regex;
use reqwest::Client;

const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";

#[tokio::main]
async fn main() -> ExitCode {
    println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
    let tags = std::io::stdin()
        .lines()
        .next()
        .unwrap()
        .unwrap()
        .trim()
        .to_string();

    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;

    loop {
        println!("now scraping page {page}");

        let urls = extract_urls(
            &client
                .get(format!(
                    "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
                    page * 42
                ))
                .send()
                .await
                .unwrap()
                .text()
                .await
                .unwrap(),
        );
        if urls.is_empty() {
            println!("no urls found, exiting...");
            return ExitCode::FAILURE;
        }

        for url in urls {
            println!("found post: {url}");

            let img_url =
                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
            if img_url.is_empty() {
                println!("image url not found");
            } else {
                println!("found image url: {img_url}");
            }
        }

        page += 1;
    }
}

fn extract_urls(html: &str) -> Vec<String> {
    Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
        .unwrap()
        .find_iter(html)
        .map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
        .collect()
}

fn extract_img_url(html: &str) -> String {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+")
            .unwrap()
            .find(html)
    {
        img_url.as_str().to_string()
    } else {
        String::new()
    }
}