r34-scraper/src/main.rs

#![feature(async_closure)]
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use tokio::time::{sleep, Duration};

const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";

#[tokio::main]
async fn main() -> ExitCode {
    println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
    let tags = std::io::stdin()
        .lines()
        .next()
        .unwrap()
        .unwrap()
        .trim()
        .to_string();

    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;

    loop {
        println!("now scraping page {}", page + 1);

        let post_html = async || {
            extract_urls(
                &client
                    .get(format!(
                        "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
                        page * 42
                    ))
                    .send()
                    .await
                    .unwrap()
                    .text()
                    .await
                    .unwrap(),
            )
        };

        let mut urls = post_html().await;

        let mut wait_time = 5000;

        if urls.is_empty() {
            for reconnection_attempts in 0..4 {
                println!("no urls found, retrying in {} seconds...", wait_time / 1000);
                sleep(Duration::from_millis(wait_time)).await;

                urls = post_html().await;

                if !urls.is_empty() {
                    println!("urls found! continuing...");
                    break;
                }

                if reconnection_attempts == 3 {
                    println!("no urls found in 4 attempts, exiting...");
                    return ExitCode::FAILURE;
                }

                wait_time += 5000;
            }
        }

        for url in urls {
            let img_url =
                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
            if img_url.is_empty() {
                println!("image url not found");
            } else {
                println!("found image url: {img_url}");
            }
        }

        page += 1;
    }
}

fn extract_urls(html: &str) -> Vec<String> {
    Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
        .unwrap()
        .find_iter(html)
        .map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
        .collect()
}

fn extract_img_url(html: &str) -> String {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
        img_url.as_str().to_string()
    } else {
        String::new()
    }
}