not-r34/src/main.rs

#![feature(async_closure)]
use regex::Regex;
use reqwest::Client;
use std::process::ExitCode;
use taap::Argument;
use tokio::time::{sleep, Duration};

const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";

#[tokio::main]
async fn main() -> ExitCode {
    // Taap setup
    let mut arguments = Argument::new(
        "r34-scrape",
        "A scraper for r34.xxx",
        "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
        "Danmax and authors 2024",
    );

    arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
    let parsed_arguments = arguments.parse_args(None);

    let tags = parsed_arguments.get("TAGS").unwrap();

    // End of taap setup
    // Check if empty and warn
    // Can't use tags.0 because taap is not buggy at all :3
    if tags.1.is_empty() {
        println!("[warning] No tags were used, use --help for help")
    }

    let mut thread_counter = 0;
    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;

    loop {
        println!("now scraping page {}", page + 1);

        let post_html = async || {
            extract_urls(
                &client
                    .get(format!(
                        "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
                        if tags.0 {
                            format!("&tags={}", tags.1.join("+"))
                        } else {
                            "".to_owned()
                        },
                        page * 42
                    ))
                    .send()
                    .await
                    .unwrap()
                    .text()
                    .await
                    .unwrap(),
            )
        };

        let mut urls = post_html().await;

        let mut wait_time = 5000;

        if urls.is_empty() {
            for reconnection_attempts in 0..4 {
                println!("no urls found, retrying in {} seconds...", wait_time / 1000);
                sleep(Duration::from_millis(wait_time)).await;

                urls = post_html().await;

                if !urls.is_empty() {
                    println!("urls found! continuing...");
                    break;
                }

                if reconnection_attempts == 3 {
                    println!("no urls found in 4 attempts, exiting...");
                    return ExitCode::FAILURE;
                }

                wait_time += 5000;
            }
        }

        for url in urls {
            tokio::spawn(async move {
                let thread_id = format!("[{thread_counter: >4}]");
                loop {
                    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
                    match extract_img_url(
                        &client
                            .get(url.clone())
                            .send()
                            .await
                            .unwrap()
                            .text()
                            .await
                            .unwrap(),
                    ) {
                        Ok(img_url) => {
                            if img_url.is_empty() {
                                println!("{thread_id} image url not found");
                            } else {
                                println!("{thread_id} found image url: {img_url}");
                            }
                        }
                        Err(_) => {
                            println!("{thread_id} ratelimited, retrying after 1 second");
                            std::thread::sleep(std::time::Duration::from_millis(1000));
                            continue;
                        }
                    }
                    break;
                }
            });
            thread_counter += 1;
            if thread_counter > 9999 {
                thread_counter = 0;
            }
            while tokio::runtime::Handle::current()
                .metrics()
                .num_alive_tasks()
                > 4
            {
                std::thread::sleep(std::time::Duration::from_millis(100));
            }
        }

        page += 1;
    }
}

fn extract_urls(html: &str) -> Vec<String> {
    Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
        .unwrap()
        .find_iter(html)
        .map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
        .collect()
}

fn extract_img_url(html: &str) -> Result<String, &'static str> {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
        Ok(img_url.as_str().to_string())
    } else {
        if html.contains("503 Rate limiting") {
            Err("ratelimited")
        } else {
            Ok(String::new())
        }
    }
}