Update epilog cause it didnt offend anyone

Update gitignore, add taap, luv gaylord
added multithreading
2024-10-19 17:50:05 +02:00 · 2024-10-19 14:05:30 +02:00 · 2024-10-18 19:36:25 -04:00 · 2024-10-18 23:03:57 +02:00 · 2024-10-18 23:00:26 +02:00
7 changed files with 88 additions and 1519 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 /target
 /Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 [dependencies]
 clap = { version = "4.5.20", features = ["derive"] }
 regex = "1.11.0"
 reqwest = { version = "0.12.8", features = ["blocking"] }
 taap = "0.1.4"
 tokio = { version = "1", features = ["full"] }
--- a/README.md
+++ b/README.md
@ -5,3 +5,6 @@ a scraper that well scrapes r34
 ## note 
 this thing is still not completed, it only gathers links, it doesnt download things yet
 ## example usage image
 ![example image](./image.png)
--- a/image.png
+++ b/image.png
--- a/src/args/mod.rs
+++ b/src/args/mod.rs
@ -1,16 +0,0 @@
 use clap::Parser;
 #[derive(Parser)]
 #[command(version)]
 pub struct Args {
    /// User Agent to use for requests
    #[arg(
        short,
        default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    )]
    pub user_agent: String,
    // Tags to search for
    #[arg(short, long)]
    pub tags: Option<Vec<String>>,
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -1,44 +1,51 @@
-#![feature(async_closure, iter_intersperse)]
+#![feature(async_closure)]
 pub mod args;
 use clap::Parser;
 use regex::Regex;
 use reqwest::Client;
 use std::process::ExitCode;
 use taap::Argument;
 use tokio::time::{sleep, Duration};
 const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 #[tokio::main]
 async fn main() -> ExitCode {
-    let args = args::Args::parse();
+    // Taap setup
-
+    let mut arguments = Argument::new(
-    let tags = args.tags.unwrap_or_else(|| {
+        "r34-scrape",
-        println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
+        "A scraper for r34.xxx",
-        let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
+        "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
-        tags_binding
+        "Danmax and authors 2024",
            .split(' ')
            .filter(|item| !item.is_empty())
            .map(|item| item.to_owned())
            .collect()
    });
    let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
    let client = Client::builder()
        .user_agent(args.user_agent)
        .build()
        .unwrap();
    for page in 0.. {
        println!("now scraping page {}", page + 1);
        println!(
            "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
            page * 42
    );
    arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
    let parsed_arguments = arguments.parse_args(None);
    let tags = parsed_arguments.get("TAGS").unwrap();
    // End of taap setup
    // Check if empty and warn
    // Can't use tags.0 because taap is not buggy at all :3
    if tags.1.is_empty() {
        println!("[warning] No tags were used, use --help for help")
    }
    let mut thread_counter = 0;
    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;
    loop {
        println!("now scraping page {}", page + 1);
        let post_html = async || {
            extract_urls(
                &client
                    .get(format!(
-                        "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
+                        "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
                        if tags.0 {
                            format!("&tags={}", tags.1.join("+"))
                        } else {
                            "".to_owned()
                        },
                        page * 42
                    ))
                    .send()
@ -76,17 +83,51 @@ async fn main() -> ExitCode {
        }
        for url in urls {
-            let img_url =
+            tokio::spawn(async move {
-                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
+                let thread_id = format!("[{thread_counter: >4}]");
                loop {
                    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
                    match extract_img_url(
                        &client
                            .get(url.clone())
                            .send()
                            .await
                            .unwrap()
                            .text()
                            .await
                            .unwrap(),
                    ) {
                        Ok(img_url) => {
                            if img_url.is_empty() {
-                println!("image url not found");
+                                println!("{thread_id} image url not found");
                            } else {
-                println!("found image url: {img_url}");
+                                println!("{thread_id} found image url: {img_url}");
                            }
                        }
                        Err(_) => {
                            println!("{thread_id} ratelimited, retrying after 1 second");
                            std::thread::sleep(std::time::Duration::from_millis(1000));
                            continue;
                        }
                    }
                    break;
                }
            });
            thread_counter += 1;
            if thread_counter > 9999 {
                thread_counter = 0;
            }
            while tokio::runtime::Handle::current()
                .metrics()
                .num_alive_tasks()
                > 4
            {
                std::thread::sleep(std::time::Duration::from_millis(100));
            }
        }
-    return ExitCode::SUCCESS;
+        page += 1;
    }
 }
 fn extract_urls(html: &str) -> Vec<String> {
@ -97,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
        .collect()
 }
-fn extract_img_url(html: &str) -> String {
+fn extract_img_url(html: &str) -> Result<String, &'static str> {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
-        img_url.as_str().to_string()
+        Ok(img_url.as_str().to_string())
    } else {
-        String::new()
+        if html.contains("503 Rate limiting") {
            Err("ratelimited")
        } else {
            Ok(String::new())
        }
    }
 }
Author	SHA1	Message	Date
GayLord	c1d67c9d84	Update epilog cause it didnt offend anyone	2024-10-19 17:50:05 +02:00
GayLord	89fca9d0a3	Update gitignore, add taap, luv gaylord	2024-10-19 14:05:30 +02:00
danmax	eeee4f50b3	added multithreading Co-authored-by: ErrorNoInternet <errornointernet@envs.net>	2024-10-18 19:36:25 -04:00
danmax	abd91a6e95	Merge pull request 'chore(README): add example usage image' (#4 ) from grialion/r34-scraper:main into main Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4	2024-10-18 23:03:57 +02:00
grialion	1a5fc75162	chore(README): add example usage image	2024-10-18 23:00:26 +02:00