Update epilog cause it didnt offend anyone

Update gitignore, add taap, luv gaylord
added multithreading
2024-10-19 17:50:05 +02:00 · 2024-10-19 14:05:30 +02:00 · 2024-10-18 19:36:25 -04:00 · 2024-10-18 23:03:57 +02:00 · 2024-10-18 23:00:26 +02:00
7 changed files with 88 additions and 1519 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+/Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-clap = { version = "4.5.20", features = ["derive"] }
 regex = "1.11.0"
 reqwest = { version = "0.12.8", features = ["blocking"] }
+taap = "0.1.4"
 tokio = { version = "1", features = ["full"] }
--- a/README.md
+++ b/README.md
@@ -5,3 +5,6 @@ a scraper that well scrapes r34
 ## note 

 this thing is still not completed, it only gathers links, it doesnt download things yet
+
+## example usage image
+![example image](./image.png)
--- a/image.png
+++ b/image.png
--- a/src/args/mod.rs
+++ b/src/args/mod.rs
@@ -1,16 +0,0 @@
-use clap::Parser;
-
-#[derive(Parser)]
-#[command(version)]
-pub struct Args {
-    /// User Agent to use for requests
-    #[arg(
-        short,
-        default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
-    )]
-    pub user_agent: String,
-
-    // Tags to search for
-    #[arg(short, long)]
-    pub tags: Option<Vec<String>>,
-}
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,44 +1,51 @@
-#![feature(async_closure, iter_intersperse)]
-pub mod args;
-
-use clap::Parser;
+#![feature(async_closure)]
 use regex::Regex;
 use reqwest::Client;
 use std::process::ExitCode;
+use taap::Argument;
 use tokio::time::{sleep, Duration};

+const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
+
 #[tokio::main]
 async fn main() -> ExitCode {
-    let args = args::Args::parse();
-
-    let tags = args.tags.unwrap_or_else(|| {
-        println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
-        let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
-        tags_binding
-            .split(' ')
-            .filter(|item| !item.is_empty())
-            .map(|item| item.to_owned())
-            .collect()
-    });
-    let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
-
-    let client = Client::builder()
-        .user_agent(args.user_agent)
-        .build()
-        .unwrap();
-
-    for page in 0.. {
-        println!("now scraping page {}", page + 1);
-        println!(
-            "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
-            page * 42
+    // Taap setup
+    let mut arguments = Argument::new(
+        "r34-scrape",
+        "A scraper for r34.xxx",
+        "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
+        "Danmax and authors 2024",
    );
    
+    arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
+    let parsed_arguments = arguments.parse_args(None);
+    
+    let tags = parsed_arguments.get("TAGS").unwrap();
+    
+    // End of taap setup
+    // Check if empty and warn
+    // Can't use tags.0 because taap is not buggy at all :3
+    if tags.1.is_empty() {
+        println!("[warning] No tags were used, use --help for help")
+    }
+    
+    let mut thread_counter = 0;
+    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
+    let mut page = 0;
+
+    loop {
+        println!("now scraping page {}", page + 1);
+
        let post_html = async || {
            extract_urls(
                &client
                    .get(format!(
-                        "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
+                        "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
+                        if tags.0 {
+                            format!("&tags={}", tags.1.join("+"))
+                        } else {
+                            "".to_owned()
+                        },
                        page * 42
                    ))
                    .send()
@@ -76,17 +83,51 @@ async fn main() -> ExitCode {
        }

        for url in urls {
-            let img_url =
-                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
+            tokio::spawn(async move {
+                let thread_id = format!("[{thread_counter: >4}]");
+                loop {
+                    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
+                    match extract_img_url(
+                        &client
+                            .get(url.clone())
+                            .send()
+                            .await
+                            .unwrap()
+                            .text()
+                            .await
+                            .unwrap(),
+                    ) {
+                        Ok(img_url) => {
                            if img_url.is_empty() {
-                println!("image url not found");
+                                println!("{thread_id} image url not found");
                            } else {
-                println!("found image url: {img_url}");
+                                println!("{thread_id} found image url: {img_url}");
                            }
                        }
+                        Err(_) => {
+                            println!("{thread_id} ratelimited, retrying after 1 second");
+                            std::thread::sleep(std::time::Duration::from_millis(1000));
+                            continue;
+                        }
+                    }
+                    break;
+                }
+            });
+            thread_counter += 1;
+            if thread_counter > 9999 {
+                thread_counter = 0;
+            }
+            while tokio::runtime::Handle::current()
+                .metrics()
+                .num_alive_tasks()
+                > 4
+            {
+                std::thread::sleep(std::time::Duration::from_millis(100));
+            }
        }

-    return ExitCode::SUCCESS;
+        page += 1;
+    }
 }

 fn extract_urls(html: &str) -> Vec<String> {
@@ -97,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
        .collect()
 }

-fn extract_img_url(html: &str) -> String {
+fn extract_img_url(html: &str) -> Result<String, &'static str> {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
-        img_url.as_str().to_string()
+        Ok(img_url.as_str().to_string())
    } else {
-        String::new()
+        if html.contains("503 Rate limiting") {
+            Err("ratelimited")
+        } else {
+            Ok(String::new())
+        }
    }
 }
Author	SHA1	Message	Date
GayLord	c1d67c9d84	Update epilog cause it didnt offend anyone	2024-10-19 17:50:05 +02:00
GayLord	89fca9d0a3	Update gitignore, add taap, luv gaylord	2024-10-19 14:05:30 +02:00
danmax	eeee4f50b3	added multithreading Co-authored-by: ErrorNoInternet <errornointernet@envs.net>	2024-10-18 19:36:25 -04:00
danmax	abd91a6e95	Merge pull request 'chore(README): add example usage image' (#4 ) from grialion/r34-scraper:main into main Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4	2024-10-18 23:03:57 +02:00
grialion	1a5fc75162	chore(README): add example usage image	2024-10-18 23:00:26 +02:00