Update epilog cause it didnt offend anyone

Update gitignore, add taap, luv gaylord
added multithreading
2024-10-19 17:50:05 +02:00 · 2024-10-19 14:05:30 +02:00 · 2024-10-18 19:36:25 -04:00 · 2024-10-18 23:03:57 +02:00 · 2024-10-18 23:00:26 +02:00
7 changed files with 82 additions and 1363 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
 /Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -6,4 +6,5 @@ edition = "2021"
 [dependencies]
 regex = "1.11.0"
 reqwest = { version = "0.12.8", features = ["blocking"] }
 taap = "0.1.4"
 tokio = { version = "1", features = ["full"] }
--- a/README.md
+++ b/README.md
@@ -5,3 +5,6 @@ a scraper that well scrapes r34
 ## note 
 this thing is still not completed, it only gathers links, it doesnt download things yet
 ## example usage image
 ![example image](./image.png)
--- a/image.png
+++ b/image.png
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -0,0 +1,2 @@
 [toolchain]
 channel = "nightly"
--- a/src/main.rs
+++ b/src/main.rs
@@ -2,21 +2,34 @@
 use regex::Regex;
 use reqwest::Client;
 use std::process::ExitCode;
 use taap::Argument;
 use tokio::time::{sleep, Duration};
 const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 #[tokio::main]
 async fn main() -> ExitCode {
-    println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
+    // Taap setup
-    let tags = std::io::stdin()
+    let mut arguments = Argument::new(
-        .lines()
+        "r34-scrape",
-        .next()
+        "A scraper for r34.xxx",
-        .unwrap()
+        "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
-        .unwrap()
+        "Danmax and authors 2024",
-        .trim()
+    );
        .to_string();
    arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
    let parsed_arguments = arguments.parse_args(None);
    let tags = parsed_arguments.get("TAGS").unwrap();
    // End of taap setup
    // Check if empty and warn
    // Can't use tags.0 because taap is not buggy at all :3
    if tags.1.is_empty() {
        println!("[warning] No tags were used, use --help for help")
    }
    let mut thread_counter = 0;
    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;
@@ -27,7 +40,12 @@ async fn main() -> ExitCode {
            extract_urls(
                &client
                    .get(format!(
-                        "https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
+                        "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
                        if tags.0 {
                            format!("&tags={}", tags.1.join("+"))
                        } else {
                            "".to_owned()
                        },
                        page * 42
                    ))
                    .send()
@@ -65,12 +83,46 @@ async fn main() -> ExitCode {
        }
        for url in urls {
-            let img_url =
+            tokio::spawn(async move {
-                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
+                let thread_id = format!("[{thread_counter: >4}]");
-            if img_url.is_empty() {
+                loop {
-                println!("image url not found");
+                    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
-            } else {
+                    match extract_img_url(
-                println!("found image url: {img_url}");
+                        &client
                            .get(url.clone())
                            .send()
                            .await
                            .unwrap()
                            .text()
                            .await
                            .unwrap(),
                    ) {
                        Ok(img_url) => {
                            if img_url.is_empty() {
                                println!("{thread_id} image url not found");
                            } else {
                                println!("{thread_id} found image url: {img_url}");
                            }
                        }
                        Err(_) => {
                            println!("{thread_id} ratelimited, retrying after 1 second");
                            std::thread::sleep(std::time::Duration::from_millis(1000));
                            continue;
                        }
                    }
                    break;
                }
            });
            thread_counter += 1;
            if thread_counter > 9999 {
                thread_counter = 0;
            }
            while tokio::runtime::Handle::current()
                .metrics()
                .num_alive_tasks()
                > 4
            {
                std::thread::sleep(std::time::Duration::from_millis(100));
            }
        }
@@ -86,14 +138,18 @@ fn extract_urls(html: &str) -> Vec<String> {
        .collect()
 }
-fn extract_img_url(html: &str) -> String {
+fn extract_img_url(html: &str) -> Result<String, &'static str> {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
-        img_url.as_str().to_string()
+        Ok(img_url.as_str().to_string())
    } else {
-        String::new()
+        if html.contains("503 Rate limiting") {
            Err("ratelimited")
        } else {
            Ok(String::new())
        }
    }
 }
Author	SHA1	Message	Date
GayLord	c1d67c9d84	Update epilog cause it didnt offend anyone	2024-10-19 17:50:05 +02:00
GayLord	89fca9d0a3	Update gitignore, add taap, luv gaylord	2024-10-19 14:05:30 +02:00
danmax	eeee4f50b3	added multithreading Co-authored-by: ErrorNoInternet <errornointernet@envs.net>	2024-10-18 19:36:25 -04:00
danmax	abd91a6e95	Merge pull request 'chore(README): add example usage image' (#4 ) from grialion/r34-scraper:main into main Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/4	2024-10-18 23:03:57 +02:00
grialion	1a5fc75162	chore(README): add example usage image	2024-10-18 23:00:26 +02:00