Merge branch 'main' into javalsai-changes

chore: modularity & good code
feat: add arg parsing
2024-10-18 23:00:05 +02:00 · 2024-10-18 19:26:11 +02:00 · 2024-10-18 18:50:40 +02:00 · 2024-10-18 18:43:18 +02:00
7 changed files with 1519 additions and 88 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1 @@
 /target
 /Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.0"
 edition = "2021"
 [dependencies]
 clap = { version = "4.5.20", features = ["derive"] }
 regex = "1.11.0"
 reqwest = { version = "0.12.8", features = ["blocking"] }
 taap = "0.1.4"
 tokio = { version = "1", features = ["full"] }
--- a/README.md
+++ b/README.md
@@ -5,6 +5,3 @@ a scraper that well scrapes r34
 ## note 
 this thing is still not completed, it only gathers links, it doesnt download things yet
 ## example usage image
 ![example image](./image.png)
--- a/image.png
+++ b/image.png
--- a/src/args/mod.rs
+++ b/src/args/mod.rs
@@ -0,0 +1,16 @@
 use clap::Parser;
 #[derive(Parser)]
 #[command(version)]
 pub struct Args {
    /// User Agent to use for requests
    #[arg(
        short,
        default_value = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    )]
    pub user_agent: String,
    // Tags to search for
    #[arg(short, long)]
    pub tags: Option<Vec<String>>,
 }
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,51 +1,44 @@
-#![feature(async_closure)]
+#![feature(async_closure, iter_intersperse)]
 pub mod args;
 use clap::Parser;
 use regex::Regex;
 use reqwest::Client;
 use std::process::ExitCode;
 use taap::Argument;
 use tokio::time::{sleep, Duration};
 const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
 #[tokio::main]
 async fn main() -> ExitCode {
-    // Taap setup
+    let args = args::Args::parse();
    let mut arguments = Argument::new(
        "r34-scrape",
        "A scraper for r34.xxx",
        "Users love this tool! Hear our reviews down below:\n\"It has never been easier to find what I love!\" - penguinlover\n\"This has made my life way easier!\" - FurryUser69\n\"Best tool I've ever used\" - Sean Combs\n",
        "Danmax and authors 2024",
    );
-    arguments.add_arg("TAGS", "+", Some("the tags you want to search for"));
+    let tags = args.tags.unwrap_or_else(|| {
-    let parsed_arguments = arguments.parse_args(None);
+        println!("which tags do you want to scrape? ex: 1girls 1boys yomama");
        let tags_binding = std::io::stdin().lines().next().unwrap().unwrap();
        tags_binding
            .split(' ')
            .filter(|item| !item.is_empty())
            .map(|item| item.to_owned())
            .collect()
    });
    let uri_tags: String = tags.into_iter().intersperse(String::from("+")).collect();
-    let tags = parsed_arguments.get("TAGS").unwrap();
+    let client = Client::builder()
        .user_agent(args.user_agent)
        .build()
        .unwrap();
-    // End of taap setup
+    for page in 0.. {
    // Check if empty and warn
    // Can't use tags.0 because taap is not buggy at all :3
    if tags.1.is_empty() {
        println!("[warning] No tags were used, use --help for help")
    }
    let mut thread_counter = 0;
    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
    let mut page = 0;
    loop {
        println!("now scraping page {}", page + 1);
        println!(
            "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
            page * 42
        );
        let post_html = async || {
            extract_urls(
                &client
                    .get(format!(
-                        "https://rule34.xxx/index.php?page=post&s=list{}&pid={}",
+                        "https://rule34.xxx/index.php?page=post&s=list&tags={uri_tags}&pid={}",
                        if tags.0 {
                            format!("&tags={}", tags.1.join("+"))
                        } else {
                            "".to_owned()
                        },
                        page * 42
                    ))
                    .send()
@@ -83,51 +76,17 @@ async fn main() -> ExitCode {
        }
        for url in urls {
-            tokio::spawn(async move {
+            let img_url =
-                let thread_id = format!("[{thread_counter: >4}]");
+                extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
                loop {
                    let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
                    match extract_img_url(
                        &client
                            .get(url.clone())
                            .send()
                            .await
                            .unwrap()
                            .text()
                            .await
                            .unwrap(),
                    ) {
                        Ok(img_url) => {
            if img_url.is_empty() {
-                                println!("{thread_id} image url not found");
+                println!("image url not found");
            } else {
-                                println!("{thread_id} found image url: {img_url}");
+                println!("found image url: {img_url}");
            }
        }
                        Err(_) => {
                            println!("{thread_id} ratelimited, retrying after 1 second");
                            std::thread::sleep(std::time::Duration::from_millis(1000));
                            continue;
                        }
                    }
                    break;
                }
            });
            thread_counter += 1;
            if thread_counter > 9999 {
                thread_counter = 0;
            }
            while tokio::runtime::Handle::current()
                .metrics()
                .num_alive_tasks()
                > 4
            {
                std::thread::sleep(std::time::Duration::from_millis(100));
            }
    }
-        page += 1;
+    return ExitCode::SUCCESS;
    }
 }
 fn extract_urls(html: &str) -> Vec<String> {
@@ -138,18 +97,14 @@ fn extract_urls(html: &str) -> Vec<String> {
        .collect()
 }
-fn extract_img_url(html: &str) -> Result<String, &'static str> {
+fn extract_img_url(html: &str) -> String {
    if let Some(img_url) =
        Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
            .unwrap()
            .find(html)
    {
-        Ok(img_url.as_str().to_string())
+        img_url.as_str().to_string()
    } else {
-        if html.contains("503 Rate limiting") {
+        String::new()
            Err("ratelimited")
        } else {
            Ok(String::new())
        }
    }
 }
Author	SHA1	Message	Date
javalsai	e5e586ca2a	Merge branch 'main' into javalsai-changes	2024-10-18 23:00:05 +02:00
javalsai	5ce292d1c2	chore: modularity & good code	2024-10-18 19:26:11 +02:00
javalsai	e62d2cc186	feat: add arg parsing	2024-10-18 18:50:40 +02:00
javalsai	4acaf0308c	chore: define rust channel (+ fmt)	2024-10-18 18:43:18 +02:00