made scraper stop once there are no post urls

2024-10-15 00:20:24 -04:00
parent 7b73c942b8
commit 36c0bfb0fe
1 changed files with 11 additions and 5 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -9,7 +9,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        r34pid += 42;

        let r34_url = format!(
-            "https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid={}",
+            "https://rule34.xxx/index.php?page=post&s=list&tags=kaguya_jinguu&pid={}",
            r34pid
        );

@@ -17,11 +17,19 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        .get(r34_url)
        .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;

+        let urls = extract_urls(&body);
+
+        if !urls.is_empty() {
+        } else {
+            println!("no urls found, exiting...");
+            std::process::exit(1);
+        }
+
        println!("-------------------------------");
        println!("      now scraping page {}", r34pid / 42 + 1);
        println!("-------------------------------");

-        let urls = extract_urls(&body);
+
        for url in urls {
            println!("found post: {}", url);

@@ -50,9 +58,7 @@ fn extract_urls(html: &str) -> Vec<String> {
 }

 fn extract_img_url(html: &str) -> String {
-    let re =
-        Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+")
-            .unwrap();
+    let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap();
    match re.find(html) {
        Some(img_url) => img_url.as_str().to_string(),
        None => String::new(),