forked from danmax/r34-scraper
		
	Compare commits
	
		
			3 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 54ccc84719 | |||
| 5eaff063f3 | |||
| ef1e502af1 | 
@@ -4,7 +4,7 @@ a scraper that well scrapes r34
 | 
			
		||||
 | 
			
		||||
## note 
 | 
			
		||||
 | 
			
		||||
this program is pretty much complete, although i am planning to add a few extra features.
 | 
			
		||||
this program is pretty much complete, although i might add a few extra features.
 | 
			
		||||
 | 
			
		||||
## example usage image
 | 
			
		||||

 | 
			
		||||
 
 | 
			
		||||
@@ -22,6 +22,9 @@ pub struct Args {
 | 
			
		||||
    #[arg(short, long, default_value_t = 4)]
 | 
			
		||||
    pub jobs: usize,
 | 
			
		||||
 | 
			
		||||
    #[arg(long, default_value = "downloads")]
 | 
			
		||||
    pub dir: String,
 | 
			
		||||
 | 
			
		||||
    /// Delay for rate-limits (ms)
 | 
			
		||||
    #[arg(short, long, default_value = "1000", value_parser = parse_duration)]
 | 
			
		||||
    pub delay: std::time::Duration,
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										35
									
								
								src/main.rs
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								src/main.rs
									
									
									
									
									
								
							@@ -1,4 +1,3 @@
 | 
			
		||||
#![feature(async_closure)]
 | 
			
		||||
pub mod args;
 | 
			
		||||
 | 
			
		||||
use clap::Parser;
 | 
			
		||||
@@ -20,7 +19,8 @@ async fn main() -> ExitCode {
 | 
			
		||||
    let args = args::Args::parse();
 | 
			
		||||
 | 
			
		||||
    let uri_tags = &args.tags.join("+");
 | 
			
		||||
    let _ = std::fs::create_dir(uri_tags);
 | 
			
		||||
    let dir = &args.dir;
 | 
			
		||||
    let _ = std::fs::create_dir(dir);
 | 
			
		||||
 | 
			
		||||
    let running = Arc::new(AtomicBool::new(true));
 | 
			
		||||
    let running_t = running.clone();
 | 
			
		||||
@@ -100,14 +100,14 @@ async fn main() -> ExitCode {
 | 
			
		||||
                        }
 | 
			
		||||
 | 
			
		||||
                        this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m"));
 | 
			
		||||
                        let resp = client.get(&url).send().await.unwrap();
 | 
			
		||||
                        if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) {
 | 
			
		||||
                        let resp = client.get(&url).send().await.unwrap().text().await.unwrap();
 | 
			
		||||
                        if let Ok(img_url) = extract_img_url(&resp) {
 | 
			
		||||
                            if img_url.is_empty() {
 | 
			
		||||
                                this_bar.abandon_with_message(format!(
 | 
			
		||||
                                    "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
 | 
			
		||||
                                ));
 | 
			
		||||
                            } else {
 | 
			
		||||
                                download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await;
 | 
			
		||||
                                download_file(running_t, &img_url, this_bar, i, urls_amount, &resp, dir.as_str()).await;
 | 
			
		||||
                            }
 | 
			
		||||
                            break;
 | 
			
		||||
                        }
 | 
			
		||||
@@ -122,7 +122,7 @@ async fn main() -> ExitCode {
 | 
			
		||||
                }
 | 
			
		||||
            })
 | 
			
		||||
            .buffered(args.jobs);
 | 
			
		||||
        let _ = responses.for_each(|()| async {}).await;
 | 
			
		||||
        let () = responses.for_each(|()| async {}).await;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    ExitCode::SUCCESS
 | 
			
		||||
@@ -136,6 +136,23 @@ fn extract_urls(html: &str) -> Vec<String> {
 | 
			
		||||
        .collect()
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn write_to_database(html: &str, file_path: &str) {
 | 
			
		||||
    let strings = html.split('\n');
 | 
			
		||||
    for line in strings {
 | 
			
		||||
        if line.contains("<title>") {
 | 
			
		||||
            let line = line.trim_start();
 | 
			
		||||
            let mut file = std::fs::OpenOptions::new()
 | 
			
		||||
                .create(true)                
 | 
			
		||||
                .append(true)
 | 
			
		||||
                .open("database")
 | 
			
		||||
                .unwrap();
 | 
			
		||||
 | 
			
		||||
            writeln!(file, "{file_path}: {line}").unwrap();
 | 
			
		||||
            break;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn extract_img_url(html: &str) -> Result<String, &'static str> {
 | 
			
		||||
    if let Some(img_url) =
 | 
			
		||||
        Regex::new(r"https://us\.rule34\.xxx//images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
 | 
			
		||||
@@ -156,7 +173,8 @@ async fn download_file(
 | 
			
		||||
    this_bar: ProgressBar,
 | 
			
		||||
    i: usize,
 | 
			
		||||
    urls_amount: usize,
 | 
			
		||||
    uri_tags: &str,
 | 
			
		||||
    html: &str,
 | 
			
		||||
    dir: &str,
 | 
			
		||||
) {
 | 
			
		||||
    let args = args::Args::parse();
 | 
			
		||||
 | 
			
		||||
@@ -166,7 +184,7 @@ async fn download_file(
 | 
			
		||||
        .map(|m| m.as_str())
 | 
			
		||||
        .unwrap();
 | 
			
		||||
 | 
			
		||||
    let file_path = uri_tags.to_owned() + "/" + file_name;
 | 
			
		||||
    let file_path = dir.to_owned() + "/" + file_name;
 | 
			
		||||
 | 
			
		||||
    let mut file = if std::fs::File::open(&file_path).is_ok() {
 | 
			
		||||
        this_bar.finish_with_message(format!(
 | 
			
		||||
@@ -198,6 +216,7 @@ async fn download_file(
 | 
			
		||||
        written += img_chunk.len();
 | 
			
		||||
        this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
 | 
			
		||||
    }
 | 
			
		||||
    write_to_database(html, &file_path);
 | 
			
		||||
    this_bar.finish_with_message(format!(
 | 
			
		||||
        "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
 | 
			
		||||
    ));
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user