Compare commits

...

3 Commits
fix ... main

Author SHA1 Message Date
54ccc84719 add database 2025-02-01 21:37:41 -05:00
5eaff063f3 Update README.md
made readme more accurate
2024-11-25 03:31:16 +01:00
ef1e502af1 Merge pull request 'fix' (#14) from ErrorNoInternet/r34-scraper:fix into main
👍
2024-11-25 03:29:35 +01:00
3 changed files with 31 additions and 9 deletions

View File

@ -4,7 +4,7 @@ a scraper that well scrapes r34
## note ## note
this program is pretty much complete, although i am planning to add a few extra features. this program is pretty much complete, although i might add a few extra features.
## example usage image ## example usage image
![example image](./image.png) ![example image](./image.png)

View File

@ -22,6 +22,9 @@ pub struct Args {
#[arg(short, long, default_value_t = 4)] #[arg(short, long, default_value_t = 4)]
pub jobs: usize, pub jobs: usize,
#[arg(long, default_value = "downloads")]
pub dir: String,
/// Delay for rate-limits (ms) /// Delay for rate-limits (ms)
#[arg(short, long, default_value = "1000", value_parser = parse_duration)] #[arg(short, long, default_value = "1000", value_parser = parse_duration)]
pub delay: std::time::Duration, pub delay: std::time::Duration,

View File

@ -1,4 +1,3 @@
#![feature(async_closure)]
pub mod args; pub mod args;
use clap::Parser; use clap::Parser;
@ -20,7 +19,8 @@ async fn main() -> ExitCode {
let args = args::Args::parse(); let args = args::Args::parse();
let uri_tags = &args.tags.join("+"); let uri_tags = &args.tags.join("+");
let _ = std::fs::create_dir(uri_tags); let dir = &args.dir;
let _ = std::fs::create_dir(dir);
let running = Arc::new(AtomicBool::new(true)); let running = Arc::new(AtomicBool::new(true));
let running_t = running.clone(); let running_t = running.clone();
@ -100,14 +100,14 @@ async fn main() -> ExitCode {
} }
this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m")); this_bar.set_message(format!("\x1b[37m[{i: >4}/{urls_amount}] \x1b[36mscraping {url}\x1b[0m"));
let resp = client.get(&url).send().await.unwrap(); let resp = client.get(&url).send().await.unwrap().text().await.unwrap();
if let Ok(img_url) = extract_img_url(&resp.text().await.unwrap()) { if let Ok(img_url) = extract_img_url(&resp) {
if img_url.is_empty() { if img_url.is_empty() {
this_bar.abandon_with_message(format!( this_bar.abandon_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m" "\x1b[37m[{i: >4}/{urls_amount}] \x1b[1;31mimage url not found\x1b[0m"
)); ));
} else { } else {
download_file(running_t, &img_url, this_bar, i, urls_amount, uri_tags).await; download_file(running_t, &img_url, this_bar, i, urls_amount, &resp, dir.as_str()).await;
} }
break; break;
} }
@ -122,7 +122,7 @@ async fn main() -> ExitCode {
} }
}) })
.buffered(args.jobs); .buffered(args.jobs);
let _ = responses.for_each(|()| async {}).await; let () = responses.for_each(|()| async {}).await;
} }
ExitCode::SUCCESS ExitCode::SUCCESS
@ -136,6 +136,23 @@ fn extract_urls(html: &str) -> Vec<String> {
.collect() .collect()
} }
fn write_to_database(html: &str, file_path: &str) {
let strings = html.split('\n');
for line in strings {
if line.contains("<title>") {
let line = line.trim_start();
let mut file = std::fs::OpenOptions::new()
.create(true)
.append(true)
.open("database")
.unwrap();
writeln!(file, "{file_path}: {line}").unwrap();
break;
}
}
}
fn extract_img_url(html: &str) -> Result<String, &'static str> { fn extract_img_url(html: &str) -> Result<String, &'static str> {
if let Some(img_url) = if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx//images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") Regex::new(r"https://us\.rule34\.xxx//images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
@ -156,7 +173,8 @@ async fn download_file(
this_bar: ProgressBar, this_bar: ProgressBar,
i: usize, i: usize,
urls_amount: usize, urls_amount: usize,
uri_tags: &str, html: &str,
dir: &str,
) { ) {
let args = args::Args::parse(); let args = args::Args::parse();
@ -166,7 +184,7 @@ async fn download_file(
.map(|m| m.as_str()) .map(|m| m.as_str())
.unwrap(); .unwrap();
let file_path = uri_tags.to_owned() + "/" + file_name; let file_path = dir.to_owned() + "/" + file_name;
let mut file = if std::fs::File::open(&file_path).is_ok() { let mut file = if std::fs::File::open(&file_path).is_ok() {
this_bar.finish_with_message(format!( this_bar.finish_with_message(format!(
@ -198,6 +216,7 @@ async fn download_file(
written += img_chunk.len(); written += img_chunk.len();
this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64); this_bar.set_position((written as f64 / file_length as f64 * BAR_LENGTH as f64) as u64);
} }
write_to_database(html, &file_path);
this_bar.finish_with_message(format!( this_bar.finish_with_message(format!(
"\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m" "\x1b[37m[{i: >4}/{urls_amount}] \x1b[32mdownloaded {img_url}\x1b[0m"
)); ));