fix: img regex #3

Merged
danmax merged 1 commits from grialion/r34-scraper:main into main 2024-10-18 22:56:08 +02:00

View File

@@ -21,7 +21,6 @@ async fn main() -> ExitCode {
let mut page = 0; let mut page = 0;
loop { loop {
println!("now scraping page {}", page + 1); println!("now scraping page {}", page + 1);
let post_html = async || { let post_html = async || {
@@ -45,11 +44,10 @@ async fn main() -> ExitCode {
let mut wait_time = 5000; let mut wait_time = 5000;
if urls.is_empty() { if urls.is_empty() {
for reconnection_attempts in 0..4 { for reconnection_attempts in 0..4 {
println!("no urls found, retrying in {} seconds...", wait_time / 1000); println!("no urls found, retrying in {} seconds...", wait_time / 1000);
sleep(Duration::from_millis(wait_time)).await; sleep(Duration::from_millis(wait_time)).await;
urls = post_html().await; urls = post_html().await;
if !urls.is_empty() { if !urls.is_empty() {
@@ -67,7 +65,6 @@ async fn main() -> ExitCode {
} }
for url in urls { for url in urls {
let img_url = let img_url =
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
if img_url.is_empty() { if img_url.is_empty() {
@@ -91,7 +88,7 @@ fn extract_urls(html: &str) -> Vec<String> {
fn extract_img_url(html: &str) -> String { fn extract_img_url(html: &str) -> String {
if let Some(img_url) = if let Some(img_url) =
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+") Regex::new(r"https://us\.rule34\.xxx/images/([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+")
.unwrap() .unwrap()
.find(html) .find(html)
{ {