forked from danmax/r34-scraper
made scraper scrape all the pages
This commit is contained in:
parent
91568abea5
commit
7b73c942b8
26
src/main.rs
26
src/main.rs
@ -3,11 +3,24 @@ use reqwest::Client;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let mut r34pid = -42;
|
||||
|
||||
loop {
|
||||
r34pid += 42;
|
||||
|
||||
let r34_url = format!(
|
||||
"https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid={}",
|
||||
r34pid
|
||||
);
|
||||
|
||||
let body = Client::new()
|
||||
.get("https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid=0")
|
||||
.get(r34_url)
|
||||
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
|
||||
|
||||
println!("-------------------------------");
|
||||
println!(" now scraping page {}", r34pid / 42 + 1);
|
||||
println!("-------------------------------");
|
||||
|
||||
let urls = extract_urls(&body);
|
||||
for url in urls {
|
||||
println!("found post: {}", url);
|
||||
@ -22,15 +35,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
false => println!("image url not found"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn extract_urls(html: &str) -> Vec<String> {
|
||||
let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap();
|
||||
|
||||
let urls: Vec<String> = re.find_iter(html)
|
||||
let urls: Vec<String> = re
|
||||
.find_iter(html)
|
||||
.map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string()))
|
||||
.collect();
|
||||
|
||||
@ -38,7 +50,9 @@ fn extract_urls(html: &str) -> Vec<String> {
|
||||
}
|
||||
|
||||
fn extract_img_url(html: &str) -> String {
|
||||
let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap();
|
||||
let re =
|
||||
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+")
|
||||
.unwrap();
|
||||
match re.find(html) {
|
||||
Some(img_url) => img_url.as_str().to_string(),
|
||||
None => String::new(),
|
||||
|
Loading…
x
Reference in New Issue
Block a user