forked from danmax/r34-scraper
done with basics
This commit is contained in:
commit
6dd45a17fa
9
Cargo.toml
Normal file
9
Cargo.toml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
[package]
|
||||||
|
name = "r34-scraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
regex = "1.11.0"
|
||||||
|
reqwest = { version = "0.12.8", features = ["blocking"] }
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
46
src/main.rs
Normal file
46
src/main.rs
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
use regex::Regex;
|
||||||
|
use reqwest::Client;
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||||
|
|
||||||
|
let body = Client::new()
|
||||||
|
.get("https://rule34.xxx/index.php?page=post&s=list&tags=1girls&pid=0")
|
||||||
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
|
||||||
|
|
||||||
|
let urls = extract_urls(&body);
|
||||||
|
for url in urls {
|
||||||
|
println!("found post: {}", url);
|
||||||
|
|
||||||
|
let post = Client::new()
|
||||||
|
.get(url)
|
||||||
|
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
|
||||||
|
|
||||||
|
let img_url = extract_img_url(&post);
|
||||||
|
match !img_url.is_empty() {
|
||||||
|
true => println!("found image url: {}", img_url),
|
||||||
|
false => println!("image url not found"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn extract_urls(html: &str) -> Vec<String> {
|
||||||
|
let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap();
|
||||||
|
|
||||||
|
let urls: Vec<String> = re.find_iter(html)
|
||||||
|
.map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
urls
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_img_url(html: &str) -> String {
|
||||||
|
let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z]+\?[0-9]+").unwrap();
|
||||||
|
match re.find(html) {
|
||||||
|
Some(img_url) => img_url.as_str().to_string(),
|
||||||
|
None => String::new(),
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user