Merge pull request 'refactor: tidy main.rs' (#1) from ErrorNoInternet/r34-scraper:main into main
Reviewed-on: https://git.javalsai.dynv6.net/danmax/r34-scraper/pulls/1
This commit is contained in:
commit
c954cdeaea
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
/target
|
1344
Cargo.lock
generated
Normal file
1344
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
96
src/main.rs
96
src/main.rs
@ -1,68 +1,78 @@
|
|||||||
|
use std::process::ExitCode;
|
||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
|
|
||||||
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
async fn main() -> ExitCode {
|
||||||
let mut r34_tags = String::new();
|
|
||||||
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
|
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
|
||||||
std::io::stdin().read_line(&mut r34_tags).unwrap();
|
let tags = std::io::stdin()
|
||||||
r34_tags.trim().to_string();
|
.lines()
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.unwrap()
|
||||||
|
.trim()
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
|
||||||
|
let mut page = 0;
|
||||||
|
|
||||||
let mut r34pid = -42;
|
|
||||||
loop {
|
loop {
|
||||||
r34pid += 42;
|
println!("now scraping page {page}");
|
||||||
|
|
||||||
let r34_url = format!(
|
let urls = extract_urls(
|
||||||
"https://rule34.xxx/index.php?page=post&s=list&tags={}&pid={}", r34_tags, r34pid);
|
&client
|
||||||
|
.get(format!(
|
||||||
let body = Client::new()
|
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}",
|
||||||
.get(r34_url)
|
page * 42
|
||||||
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
|
))
|
||||||
|
.send()
|
||||||
let urls = extract_urls(&body);
|
.await
|
||||||
|
.unwrap()
|
||||||
if !urls.is_empty() {
|
.text()
|
||||||
} else {
|
.await
|
||||||
|
.unwrap(),
|
||||||
|
);
|
||||||
|
if urls.is_empty() {
|
||||||
println!("no urls found, exiting...");
|
println!("no urls found, exiting...");
|
||||||
std::process::exit(1);
|
return ExitCode::FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("-------------------------------");
|
|
||||||
println!(" now scraping page {}", r34pid / 42 + 1);
|
|
||||||
println!("-------------------------------");
|
|
||||||
|
|
||||||
|
|
||||||
for url in urls {
|
for url in urls {
|
||||||
println!("found post: {}", url);
|
println!("found post: {url}");
|
||||||
|
|
||||||
let post = Client::new()
|
let img_url =
|
||||||
.get(url)
|
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap());
|
||||||
.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
|
if img_url.is_empty() {
|
||||||
|
println!("image url not found");
|
||||||
let img_url = extract_img_url(&post);
|
} else {
|
||||||
match !img_url.is_empty() {
|
println!("found image url: {img_url}");
|
||||||
true => println!("found image url: {}", img_url),
|
|
||||||
false => println!("image url not found"),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
page += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_urls(html: &str) -> Vec<String> {
|
fn extract_urls(html: &str) -> Vec<String> {
|
||||||
let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap();
|
Regex::new(r"/index\.php\?page=post&s=view&id=\d+")
|
||||||
|
.unwrap()
|
||||||
let urls: Vec<String> = re
|
|
||||||
.find_iter(html)
|
.find_iter(html)
|
||||||
.map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string()))
|
.map(|mat| format!("https://rule34.xxx{}", mat.as_str()))
|
||||||
.collect();
|
.collect()
|
||||||
|
|
||||||
urls
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_img_url(html: &str) -> String {
|
fn extract_img_url(html: &str) -> String {
|
||||||
let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+").unwrap();
|
if let Some(img_url) =
|
||||||
match re.find(html) {
|
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+")
|
||||||
Some(img_url) => img_url.as_str().to_string(),
|
.unwrap()
|
||||||
None => String::new(),
|
.find(html)
|
||||||
|
{
|
||||||
|
img_url.as_str().to_string()
|
||||||
|
} else {
|
||||||
|
String::new()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user