Compare commits

..

No commits in common. "c954cdeaea727246418a2077979dda10db5529f2" and "13a626a9437f2fc11ba9e9ce24bf916db8909e2c" have entirely different histories.

3 changed files with 43 additions and 1398 deletions

1
.gitignore vendored
View File

@ -1 +0,0 @@
/target

1344
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,78 +1,68 @@
use std::process::ExitCode;
use regex::Regex; use regex::Regex;
use reqwest::Client; use reqwest::Client;
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36";
#[tokio::main] #[tokio::main]
async fn main() -> ExitCode { async fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut r34_tags = String::new();
println!("which tags do you want to scrape? ex: 1girls+1boys+yomama"); println!("which tags do you want to scrape? ex: 1girls+1boys+yomama");
let tags = std::io::stdin() std::io::stdin().read_line(&mut r34_tags).unwrap();
.lines() r34_tags.trim().to_string();
.next()
.unwrap()
.unwrap()
.trim()
.to_string();
let client = Client::builder().user_agent(USER_AGENT).build().unwrap();
let mut page = 0;
let mut r34pid = -42;
loop { loop {
println!("now scraping page {page}"); r34pid += 42;
let urls = extract_urls( let r34_url = format!(
&client "https://rule34.xxx/index.php?page=post&s=list&tags={}&pid={}", r34_tags, r34pid);
.get(format!(
"https://rule34.xxx/index.php?page=post&s=list&tags={tags}&pid={}", let body = Client::new()
page * 42 .get(r34_url)
)) .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
.send()
.await let urls = extract_urls(&body);
.unwrap()
.text() if !urls.is_empty() {
.await } else {
.unwrap(),
);
if urls.is_empty() {
println!("no urls found, exiting..."); println!("no urls found, exiting...");
return ExitCode::FAILURE; std::process::exit(1);
} }
println!("-------------------------------");
println!(" now scraping page {}", r34pid / 42 + 1);
println!("-------------------------------");
for url in urls { for url in urls {
println!("found post: {url}"); println!("found post: {}", url);
let img_url = let post = Client::new()
extract_img_url(&client.get(url).send().await.unwrap().text().await.unwrap()); .get(url)
if img_url.is_empty() { .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36").send().await?.text().await?;
println!("image url not found");
} else { let img_url = extract_img_url(&post);
println!("found image url: {img_url}"); match !img_url.is_empty() {
true => println!("found image url: {}", img_url),
false => println!("image url not found"),
} }
} }
page += 1;
} }
} }
fn extract_urls(html: &str) -> Vec<String> { fn extract_urls(html: &str) -> Vec<String> {
Regex::new(r"/index\.php\?page=post&s=view&id=\d+") let re = Regex::new(r"/index\.php\?page=post&s=view&id=\d+").unwrap();
.unwrap()
let urls: Vec<String> = re
.find_iter(html) .find_iter(html)
.map(|mat| format!("https://rule34.xxx{}", mat.as_str())) .map(|mat| format!("https://rule34.xxx{}", mat.as_str().to_string()))
.collect() .collect();
urls
} }
fn extract_img_url(html: &str) -> String { fn extract_img_url(html: &str) -> String {
if let Some(img_url) = let re = Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+").unwrap();
Regex::new(r"https://us\.rule34\.xxx//([A-Za-z0-9]+(/[A-Za-z0-9]+)+)\.[A-Za-z0-9]+\?[0-9]+") match re.find(html) {
.unwrap() Some(img_url) => img_url.as_str().to_string(),
.find(html) None => String::new(),
{
img_url.as_str().to_string()
} else {
String::new()
} }
} }