diff --git a/src/api.rs b/src/api.rs index 3f1aefb..23709cd 100644 --- a/src/api.rs +++ b/src/api.rs @@ -83,6 +83,40 @@ impl Ord for ClientVersion { } } +fn client_version_from_request(req: &HttpRequest) -> ClientVersion { + match req.headers().get("User-Agent") { + Some(v) => match v.to_str() { + Ok(useragent) => ClientVersion::parse(useragent) + .unwrap_or_else(|| ClientVersion::new(999, 0, "Hot%20Tub".to_string())), + Err(_) => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), + }, + _ => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), + } +} + +async fn ensure_videos_table(pool: &DbPool) { + match pool.get() { + Ok(mut conn) => match db::has_table(&mut conn, "videos") { + Ok(false) => { + if let Err(e) = db::create_table( + &mut conn, + "CREATE TABLE videos (id TEXT NOT NULL, url TEXT NOT NULL);", + ) { + report_provider_error("db", "ensure_videos_table.create_table", &e.to_string()) + .await; + } + } + Ok(true) => {} + Err(e) => { + report_provider_error("db", "ensure_videos_table.has_table", &e.to_string()).await; + } + }, + Err(e) => { + report_provider_error("db", "ensure_videos_table.pool_get", &e.to_string()).await; + } + } +} + fn normalize_query(raw_query: Option<&str>) -> (Option, Option) { let Some(raw_query) = raw_query else { return (None, None); @@ -130,6 +164,55 @@ fn video_matches_literal_query(video: &VideoItem, literal_query: &str) -> bool { .is_some_and(|tags| tags.iter().any(|tag| contains_literal(tag))) } +fn normalize_uploader_name(value: &str) -> String { + value + .trim() + .trim_start_matches('#') + .split_whitespace() + .collect::>() + .join(" ") + .to_ascii_lowercase() +} + +fn video_matches_normalized_uploader(video: &VideoItem, normalized_uploader: &str) -> bool { + video + .uploader + .as_deref() + .map(normalize_uploader_name) + .is_some_and(|value| value == normalized_uploader) +} + +fn add_inline_previews(video_items: &mut [VideoItem]) { + for video in video_items.iter_mut() { + if video.duration <= 120 { + let mut preview_url = video.url.clone(); + if let Some(x) = &video.formats { + if let Some(first) = x.first() { + preview_url = first.url.clone(); + } + } + video.preview = Some(preview_url); + } + } +} + +fn slugify(value: &str) -> String { + let mut slug = String::new(); + let mut prev_dash = false; + + for ch in normalize_uploader_name(value).chars() { + if ch.is_ascii_alphanumeric() { + slug.push(ch); + prev_dash = false; + } else if !prev_dash { + slug.push('-'); + prev_dash = true; + } + } + + slug.trim_matches('-').to_string() +} + pub fn config(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("/status") @@ -141,19 +224,14 @@ pub fn config(cfg: &mut web::ServiceConfig) { // .route(web::get().to(videos_get)) .route(web::post().to(videos_post)), ) + .service(web::resource("/uploader").route(web::post().to(uploader_post))) + .service(web::resource("/uploaders").route(web::post().to(uploader_post))) .service(web::resource("/test").route(web::get().to(test))) .service(web::resource("/proxies").route(web::get().to(proxies))); } async fn status(req: HttpRequest) -> Result { - let clientversion: ClientVersion = match req.headers().get("User-Agent") { - Some(v) => match v.to_str() { - Ok(useragent) => ClientVersion::parse(useragent) - .unwrap_or_else(|| ClientVersion::new(999, 0, "Hot%20Tub".to_string())), - Err(_) => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), - }, - _ => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), - }; + let clientversion = client_version_from_request(&req); println!( "Received status request with client version: {:?}", @@ -198,35 +276,9 @@ async fn videos_post( requester: web::types::State, req: HttpRequest, ) -> Result { - let clientversion: ClientVersion = match req.headers().get("User-Agent") { - Some(v) => match v.to_str() { - Ok(useragent) => ClientVersion::parse(useragent) - .unwrap_or_else(|| ClientVersion::new(999, 0, "Hot%20Tub".to_string())), - Err(_) => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), - }, - _ => ClientVersion::new(999, 0, "Hot%20Tub".to_string()), - }; + let clientversion = client_version_from_request(&req); let requester = requester.get_ref().clone(); - // Ensure "videos" table exists with two string columns. - match pool.get() { - Ok(mut conn) => match db::has_table(&mut conn, "videos") { - Ok(false) => { - if let Err(e) = db::create_table( - &mut conn, - "CREATE TABLE videos (id TEXT NOT NULL, url TEXT NOT NULL);", - ) { - report_provider_error("db", "videos_post.create_table", &e.to_string()).await; - } - } - Ok(true) => {} - Err(e) => { - report_provider_error("db", "videos_post.has_table", &e.to_string()).await; - } - }, - Err(e) => { - report_provider_error("db", "videos_post.pool_get", &e.to_string()).await; - } - } + ensure_videos_table(pool.get_ref()).await; let mut videos = Videos { pageInfo: PageInfo { @@ -387,21 +439,182 @@ async fn videos_post( }); //### - for video in videos.items.iter_mut() { - if video.duration <= 120 { - let mut preview_url = video.url.clone(); - if let Some(x) = &video.formats { - if let Some(first) = x.first() { - preview_url = first.url.clone(); - } - } - video.preview = Some(preview_url); - } - } + add_inline_previews(&mut videos.items); Ok(web::HttpResponse::Ok().json(&videos)) } +async fn uploader_post( + uploader_request: web::types::Json, + cache: web::types::State, + pool: web::types::State, + requester: web::types::State, + req: HttpRequest, +) -> Result { + let clientversion = client_version_from_request(&req); + let requester = requester.get_ref().clone(); + ensure_videos_table(pool.get_ref()).await; + + let uploader_name = uploader_request + .uploader + .as_deref() + .or(uploader_request.title.as_deref()) + .or(uploader_request.query.as_deref()) + .map(str::trim) + .filter(|value| !value.is_empty()) + .ok_or_else(|| web::error::ErrorBadRequest("Missing uploader".to_string()))?; + let normalized_uploader = normalize_uploader_name(uploader_name); + if normalized_uploader.is_empty() { + return Err(web::error::ErrorBadRequest("Missing uploader".to_string()).into()); + } + + let page: u8 = uploader_request + .page + .as_ref() + .and_then(|value| value.to_u8()) + .unwrap_or(1); + let per_page: u8 = uploader_request + .perPage + .as_ref() + .and_then(|value| value.to_u8()) + .unwrap_or(10); + let sort = uploader_request + .sort + .as_deref() + .unwrap_or("date") + .to_string(); + let featured = uploader_request + .featured + .as_deref() + .unwrap_or("all") + .to_string(); + let category = uploader_request + .category + .as_deref() + .unwrap_or("all") + .to_string(); + let sites = uploader_request + .all_provider_sites + .as_deref() + .or(uploader_request.sites.as_deref()) + .unwrap_or("") + .to_string(); + let filter = uploader_request + .filter + .as_deref() + .unwrap_or("new") + .to_string(); + let language = uploader_request + .language + .as_deref() + .unwrap_or("en") + .to_string(); + let networks = uploader_request + .networks + .as_deref() + .unwrap_or("") + .to_string(); + let stars = uploader_request.stars.as_deref().unwrap_or("").to_string(); + let categories = uploader_request + .categories + .as_deref() + .unwrap_or("") + .to_string(); + let duration = uploader_request + .duration + .as_deref() + .unwrap_or("") + .to_string(); + let sexuality = uploader_request + .sexuality + .as_deref() + .unwrap_or("") + .to_string(); + let public_url_base = format!( + "{}://{}", + req.connection_info().scheme(), + req.connection_info().host() + ); + let options = ServerOptions { + featured: Some(featured), + category: Some(category), + sites: Some(sites), + filter: Some(filter), + language: Some(language), + public_url_base: Some(public_url_base), + requester: Some(requester), + network: Some(networks), + stars: Some(stars), + categories: Some(categories), + duration: Some(duration), + sort: Some(sort.clone()), + sexuality: Some(sexuality), + }; + + let provider = get_provider("all") + .ok_or_else(|| web::error::ErrorBadRequest("Invalid channel".to_string()))?; + let mut video_items = run_provider_guarded( + "all", + "uploader_post.get_videos", + provider.get_videos( + cache.get_ref().clone(), + pool.get_ref().clone(), + sort, + Some(uploader_name.to_string()), + page.to_string(), + per_page.to_string(), + options, + ), + ) + .await; + + if clientversion == ClientVersion::new(38, 0, "Hot%20Tub".to_string()) { + video_items = video_items + .into_iter() + .filter_map(|video| { + let last_url = video + .formats + .as_ref() + .and_then(|formats| formats.last().map(|f| f.url.clone())); + if let Some(url) = last_url { + let mut v = video; + v.url = url; + return Some(v); + } + Some(video) + }) + .collect(); + } + + video_items.retain(|video| video_matches_normalized_uploader(video, &normalized_uploader)); + add_inline_previews(&mut video_items); + + let display_name = video_items + .iter() + .find_map(|video| video.uploader.as_ref()) + .cloned() + .unwrap_or_else(|| uploader_name.to_string()); + let row = LayoutRow { + id: "videos".to_string(), + row_type: "videos".to_string(), + title: "Videos".to_string(), + subtitle: Some(format!("Results for {display_name}")), + pageInfo: PageInfo { + hasNextPage: !video_items.is_empty(), + resultsPerPage: u32::from(per_page), + }, + items: video_items, + }; + let response = UploaderResponse { + id: slugify(&display_name), + title: display_name.clone(), + uploader: display_name, + rows: vec![row], + }; + + Ok(web::HttpResponse::Ok().json(&response)) +} + pub fn get_provider(channel: &str) -> Option { ALL_PROVIDERS.get(channel).cloned() } @@ -443,3 +656,41 @@ pub async fn proxies() -> Result { } Ok(web::HttpResponse::Ok().json(&by_protocol)) } + +#[cfg(test)] +mod tests { + use super::{normalize_uploader_name, slugify, video_matches_normalized_uploader}; + use crate::videos::VideoItem; + + #[test] + fn normalize_uploader_name_collapses_spacing_and_case() { + assert_eq!( + normalize_uploader_name(" #The Pet Collective "), + "the pet collective" + ); + } + + #[test] + fn uploader_match_uses_normalized_equality() { + let video = VideoItem::new( + "id".to_string(), + "title".to_string(), + "https://example.com/video".to_string(), + "all".to_string(), + "https://example.com/thumb.jpg".to_string(), + 90, + ) + .uploader("The Pet Collective".to_string()); + + assert!(video_matches_normalized_uploader( + &video, + "the pet collective" + )); + assert!(!video_matches_normalized_uploader(&video, "pet collective")); + } + + #[test] + fn slugify_uses_normalized_name() { + assert_eq!(slugify(" #The Pet Collective "), "the-pet-collective"); + } +} diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 2d7d499..6c45008 100644 --- a/src/providers/mod.rs +++ b/src/providers/mod.rs @@ -25,7 +25,6 @@ pub mod pmvhaven; pub mod pornhat; pub mod pornhub; pub mod redtube; -pub mod rule34video; pub mod spankbang; // pub mod hentaimoon; pub mod beeg; @@ -34,6 +33,7 @@ pub mod omgxxx; pub mod paradisehill; pub mod porn00; pub mod porn4fans; +pub mod porndish; pub mod pornzog; pub mod shooshtime; pub mod sxyprn; @@ -53,6 +53,7 @@ pub mod javtiful; pub mod noodlemagazine; pub mod pimpbunny; pub mod rule34gen; +pub mod rule34video; pub mod xxdbx; // pub mod tube8; @@ -78,10 +79,6 @@ pub static ALL_PROVIDERS: Lazy> = Lazy::new(| "spankbang", Arc::new(spankbang::SpankbangProvider::new()) as DynProvider, ); - m.insert( - "rule34video", - Arc::new(rule34video::Rule34videoProvider::new()) as DynProvider, - ); m.insert( "redtube", Arc::new(redtube::RedtubeProvider::new()) as DynProvider, @@ -134,6 +131,10 @@ pub static ALL_PROVIDERS: Lazy> = Lazy::new(| "porn4fans", Arc::new(porn4fans::Porn4fansProvider::new()) as DynProvider, ); + m.insert( + "porndish", + Arc::new(porndish::PorndishProvider::new()) as DynProvider, + ); m.insert( "shooshtime", Arc::new(shooshtime::ShooshtimeProvider::new()) as DynProvider, @@ -160,6 +161,10 @@ pub static ALL_PROVIDERS: Lazy> = Lazy::new(| Arc::new(viralxxxporn::ViralxxxpornProvider::new()) as DynProvider, ); // m.insert("pornxp", Arc::new(pornxp::PornxpProvider::new()) as DynProvider); + m.insert( + "rule34video", + Arc::new(rule34video::Rule34videoProvider::new()) as DynProvider, + ); m.insert( "rule34gen", Arc::new(rule34gen::Rule34genProvider::new()) as DynProvider, diff --git a/src/providers/porndish.rs b/src/providers/porndish.rs new file mode 100644 index 0000000..6941288 --- /dev/null +++ b/src/providers/porndish.rs @@ -0,0 +1,1066 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{Provider, report_provider_error, report_provider_error_background}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::parse_abbreviated_number; +use crate::util::requester::Requester; +use crate::videos::{ServerOptions, VideoEmbed, VideoFormat, VideoItem}; +use async_trait::async_trait; +use chrono::DateTime; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use std::process::Command; +use std::sync::{Arc, RwLock}; +use std::thread; + +error_chain! { + foreign_links { + Io(std::io::Error); + HttpRequest(wreq::Error); + Json(serde_json::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +#[derive(Debug, Clone)] +pub struct PorndishProvider { + url: String, + sites: Arc>>, + tags: Arc>>, + uploaders: Arc>>, +} + +impl PorndishProvider { + pub fn new() -> Self { + let provider = Self { + url: "https://www.porndish.com".to_string(), + sites: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![])), + uploaders: Arc::new(RwLock::new(vec![])), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let base_url = self.url.clone(); + let sites = Arc::clone(&self.sites); + let tags = Arc::clone(&self.tags); + let uploaders = Arc::clone(&self.uploaders); + + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + "porndish", + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + if let Err(error) = Self::load_filters(&base_url, sites, tags, uploaders).await { + report_provider_error_background( + "porndish", + "spawn_initial_load.load_filters", + &error.to_string(), + ); + } + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let sites = self + .sites + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + + Channel { + id: "porndish".to_string(), + name: "Porndish".to_string(), + description: "Porndish archive pages, tags, and source studios.".to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=porndish.com".to_string(), + status: "active".to_string(), + categories: vec![], + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Browse the default, popular, hot, or trending archives." + .to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "new".to_string(), + title: "Newest".to_string(), + }, + FilterOption { + id: "popular".to_string(), + title: "Popular".to_string(), + }, + FilterOption { + id: "hot".to_string(), + title: "Hot".to_string(), + }, + FilterOption { + id: "trending".to_string(), + title: "Trending".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Sites".to_string(), + description: "Browse a Porndish source archive directly.".to_string(), + systemImage: "network".to_string(), + colorName: "purple".to_string(), + options: sites, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn selector(value: &str) -> Result { + Selector::parse(value) + .map_err(|error| Error::from(format!("selector `{value}` parse failed: {error}"))) + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") + } + + fn decode_html(text: &str) -> String { + decode(text.as_bytes()) + .to_string() + .unwrap_or_else(|_| text.to_string()) + } + + fn text_of(element: &ElementRef<'_>) -> String { + Self::decode_html(&Self::collapse_whitespace( + &element.text().collect::>().join(" "), + )) + } + + fn normalize_title(title: &str) -> String { + title + .trim() + .split_whitespace() + .collect::>() + .join(" ") + .to_ascii_lowercase() + } + + fn slug_to_title(slug: &str) -> String { + slug.split('-') + .filter(|value| !value.is_empty()) + .map(|value| { + let mut chars = value.chars(); + match chars.next() { + Some(first) => format!( + "{}{}", + first.to_ascii_uppercase(), + chars.collect::() + ), + None => String::new(), + } + }) + .collect::>() + .join(" ") + } + + fn normalize_url(&self, url: &str) -> String { + if url.is_empty() { + return String::new(); + } + if url.starts_with("http://") || url.starts_with("https://") { + return url.to_string(); + } + if url.starts_with("//") { + return format!("https:{url}"); + } + if url.starts_with('/') { + return format!("{}{}", self.url, url); + } + format!("{}/{}", self.url, url.trim_start_matches("./")) + } + + fn proxied_thumb(&self, options: &ServerOptions, thumb: &str) -> String { + if thumb.is_empty() { + return String::new(); + } + + let target = crate::providers::strip_url_scheme(thumb); + crate::providers::build_proxy_url(options, "porndish-thumb", &target) + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|value| value.id == item.id) { + values.push(item); + } + } + } + + fn add_tag_filter(&self, slug: &str, title: &str) { + let clean_slug = slug.trim_matches('/').trim(); + let clean_title = title.trim(); + if clean_slug.is_empty() || clean_title.is_empty() { + return; + } + Self::push_unique( + &self.tags, + FilterOption { + id: format!("{}/video2/{clean_slug}/", self.url), + title: clean_title.to_string(), + }, + ); + } + + fn add_uploader_filter(&self, url: &str, title: &str) { + let clean_title = title.trim(); + if url.is_empty() || clean_title.is_empty() { + return; + } + Self::push_unique( + &self.uploaders, + FilterOption { + id: url.to_string(), + title: clean_title.to_string(), + }, + ); + } + + async fn fetch_with_curl_cffi(url: &str, referer: Option<&str>) -> Result { + let url = url.to_string(); + let referer = referer.unwrap_or("").to_string(); + + let output = tokio::task::spawn_blocking(move || { + Command::new("python3") + .arg("-c") + .arg( + r#" +import sys +from curl_cffi import requests + +url = sys.argv[1] +referer = sys.argv[2] if len(sys.argv) > 2 else "" +headers = {"Referer": referer} if referer else {} +response = requests.get( + url, + impersonate="chrome", + timeout=30, + allow_redirects=True, + headers=headers, +) +if response.status_code >= 400: + sys.stderr.write(f"status={response.status_code} url={response.url}\n") + sys.exit(1) +sys.stdout.buffer.write(response.content) +"#, + ) + .arg(url) + .arg(referer) + .output() + }) + .await + .map_err(|error| Error::from(format!("spawn_blocking failed: {error}")))? + .map_err(|error| Error::from(format!("python3 execution failed: {error}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + return Err(Error::from(format!("curl_cffi request failed: {stderr}"))); + } + + Ok(String::from_utf8_lossy(&output.stdout).to_string()) + } + + async fn fetch_html(url: &str) -> Result { + Self::fetch_with_curl_cffi(url, None).await + } + + async fn load_filters( + base_url: &str, + sites: Arc>>, + tags: Arc>>, + uploaders: Arc>>, + ) -> Result<()> { + let link_selector = Self::selector("a[href]")?; + let article_selector = Self::selector("article.entry-tpl-grid, article.post")?; + let pages = vec![ + format!("{base_url}/"), + format!("{base_url}/page/2/"), + format!("{base_url}/popular/"), + format!("{base_url}/hot/"), + format!("{base_url}/trending/"), + ]; + + for url in pages { + let html = match Self::fetch_html(&url).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + "porndish", + "load_filters.fetch_html", + &format!("url={url}; error={error}"), + ); + continue; + } + }; + + let document = Html::parse_document(&html); + + for link in document.select(&link_selector) { + let Some(href) = link.value().attr("href") else { + continue; + }; + let normalized = href.trim_end_matches('/'); + let prefix = format!("{base_url}/videos2/"); + if !normalized.starts_with(&prefix) { + continue; + } + + let remainder = normalized.strip_prefix(&prefix).unwrap_or_default(); + if remainder.is_empty() || remainder.contains("/page/") { + continue; + } + + let title = Self::text_of(&link); + if title.is_empty() { + continue; + } + + let item = FilterOption { + id: format!("{normalized}/"), + title: title.clone(), + }; + Self::push_unique(&sites, item.clone()); + Self::push_unique(&uploaders, item); + } + + for article in document.select(&article_selector) { + let Some(classes) = article.value().attr("class") else { + continue; + }; + + for class_name in classes.split_whitespace() { + if let Some(slug) = class_name.strip_prefix("tag-") { + if slug.is_empty() || slug == "format-video" { + continue; + } + Self::push_unique( + &tags, + FilterOption { + id: format!("{base_url}/video2/{slug}/"), + title: Self::slug_to_title(slug), + }, + ); + } + } + } + } + + Ok(()) + } + + fn parse_duration(text: &str) -> u32 { + let parts = text + .trim() + .split(':') + .filter_map(|value| value.parse::().ok()) + .collect::>(); + + match parts.as_slice() { + [minutes, seconds] => minutes.saturating_mul(60).saturating_add(*seconds), + [hours, minutes, seconds] => hours + .saturating_mul(3600) + .saturating_add(minutes.saturating_mul(60)) + .saturating_add(*seconds), + _ => 0, + } + } + + fn parse_views(text: &str) -> Option { + parse_abbreviated_number( + &text + .replace("Views", "") + .replace("View", "") + .replace(' ', "") + .trim() + .to_string(), + ) + } + + fn parse_uploaded_at(text: &str) -> Option { + DateTime::parse_from_rfc3339(text) + .ok() + .map(|value| value.timestamp() as u64) + } + + fn encoded_query(query: &str) -> String { + let mut serializer = url::form_urlencoded::Serializer::new(String::new()); + serializer.append_pair("", query); + serializer.finish().trim_start_matches('=').to_string() + } + + fn build_top_level_url(&self, page: u8, sort: &str) -> String { + let base = match sort { + "popular" => format!("{}/popular/", self.url), + "hot" => format!("{}/hot/", self.url), + "trending" => format!("{}/trending/", self.url), + _ => format!("{}/", self.url), + }; + + if page > 1 { + format!("{}page/{page}/", base) + } else { + base + } + } + + fn build_archive_page_url(base: &str, page: u8) -> String { + if page > 1 { + format!( + "{}page/{page}/", + base.trim_end_matches('/').to_string() + "/" + ) + } else { + base.to_string() + } + } + + fn build_search_url(&self, query: &str, page: u8) -> String { + let encoded = Self::encoded_query(query); + if page > 1 { + format!("{}/search/{encoded}/page/{page}/", self.url) + } else { + format!("{}/search/{encoded}/", self.url) + } + } + + fn resolve_option_target(&self, options: &ServerOptions) -> Option { + let site = options.sites.as_deref()?; + if site.is_empty() || site == "all" { + return None; + } + Some(site.to_string()) + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| Self::normalize_title(&value.title) == normalized_query) + .map(|value| value.id.clone()) + } + + fn resolve_query_target(&self, query: &str) -> Option { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(target) = Self::match_filter(&uploaders, query) { + return Some(target); + } + } + + if let Ok(sites) = self.sites.read() { + if let Some(target) = Self::match_filter(&sites, query) { + return Some(target); + } + } + + if let Ok(tags) = self.tags.read() { + if let Some(target) = Self::match_filter(&tags, query) { + return Some(target); + } + } + + None + } + + fn parse_list_videos(&self, html: &str) -> Result> { + let document = Html::parse_document(html); + let article_selector = + Self::selector("article.entry-tpl-grid, article.entry-tpl-list-fancy")?; + let title_selector = Self::selector(".entry-title a[href]")?; + let image_selector = Self::selector(".entry-featured-media img")?; + let duration_selector = Self::selector(".mace-video-duration")?; + let source_selector = Self::selector(".entry-categories a[href]")?; + let views_selector = Self::selector(".entry-views strong")?; + let time_selector = Self::selector("time.entry-date[datetime]")?; + + let mut items = Vec::new(); + + for article in document.select(&article_selector) { + let Some(title_link) = article.select(&title_selector).next() else { + continue; + }; + + let Some(url) = title_link.value().attr("href") else { + continue; + }; + let page_url = self.normalize_url(url); + if page_url.is_empty() { + continue; + } + + let title = Self::text_of(&title_link); + if title.is_empty() { + continue; + } + + let slug = page_url + .trim_end_matches('/') + .rsplit('/') + .next() + .unwrap_or_default() + .to_string(); + + let thumb = article + .select(&image_selector) + .next() + .and_then(|image| { + image + .value() + .attr("data-src") + .or_else(|| image.value().attr("src")) + }) + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + + let duration = article + .select(&duration_selector) + .next() + .map(|value| Self::parse_duration(&Self::text_of(&value))) + .unwrap_or(0); + + let mut item = VideoItem::new( + slug, + title, + page_url.clone(), + "porndish".to_string(), + thumb, + duration, + ); + + if let Some(source_link) = article.select(&source_selector).next() { + let source_title = Self::text_of(&source_link); + if !source_title.is_empty() { + if let Some(source_url) = source_link.value().attr("href") { + let source_url = self.normalize_url(source_url); + item = item + .uploader(source_title.clone()) + .uploader_url(source_url.clone()); + self.add_uploader_filter(&source_url, &source_title); + } else { + item = item.uploader(source_title); + } + } + } + + if let Some(views) = article + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))) + { + item = item.views(views); + } + + if let Some(uploaded_at) = article + .select(&time_selector) + .next() + .and_then(|time| time.value().attr("datetime")) + .and_then(Self::parse_uploaded_at) + { + item = item.uploaded_at(uploaded_at); + } + + if let Some(classes) = article.value().attr("class") { + let mut tags = Vec::new(); + for class_name in classes.split_whitespace() { + if let Some(slug) = class_name.strip_prefix("tag-") { + if slug.is_empty() || slug == "format-video" { + continue; + } + let title = Self::slug_to_title(slug); + if !title.is_empty() && !tags.iter().any(|value| value == &title) { + self.add_tag_filter(slug, &title); + tags.push(title); + } + } + } + if !tags.is_empty() { + item = item.tags(tags); + } + } + + items.push(item); + } + + Ok(items) + } + + fn extract_iframe_fragments(&self, html: &str) -> Result> { + let regex = Self::regex(r#"const\s+[A-Za-z0-9_]+Content\s*=\s*"((?:\\.|[^"\\])*)";"#)?; + let mut fragments = Vec::new(); + + for captures in regex.captures_iter(html) { + let Some(value) = captures.get(1).map(|value| value.as_str()) else { + continue; + }; + let encoded = format!("\"{value}\""); + let decoded = serde_json::from_str::(&encoded).unwrap_or_default(); + if decoded.contains(" Result { + let iframe_url = iframe_url.to_string(); + let output = tokio::task::spawn_blocking(move || { + Command::new("python3") + .arg("-c") + .arg( + r#" +import re +import sys +import time +from curl_cffi import requests + +iframe_url = sys.argv[1] +session = requests.Session(impersonate="chrome") +html = session.get(iframe_url, timeout=30).text +match = re.search(r"\$\.get\(\s*['\"](/pass_md5/[^'\"]+)['\"]", html) +if not match: + sys.stderr.write("missing pass_md5 path\n") + sys.exit(1) +path = match.group(1) +token = path.rstrip("/").split("/")[-1] +if not token: + sys.stderr.write("missing pass_md5 token\n") + sys.exit(1) +if path.startswith("http://") or path.startswith("https://"): + pass_url = path +else: + pass_url = "/".join(iframe_url.split("/")[:3]) + path +base = session.get(pass_url, headers={"Referer": iframe_url}, timeout=30).text.strip() +if not base or base == "RELOAD" or not base.startswith("http"): + sys.stderr.write(f"unusable pass_md5 response: {base[:120]}\n") + sys.exit(1) +chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" +now = int(time.time() * 1000) +suffix = "".join(chars[(now + i * 17) % len(chars)] for i in range(10)) +sys.stdout.write(f"{base}{suffix}?token={token}&expiry={now}") +"#, + ) + .arg(iframe_url) + .output() + }) + .await + .map_err(|error| Error::from(format!("spawn_blocking failed: {error}")))? + .map_err(|error| Error::from(format!("python3 execution failed: {error}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + return Err(Error::from(format!( + "myvidplay resolution failed: {stderr}" + ))); + } + + let resolved = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if resolved.is_empty() || !resolved.starts_with("http") { + return Err(Error::from( + "myvidplay resolution returned empty url".to_string(), + )); + } + + Ok(resolved) + } + + fn parse_embed_source(fragment: &str) -> Result> { + let iframe_regex = Self::regex(r#"(?is)]+src="([^"]+)"[^>]*>"#)?; + Ok(iframe_regex.captures(fragment).and_then(|captures| { + captures + .get(1) + .map(|value| (fragment.to_string(), value.as_str().to_string())) + })) + } + + async fn apply_detail_video( + &self, + mut item: VideoItem, + html: &str, + page_url: &str, + options: &ServerOptions, + _requester: &mut Requester, + ) -> Result { + let ( + parsed_title, + parsed_thumb, + parsed_uploader, + parsed_uploader_url, + parsed_views, + parsed_uploaded_at, + parsed_tags, + ) = { + let document = Html::parse_document(html); + let title_selector = Self::selector("h1.entry-title")?; + let thumb_selector = Self::selector("meta[property=\"og:image\"]")?; + let category_selector = + Self::selector(".entry-categories-l a[href], .entry-categories a[href]")?; + let views_selector = Self::selector(".entry-views strong")?; + let time_selector = Self::selector("time.entry-date[datetime]")?; + let tag_selector = Self::selector(".entry-tags a[href]")?; + + let parsed_title = document + .select(&title_selector) + .next() + .map(|title| Self::text_of(&title)) + .filter(|title| !title.is_empty()); + + let parsed_thumb = document + .select(&thumb_selector) + .next() + .and_then(|meta| meta.value().attr("content")) + .map(|thumb| self.normalize_url(thumb)) + .filter(|thumb| !thumb.is_empty()); + + let (parsed_uploader, parsed_uploader_url) = document + .select(&category_selector) + .next() + .map(|category| { + let title = Self::text_of(&category); + let url = category + .value() + .attr("href") + .map(|href| self.normalize_url(href)) + .filter(|href| !href.is_empty()); + (title, url) + }) + .filter(|(title, _)| !title.is_empty()) + .map(|(title, url)| (Some(title), url)) + .unwrap_or((None, None)); + + let parsed_views = document + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))); + + let parsed_uploaded_at = document + .select(&time_selector) + .next() + .and_then(|time| time.value().attr("datetime")) + .and_then(Self::parse_uploaded_at); + + let mut parsed_tags = Vec::new(); + for tag_link in document.select(&tag_selector) { + let title = Self::text_of(&tag_link); + let slug = tag_link + .value() + .attr("href") + .map(|href| { + href.trim_end_matches('/') + .rsplit('/') + .next() + .unwrap_or_default() + .to_string() + }) + .unwrap_or_default(); + if !title.is_empty() { + parsed_tags.push((title, slug)); + } + } + + ( + parsed_title, + parsed_thumb, + parsed_uploader, + parsed_uploader_url, + parsed_views, + parsed_uploaded_at, + parsed_tags, + ) + }; + + if let Some(title) = parsed_title { + item.title = title; + } + + if let Some(thumb) = parsed_thumb { + item.thumb = self.proxied_thumb(options, &thumb); + } + + if let Some(uploader) = parsed_uploader { + item.uploader = Some(uploader.clone()); + if let Some(uploader_url) = parsed_uploader_url { + item.uploaderUrl = Some(uploader_url.clone()); + self.add_uploader_filter(&uploader_url, &uploader); + } + } + + if let Some(views) = parsed_views { + item.views = Some(views); + } + + if let Some(uploaded_at) = parsed_uploaded_at { + item.uploadedAt = Some(uploaded_at); + } + + if !parsed_tags.is_empty() { + let mut tags = Vec::new(); + for (title, slug) in parsed_tags { + if !slug.is_empty() { + self.add_tag_filter(&slug, &title); + } + if !tags.iter().any(|value| value == &title) { + tags.push(title); + } + } + item.tags = Some(tags); + } + + for fragment in self.extract_iframe_fragments(html)? { + let Some((embed_html, iframe_url)) = Self::parse_embed_source(&fragment)? else { + continue; + }; + let iframe_url = self.normalize_url(&iframe_url); + + item.embed = Some(VideoEmbed { + html: embed_html, + source: iframe_url.clone(), + }); + + if iframe_url.contains("myvidplay.com") { + match self.resolve_myvidplay_stream(&iframe_url).await { + Ok(stream_url) => { + item.url = stream_url.clone(); + let mut format = VideoFormat::new( + stream_url.clone(), + "sd".to_string(), + "mp4".to_string(), + ); + format.add_http_header("Referer".to_string(), iframe_url.clone()); + item.formats = Some(vec![format]); + } + Err(error) => { + report_provider_error_background( + "porndish", + "resolve_myvidplay_stream", + &format!("iframe_url={iframe_url}; error={error}"), + ); + item.url = iframe_url; + } + } + } else { + item.url = iframe_url; + } + + break; + } + + if item.formats.is_none() && item.url != page_url { + let mut format = + VideoFormat::new(item.url.clone(), "unknown".to_string(), "mp4".to_string()); + format.add_http_header("Referer".to_string(), page_url.to_string()); + item.formats = Some(vec![format]); + } + + Ok(item) + } + + async fn enrich_video(&self, item: VideoItem, options: &ServerOptions) -> VideoItem { + let mut item = item; + if !item.thumb.is_empty() { + item.thumb = self.proxied_thumb(options, &item.thumb); + } + + let page_url = if item.url.starts_with("http://") || item.url.starts_with("https://") { + item.url.clone() + } else { + return item; + }; + let original_item = item.clone(); + + let mut requester = match options.requester.clone() { + Some(requester) => requester, + None => Requester::new(), + }; + + let html = match Self::fetch_with_curl_cffi(&page_url, None).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + "porndish", + "enrich_video.request", + &format!("url={page_url}; error={error}"), + ); + return item; + } + }; + + match self + .apply_detail_video(item, &html, &page_url, options, &mut requester) + .await + { + Ok(item) => item, + Err(error) => { + report_provider_error_background( + "porndish", + "enrich_video.parse", + &format!("url={page_url}; error={error}"), + ); + original_item + } + } + } + + async fn fetch_items_for_url( + &self, + cache: VideoCache, + url: String, + options: &ServerOptions, + ) -> Result> { + if let Some((time, items)) = cache.get(&url) { + if time.elapsed().unwrap_or_default().as_secs() < 300 { + return Ok(items.clone()); + } + } + + let _requester = + crate::providers::requester_or_default(options, "porndish", "missing_requester"); + + let html = match Self::fetch_with_curl_cffi(&url, None).await { + Ok(html) => html, + Err(error) => { + report_provider_error( + "porndish", + "fetch_items_for_url.request", + &format!("url={url}; error={error}"), + ) + .await; + return Ok(vec![]); + } + }; + + let list_videos = self.parse_list_videos(&html)?; + if list_videos.is_empty() { + return Ok(vec![]); + } + + let items = stream::iter(list_videos.into_iter().map(|video| { + let provider = self.clone(); + let options = options.clone(); + async move { provider.enrich_video(video, &options).await } + })) + .buffer_unordered(1) + .collect::>() + .await; + + if !items.is_empty() { + cache.insert(url, items.clone()); + } + + Ok(items) + } + + async fn get( + &self, + cache: VideoCache, + page: u8, + sort: &str, + options: ServerOptions, + ) -> Result> { + let url = match self.resolve_option_target(&options) { + Some(target) => Self::build_archive_page_url(&target, page), + None => self.build_top_level_url(page, sort), + }; + + self.fetch_items_for_url(cache, url, &options).await + } + + async fn query( + &self, + cache: VideoCache, + page: u8, + query: &str, + _sort: &str, + options: ServerOptions, + ) -> Result> { + let url = match self.resolve_query_target(query) { + Some(target) => Self::build_archive_page_url(&target, page), + None => self.build_search_url(query, page), + }; + + self.fetch_items_for_url(cache, url, &options).await + } +} + +#[async_trait] +impl Provider for PorndishProvider { + async fn get_videos( + &self, + cache: VideoCache, + pool: DbPool, + sort: String, + query: Option, + page: String, + per_page: String, + options: ServerOptions, + ) -> Vec { + let _ = pool; + let _ = per_page; + let page = page.parse::().unwrap_or(1); + + let result = match query { + Some(query) if !query.trim().is_empty() => { + self.query(cache, page, &query, &sort, options).await + } + _ => self.get(cache, page, &sort, options).await, + }; + + match result { + Ok(videos) => videos, + Err(error) => { + report_provider_error_background("porndish", "get_videos", &error.to_string()); + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} diff --git a/src/providers/rule34video.rs b/src/providers/rule34video.rs index 0d58abb..4914f2b 100644 --- a/src/providers/rule34video.rs +++ b/src/providers/rule34video.rs @@ -1,17 +1,20 @@ use crate::DbPool; use crate::api::ClientVersion; -use crate::providers::Provider; +use crate::providers::{Provider, report_provider_error, requester_or_default}; use crate::status::*; use crate::util::cache::VideoCache; -use crate::util::discord::send_discord_error_report; use crate::util::parse_abbreviated_number; +use crate::util::requester::Requester; use crate::util::time::parse_time_to_seconds; use crate::videos::{ServerOptions, VideoItem}; use async_trait::async_trait; use error_chain::error_chain; +use futures::stream::{self, StreamExt}; use htmlentity::entity::{ICodedDataTrait, decode}; +use scraper::{ElementRef, Html, Selector}; +use std::collections::HashSet; +use std::sync::{Arc, RwLock}; use std::time::{SystemTime, UNIX_EPOCH}; -use std::vec; error_chain! { foreign_links { @@ -19,9 +22,9 @@ error_chain! { HttpRequest(wreq::Error); } errors { - ParsingError(t: String) { - description("html parsing error") - display("HTML parsing error: '{}'", t) + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) } } } @@ -29,274 +32,695 @@ error_chain! { #[derive(Debug, Clone)] pub struct Rule34videoProvider { url: String, + categories: Arc>>, + artists: Arc>>, + tags: Arc>>, + uploaders: Arc>>, +} + +#[derive(Debug, Clone)] +struct QueryTarget { + url: String, } impl Rule34videoProvider { pub fn new() -> Self { - Rule34videoProvider { + Self { url: "https://rule34video.com".to_string(), + categories: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + artists: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + tags: Arc::new(RwLock::new(vec![])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), } } fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let categories = self + .categories + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let artists = self + .artists + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let uploaders = self + .uploaders + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + Channel { id: "rule34video".to_string(), name: "Rule34Video".to_string(), - description: "If it exists, there is porn".to_string(), + description: "Rule34Video with artist, uploader, category, and tag-aware searches." + .to_string(), premium: false, favicon: "https://www.google.com/s2/favicons?sz=64&domain=rule34video.com".to_string(), status: "active".to_string(), - categories: vec![], - options: vec![ChannelOption { - id: "sort".to_string(), - title: "Sort".to_string(), - description: "Sort the Videos".to_string(), - systemImage: "list.number".to_string(), - colorName: "blue".to_string(), - options: vec![ - FilterOption { - id: "post_date".to_string(), - title: "Newest".to_string(), - }, - FilterOption { - id: "video_viewed".to_string(), - title: "Most Viewed".to_string(), - }, - FilterOption { - id: "rating".to_string(), - title: "Top Rated".to_string(), - }, - FilterOption { - id: "duration".to_string(), - title: "Longest".to_string(), - }, - FilterOption { - id: "pseudo_random".to_string(), - title: "Random".to_string(), - }, - ], - multiSelect: false, - }], + categories: categories.iter().map(|value| value.title.clone()).collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Sort the videos".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "post_date".to_string(), + title: "Newest".to_string(), + }, + FilterOption { + id: "video_viewed".to_string(), + title: "Most Viewed".to_string(), + }, + FilterOption { + id: "rating".to_string(), + title: "Top Rated".to_string(), + }, + FilterOption { + id: "duration".to_string(), + title: "Longest".to_string(), + }, + FilterOption { + id: "pseudo_random".to_string(), + title: "Random".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Categories".to_string(), + description: "Browse a Rule34Video category".to_string(), + systemImage: "square.grid.2x2".to_string(), + colorName: "orange".to_string(), + options: categories, + multiSelect: false, + }, + ChannelOption { + id: "stars".to_string(), + title: "Artists".to_string(), + description: "Browse a Rule34Video artist".to_string(), + systemImage: "paintbrush.pointed".to_string(), + colorName: "purple".to_string(), + options: artists, + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Uploaders".to_string(), + description: "Browse a Rule34Video uploader".to_string(), + systemImage: "person.crop.rectangle".to_string(), + colorName: "green".to_string(), + options: uploaders, + multiSelect: false, + }, + ], nsfw: true, cacheDuration: Some(1800), } } - /// Helper to safely extract a string between two delimiters - fn extract_between<'a>(content: &'a str, start_pat: &str, end_pat: &str) -> Option<&'a str> { - let start_idx = content.find(start_pat)? + start_pat.len(); - let sub = &content[start_idx..]; - let end_idx = sub.find(end_pat)?; - Some(&sub[..end_idx]) + fn expected_sort(sort: &str) -> &'static str { + match sort { + "video_viewed" => "video_viewed", + "rating" => "rating", + "duration" => "duration", + "pseudo_random" => "pseudo_random", + _ => "post_date", + } } - async fn get( - &self, - cache: VideoCache, - page: u8, - sort: &str, - options: ServerOptions, - ) -> Result> { + fn selector(pattern: &str) -> Result { + Selector::parse(pattern) + .map_err(|error| Error::from(format!("selector parse failed for {pattern}: {error}"))) + } + + fn decode_html(text: &str) -> String { + decode(text.as_bytes()) + .to_string() + .unwrap_or_else(|_| text.to_string()) + } + + fn collapse_whitespace(text: &str) -> String { + text.split_whitespace().collect::>().join(" ") + } + + fn text_of(element: &ElementRef<'_>) -> String { + Self::collapse_whitespace(&element.text().collect::>().join(" ")) + } + + fn normalize_title(title: &str) -> String { + title + .trim() + .trim_start_matches('#') + .split_whitespace() + .collect::>() + .join(" ") + .to_ascii_lowercase() + } + + fn strip_counter_suffix(text: &str) -> String { + let mut parts = text.split_whitespace().collect::>(); + + while parts + .last() + .is_some_and(|value| value.chars().all(|ch| ch.is_ascii_digit())) + { + parts.pop(); + } + + Self::collapse_whitespace(&parts.join(" ")) + } + + fn normalize_url(&self, url: &str) -> String { + if url.is_empty() { + return String::new(); + } + if url.starts_with("http://") || url.starts_with("https://") { + return url.to_string(); + } + if url.starts_with("//") { + return format!("https:{url}"); + } + if url.starts_with('/') { + return format!("{}{}", self.url, url); + } + format!("{}/{}", self.url, url.trim_start_matches("./")) + } + + fn normalize_member_url(&self, url: &str) -> String { + let absolute = self.normalize_url(url); + if absolute.is_empty() { + return absolute; + } + if absolute.ends_with("/videos/") { + return absolute; + } + let trimmed = absolute.trim_end_matches('/'); + if trimmed.ends_with("/videos") { + return format!("{trimmed}/"); + } + format!("{trimmed}/videos/") + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.is_empty() || item.title.is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|value| value.id == item.id) { + values.push(item); + } + } + } + + fn add_category_filter(&self, url: &str, title: &str) { + let title = Self::strip_counter_suffix(title); + if url.is_empty() || title.is_empty() { + return; + } + Self::push_unique( + &self.categories, + FilterOption { + id: self.normalize_url(url), + title, + }, + ); + } + + fn add_artist_filter(&self, url: &str, title: &str) { + let title = Self::strip_counter_suffix(title); + if url.is_empty() || title.is_empty() { + return; + } + Self::push_unique( + &self.artists, + FilterOption { + id: self.normalize_url(url), + title, + }, + ); + } + + fn add_tag_filter(&self, url: &str, title: &str) { + let title = Self::strip_counter_suffix(title); + if url.is_empty() || title.is_empty() { + return; + } + Self::push_unique( + &self.tags, + FilterOption { + id: self.normalize_url(url), + title, + }, + ); + } + + fn add_uploader_filter(&self, url: &str, title: &str) { + let title = Self::strip_counter_suffix(title); + if url.is_empty() || title.is_empty() { + return; + } + Self::push_unique( + &self.uploaders, + FilterOption { + id: self.normalize_member_url(url), + title, + }, + ); + } + + fn parse_views(text: &str) -> Option { + let cleaned = text + .replace("views", "") + .replace("view", "") + .replace(' ', "") + .trim() + .to_string(); + parse_abbreviated_number(&cleaned) + } + + fn video_id_from_url(url: &str) -> String { + url.split("/video/") + .nth(1) + .and_then(|value| value.split('/').next()) + .unwrap_or_default() + .to_string() + } + + fn append_query_param(url: &str, key: &str, value: &str) -> String { + let separator = if url.contains('?') { "&" } else { "?" }; + format!("{url}{separator}{key}={value}") + } + + fn build_top_level_url(&self, page: u8, sort: &str) -> String { let timestamp_millis = SystemTime::now() .duration_since(UNIX_EPOCH) - .map(|d| d.as_millis()) + .map(|value| value.as_millis()) .unwrap_or(0); + format!( + "{}/?mode=async&function=get_block&block_id=custom_list_videos_most_recent_videos&tag_ids=&sort_by={}&from={}&_={}", + self.url, sort, page, timestamp_millis + ) + } - let expected_sorts = vec![ - "post_date", - "video_viewed", - "rating", - "duration", - "pseudo_random", - ]; - let sort_val = if expected_sorts.contains(&sort) { - sort + fn build_search_url(&self, query: &str, page: u8, sort: &str) -> String { + let timestamp_millis = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|value| value.as_millis()) + .unwrap_or(0); + format!( + "{}/search/{}/?mode=async&function=get_block&block_id=custom_list_videos_videos_list_search&tag_ids=&sort_by={}&from_videos={}&from_albums={}&_={}", + self.url, + query.replace(' ', "-"), + sort, + page, + page, + timestamp_millis + ) + } + + fn build_filtered_url(&self, base: &str, page: u8, sort: &str) -> String { + let mut url = if page > 1 { + format!("{}{page}/", base.trim_end_matches('/').to_string() + "/") } else { - "post_date" + base.to_string() + }; + if sort != "post_date" { + url = Self::append_query_param(&url, "sort_by", sort); + } + url + } + + fn resolve_option_target(&self, options: &ServerOptions) -> Option { + if let Some(category) = options.categories.as_deref() { + if !category.is_empty() && category != "all" { + return Some(QueryTarget { + url: category.to_string(), + }); + } + } + + if let Some(artist) = options.stars.as_deref() { + if !artist.is_empty() && artist != "all" { + return Some(QueryTarget { + url: artist.to_string(), + }); + } + } + + if let Some(uploader) = options.sites.as_deref() { + if !uploader.is_empty() && uploader != "all" { + return Some(QueryTarget { + url: uploader.to_string(), + }); + } + } + + None + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| Self::normalize_title(&value.title) == normalized_query) + .map(|value| QueryTarget { + url: value.id.clone(), + }) + } + + fn resolve_query_target(&self, query: &str) -> Option { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(target) = Self::match_filter(&uploaders, query) { + return Some(target); + } + } + + if let Ok(artists) = self.artists.read() { + if let Some(target) = Self::match_filter(&artists, query) { + return Some(target); + } + } + + if let Ok(tags) = self.tags.read() { + if let Some(target) = Self::match_filter(&tags, query) { + return Some(target); + } + } + + if let Ok(categories) = self.categories.read() { + if let Some(target) = Self::match_filter(&categories, query) { + return Some(target); + } + } + + None + } + + async fn fetch_page_items( + &self, + cache: VideoCache, + cache_key: String, + url: String, + use_cache: bool, + options: ServerOptions, + ) -> Result> { + let old_items = if use_cache { + match cache.get(&cache_key) { + Some((time, items)) => { + if time.elapsed().unwrap_or_default().as_secs() < 60 * 5 { + return Ok(items.clone()); + } + items.clone() + } + None => vec![], + } + } else { + vec![] }; - let index = format!("rule34video:{}:{}", page, sort_val); + let mut requester = + requester_or_default(&options, module_path!(), "rule34video.fetch_page_items"); + let text = match requester.get(&url, None).await { + Ok(text) => text, + Err(error) => { + report_provider_error( + "rule34video", + "fetch_page_items.request", + &format!("url={url}; error={error}"), + ) + .await; + return Ok(old_items); + } + }; - if sort_val != "pseudo_random" { - if let Some((time, items)) = cache.get(&index) { - if time.elapsed().unwrap_or_default().as_secs() < 300 { - return Ok(items.clone()); + let items = match self.parse_list_videos(&text) { + Ok(items) => items, + Err(error) => { + report_provider_error( + "rule34video", + "fetch_page_items.parse_list_videos", + &format!("url={url}; error={error}"), + ) + .await; + return Ok(old_items); + } + }; + + if items.is_empty() { + return Ok(old_items); + } + + let enriched = self.enrich_video_items(items, requester).await; + if !enriched.is_empty() && use_cache { + cache.insert(cache_key, enriched.clone()); + } + Ok(enriched) + } + + async fn enrich_video_items( + &self, + items: Vec, + requester: Requester, + ) -> Vec { + let provider = self.clone(); + let mut enriched = stream::iter(items.into_iter().enumerate().map(move |(idx, item)| { + let provider = provider.clone(); + let mut requester = requester.clone(); + async move { + let url = item.url.clone(); + let updated = match requester.get(&url, None).await { + Ok(html) => provider.apply_detail_video(item, &html), + Err(error) => { + report_provider_error( + "rule34video", + "enrich_video_items.detail_request", + &format!("url={url}; error={error}"), + ) + .await; + item + } + }; + (idx, updated) + } + })) + .buffer_unordered(6) + .collect::>() + .await; + + enriched.sort_by_key(|(idx, _)| *idx); + enriched.into_iter().map(|(_, item)| item).collect() + } + + fn parse_list_videos(&self, html: &str) -> Result> { + if html.trim().is_empty() { + return Ok(vec![]); + } + + let document = Html::parse_document(html); + let card_selector = Self::selector("div.item.thumb")?; + let link_selector = Self::selector("a.th.js-open-popup[href], a[href*=\"/video/\"]")?; + let title_selector = Self::selector(".thumb_title")?; + let image_selector = Self::selector("img")?; + let duration_selector = Self::selector(".time")?; + let views_selector = Self::selector(".views")?; + + let mut items = Vec::new(); + + for card in document.select(&card_selector) { + let Some(link) = card.select(&link_selector).next() else { + continue; + }; + + let href = link.value().attr("href").unwrap_or_default(); + let url = self.normalize_url(href); + let id = Self::video_id_from_url(&url); + if url.is_empty() || id.is_empty() { + continue; + } + + let title = card + .select(&title_selector) + .next() + .map(|value| Self::decode_html(&Self::text_of(&value))) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| { + Self::decode_html(link.value().attr("title").unwrap_or_default()) + }); + if title.is_empty() { + continue; + } + + let image = card.select(&image_selector).next(); + let thumb = image + .and_then(|value| { + value + .value() + .attr("data-original") + .or_else(|| value.value().attr("data-src")) + .or_else(|| value.value().attr("src")) + }) + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + + let duration = card + .select(&duration_selector) + .next() + .map(|value| parse_time_to_seconds(&Self::text_of(&value)).unwrap_or(0) as u32) + .unwrap_or(0); + + let views = card + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))); + + let mut item = + VideoItem::new(id, title, url, "rule34video".to_string(), thumb, duration); + if let Some(views) = views { + item = item.views(views); + } + items.push(item); + } + + Ok(items) + } + + fn collect_link_values( + &self, + document: &Html, + selector: &Selector, + normalize_url: F, + ) -> Vec<(String, String)> + where + F: Fn(&str) -> String, + { + let mut seen = HashSet::new(); + let mut values = Vec::new(); + + for link in document.select(selector) { + let Some(href) = link.value().attr("href") else { + continue; + }; + let id = normalize_url(href); + let title = Self::strip_counter_suffix(&Self::decode_html(&Self::text_of(&link))); + if id.is_empty() || title.is_empty() { + continue; + } + if seen.insert(id.clone()) { + values.push((id, title)); + } + } + + values + } + + fn dedupe_terms(values: Vec) -> Vec { + let mut seen = HashSet::new(); + let mut deduped = Vec::new(); + for value in values { + let normalized = Self::normalize_title(&value); + if normalized.is_empty() || !seen.insert(normalized) { + continue; + } + deduped.push(value); + } + deduped + } + + fn apply_detail_video(&self, mut item: VideoItem, html: &str) -> VideoItem { + if html.trim().is_empty() { + return item; + } + + let document = Html::parse_document(html); + let title_selector = Self::selector("h1, .headline h1, .headline .title").ok(); + let category_selector = Self::selector("a[href*=\"/categories/\"]").ok(); + let artist_selector = Self::selector("a[href*=\"/artists/\"]").ok(); + let uploader_selector = Self::selector("a[href*=\"/members/\"]").ok(); + let tag_selector = Self::selector("a[href*=\"/tags/\"]").ok(); + + if item.title.is_empty() { + if let Some(selector) = &title_selector { + if let Some(title) = document.select(selector).next() { + let value = Self::decode_html(&Self::text_of(&title)); + if !value.is_empty() { + item.title = value; + } } } } - let mut requester = options.requester.clone().ok_or("Requester missing")?; - let url = format!( - "{}/?mode=async&function=get_block&block_id=custom_list_videos_most_recent_videos&tag_ids=&sort_by={}&from={}&_={}", - self.url, sort_val, page, timestamp_millis - ); + let categories = category_selector + .as_ref() + .map(|selector| { + self.collect_link_values(&document, selector, |href| self.normalize_url(href)) + }) + .unwrap_or_default(); + let artists = artist_selector + .as_ref() + .map(|selector| { + self.collect_link_values(&document, selector, |href| self.normalize_url(href)) + }) + .unwrap_or_default(); + let uploaders = uploader_selector + .as_ref() + .map(|selector| { + self.collect_link_values(&document, selector, |href| { + self.normalize_member_url(href) + }) + }) + .unwrap_or_default(); + let tags = tag_selector + .as_ref() + .map(|selector| { + self.collect_link_values(&document, selector, |href| self.normalize_url(href)) + }) + .unwrap_or_default(); - let text = requester.get(&url, None).await.unwrap_or_else(|e| { - eprintln!("Error fetching rule34video URL {}: {}", url, e); - let _ = send_discord_error_report( - e.to_string(), - None, - Some(&url), - None, - file!(), - line!(), - module_path!(), - ); - "".to_string() - }); - let video_items = self.get_video_items_from_html(text); - - if !video_items.is_empty() { - cache.insert(index, video_items.clone()); - Ok(video_items) - } else { - // Return empty or old items if available - Ok(cache - .get(&index) - .map(|(_, items)| items) - .unwrap_or_default()) + for (id, title) in &categories { + self.add_category_filter(id, title); } - } - - async fn query( - &self, - cache: VideoCache, - page: u8, - query: &str, - sort: &str, - options: ServerOptions, - ) -> Result> { - let timestamp_millis = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_millis()) - .unwrap_or(0); - - let expected_sorts = vec![ - "post_date", - "video_viewed", - "rating", - "duration", - "pseudo_random", - ]; - let sort_val = if expected_sorts.contains(&sort) { - sort - } else { - "post_date" - }; - - let index = format!("rule34video:{}:{}:{}", page, sort_val, query); - - if let Some((time, items)) = cache.get(&index) { - if time.elapsed().unwrap_or_default().as_secs() < 300 { - return Ok(items.clone()); - } + for (id, title) in &artists { + self.add_artist_filter(id, title); + } + for (id, title) in &uploaders { + self.add_uploader_filter(id, title); + } + for (id, title) in &tags { + self.add_tag_filter(id, title); } - let mut requester = options.requester.clone().ok_or("Requester missing")?; - let url = format!( - "{}/search/{}/?mode=async&function=get_block&block_id=custom_list_videos_videos_list_search&tag_ids=&sort_by={}&from_videos={}&from_albums={}&_={}", - self.url, - query.replace(" ", "-"), - sort_val, - page, - page, - timestamp_millis - ); - - let text = requester.get(&url, None).await.unwrap_or_else(|e| { - eprintln!("Error fetching rule34video URL {}: {}", url, e); - let _ = send_discord_error_report( - e.to_string(), - None, - Some(&url), - None, - file!(), - line!(), - module_path!(), - ); - "".to_string() - }); - let video_items = self.get_video_items_from_html(text); - - if !video_items.is_empty() { - cache.insert(index, video_items.clone()); - Ok(video_items) - } else { - Ok(cache - .get(&index) - .map(|(_, items)| items) - .unwrap_or_default()) - } - } - - fn get_video_items_from_html(&self, html: String) -> Vec { - if html.is_empty() { - return vec![]; + if let Some((uploader_url, uploader_name)) = uploaders.first() { + item = item.uploader(uploader_name.clone()); + item = item.uploader_url(uploader_url.clone()); + } else if let Some((_, artist_name)) = artists.first() { + item = item.uploader(artist_name.clone()); } - // Safely isolate the video listing section - let video_listing = match Self::extract_between( - &html, - "id=\"custom_list_videos", - "
content, - None => return vec![], - }; - - let mut items = Vec::new(); - // Skip the first split result as it's the preamble - let raw_videos = video_listing - .split("
", "<") - .unwrap_or("Unknown"); - let title = decode(title_raw.as_bytes()) - .to_string() - .unwrap_or_else(|_| title_raw.to_string()); - - // ID extraction - let id = Self::extract_between(video_segment, "https://rule34video.com/video/", "/") - .unwrap_or("0") - .to_string(); - - // Duration extraction - let raw_duration = - Self::extract_between(video_segment, "
", "<").unwrap_or("0:00"); - let duration = parse_time_to_seconds(raw_duration).unwrap_or(0) as u32; - - // Views extraction - let views_segment = Self::extract_between(video_segment, "
", "<"); - let views_count_str = views_segment - .and_then(|s| s.split("").nth(1)) - .unwrap_or("0"); - let views = parse_abbreviated_number(views_count_str.trim()).unwrap_or(0); - - // Thumbnail extraction - let thumb = Self::extract_between(video_segment, "data-original=\"", "\"") - .unwrap_or("") - .to_string(); - - // URL extraction - let url = - Self::extract_between(video_segment, " Vec { - let page_num = page.parse::().unwrap_or(1); + let page = page.parse::().unwrap_or(1); + let sort = Self::expected_sort(&sort); + let use_cache = sort != "pseudo_random"; - let result = match query { - Some(q) => self.query(cache, page_num, &q, &sort, options).await, - None => self.get(cache, page_num, &sort, options).await, + let (cache_key, url) = match ( + self.resolve_option_target(&options), + query + .as_deref() + .and_then(|value| self.resolve_query_target(value)), + query.as_deref(), + ) { + (Some(target), _, _) => ( + format!("rule34video:target:{}:{}:{}", page, sort, target.url), + self.build_filtered_url(&target.url, page, sort), + ), + (None, Some(target), Some(query)) => ( + format!( + "rule34video:query-target:{}:{}:{}", + page, + sort, + Self::normalize_title(query) + ), + self.build_filtered_url(&target.url, page, sort), + ), + (None, None, Some(query)) => ( + format!( + "rule34video:search:{}:{}:{}", + page, + sort, + Self::normalize_title(query) + ), + self.build_search_url(query, page, sort), + ), + (None, None, None) => ( + format!("rule34video:index:{}:{}", page, sort), + self.build_top_level_url(page, sort), + ), + (None, Some(target), None) => ( + format!("rule34video:target:{}:{}:{}", page, sort, target.url), + self.build_filtered_url(&target.url, page, sort), + ), }; - match result { - Ok(v) => v, - Err(e) => { - eprintln!("Error fetching videos: {}", e); + match self + .fetch_page_items(cache, cache_key, url, use_cache, options) + .await + { + Ok(items) => items, + Err(error) => { + eprintln!("Error fetching Rule34Video videos: {error}"); vec![] } } @@ -332,3 +795,93 @@ impl Provider for Rule34videoProvider { Some(self.build_channel(clientversion)) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builds_search_url_with_async_endpoint() { + let provider = Rule34videoProvider::new(); + let url = provider.build_search_url("friendly encounter", 2, "rating"); + assert!(url.contains("/search/friendly-encounter/")); + assert!(url.contains("sort_by=rating")); + assert!(url.contains("from_videos=2")); + } + + #[test] + fn parses_listing_cards() { + let provider = Rule34videoProvider::new(); + let html = r#" +
+ + + +
Friendly Encounter Preview
+
10:05
+
1.2K
+
+ "#; + + let items = provider.parse_list_videos(html).unwrap(); + assert_eq!(items.len(), 1); + assert_eq!(items[0].id, "4288578"); + assert_eq!(items[0].title, "Friendly Encounter Preview"); + assert_eq!(items[0].duration, 605); + assert_eq!(items[0].views, Some(1200)); + } + + #[test] + fn applies_detail_metadata_and_populates_filters() { + let provider = Rule34videoProvider::new(); + let item = VideoItem::new( + "4288578".to_string(), + "Friendly Encounter Preview".to_string(), + "https://rule34video.com/video/4288578/friendly-encounter-preview/".to_string(), + "rule34video".to_string(), + "https://img.example/thumb.jpg".to_string(), + 605, + ); + let html = r#" +

Friendly Encounter Preview

+
+ Category + Pokemon + Monster + Artist + Jackerman + Uploaded by + Jackerman + Tags + 3d + animated +
+ "#; + + let item = provider.apply_detail_video(item, html); + assert_eq!(item.uploader, Some("Jackerman".to_string())); + assert_eq!( + item.uploaderUrl, + Some("https://rule34video.com/members/123/jackerman/videos/".to_string()) + ); + assert_eq!( + item.tags, + Some(vec![ + "Pokemon".to_string(), + "Monster".to_string(), + "Jackerman".to_string(), + "3d".to_string(), + "animated".to_string(), + ]) + ); + + let artists = provider.artists.read().unwrap(); + let uploaders = provider.uploaders.read().unwrap(); + let categories = provider.categories.read().unwrap(); + let tags = provider.tags.read().unwrap(); + assert!(artists.iter().any(|value| value.title == "Jackerman")); + assert!(uploaders.iter().any(|value| value.title == "Jackerman")); + assert!(categories.iter().any(|value| value.title == "Pokemon")); + assert!(tags.iter().any(|value| value.title == "animated")); + } +} diff --git a/src/proxies/mod.rs b/src/proxies/mod.rs index c9e5b55..c2af138 100644 --- a/src/proxies/mod.rs +++ b/src/proxies/mod.rs @@ -7,6 +7,7 @@ pub mod hanimecdn; pub mod hqpornerthumb; pub mod javtiful; pub mod noodlemagazine; +pub mod porndishthumb; pub mod spankbang; pub mod sxyprn; diff --git a/src/proxies/porndishthumb.rs b/src/proxies/porndishthumb.rs new file mode 100644 index 0000000..61e88d1 --- /dev/null +++ b/src/proxies/porndishthumb.rs @@ -0,0 +1,62 @@ +use ntex::http::header::CONTENT_TYPE; +use ntex::{ + http::Response, + web::{self, HttpRequest, error}, +}; +use std::process::Command; + +use crate::util::requester::Requester; + +pub async fn get_image( + req: HttpRequest, + _requester: web::types::State, +) -> Result { + let endpoint = req.match_info().query("endpoint").to_string(); + let image_url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") { + endpoint + } else { + format!("https://{}", endpoint.trim_start_matches('/')) + }; + + let output = tokio::task::spawn_blocking(move || { + Command::new("python3") + .arg("-c") + .arg( + r#" +import sys +from curl_cffi import requests + +url = sys.argv[1] +response = requests.get( + url, + impersonate="chrome", + timeout=30, + allow_redirects=True, + headers={"Referer": "https://www.porndish.com/"}, +) +if response.status_code >= 400: + sys.stderr.write(f"status={response.status_code}\n") + sys.exit(1) +sys.stderr.write(response.headers.get("content-type", "application/octet-stream")) +sys.stdout.buffer.write(response.content) +"#, + ) + .arg(image_url) + .output() + }) + .await + .map_err(error::ErrorBadGateway)? + .map_err(error::ErrorBadGateway)?; + + if !output.status.success() { + return Ok(web::HttpResponse::NotFound().finish()); + } + + let content_type = String::from_utf8_lossy(&output.stderr).trim().to_string(); + let mut resp = Response::build(ntex::http::StatusCode::OK); + if !content_type.is_empty() { + resp.set_header(CONTENT_TYPE, content_type); + } + + Ok(resp.body(output.stdout)) +} diff --git a/src/proxy.rs b/src/proxy.rs index a3b0422..3794696 100644 --- a/src/proxy.rs +++ b/src/proxy.rs @@ -36,6 +36,11 @@ pub fn config(cfg: &mut web::ServiceConfig) { web::resource("/hqporner-thumb/{endpoint}*") .route(web::post().to(crate::proxies::hqpornerthumb::get_image)) .route(web::get().to(crate::proxies::hqpornerthumb::get_image)), + ) + .service( + web::resource("/porndish-thumb/{endpoint}*") + .route(web::post().to(crate::proxies::porndishthumb::get_image)) + .route(web::get().to(crate::proxies::porndishthumb::get_image)), ); } diff --git a/src/videos.rs b/src/videos.rs index e385e3c..d1c9029 100644 --- a/src/videos.rs +++ b/src/videos.rs @@ -53,6 +53,30 @@ pub struct VideosRequest { pub duration: Option, } +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct UploaderRequest { + pub uploader: Option, + pub title: Option, + pub uploaderUrl: Option, + pub uploaderId: Option, + pub channel: Option, + pub sort: Option, + pub query: Option, + pub page: Option, + pub perPage: Option, + pub featured: Option, + pub category: Option, + pub sites: Option, + pub all_provider_sites: Option, + pub filter: Option, + pub language: Option, + pub networks: Option, + pub stars: Option, + pub categories: Option, + pub duration: Option, + pub sexuality: Option, +} + #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)] pub struct ServerOptions { pub featured: Option, // "featured", @@ -405,3 +429,23 @@ pub struct Videos { pub pageInfo: PageInfo, pub items: Vec, } + +#[derive(serde::Serialize, Debug)] +pub struct LayoutRow { + pub id: String, + #[serde(rename = "type")] + pub row_type: String, + pub title: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub subtitle: Option, + pub pageInfo: PageInfo, + pub items: Vec, +} + +#[derive(serde::Serialize, Debug)] +pub struct UploaderResponse { + pub id: String, + pub title: String, + pub uploader: String, + pub rows: Vec, +}