diff --git a/build.rs b/build.rs index 8d0e856..6d269eb 100644 --- a/build.rs +++ b/build.rs @@ -204,6 +204,11 @@ const PROVIDERS: &[ProviderDef] = &[ module: "javtiful", ty: "JavtifulProvider", }, + ProviderDef { + id: "supjav", + module: "supjav", + ty: "SupjavProvider", + }, ProviderDef { id: "hypnotube", module: "hypnotube", diff --git a/src/providers/supjav.rs b/src/providers/supjav.rs new file mode 100644 index 0000000..420e308 --- /dev/null +++ b/src/providers/supjav.rs @@ -0,0 +1,1908 @@ +use crate::DbPool; +use crate::api::ClientVersion; +use crate::providers::{Provider, report_provider_error, report_provider_error_background}; +use crate::status::*; +use crate::util::cache::VideoCache; +use crate::util::parse_abbreviated_number; +use crate::videos::{ServerOptions, VideoFormat, VideoItem}; +use async_trait::async_trait; +use chrono::NaiveDate; +use error_chain::error_chain; +use futures::stream::{self, StreamExt}; +use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; +use serde::Deserialize; +use std::collections::HashSet; +use std::process::Command; +use std::sync::{Arc, RwLock}; +use std::thread; +use url::Url; + +pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = + crate::providers::ProviderChannelMetadata { + group_id: "jav", + tags: &["jav", "asian", "archive"], + }; + +const BASE_URL: &str = "https://supjav.com"; +const CHANNEL_ID: &str = "supjav"; +const PLAYER_BASE_URL: &str = "https://lk1.supremejav.com/supjav.php"; + +error_chain! { + foreign_links { + Io(std::io::Error); + Json(serde_json::Error); + } + errors { + Parse(msg: String) { + description("parse error") + display("parse error: {}", msg) + } + } +} + +#[derive(Debug, Clone)] +pub struct SupjavProvider { + url: String, + categories: Vec, + tags: Arc>>, + uploaders: Arc>>, + stars: Arc>>, +} + +#[derive(Debug, Clone)] +struct ArchiveContext { + inherited_tags: Vec, + uploader: Option<(String, String)>, +} + +#[derive(Debug, Clone)] +struct PlayerCandidate { + label: String, + token: String, +} + +#[derive(Debug, Deserialize)] +struct CurlFetchResponse { + status: u16, + url: String, + text: String, +} + +impl SupjavProvider { + pub fn new() -> Self { + let provider = Self { + url: BASE_URL.to_string(), + categories: vec![ + FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/censored-jav"), + title: "Censored JAV".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/uncensored-jav"), + title: "Uncensored JAV".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/amateur"), + title: "Amateur".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/chinese-subtitles"), + title: "Chinese Subtitles".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/english-subtitles"), + title: "English Subtitles".to_string(), + }, + FilterOption { + id: format!("{BASE_URL}/category/reducing-mosaic"), + title: "Reducing Mosaic".to_string(), + }, + ], + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + stars: Arc::new(RwLock::new(vec![FilterOption { + id: "all".to_string(), + title: "All".to_string(), + }])), + }; + provider.spawn_initial_load(); + provider + } + + fn spawn_initial_load(&self) { + let tags = Arc::clone(&self.tags); + let uploaders = Arc::clone(&self.uploaders); + let stars = Arc::clone(&self.stars); + + thread::spawn(move || { + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(runtime) => runtime, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.runtime_build", + &error.to_string(), + ); + return; + } + }; + + runtime.block_on(async move { + if let Err(error) = + Self::load_filter_index(format!("{BASE_URL}/tag"), Arc::clone(&tags), "tag") + .await + { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.tags", + &error.to_string(), + ); + } + if let Err(error) = Self::load_filter_index( + format!("{BASE_URL}/maker"), + Arc::clone(&uploaders), + "maker", + ) + .await + { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.uploaders", + &error.to_string(), + ); + } + if let Err(error) = + Self::load_filter_index(format!("{BASE_URL}/cast"), Arc::clone(&stars), "cast") + .await + { + report_provider_error_background( + CHANNEL_ID, + "spawn_initial_load.stars", + &error.to_string(), + ); + } + }); + }); + } + + fn build_channel(&self, _clientversion: ClientVersion) -> Channel { + let tags = self.tags.read().map(|value| value.clone()).unwrap_or_default(); + let uploaders = self + .uploaders + .read() + .map(|value| value.clone()) + .unwrap_or_default(); + let stars = self.stars.read().map(|value| value.clone()).unwrap_or_default(); + + Channel { + id: CHANNEL_ID.to_string(), + name: "SupJav".to_string(), + description: + "SupJav listings with maker, cast, genre, and direct HLS playback links." + .to_string(), + premium: false, + favicon: "https://www.google.com/s2/favicons?sz=64&domain=supjav.com".to_string(), + status: "active".to_string(), + categories: self + .categories + .iter() + .skip(1) + .map(|value| value.title.clone()) + .collect(), + options: vec![ + ChannelOption { + id: "sort".to_string(), + title: "Sort".to_string(), + description: "Browse the latest or SupJav popular charts.".to_string(), + systemImage: "list.number".to_string(), + colorName: "blue".to_string(), + options: vec![ + FilterOption { + id: "new".to_string(), + title: "Latest".to_string(), + }, + FilterOption { + id: "popular_day".to_string(), + title: "Popular Today".to_string(), + }, + FilterOption { + id: "popular_week".to_string(), + title: "Popular This Week".to_string(), + }, + FilterOption { + id: "popular_month".to_string(), + title: "Popular This Month".to_string(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "filter".to_string(), + title: "Category".to_string(), + description: "Browse a SupJav category archive directly.".to_string(), + systemImage: "line.horizontal.3.decrease.circle".to_string(), + colorName: "green".to_string(), + options: self.categories.clone(), + multiSelect: false, + }, + ChannelOption { + id: "sites".to_string(), + title: "Makers".to_string(), + description: "Browse maker archives and reuse them as uploader targets." + .to_string(), + systemImage: "building.2".to_string(), + colorName: "purple".to_string(), + options: uploaders, + multiSelect: false, + }, + ChannelOption { + id: "stars".to_string(), + title: "Cast".to_string(), + description: "Browse cast archives directly.".to_string(), + systemImage: "person.2".to_string(), + colorName: "pink".to_string(), + options: stars, + multiSelect: false, + }, + ChannelOption { + id: "categories".to_string(), + title: "Genres".to_string(), + description: "Browse a genre/tag archive directly.".to_string(), + systemImage: "tag.fill".to_string(), + colorName: "orange".to_string(), + options: tags, + multiSelect: false, + }, + ], + nsfw: true, + cacheDuration: Some(1800), + } + } + + fn selector(value: &str) -> Result { + Selector::parse(value) + .map_err(|error| Error::from(format!("selector `{value}` parse failed: {error}"))) + } + + fn regex(value: &str) -> Result { + Regex::new(value).map_err(|error| Error::from(format!("regex `{value}` failed: {error}"))) + } + + fn decode_text(value: &str) -> String { + decode(value.as_bytes()) + .to_string() + .unwrap_or_else(|_| value.to_string()) + .replace('\u{a0}', " ") + .split_whitespace() + .collect::>() + .join(" ") + .trim() + .to_string() + } + + fn text_of(element: &ElementRef<'_>) -> String { + Self::decode_text(&element.text().collect::>().join(" ")) + } + + fn strip_count_suffix(value: &str) -> String { + let trimmed = value.trim(); + if let Some(index) = trimmed.rfind(" (") { + if trimmed.ends_with(')') + && trimmed[index + 2..trimmed.len() - 1] + .chars() + .all(|ch| ch.is_ascii_digit()) + { + return trimmed[..index].trim().to_string(); + } + } + trimmed.to_string() + } + + fn normalize_title(value: &str) -> String { + Self::strip_count_suffix(value) + .split_whitespace() + .collect::>() + .join(" ") + .trim() + .to_ascii_lowercase() + } + + fn push_unique(target: &Arc>>, item: FilterOption) { + if item.id.trim().is_empty() || item.title.trim().is_empty() { + return; + } + if let Ok(mut values) = target.write() { + if !values.iter().any(|value| value.id == item.id) { + values.push(item); + } + } + } + + fn push_unique_string(target: &mut Vec, value: &str) { + let clean = Self::decode_text(value); + if clean.is_empty() || target.iter().any(|existing| existing == &clean) { + return; + } + target.push(clean); + } + + fn parse_views(value: &str) -> Option { + parse_abbreviated_number( + &value + .replace("Views", "") + .replace("View", "") + .replace(',', "") + .trim() + .to_string(), + ) + } + + fn parse_uploaded_at(value: &str) -> Option { + NaiveDate::parse_from_str(value.trim(), "%Y/%m/%d") + .ok() + .and_then(|value| value.and_hms_opt(0, 0, 0)) + .map(|value| value.and_utc().timestamp() as u64) + } + + fn normalize_url(&self, value: &str) -> String { + if value.is_empty() { + return String::new(); + } + if value.starts_with("http://") || value.starts_with("https://") { + return value.to_string(); + } + if value.starts_with("//") { + return format!("https:{value}"); + } + if value.starts_with('/') { + return format!("{}{}", self.url, value); + } + format!("{}/{}", self.url, value.trim_start_matches("./")) + } + + fn normalize_archive_target(&self, value: &str) -> Option { + let trimmed = value.trim(); + if trimmed.is_empty() || trimmed == "all" { + return None; + } + + // The generic API request reuses `filter` for sort choices like `new`. + // Only treat values that already look like links as archive targets. + let looks_like_link = trimmed.starts_with("http://") + || trimmed.starts_with("https://") + || trimmed.starts_with("//") + || trimmed.starts_with('/') + || trimmed.starts_with("./") + || trimmed.starts_with("../") + || trimmed.starts_with('?'); + if !looks_like_link { + return None; + } + + Some(self.normalize_url(trimmed)) + } + + fn make_player_url(token: &str) -> String { + let reversed = token.chars().rev().collect::(); + format!("{PLAYER_BASE_URL}?c={reversed}") + } + + fn canonical_uploader_id(uploader_url: &str) -> Option { + let url = Url::parse(uploader_url).ok()?; + let mut segments = url.path_segments()?; + let first = segments.next()?; + let second = segments.next()?; + let third = segments.next()?; + if first == "category" && second == "maker" && !third.is_empty() { + return Some(format!("{CHANNEL_ID}:maker:{third}")); + } + None + } + + fn build_archive_page_url(base_url: &str, page: u16) -> Result { + let mut url = + Url::parse(base_url).map_err(|error| Error::from(format!("invalid url: {error}")))?; + let path = url.path().trim_end_matches('/').to_string(); + let next_path = if page <= 1 { + if path.is_empty() { "/".to_string() } else { path } + } else if path.is_empty() || path == "/" { + format!("/page/{page}") + } else { + format!("{path}/page/{page}") + }; + url.set_path(&next_path); + Ok(url.to_string()) + } + + fn build_home_url(&self, page: u16) -> Result { + Self::build_archive_page_url(&self.url, page) + } + + fn build_popular_url(&self, page: u16, sort: &str) -> Result { + let mut url = Url::parse(&format!("{}/popular", self.url)) + .map_err(|error| Error::from(format!("invalid popular url: {error}")))?; + match sort { + "popular_week" => url.set_query(Some("sort=week")), + "popular_month" => url.set_query(Some("sort=month")), + _ => {} + } + Self::build_archive_page_url(url.as_str(), page) + } + + fn build_search_url(&self, query: &str, page: u16) -> Result { + let mut url = Url::parse(&self.url) + .map_err(|error| Error::from(format!("invalid search url: {error}")))?; + if page > 1 { + url.set_path(&format!("/page/{page}")); + } + url.query_pairs_mut().clear().append_pair("s", query); + Ok(url.to_string()) + } + + fn match_filter(options: &[FilterOption], query: &str) -> Option { + let normalized_query = Self::normalize_title(query); + options + .iter() + .find(|value| Self::normalize_title(&value.title) == normalized_query) + .map(|value| value.id.clone()) + } + + fn resolve_option_target(&self, options: &ServerOptions) -> Option { + for value in [ + options.sites.as_deref(), + options.stars.as_deref(), + options.categories.as_deref(), + options.category.as_deref(), + options.filter.as_deref(), + ] { + let Some(value) = value else { + continue; + }; + if let Some(target) = self.normalize_archive_target(value) { + return Some(target); + } + } + None + } + + fn resolve_query_target(&self, query: &str) -> Option { + if let Ok(uploaders) = self.uploaders.read() { + if let Some(target) = Self::match_filter(&uploaders, query) { + return self.normalize_archive_target(&target); + } + } + if let Ok(stars) = self.stars.read() { + if let Some(target) = Self::match_filter(&stars, query) { + return self.normalize_archive_target(&target); + } + } + if let Ok(tags) = self.tags.read() { + if let Some(target) = Self::match_filter(&tags, query) { + return self.normalize_archive_target(&target); + } + } + Self::match_filter(&self.categories, query) + .and_then(|target| self.normalize_archive_target(&target)) + } + + fn filters_need_loading(&self) -> bool { + let tags_len = self.tags.read().map(|value| value.len()).unwrap_or_default(); + let uploaders_len = self + .uploaders + .read() + .map(|value| value.len()) + .unwrap_or_default(); + let stars_len = self.stars.read().map(|value| value.len()).unwrap_or_default(); + tags_len <= 1 || uploaders_len <= 1 || stars_len <= 1 + } + + async fn ensure_filters_loaded(&self) { + if !self.filters_need_loading() { + return; + } + + if self.tags.read().map(|value| value.len()).unwrap_or_default() <= 1 { + if let Err(error) = + Self::load_filter_index(format!("{BASE_URL}/tag"), Arc::clone(&self.tags), "tag") + .await + { + report_provider_error_background( + CHANNEL_ID, + "ensure_filters_loaded.tags", + &error.to_string(), + ); + } + } + + if self + .uploaders + .read() + .map(|value| value.len()) + .unwrap_or_default() + <= 1 + { + if let Err(error) = Self::load_filter_index( + format!("{BASE_URL}/maker"), + Arc::clone(&self.uploaders), + "maker", + ) + .await + { + report_provider_error_background( + CHANNEL_ID, + "ensure_filters_loaded.uploaders", + &error.to_string(), + ); + } + } + + if self.stars.read().map(|value| value.len()).unwrap_or_default() <= 1 { + if let Err(error) = + Self::load_filter_index(format!("{BASE_URL}/cast"), Arc::clone(&self.stars), "cast") + .await + { + report_provider_error_background( + CHANNEL_ID, + "ensure_filters_loaded.stars", + &error.to_string(), + ); + } + } + } + + async fn resolve_query_target_async(&self, query: &str) -> Option { + if let Some(target) = self.resolve_query_target(query) { + return Some(target); + } + + if self.filters_need_loading() { + self.ensure_filters_loaded().await; + return self.resolve_query_target(query); + } + + None + } + + async fn fetch_with_curl_cffi(url: &str, referer: Option<&str>) -> Result { + let url = url.to_string(); + let referer = referer.unwrap_or("").to_string(); + + let output = tokio::task::spawn_blocking(move || { + Command::new("python3") + .arg("-c") + .arg( + r#" +import json +import sys +from curl_cffi import requests + +url = sys.argv[1] +referer = sys.argv[2] if len(sys.argv) > 2 else "" +headers = {} +if referer: + headers["Referer"] = referer + +response = requests.get( + url, + impersonate="chrome", + timeout=30, + allow_redirects=True, + headers=headers, +) + +print(json.dumps({ + "status": response.status_code, + "url": response.url, + "text": response.text, +})) +"#, + ) + .arg(url) + .arg(referer) + .output() + }) + .await + .map_err(|error| Error::from(format!("spawn_blocking failed: {error}")))? + .map_err(|error| Error::from(format!("python3 execution failed: {error}")))?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + return Err(Error::from(format!("curl_cffi request failed: {stderr}"))); + } + + serde_json::from_slice::(&output.stdout) + .map_err(|error| Error::from(format!("curl_cffi json parse failed: {error}"))) + } + + async fn fetch_html(url: &str, referer: Option<&str>) -> Result { + let response = Self::fetch_with_curl_cffi(url, referer).await?; + if response.status >= 400 { + return Err(Error::from(format!( + "request failed status={} url={}", + response.status, response.url + ))); + } + Ok(response.text) + } + + async fn load_filter_index( + start_url: String, + target: Arc>>, + kind: &str, + ) -> Result<()> { + let first_html = Self::fetch_html(&start_url, Some(BASE_URL)).await?; + let page_count = Self::pagination_last_page(&first_html)?; + Self::parse_filter_page(&first_html, &target, kind)?; + + for page in 2..=page_count { + let page_url = Self::build_archive_page_url(&start_url, page as u16)?; + let html = match Self::fetch_html(&page_url, Some(BASE_URL)).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "load_filter_index.page", + &format!("kind={kind}; url={page_url}; error={error}"), + ); + continue; + } + }; + Self::parse_filter_page(&html, &target, kind)?; + } + + Ok(()) + } + + fn pagination_last_page(html: &str) -> Result { + let document = Html::parse_document(html); + let link_selector = + Self::selector(".pagination a[href], .page-numbers[href], .wp-pagenavi a[href]")?; + let mut last_page = 1usize; + for link in document.select(&link_selector) { + let text = Self::text_of(&link); + if let Ok(page) = text.parse::() { + last_page = last_page.max(page); + } + } + Ok(last_page) + } + + fn parse_filter_page( + html: &str, + target: &Arc>>, + kind: &str, + ) -> Result<()> { + let document = Html::parse_document(html); + let link_selector = Self::selector("a[href]")?; + + for link in document.select(&link_selector) { + let Some(href) = link.value().attr("href") else { + continue; + }; + let title = Self::strip_count_suffix(&Self::text_of(&link)); + if title.is_empty() { + continue; + } + + let matched = match kind { + "tag" => href.contains("/tag/") && !href.ends_with("/tag") && !href.contains("?sort="), + "maker" => href.contains("/category/maker/"), + "cast" => href.contains("/category/cast/"), + _ => false, + }; + if !matched { + continue; + } + + Self::push_unique( + target, + FilterOption { + id: if href.starts_with("http://") + || href.starts_with("https://") + || href.starts_with("//") + || href.starts_with('/') + || href.starts_with("./") + || href.starts_with("../") + { + if href.starts_with("//") { + format!("https:{href}") + } else if href.starts_with("http://") || href.starts_with("https://") { + href.to_string() + } else if href.starts_with('/') { + format!("{BASE_URL}{href}") + } else { + format!("{BASE_URL}/{}", href.trim_start_matches("./")) + } + } else { + continue; + }, + title, + }, + ); + } + + Ok(()) + } + + fn context_for_target_url(&self, url: &str, archive_title: Option<&str>) -> ArchiveContext { + let mut context = ArchiveContext { + inherited_tags: vec![], + uploader: None, + }; + let Some(title) = archive_title.map(Self::strip_count_suffix) else { + return context; + }; + + if let Ok(parsed) = Url::parse(url) { + let path = parsed.path(); + if path.starts_with("/category/maker/") { + context.uploader = Some((title, url.to_string())); + return context; + } + if path.starts_with("/category/cast/") || path.starts_with("/tag/") { + context.inherited_tags.push(title); + return context; + } + if path.starts_with("/category/") && !path.starts_with("/category/maker/") + && !path.starts_with("/category/cast/") + { + context.inherited_tags.push(title); + } + } + + context + } + + fn extract_archive_title(document: &Html) -> Result> { + let title_selector = Self::selector(".archive-title h1")?; + Ok(document + .select(&title_selector) + .next() + .map(|value| Self::strip_count_suffix(&Self::text_of(&value))) + .filter(|value| !value.is_empty())) + } + + fn parse_post_element( + &self, + element: &ElementRef<'_>, + context: &ArchiveContext, + ) -> Result> { + let link_selector = Self::selector("a.img[href], h3 a[href]")?; + let image_selector = Self::selector("img.thumb")?; + let title_selector = Self::selector("h3 a[href]")?; + let meta_selector = Self::selector(".meta")?; + let views_selector = Self::selector(".meta .date")?; + + let Some(title_link) = element.select(&title_selector).next() else { + return Ok(None); + }; + let Some(href) = title_link.value().attr("href") else { + return Ok(None); + }; + let page_url = self.normalize_url(href); + if page_url.is_empty() { + return Ok(None); + } + + let title = Self::text_of(&title_link); + if title.is_empty() { + return Ok(None); + } + + let id = page_url + .trim_end_matches('/') + .rsplit('/') + .next() + .unwrap_or_default() + .trim_end_matches(".html") + .to_string(); + if id.is_empty() { + return Ok(None); + } + + let thumb = element + .select(&image_selector) + .next() + .and_then(|image| { + image + .value() + .attr("data-original") + .or_else(|| image.value().attr("src")) + }) + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + + let mut item = VideoItem::new( + id, + title, + page_url.clone(), + CHANNEL_ID.to_string(), + thumb, + 0, + ); + + if let Some(meta) = element.select(&meta_selector).next() { + let raw_meta = Self::text_of(&meta); + let date_text = raw_meta + .split_whitespace() + .find(|part| part.contains('/')) + .unwrap_or_default() + .to_string(); + item.uploadedAt = Self::parse_uploaded_at(&date_text); + } + + if let Some(views) = element + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))) + { + item.views = Some(views); + } + + if let Some((uploader, uploader_url)) = context.uploader.as_ref() { + item.uploader = Some(uploader.clone()); + item.uploaderUrl = Some(uploader_url.clone()); + item.uploaderId = Self::canonical_uploader_id(uploader_url); + } + + let mut tags = context.inherited_tags.clone(); + if let Some(uploader) = item.uploader.as_ref() { + if !tags.iter().any(|value| value == uploader) { + tags.push(uploader.clone()); + } + } + if !tags.is_empty() { + item.tags = Some(tags); + } + + let _ = element.select(&link_selector).next(); + Ok(Some(item)) + } + + fn parse_posts( + &self, + scope: &ElementRef<'_>, + context: &ArchiveContext, + ) -> Result> { + let post_selector = Self::selector("div.posts.clearfix div.post")?; + let mut items = Vec::new(); + let mut seen_ids = HashSet::new(); + + for post in scope.select(&post_selector) { + let Some(item) = self.parse_post_element(&post, context)? else { + continue; + }; + if seen_ids.insert(item.id.clone()) { + items.push(item); + } + } + + Ok(items) + } + + fn parse_home_sections(&self, html: &str) -> Result> { + let document = Html::parse_document(html); + let content_selector = Self::selector(".contents .content")?; + let title_selector = Self::selector(".archive-title h1")?; + let mut items = Vec::new(); + let mut seen_ids = HashSet::new(); + + for content in document.select(&content_selector) { + let category = content + .select(&title_selector) + .next() + .map(|value| Self::strip_count_suffix(&Self::text_of(&value))) + .filter(|value| !value.is_empty()); + let context = ArchiveContext { + inherited_tags: category.into_iter().collect(), + uploader: None, + }; + for item in self.parse_posts(&content, &context)? { + if seen_ids.insert(item.id.clone()) { + items.push(item); + } + } + } + + Ok(items) + } + + fn parse_listing_page(&self, url: &str, html: &str, sort: &str) -> Result> { + let document = Html::parse_document(html); + let body_selector = Self::selector("body")?; + let is_home = document + .select(&body_selector) + .next() + .and_then(|body| body.value().attr("class")) + .is_some_and(|class_name| class_name.split_whitespace().any(|value| value == "home")); + + if is_home && sort == "new" && Url::parse(url).ok().and_then(|parsed| { + let path = parsed.path(); + parsed.query().map(|_| false).or(Some(path == "/" || path.starts_with("/page/"))) + }) == Some(true) + { + return self.parse_home_sections(html); + } + + let title = Self::extract_archive_title(&document)?; + let context = self.context_for_target_url(url, title.as_deref()); + for scope_selector in [".main .content", ".main", ".container"] { + let selector = Self::selector(scope_selector)?; + let Some(scope) = document.select(&selector).next() else { + continue; + }; + let items = self.parse_posts(&scope, &context)?; + if !items.is_empty() { + return Ok(items); + } + } + + Ok(vec![]) + } + + fn extract_player_candidates(html: &str) -> Result> { + let document = Html::parse_document(html); + let selector = Self::selector(".btn-server[data-link]")?; + let mut players = Vec::new(); + for link in document.select(&selector) { + let label = Self::text_of(&link); + let token = link + .value() + .attr("data-link") + .unwrap_or_default() + .trim() + .to_string(); + if label.is_empty() || token.is_empty() { + continue; + } + players.push(PlayerCandidate { label, token }); + } + Ok(players) + } + + fn extract_m3u8_url(html: &str) -> Result> { + let data_hash_regex = Self::regex(r#"data-hash="([^"]+\.m3u8[^"]*)""#)?; + if let Some(captures) = data_hash_regex.captures(html) { + if let Some(url) = captures.get(1).map(|value| value.as_str().to_string()) { + return Ok(Some(url)); + } + } + + let url_play_regex = Self::regex(r#"var\s+urlPlay\s*=\s*'([^']+\.m3u8[^']*)'"#)?; + Ok(url_play_regex + .captures(html) + .and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))) + } + + async fn build_formats_from_master(&self, master_url: &str) -> Result> { + let response = Self::fetch_with_curl_cffi(master_url, None).await?; + if response.status >= 400 { + return Err(Error::from(format!( + "master playlist fetch failed status={} url={}", + response.status, response.url + ))); + } + + let base_url = + Url::parse(master_url).map_err(|error| Error::from(format!("invalid master url: {error}")))?; + let mut formats = vec![ + VideoFormat::new(master_url.to_string(), "auto".to_string(), "m3u8".to_string()) + .format_note("master".to_string()) + .format_id("master".to_string()), + ]; + let resolution_regex = Self::regex(r#"RESOLUTION=(\d+)x(\d+)"#)?; + let bandwidth_regex = Self::regex(r#"BANDWIDTH=(\d+)"#)?; + let mut lines = response.text.lines(); + + while let Some(line) = lines.next() { + if !line.starts_with("#EXT-X-STREAM-INF:") { + continue; + } + + let next_url = lines + .by_ref() + .find(|value| !value.trim().is_empty() && !value.starts_with('#')) + .unwrap_or_default() + .trim() + .to_string(); + if next_url.is_empty() { + continue; + } + + let resolved_url = base_url + .join(&next_url) + .map(|value| value.to_string()) + .unwrap_or(next_url.clone()); + let height = resolution_regex + .captures(line) + .and_then(|captures| captures.get(2)) + .and_then(|value| value.as_str().parse::().ok()); + let bandwidth = bandwidth_regex + .captures(line) + .and_then(|captures| captures.get(1)) + .and_then(|value| value.as_str().parse::().ok()); + let quality = height + .map(|value| format!("{value}p")) + .unwrap_or_else(|| "hls".to_string()); + let mut format = VideoFormat::new(resolved_url, quality.clone(), "m3u8".to_string()) + .format_note( + height + .map(|value| format!("{value}p")) + .unwrap_or_else(|| "variant".to_string()), + ) + .format_id( + height + .map(|value| format!("hls-{value}p")) + .unwrap_or_else(|| "hls-variant".to_string()), + ); + if let Some(bandwidth) = bandwidth { + format = format.format_note(format!("{quality} ({bandwidth}bps)")); + } + formats.push(format); + } + + Ok(formats) + } + + async fn resolve_player( + &self, + detail_url: &str, + candidates: &[PlayerCandidate], + ) -> Result)>> { + for candidate in candidates { + let player_url = Self::make_player_url(&candidate.token); + let response = match Self::fetch_with_curl_cffi(&player_url, Some(detail_url)).await { + Ok(response) => response, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "resolve_player.fetch", + &format!("detail_url={detail_url}; label={}; error={error}", candidate.label), + ); + continue; + } + }; + if response.status >= 400 { + continue; + } + + let Some(master_url) = Self::extract_m3u8_url(&response.text)? else { + continue; + }; + let formats = match self.build_formats_from_master(&master_url).await { + Ok(formats) => formats, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "resolve_player.master", + &format!( + "detail_url={detail_url}; label={}; master_url={master_url}; error={error}", + candidate.label + ), + ); + vec![] + } + }; + return Ok(Some((master_url, formats))); + } + + Ok(None) + } + + async fn apply_detail_video(&self, mut item: VideoItem, html: &str, page_url: &str) -> Result { + let ( + parsed_title, + parsed_thumb, + parsed_views, + parsed_tags, + parsed_maker, + parsed_casts, + parsed_genre_filters, + players, + ) = { + let document = Html::parse_document(html); + let title_selector = Self::selector(".archive-title h1, .post-meta h2, h1")?; + let image_selector = + Self::selector(".post-meta img.img, meta[property=\"og:image\"]")?; + let views_selector = Self::selector(".dz_view .views")?; + let cats_selector = Self::selector(".cats p")?; + let tag_selector = Self::selector(".tags a[href]")?; + let span_selector = Self::selector("span")?; + let link_selector = Self::selector("a[href]")?; + + let parsed_title = document + .select(&title_selector) + .next() + .map(|value| Self::text_of(&value)) + .filter(|value| !value.is_empty()); + + let parsed_thumb = document + .select(&image_selector) + .next() + .and_then(|value| { + value + .value() + .attr("src") + .or_else(|| value.value().attr("content")) + }) + .map(|value| self.normalize_url(value)) + .filter(|value| !value.is_empty()); + + let parsed_views = document + .select(&views_selector) + .next() + .and_then(|value| Self::parse_views(&Self::text_of(&value))); + + let mut parsed_tags = item.tags.clone().unwrap_or_default(); + let mut parsed_maker: Option<(String, String)> = None; + let mut parsed_casts = Vec::new(); + let mut parsed_genre_filters = Vec::new(); + + for paragraph in document.select(&cats_selector) { + let label = paragraph + .select(&span_selector) + .next() + .map(|value| Self::text_of(&value)) + .unwrap_or_default(); + let links = paragraph.select(&link_selector).collect::>(); + if links.is_empty() { + continue; + } + + if label.starts_with("Maker") { + if let Some(link) = links.first() { + let uploader = Self::text_of(link); + let uploader_url = link + .value() + .attr("href") + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + if !uploader.is_empty() { + parsed_maker = Some((uploader, uploader_url)); + } + } + continue; + } + + for link in links { + let value = Self::text_of(&link); + let value_url = link + .value() + .attr("href") + .map(|href| self.normalize_url(href)) + .unwrap_or_default(); + if value.is_empty() { + continue; + } + Self::push_unique_string(&mut parsed_tags, &value); + if label.starts_with("Cast") && !value_url.is_empty() { + parsed_casts.push((value, value_url)); + } else if label.is_empty() && !value_url.is_empty() { + parsed_genre_filters.push((value, value_url)); + } + } + } + + for tag in document.select(&tag_selector) { + let title = Self::text_of(&tag); + let tag_url = tag + .value() + .attr("href") + .map(|value| self.normalize_url(value)) + .unwrap_or_default(); + if title.is_empty() { + continue; + } + Self::push_unique_string(&mut parsed_tags, &title); + if !tag_url.is_empty() { + parsed_genre_filters.push((title, tag_url)); + } + } + + ( + parsed_title, + parsed_thumb, + parsed_views, + parsed_tags, + parsed_maker, + parsed_casts, + parsed_genre_filters, + Self::extract_player_candidates(html)?, + ) + }; + + if let Some(title) = parsed_title { + item.title = title; + } + + if let Some(thumb) = parsed_thumb { + item.thumb = thumb; + } + + if let Some(views) = parsed_views { + item.views = Some(views); + } + + if let Some((uploader, uploader_url)) = parsed_maker { + item.uploader = Some(uploader.clone()); + if !uploader_url.is_empty() { + item.uploaderUrl = Some(uploader_url.clone()); + item.uploaderId = Self::canonical_uploader_id(&uploader_url); + Self::push_unique( + &self.uploaders, + FilterOption { + id: uploader_url, + title: uploader, + }, + ); + } + } + + for (value, value_url) in parsed_casts { + Self::push_unique( + &self.stars, + FilterOption { + id: value_url, + title: value, + }, + ); + } + + for (value, value_url) in parsed_genre_filters { + Self::push_unique( + &self.tags, + FilterOption { + id: value_url, + title: value, + }, + ); + } + + if !parsed_tags.is_empty() { + item.tags = Some(parsed_tags); + } + + if let Some((master_url, formats)) = self.resolve_player(page_url, &players).await? { + item.url = master_url; + if !formats.is_empty() { + item.formats = Some(formats); + } + } + + item.aspectRatio = Some(16.0 / 9.0); + Ok(item) + } + + async fn enrich_video(&self, item: VideoItem) -> VideoItem { + let page_url = item.url.clone(); + let original = item.clone(); + let html = match Self::fetch_html(&page_url, Some(BASE_URL)).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_video.fetch", + &format!("url={page_url}; error={error}"), + ); + return original; + } + }; + + match self.apply_detail_video(item, &html, &page_url).await { + Ok(item) => item, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "enrich_video.parse", + &format!("url={page_url}; error={error}"), + ); + original + } + } + } + + async fn fetch_items_for_url( + &self, + cache: VideoCache, + url: String, + sort: &str, + per_page_limit: usize, + ) -> Result> { + if let Some((time, items)) = cache.get(&url) { + if time.elapsed().unwrap_or_default().as_secs() < 300 { + return Ok(items.iter().take(per_page_limit).cloned().collect()); + } + } + + let html = Self::fetch_html(&url, Some(BASE_URL)).await?; + let list_items = self.parse_listing_page(&url, &html, sort)?; + if list_items.is_empty() { + return Ok(vec![]); + } + + let items = stream::iter( + list_items + .into_iter() + .take(per_page_limit.max(1)) + .map(|item| { + let provider = self.clone(); + async move { provider.enrich_video(item).await } + }), + ) + .buffer_unordered(4) + .collect::>() + .await; + + if !items.is_empty() { + cache.insert(url, items.clone()); + } + + Ok(items) + } + + fn item_matches_query(item: &VideoItem, query: &str) -> bool { + let query = query.trim().to_ascii_lowercase(); + if query.is_empty() { + return false; + } + + item.title.to_ascii_lowercase().contains(&query) + || item + .uploader + .as_deref() + .is_some_and(|value| value.to_ascii_lowercase().contains(&query)) + || item.tags.as_ref().is_some_and(|values| { + values + .iter() + .any(|value| value.to_ascii_lowercase().contains(&query)) + }) + } + + fn fallback_query_sources(&self, sort: &str) -> Result> { + let mut urls = Vec::new(); + for page in 1..=4u16 { + urls.push(self.build_home_url(page)?); + } + for category in self.categories.iter().skip(1) { + urls.push(category.id.clone()); + urls.push(Self::build_archive_page_url(&category.id, 2)?); + } + urls.push(self.build_popular_url(1, sort)?); + urls.push(self.build_popular_url(2, sort)?); + urls.push(self.build_popular_url(1, "popular_week")?); + Ok(urls) + } + + async fn fallback_query( + &self, + query: &str, + page: u16, + per_page_limit: usize, + sort: &str, + ) -> Result> { + let mut matched = Vec::new(); + let mut seen_ids = HashSet::new(); + + for url in self.fallback_query_sources(sort)? { + let html = match Self::fetch_html(&url, Some(BASE_URL)).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "fallback_query.fetch", + &format!("query={query}; url={url}; error={error}"), + ); + continue; + } + }; + + let items = match self.parse_listing_page(&url, &html, "new") { + Ok(items) => items, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "fallback_query.parse", + &format!("query={query}; url={url}; error={error}"), + ); + continue; + } + }; + + for item in items { + if !Self::item_matches_query(&item, query) { + continue; + } + if seen_ids.insert(item.id.clone()) { + matched.push(item); + } + } + + if matched.len() >= page as usize * per_page_limit.max(1) { + break; + } + } + + let start = page.saturating_sub(1) as usize * per_page_limit.max(1); + let selected = matched + .into_iter() + .skip(start) + .take(per_page_limit.max(1)) + .collect::>(); + + if selected.is_empty() { + return Ok(vec![]); + } + + Ok(stream::iter(selected.into_iter().map(|item| { + let provider = self.clone(); + async move { provider.enrich_video(item).await } + })) + .buffer_unordered(4) + .collect::>() + .await) + } + + async fn aggregate_latest(&self, page: u16, per_page_limit: usize) -> Result> { + let needed = page as usize * per_page_limit.max(1); + let pages_per_category = page.max(2); + let mut combined = Vec::new(); + let mut seen_ids = HashSet::new(); + + for category in self.categories.iter().skip(1) { + for category_page in 1..=pages_per_category { + let url = Self::build_archive_page_url(&category.id, category_page)?; + let html = match Self::fetch_html(&url, Some(BASE_URL)).await { + Ok(html) => html, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "aggregate_latest.fetch", + &format!("url={url}; error={error}"), + ); + continue; + } + }; + + let items = match self.parse_listing_page(&url, &html, "new") { + Ok(items) => items, + Err(error) => { + report_provider_error_background( + CHANNEL_ID, + "aggregate_latest.parse", + &format!("url={url}; error={error}"), + ); + continue; + } + }; + + for item in items { + if seen_ids.insert(item.id.clone()) { + combined.push(item); + } + } + } + } + + combined.sort_by(|a, b| { + ( + b.uploadedAt.unwrap_or(0), + b.views.unwrap_or(0), + a.title.len(), + a.id.clone(), + ) + .cmp(&( + a.uploadedAt.unwrap_or(0), + a.views.unwrap_or(0), + b.title.len(), + b.id.clone(), + )) + }); + + if combined.len() < needed { + report_provider_error_background( + CHANNEL_ID, + "aggregate_latest.shortfall", + &format!("needed={needed}; found={}", combined.len()), + ); + } + + let start = page.saturating_sub(1) as usize * per_page_limit.max(1); + let selected = combined + .into_iter() + .skip(start) + .take(per_page_limit.max(1)) + .collect::>(); + if selected.is_empty() { + return Ok(vec![]); + } + + Ok(stream::iter(selected.into_iter().map(|item| { + let provider = self.clone(); + async move { provider.enrich_video(item).await } + })) + .buffer_unordered(4) + .collect::>() + .await) + } + + async fn get( + &self, + cache: VideoCache, + page: u16, + sort: &str, + per_page_limit: usize, + options: ServerOptions, + ) -> Result> { + if self.resolve_option_target(&options).is_none() && sort == "new" { + return self.aggregate_latest(page, per_page_limit).await; + } + + let url = if let Some(target) = self.resolve_option_target(&options) { + Self::build_archive_page_url(&target, page)? + } else if sort == "popular_day" || sort == "popular_week" || sort == "popular_month" { + self.build_popular_url(page, sort)? + } else { + self.build_home_url(page)? + }; + + self.fetch_items_for_url(cache, url, sort, per_page_limit).await + } + + async fn query( + &self, + cache: VideoCache, + page: u16, + sort: &str, + query: &str, + per_page_limit: usize, + ) -> Result> { + let exact_target = self.resolve_query_target_async(query).await; + let url = if let Some(target) = exact_target.as_ref() { + Self::build_archive_page_url(target, page)? + } else { + self.build_search_url(query, page)? + }; + + let items = self + .fetch_items_for_url(cache, url, sort, per_page_limit) + .await?; + if exact_target.is_some() || items.len() >= 5 || !items.is_empty() { + return Ok(items); + } + + let fallback = self.fallback_query(query, page, per_page_limit, sort).await?; + if !fallback.is_empty() { + return Ok(fallback); + } + + Ok(items) + } +} + +#[async_trait] +impl Provider for SupjavProvider { + async fn get_videos( + &self, + cache: VideoCache, + pool: DbPool, + sort: String, + query: Option, + page: String, + per_page: String, + options: ServerOptions, + ) -> Vec { + let _ = pool; + let page = page.parse::().unwrap_or(1); + let per_page_limit = per_page.parse::().unwrap_or(24); + + let result = match query { + Some(query) if !query.trim().is_empty() => { + self.query(cache, page, &sort, &query, per_page_limit) + .await + } + _ => self.get(cache, page, &sort, per_page_limit, options).await, + }; + + match result { + Ok(videos) => videos, + Err(error) => { + report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await; + vec![] + } + } + } + + fn get_channel(&self, clientversion: ClientVersion) -> Option { + Some(self.build_channel(clientversion)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::util::cache::VideoCache; + use crate::util::requester::Requester; + use diesel::{ + SqliteConnection, + r2d2::{self, ConnectionManager}, + }; + use std::process::Command; + use std::time::{Duration, SystemTime, UNIX_EPOCH}; + + fn test_provider() -> SupjavProvider { + SupjavProvider { + url: BASE_URL.to_string(), + categories: vec![FilterOption { + id: format!("{BASE_URL}/category/censored-jav"), + title: "Censored JAV".to_string(), + }], + tags: Arc::new(RwLock::new(vec![FilterOption { + id: "/tag/creampie".to_string(), + title: "Creampie".to_string(), + }])), + uploaders: Arc::new(RwLock::new(vec![])), + stars: Arc::new(RwLock::new(vec![])), + } + } + + #[test] + fn strips_count_suffix() { + assert_eq!(SupjavProvider::strip_count_suffix("DAHLIA (937)"), "DAHLIA"); + assert_eq!(SupjavProvider::strip_count_suffix("Censored JAV"), "Censored JAV"); + } + + #[test] + fn builds_archive_page_url_with_query() { + assert_eq!( + SupjavProvider::build_archive_page_url("https://supjav.com/popular?sort=week", 3) + .expect("archive page url should build"), + "https://supjav.com/popular/page/3?sort=week" + ); + assert_eq!( + SupjavProvider::build_archive_page_url("https://supjav.com/?s=fc2", 2) + .expect("search page url should build"), + "https://supjav.com/page/2?s=fc2" + ); + } + + #[test] + fn canonical_uploader_ids_use_maker_slug() { + assert_eq!( + SupjavProvider::canonical_uploader_id("https://supjav.com/category/maker/dahlia") + .as_deref(), + Some("supjav:maker:dahlia") + ); + } + + #[test] + fn extracts_master_playlist_from_player_html() { + let html = r#" +
+ + "#; + assert_eq!( + SupjavProvider::extract_m3u8_url(html) + .expect("m3u8 extraction should work") + .as_deref(), + Some("https://cdn3.turboviplay.com/data1/example/master.m3u8") + ); + } + + #[test] + fn ignores_sort_filter_when_resolving_archive_target() { + let provider = test_provider(); + let options = ServerOptions { + featured: None, + category: Some("all".to_string()), + sites: None, + filter: Some("new".to_string()), + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }; + + assert_eq!(provider.resolve_option_target(&options), None); + } + + #[test] + fn normalizes_relative_archive_targets() { + let provider = test_provider(); + let options = ServerOptions { + featured: None, + category: Some("/category/censored-jav".to_string()), + sites: None, + filter: Some("new".to_string()), + language: None, + public_url_base: None, + requester: None, + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }; + + assert_eq!( + provider.resolve_option_target(&options).as_deref(), + Some("https://supjav.com/category/censored-jav") + ); + assert_eq!( + provider.resolve_query_target("Creampie").as_deref(), + Some("https://supjav.com/tag/creampie") + ); + } + + fn test_db_pool() -> DbPool { + let unique = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time should be after unix epoch") + .as_nanos(); + let database_url = std::env::temp_dir() + .join(format!("hottub-supjav-test-{unique}.sqlite")) + .to_string_lossy() + .into_owned(); + let manager = ConnectionManager::::new(database_url); + r2d2::Pool::builder() + .max_size(8) + .build(manager) + .expect("test db pool should build") + } + + #[ntex::test] + #[ignore = "live supjav provider contract checks"] + async fn live_provider_contract_checks() { + let provider = SupjavProvider::new(); + provider.ensure_filters_loaded().await; + + let cache = VideoCache::new().max_size(1_000).to_owned(); + let pool = test_db_pool(); + let options = ServerOptions { + featured: None, + category: None, + sites: None, + filter: None, + language: None, + public_url_base: None, + requester: Some(Requester::new()), + network: None, + stars: None, + categories: None, + duration: None, + sort: None, + sexuality: None, + }; + + let page1 = provider + .get_videos( + cache.clone(), + pool.clone(), + "new".to_string(), + None, + "1".to_string(), + "5".to_string(), + options.clone(), + ) + .await; + assert!(page1.len() >= 5, "expected at least 5 baseline items"); + let first = &page1[0]; + let first_json = serde_json::to_value(first).expect("video item should serialize"); + assert!( + first_json.get("embed").is_none(), + "supjav items must not serialize embed" + ); + assert!( + first.url.contains(".m3u8"), + "expected direct m3u8 url, got {}", + first.url + ); + + let mut requester = Requester::new(); + let thumb_response = requester + .get_raw_with_headers_timeout( + &first.thumb, + vec![("Range".to_string(), "bytes=0-1023".to_string())], + Some(Duration::from_secs(30)), + ) + .await + .expect("thumbnail request should succeed"); + assert!( + thumb_response.status().is_success(), + "thumbnail request returned {}", + thumb_response.status() + ); + + let ytdlp = Command::new("yt-dlp") + .args(["--no-warnings", "--simulate", "--skip-download", &first.url]) + .output() + .expect("yt-dlp should run"); + assert!( + ytdlp.status.success(), + "yt-dlp failed: {}", + String::from_utf8_lossy(&ytdlp.stderr) + ); + + let page2 = provider + .get_videos( + cache.clone(), + pool.clone(), + "new".to_string(), + None, + "2".to_string(), + "5".to_string(), + options.clone(), + ) + .await; + assert!(!page2.is_empty(), "expected page 2 items"); + assert_ne!(page1[0].id, page2[0].id, "page 2 should differ from page 1"); + + let tag_query = provider + .get_videos( + cache.clone(), + pool.clone(), + "new".to_string(), + Some("Creampie".to_string()), + "1".to_string(), + "5".to_string(), + options.clone(), + ) + .await; + assert!( + !tag_query.is_empty(), + "expected tag query to return items" + ); + assert!(tag_query.iter().any(|item| { + item.tags + .as_ref() + .is_some_and(|tags| tags.iter().any(|tag| tag.eq_ignore_ascii_case("Creampie"))) + })); + + let uploader_query = provider + .get_videos( + cache, + pool, + "new".to_string(), + Some("DAHLIA".to_string()), + "1".to_string(), + "5".to_string(), + options, + ) + .await; + assert!( + !uploader_query.is_empty(), + "expected uploader query to return items" + ); + assert!(uploader_query.iter().all(|item| { + item.uploader + .as_deref() + .is_some_and(|value| value.eq_ignore_ascii_case("DAHLIA")) + || item + .uploaderId + .as_deref() + .is_some_and(|value| value.starts_with("supjav:maker:")) + })); + } +}