diff --git a/src/providers/noodlemagazine.rs b/src/providers/noodlemagazine.rs index ac3802e..147bfc1 100644 --- a/src/providers/noodlemagazine.rs +++ b/src/providers/noodlemagazine.rs @@ -10,10 +10,11 @@ use crate::videos::{ServerOptions, VideoFormat, VideoItem}; use async_trait::async_trait; use error_chain::error_chain; use htmlentity::entity::{ICodedDataTrait, decode}; +use regex::Regex; use std::net::IpAddr; -use url::Url; use std::vec; use titlecase::Titlecase; +use url::Url; use wreq::Version; pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = @@ -53,21 +54,232 @@ impl NoodlemagazineProvider { favicon: "https://www.google.com/s2/favicons?sz=64&domain=noodlemagazine.com".into(), status: "active".into(), categories: vec![], - options: vec![], + options: vec![ + ChannelOption { + id: "category".into(), + title: "Popular Period".into(), + description: "Pick which popular feed to browse.".into(), + systemImage: "clock".into(), + colorName: "blue".into(), + options: vec![ + FilterOption { + id: "recent".into(), + title: "Recent".into(), + }, + FilterOption { + id: "week".into(), + title: "This Week".into(), + }, + FilterOption { + id: "month".into(), + title: "This Month".into(), + }, + FilterOption { + id: "all".into(), + title: "All Time".into(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "sort".into(), + title: "Sort By".into(), + description: "Sort popular feed results.".into(), + systemImage: "arrow.up.arrow.down".into(), + colorName: "orange".into(), + options: vec![ + FilterOption { + id: "views".into(), + title: "Views".into(), + }, + FilterOption { + id: "date".into(), + title: "Newest".into(), + }, + FilterOption { + id: "duration".into(), + title: "Duration".into(), + }, + ], + multiSelect: false, + }, + ChannelOption { + id: "filter".into(), + title: "Order".into(), + description: "Ascending or descending order.".into(), + systemImage: "list.number".into(), + colorName: "green".into(), + options: vec![ + FilterOption { + id: "desc".into(), + title: "Descending".into(), + }, + FilterOption { + id: "asc".into(), + title: "Ascending".into(), + }, + ], + multiSelect: false, + }, + ], nsfw: true, cacheDuration: Some(1800), } } + fn resolve_popular_period(options: &ServerOptions) -> &'static str { + match options.category.as_deref() { + Some("week") => "week", + Some("month") => "month", + Some("all") => "all", + _ => "recent", + } + } + + fn resolve_sort_by(sort: &str, options: &ServerOptions) -> &'static str { + match options.sort.as_deref().unwrap_or(sort) { + "date" | "new" | "latest" => "date", + "duration" | "length" => "duration", + _ => "views", + } + } + + fn resolve_sort_order(options: &ServerOptions) -> &'static str { + match options.filter.as_deref() { + Some("asc") => "asc", + _ => "desc", + } + } + + fn mirror_url(url: &str) -> String { + let stripped = url + .strip_prefix("https://") + .or_else(|| url.strip_prefix("http://")) + .unwrap_or(url); + format!("https://r.jina.ai/http://{stripped}") + } + + fn looks_like_bot_challenge_or_block(html: &str) -> bool { + let lower = html.to_ascii_lowercase(); + lower.contains("just a moment") + || lower.contains("cf-browser-verification") + || lower.contains("cf-chl") + || lower.contains("access restricted") + || lower.contains("cloudflare") + } + + fn parse_markdown_listing_items( + &self, + markdown: &str, + options: &ServerOptions, + ) -> Vec { + let Some(regex) = Regex::new( + r#"(?is)\[\!\[Image\s+\d+:\s*(?P.*?)\]\((?P<thumb>https?://[^)\s]+)\)(?P<meta>.*?)\]\((?P<url>https?://noodlemagazine\.com/watch/[^)\s]+)\)"#, + ) + .ok() else { + return vec![]; + }; + let Some(duration_regex) = Regex::new(r"(?P<duration>\d{1,2}:\d{2}(?::\d{2})?)").ok() else { + return vec![]; + }; + let Some(views_regex) = Regex::new(r"(?P<views>[0-9]+(?:\.[0-9]+)?[KMB]?)\s+\d{1,2}:\d{2}(?::\d{2})?").ok() else { + return vec![]; + }; + + regex + .captures_iter(markdown) + .filter_map(|caps| { + let title_raw = caps.name("title")?.as_str().trim(); + let thumb = caps.name("thumb")?.as_str().trim(); + let video_url = caps.name("url")?.as_str().trim(); + let meta = caps.name("meta").map(|m| m.as_str()).unwrap_or(""); + + let parsed_url = Url::parse(video_url).ok()?; + let id = parsed_url + .path_segments() + .and_then(|mut segs| segs.next_back()) + .filter(|value| !value.is_empty()) + .map(|value| value.to_string())?; + + let duration = duration_regex + .captures(meta) + .and_then(|m| m.name("duration").map(|v| v.as_str())) + .and_then(|v| parse_time_to_seconds(v)) + .unwrap_or(0) as u32; + + let views = views_regex + .captures(meta) + .and_then(|m| m.name("views").map(|v| v.as_str())) + .and_then(|v| parse_abbreviated_number(v.trim())) + .unwrap_or(0); + + let title = decode(title_raw.as_bytes()) + .to_string() + .unwrap_or_else(|_| title_raw.to_string()) + .titlecase(); + let proxy_url = self.proxy_url(options, video_url); + let proxied_thumb = self.proxied_thumb(options, thumb); + + Some( + VideoItem::new( + id, + title, + proxy_url.clone(), + "noodlemagazine".into(), + proxied_thumb, + duration, + ) + .views(views) + .formats(vec![ + VideoFormat::new(proxy_url, "auto".into(), "video/mp4".into()) + .format_id("auto".into()) + .format_note("proxied".into()) + .http_header("Referer".into(), video_url.to_string()), + ]), + ) + }) + .collect() + } + + async fn fetch_listing_items( + &self, + requester: &mut crate::util::requester::Requester, + page_url: &str, + options: &ServerOptions, + ) -> Vec<VideoItem> { + let html = requester + .get(page_url, Some(Version::HTTP_2)) + .await + .unwrap_or_default(); + let mut items = self.get_video_items_from_html(html.clone(), options); + if !items.is_empty() { + return items; + } + + if !Self::looks_like_bot_challenge_or_block(&html) { + return items; + } + + let mirror = requester + .get(&Self::mirror_url(page_url), Some(Version::HTTP_11)) + .await + .unwrap_or_default(); + items = self.parse_markdown_listing_items(&mirror, options); + items + } + async fn get( &self, cache: VideoCache, page: u8, - _sort: &str, + sort: &str, options: ServerOptions, ) -> Result<Vec<VideoItem>> { + let period = Self::resolve_popular_period(&options); + let sort_by = Self::resolve_sort_by(sort, &options); + let sort_order = Self::resolve_sort_order(&options); let video_url = format!( - "{}/popular/recent?sort_by=views&sort_order=desc&p={}", + "{}/popular/{period}?sort_by={sort_by}&sort_order={sort_order}&p={}", self.url, page.saturating_sub(1) ); @@ -83,12 +295,9 @@ impl NoodlemagazineProvider { None => return Ok(old_items), }; - let text = requester - .get(&video_url, Some(Version::HTTP_2)) - .await - .unwrap_or_default(); - - let items = self.get_video_items_from_html(text, &options); + let items = self + .fetch_listing_items(&mut requester, &video_url, &options) + .await; if items.is_empty() { Ok(old_items) @@ -120,12 +329,9 @@ impl NoodlemagazineProvider { None => return Ok(old_items), }; - let text = requester - .get(&video_url, Some(Version::HTTP_2)) - .await - .unwrap_or_default(); - - let items = self.get_video_items_from_html(text, &options); + let items = self + .fetch_listing_items(&mut requester, &video_url, &options) + .await; if items.is_empty() { Ok(old_items) @@ -154,7 +360,7 @@ impl NoodlemagazineProvider { None => return vec![], }; - list.split("<div class=\"item\">") + list.split("<div class=\"item") .skip(1) .filter_map(|segment| { self.get_video_item(segment.to_string(), options).ok() @@ -198,6 +404,24 @@ impl NoodlemagazineProvider { .any(|ext| path.ends_with(ext)) } + fn is_known_preview_host(host: &str) -> bool { + let host = host.to_ascii_lowercase(); + host.ends_with("pvvstream.pro") + || host.ends_with("okcdn.ru") + || host.ends_with("vkuserphoto.ru") + || host.ends_with("noodlemagazine.com") + } + + fn has_preview_signature(url: &Url) -> bool { + let path = url.path().to_ascii_lowercase(); + let query = url.query().unwrap_or("").to_ascii_lowercase(); + path.contains("/preview/") + || path.contains("/poster/") + || path.contains("getvideopreview") + || query.contains("type=video_thumb") + || query.contains("keep_aspect_ratio=") + } + fn is_disallowed_thumb_host(host: &str) -> bool { if host.eq_ignore_ascii_case("localhost") { return true; @@ -234,20 +458,23 @@ impl NoodlemagazineProvider { return false; }; - !Self::is_disallowed_thumb_host(host) && Self::has_allowed_image_extension(url.path()) + if Self::is_disallowed_thumb_host(host) { + return false; + } + + if Self::has_allowed_image_extension(url.path()) { + return true; + } + + Self::is_known_preview_host(host) && Self::has_preview_signature(&url) } - fn proxied_thumb(&self, options: &ServerOptions, thumb: &str) -> String { + fn proxied_thumb(&self, _options: &ServerOptions, thumb: &str) -> String { let normalized = self.normalize_thumb_url(thumb); if normalized.is_empty() || !self.is_allowed_thumb_url(&normalized) { return String::new(); } - - crate::providers::build_proxy_url( - options, - "noodlemagazine-thumb", - &crate::providers::strip_url_scheme(&normalized), - ) + normalized } fn get_video_item(&self, video_segment: String, options: &ServerOptions) -> Result<VideoItem> { @@ -279,12 +506,17 @@ impl NoodlemagazineProvider { .ok_or_else(|| Error::from("missing id"))? .to_string(); - let thumb = video_segment - .split("data-src=\"") - .nth(1) - .and_then(|s| s.split('"').next()) - .unwrap_or("") - .to_string(); + let thumb = Regex::new( + r#"(?i)(?:data-src|data-original|data-webp|src|poster)\s*=\s*"(?P<url>[^"]+)""#, + ) + .ok() + .and_then(|regex| { + regex + .captures_iter(&video_segment) + .filter_map(|captures| captures.name("url").map(|value| value.as_str().to_string())) + .find(|candidate| !candidate.starts_with("data:image/")) + }) + .unwrap_or_default(); let raw_duration = video_segment .split("#clock-o\"></use></svg>") @@ -414,7 +646,7 @@ mod tests { ); assert_eq!( items[0].thumb, - "https://example.com/proxy/noodlemagazine-thumb/noodlemagazine.com/thumbs/test.jpg" + "https://noodlemagazine.com/thumbs/test.jpg" ); assert_eq!(items[0].formats.as_ref().map(|f| f.len()), Some(1)); } @@ -449,8 +681,70 @@ mod tests { assert_eq!(items.len(), 2); assert_eq!( items[0].thumb, - "https://example.com/proxy/noodlemagazine-thumb/cdn.example/thumb.jpg" + "https://cdn.example/thumb.jpg" ); assert!(items[1].thumb.is_empty()); } + + #[test] + fn keeps_preview_urls_without_file_extension() { + let provider = NoodlemagazineProvider::new(); + let options = options(); + let html = r#" + <div class="list_videos" id="list_videos"> + <div class="item"> + <a href="/watch/-111_222"> + <img data-src="https://img.pvvstream.pro/preview/abc/-111_222/240/iv.okcdn.ru/getVideoPreview?id=1&type=39&fn=vid_l" /> + </a> + <div class="title">sample</div> + <svg><use></use></svg>#clock-o"></use></svg>12:34< + <svg><use></use></svg>#eye"></use></svg>1.2K< + </div> + >Show more</div> + "#; + + let items = provider.get_video_items_from_html(html.to_string(), &options); + assert_eq!(items.len(), 1); + assert_eq!( + items[0].thumb, + "https://img.pvvstream.pro/preview/abc/-111_222/240/iv.okcdn.ru/getVideoPreview?id=1&type=39&fn=vid_l" + ); + } + + #[test] + fn parses_item_variants_and_alternate_thumb_attributes() { + let provider = NoodlemagazineProvider::new(); + let options = options(); + let html = r#" + <div class="list_videos" id="list_videos"> + <div class="item has-video" data-id="123"> + <a href="/watch/-333_444"> + <img data-original="https://cdn2.pvvstream.pro/videos/-333/444/preview_320.jpg" /> + </a> + <div class="title">sample alt</div> + <svg><use></use></svg>#clock-o"></use></svg>00:42< + <svg><use></use></svg>#eye"></use></svg>123< + </div> + >Show more</div> + "#; + + let items = provider.get_video_items_from_html(html.to_string(), &options); + assert_eq!(items.len(), 1); + assert_eq!( + items[0].thumb, + "https://cdn2.pvvstream.pro/videos/-333/444/preview_320.jpg" + ); + } + + #[test] + fn resolves_popular_filters_for_usability_options() { + let mut options = options(); + options.category = Some("month".to_string()); + options.sort = Some("date".to_string()); + options.filter = Some("asc".to_string()); + + assert_eq!(NoodlemagazineProvider::resolve_popular_period(&options), "month"); + assert_eq!(NoodlemagazineProvider::resolve_sort_by("views", &options), "date"); + assert_eq!(NoodlemagazineProvider::resolve_sort_order(&options), "asc"); + } } diff --git a/src/util/requester.rs b/src/util/requester.rs index bd73709..11b521b 100644 --- a/src/util/requester.rs +++ b/src/util/requester.rs @@ -136,6 +136,42 @@ impl Requester { .unwrap_or_else(|| "none".to_string()) } + fn jina_mirror_url(url: &str) -> Option<String> { + if url.trim().is_empty() { + return None; + } + if url.starts_with("https://r.jina.ai/") || url.starts_with("http://r.jina.ai/") { + return Some(url.to_string()); + } + let stripped = url + .strip_prefix("https://") + .or_else(|| url.strip_prefix("http://")) + .unwrap_or(url); + Some(format!("https://r.jina.ai/http://{stripped}")) + } + + async fn fetch_jina_mirror_body( + cookie_jar: Arc<Jar>, + user_agent: Option<String>, + proxy_enabled: bool, + url: &str, + ) -> Result<String, AnyErr> { + let mirror_url = Self::jina_mirror_url(url).ok_or("invalid mirror url")?; + let client = Self::build_client(cookie_jar, user_agent.as_deref()); + let mut request = client.get(&mirror_url).version(Version::HTTP_11); + if proxy_enabled { + if let Ok(proxy_url) = env::var("BURP_URL") { + let proxy = Proxy::all(&proxy_url)?; + request = request.proxy(proxy); + } + } + let response = request.send().await?; + if !response.status().is_success() { + return Err(format!("jina mirror returned status {}", response.status()).into()); + } + Ok(response.text().await?) + } + #[cfg(any(not(hottub_single_provider), hottub_provider = "hypnotube"))] fn debug_cookie_preview_from_borrowed_headers( &self, @@ -453,6 +489,14 @@ impl Requester { Some(v) => v, None => Version::HTTP_11, }; + let cookie_jar = self.cookie_jar.clone(); + let user_agent = self.user_agent.clone(); + let proxy_enabled = self.proxy; + let _trace_id = self.debug_trace_id.as_deref().unwrap_or("none").to_string(); + let url_owned = url.to_string(); + let jina_handle = tokio::spawn(async move { + Self::fetch_jina_mirror_body(cookie_jar, user_agent, proxy_enabled, &url_owned).await + }); loop { let mut request = self.client.get(url).version(http_version); for (key, value) in headers.iter() { @@ -464,7 +508,38 @@ impl Requester { request = request.proxy(proxy); } } - let response = request.send().await?; + let response = match request.send().await { + Ok(response) => response, + Err(error) => { + crate::flow_debug!( + "trace={} requester direct transport failed url={} error={} using_jina_fallback=true", + _trace_id, + crate::util::flow_debug::preview(url, 120), + crate::util::flow_debug::preview(&error.to_string(), 160) + ); + match jina_handle.await { + Ok(Ok(body)) => return Ok(body), + Ok(Err(_jina_error)) => { + crate::flow_debug!( + "trace={} requester jina fallback failed after transport error url={} error={}", + _trace_id, + crate::util::flow_debug::preview(url, 120), + crate::util::flow_debug::preview(&_jina_error.to_string(), 160) + ); + return Err(error.into()); + } + Err(_join_error) => { + crate::flow_debug!( + "trace={} requester jina task join failed after transport error url={} error={}", + _trace_id, + crate::util::flow_debug::preview(url, 120), + crate::util::flow_debug::preview(&_join_error.to_string(), 160) + ); + return Err(error.into()); + } + } + } + }; self.store_response_cookies(url, &response); crate::flow_debug!( "trace={} requester direct response url={} status={}", @@ -473,6 +548,7 @@ impl Requester { response.status() ); if response.status().is_success() || response.status().as_u16() == 404 { + jina_handle.abort(); return Ok(response.text().await?); } if response.status().as_u16() == 429 { @@ -484,11 +560,31 @@ impl Requester { tokio::time::sleep(std::time::Duration::from_secs(1)).await; continue; } else { - println!( - "Direct request to {} failed with status: {}", - url, + crate::flow_debug!( + "trace={} requester direct failed url={} status={} using_jina_fallback=true", + _trace_id, + crate::util::flow_debug::preview(url, 120), response.status() ); + match jina_handle.await { + Ok(Ok(body)) => return Ok(body), + Ok(Err(_error)) => { + crate::flow_debug!( + "trace={} requester jina fallback failed url={} error={}", + _trace_id, + crate::util::flow_debug::preview(url, 120), + crate::util::flow_debug::preview(&_error.to_string(), 160) + ); + } + Err(_error) => { + crate::flow_debug!( + "trace={} requester jina task join failed url={} error={}", + _trace_id, + crate::util::flow_debug::preview(url, 120), + crate::util::flow_debug::preview(&_error.to_string(), 160) + ); + } + } break; } }