noodlemagazine upgrade

This commit is contained in:
Simon
2026-03-31 13:09:51 +00:00
parent 01831c70e7
commit 80207efa73
2 changed files with 427 additions and 37 deletions

View File

@@ -10,10 +10,11 @@ use crate::videos::{ServerOptions, VideoFormat, VideoItem};
use async_trait::async_trait; use async_trait::async_trait;
use error_chain::error_chain; use error_chain::error_chain;
use htmlentity::entity::{ICodedDataTrait, decode}; use htmlentity::entity::{ICodedDataTrait, decode};
use regex::Regex;
use std::net::IpAddr; use std::net::IpAddr;
use url::Url;
use std::vec; use std::vec;
use titlecase::Titlecase; use titlecase::Titlecase;
use url::Url;
use wreq::Version; use wreq::Version;
pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata = pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
@@ -53,21 +54,232 @@ impl NoodlemagazineProvider {
favicon: "https://www.google.com/s2/favicons?sz=64&domain=noodlemagazine.com".into(), favicon: "https://www.google.com/s2/favicons?sz=64&domain=noodlemagazine.com".into(),
status: "active".into(), status: "active".into(),
categories: vec![], categories: vec![],
options: vec![], options: vec![
ChannelOption {
id: "category".into(),
title: "Popular Period".into(),
description: "Pick which popular feed to browse.".into(),
systemImage: "clock".into(),
colorName: "blue".into(),
options: vec![
FilterOption {
id: "recent".into(),
title: "Recent".into(),
},
FilterOption {
id: "week".into(),
title: "This Week".into(),
},
FilterOption {
id: "month".into(),
title: "This Month".into(),
},
FilterOption {
id: "all".into(),
title: "All Time".into(),
},
],
multiSelect: false,
},
ChannelOption {
id: "sort".into(),
title: "Sort By".into(),
description: "Sort popular feed results.".into(),
systemImage: "arrow.up.arrow.down".into(),
colorName: "orange".into(),
options: vec![
FilterOption {
id: "views".into(),
title: "Views".into(),
},
FilterOption {
id: "date".into(),
title: "Newest".into(),
},
FilterOption {
id: "duration".into(),
title: "Duration".into(),
},
],
multiSelect: false,
},
ChannelOption {
id: "filter".into(),
title: "Order".into(),
description: "Ascending or descending order.".into(),
systemImage: "list.number".into(),
colorName: "green".into(),
options: vec![
FilterOption {
id: "desc".into(),
title: "Descending".into(),
},
FilterOption {
id: "asc".into(),
title: "Ascending".into(),
},
],
multiSelect: false,
},
],
nsfw: true, nsfw: true,
cacheDuration: Some(1800), cacheDuration: Some(1800),
} }
} }
fn resolve_popular_period(options: &ServerOptions) -> &'static str {
match options.category.as_deref() {
Some("week") => "week",
Some("month") => "month",
Some("all") => "all",
_ => "recent",
}
}
fn resolve_sort_by(sort: &str, options: &ServerOptions) -> &'static str {
match options.sort.as_deref().unwrap_or(sort) {
"date" | "new" | "latest" => "date",
"duration" | "length" => "duration",
_ => "views",
}
}
fn resolve_sort_order(options: &ServerOptions) -> &'static str {
match options.filter.as_deref() {
Some("asc") => "asc",
_ => "desc",
}
}
fn mirror_url(url: &str) -> String {
let stripped = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.unwrap_or(url);
format!("https://r.jina.ai/http://{stripped}")
}
fn looks_like_bot_challenge_or_block(html: &str) -> bool {
let lower = html.to_ascii_lowercase();
lower.contains("just a moment")
|| lower.contains("cf-browser-verification")
|| lower.contains("cf-chl")
|| lower.contains("access restricted")
|| lower.contains("cloudflare")
}
fn parse_markdown_listing_items(
&self,
markdown: &str,
options: &ServerOptions,
) -> Vec<VideoItem> {
let Some(regex) = Regex::new(
r#"(?is)\[\!\[Image\s+\d+:\s*(?P<title>.*?)\]\((?P<thumb>https?://[^)\s]+)\)(?P<meta>.*?)\]\((?P<url>https?://noodlemagazine\.com/watch/[^)\s]+)\)"#,
)
.ok() else {
return vec![];
};
let Some(duration_regex) = Regex::new(r"(?P<duration>\d{1,2}:\d{2}(?::\d{2})?)").ok() else {
return vec![];
};
let Some(views_regex) = Regex::new(r"(?P<views>[0-9]+(?:\.[0-9]+)?[KMB]?)\s+\d{1,2}:\d{2}(?::\d{2})?").ok() else {
return vec![];
};
regex
.captures_iter(markdown)
.filter_map(|caps| {
let title_raw = caps.name("title")?.as_str().trim();
let thumb = caps.name("thumb")?.as_str().trim();
let video_url = caps.name("url")?.as_str().trim();
let meta = caps.name("meta").map(|m| m.as_str()).unwrap_or("");
let parsed_url = Url::parse(video_url).ok()?;
let id = parsed_url
.path_segments()
.and_then(|mut segs| segs.next_back())
.filter(|value| !value.is_empty())
.map(|value| value.to_string())?;
let duration = duration_regex
.captures(meta)
.and_then(|m| m.name("duration").map(|v| v.as_str()))
.and_then(|v| parse_time_to_seconds(v))
.unwrap_or(0) as u32;
let views = views_regex
.captures(meta)
.and_then(|m| m.name("views").map(|v| v.as_str()))
.and_then(|v| parse_abbreviated_number(v.trim()))
.unwrap_or(0);
let title = decode(title_raw.as_bytes())
.to_string()
.unwrap_or_else(|_| title_raw.to_string())
.titlecase();
let proxy_url = self.proxy_url(options, video_url);
let proxied_thumb = self.proxied_thumb(options, thumb);
Some(
VideoItem::new(
id,
title,
proxy_url.clone(),
"noodlemagazine".into(),
proxied_thumb,
duration,
)
.views(views)
.formats(vec![
VideoFormat::new(proxy_url, "auto".into(), "video/mp4".into())
.format_id("auto".into())
.format_note("proxied".into())
.http_header("Referer".into(), video_url.to_string()),
]),
)
})
.collect()
}
async fn fetch_listing_items(
&self,
requester: &mut crate::util::requester::Requester,
page_url: &str,
options: &ServerOptions,
) -> Vec<VideoItem> {
let html = requester
.get(page_url, Some(Version::HTTP_2))
.await
.unwrap_or_default();
let mut items = self.get_video_items_from_html(html.clone(), options);
if !items.is_empty() {
return items;
}
if !Self::looks_like_bot_challenge_or_block(&html) {
return items;
}
let mirror = requester
.get(&Self::mirror_url(page_url), Some(Version::HTTP_11))
.await
.unwrap_or_default();
items = self.parse_markdown_listing_items(&mirror, options);
items
}
async fn get( async fn get(
&self, &self,
cache: VideoCache, cache: VideoCache,
page: u8, page: u8,
_sort: &str, sort: &str,
options: ServerOptions, options: ServerOptions,
) -> Result<Vec<VideoItem>> { ) -> Result<Vec<VideoItem>> {
let period = Self::resolve_popular_period(&options);
let sort_by = Self::resolve_sort_by(sort, &options);
let sort_order = Self::resolve_sort_order(&options);
let video_url = format!( let video_url = format!(
"{}/popular/recent?sort_by=views&sort_order=desc&p={}", "{}/popular/{period}?sort_by={sort_by}&sort_order={sort_order}&p={}",
self.url, self.url,
page.saturating_sub(1) page.saturating_sub(1)
); );
@@ -83,12 +295,9 @@ impl NoodlemagazineProvider {
None => return Ok(old_items), None => return Ok(old_items),
}; };
let text = requester let items = self
.get(&video_url, Some(Version::HTTP_2)) .fetch_listing_items(&mut requester, &video_url, &options)
.await .await;
.unwrap_or_default();
let items = self.get_video_items_from_html(text, &options);
if items.is_empty() { if items.is_empty() {
Ok(old_items) Ok(old_items)
@@ -120,12 +329,9 @@ impl NoodlemagazineProvider {
None => return Ok(old_items), None => return Ok(old_items),
}; };
let text = requester let items = self
.get(&video_url, Some(Version::HTTP_2)) .fetch_listing_items(&mut requester, &video_url, &options)
.await .await;
.unwrap_or_default();
let items = self.get_video_items_from_html(text, &options);
if items.is_empty() { if items.is_empty() {
Ok(old_items) Ok(old_items)
@@ -154,7 +360,7 @@ impl NoodlemagazineProvider {
None => return vec![], None => return vec![],
}; };
list.split("<div class=\"item\">") list.split("<div class=\"item")
.skip(1) .skip(1)
.filter_map(|segment| { .filter_map(|segment| {
self.get_video_item(segment.to_string(), options).ok() self.get_video_item(segment.to_string(), options).ok()
@@ -198,6 +404,24 @@ impl NoodlemagazineProvider {
.any(|ext| path.ends_with(ext)) .any(|ext| path.ends_with(ext))
} }
fn is_known_preview_host(host: &str) -> bool {
let host = host.to_ascii_lowercase();
host.ends_with("pvvstream.pro")
|| host.ends_with("okcdn.ru")
|| host.ends_with("vkuserphoto.ru")
|| host.ends_with("noodlemagazine.com")
}
fn has_preview_signature(url: &Url) -> bool {
let path = url.path().to_ascii_lowercase();
let query = url.query().unwrap_or("").to_ascii_lowercase();
path.contains("/preview/")
|| path.contains("/poster/")
|| path.contains("getvideopreview")
|| query.contains("type=video_thumb")
|| query.contains("keep_aspect_ratio=")
}
fn is_disallowed_thumb_host(host: &str) -> bool { fn is_disallowed_thumb_host(host: &str) -> bool {
if host.eq_ignore_ascii_case("localhost") { if host.eq_ignore_ascii_case("localhost") {
return true; return true;
@@ -234,20 +458,23 @@ impl NoodlemagazineProvider {
return false; return false;
}; };
!Self::is_disallowed_thumb_host(host) && Self::has_allowed_image_extension(url.path()) if Self::is_disallowed_thumb_host(host) {
return false;
}
if Self::has_allowed_image_extension(url.path()) {
return true;
}
Self::is_known_preview_host(host) && Self::has_preview_signature(&url)
} }
fn proxied_thumb(&self, options: &ServerOptions, thumb: &str) -> String { fn proxied_thumb(&self, _options: &ServerOptions, thumb: &str) -> String {
let normalized = self.normalize_thumb_url(thumb); let normalized = self.normalize_thumb_url(thumb);
if normalized.is_empty() || !self.is_allowed_thumb_url(&normalized) { if normalized.is_empty() || !self.is_allowed_thumb_url(&normalized) {
return String::new(); return String::new();
} }
normalized
crate::providers::build_proxy_url(
options,
"noodlemagazine-thumb",
&crate::providers::strip_url_scheme(&normalized),
)
} }
fn get_video_item(&self, video_segment: String, options: &ServerOptions) -> Result<VideoItem> { fn get_video_item(&self, video_segment: String, options: &ServerOptions) -> Result<VideoItem> {
@@ -279,12 +506,17 @@ impl NoodlemagazineProvider {
.ok_or_else(|| Error::from("missing id"))? .ok_or_else(|| Error::from("missing id"))?
.to_string(); .to_string();
let thumb = video_segment let thumb = Regex::new(
.split("data-src=\"") r#"(?i)(?:data-src|data-original|data-webp|src|poster)\s*=\s*"(?P<url>[^"]+)""#,
.nth(1) )
.and_then(|s| s.split('"').next()) .ok()
.unwrap_or("") .and_then(|regex| {
.to_string(); regex
.captures_iter(&video_segment)
.filter_map(|captures| captures.name("url").map(|value| value.as_str().to_string()))
.find(|candidate| !candidate.starts_with("data:image/"))
})
.unwrap_or_default();
let raw_duration = video_segment let raw_duration = video_segment
.split("#clock-o\"></use></svg>") .split("#clock-o\"></use></svg>")
@@ -414,7 +646,7 @@ mod tests {
); );
assert_eq!( assert_eq!(
items[0].thumb, items[0].thumb,
"https://example.com/proxy/noodlemagazine-thumb/noodlemagazine.com/thumbs/test.jpg" "https://noodlemagazine.com/thumbs/test.jpg"
); );
assert_eq!(items[0].formats.as_ref().map(|f| f.len()), Some(1)); assert_eq!(items[0].formats.as_ref().map(|f| f.len()), Some(1));
} }
@@ -449,8 +681,70 @@ mod tests {
assert_eq!(items.len(), 2); assert_eq!(items.len(), 2);
assert_eq!( assert_eq!(
items[0].thumb, items[0].thumb,
"https://example.com/proxy/noodlemagazine-thumb/cdn.example/thumb.jpg" "https://cdn.example/thumb.jpg"
); );
assert!(items[1].thumb.is_empty()); assert!(items[1].thumb.is_empty());
} }
#[test]
fn keeps_preview_urls_without_file_extension() {
let provider = NoodlemagazineProvider::new();
let options = options();
let html = r#"
<div class="list_videos" id="list_videos">
<div class="item">
<a href="/watch/-111_222">
<img data-src="https://img.pvvstream.pro/preview/abc/-111_222/240/iv.okcdn.ru/getVideoPreview?id=1&type=39&fn=vid_l" />
</a>
<div class="title">sample</div>
<svg><use></use></svg>#clock-o"></use></svg>12:34<
<svg><use></use></svg>#eye"></use></svg>1.2K<
</div>
>Show more</div>
"#;
let items = provider.get_video_items_from_html(html.to_string(), &options);
assert_eq!(items.len(), 1);
assert_eq!(
items[0].thumb,
"https://img.pvvstream.pro/preview/abc/-111_222/240/iv.okcdn.ru/getVideoPreview?id=1&type=39&fn=vid_l"
);
}
#[test]
fn parses_item_variants_and_alternate_thumb_attributes() {
let provider = NoodlemagazineProvider::new();
let options = options();
let html = r#"
<div class="list_videos" id="list_videos">
<div class="item has-video" data-id="123">
<a href="/watch/-333_444">
<img data-original="https://cdn2.pvvstream.pro/videos/-333/444/preview_320.jpg" />
</a>
<div class="title">sample alt</div>
<svg><use></use></svg>#clock-o"></use></svg>00:42<
<svg><use></use></svg>#eye"></use></svg>123<
</div>
>Show more</div>
"#;
let items = provider.get_video_items_from_html(html.to_string(), &options);
assert_eq!(items.len(), 1);
assert_eq!(
items[0].thumb,
"https://cdn2.pvvstream.pro/videos/-333/444/preview_320.jpg"
);
}
#[test]
fn resolves_popular_filters_for_usability_options() {
let mut options = options();
options.category = Some("month".to_string());
options.sort = Some("date".to_string());
options.filter = Some("asc".to_string());
assert_eq!(NoodlemagazineProvider::resolve_popular_period(&options), "month");
assert_eq!(NoodlemagazineProvider::resolve_sort_by("views", &options), "date");
assert_eq!(NoodlemagazineProvider::resolve_sort_order(&options), "asc");
}
} }

View File

@@ -136,6 +136,42 @@ impl Requester {
.unwrap_or_else(|| "none".to_string()) .unwrap_or_else(|| "none".to_string())
} }
fn jina_mirror_url(url: &str) -> Option<String> {
if url.trim().is_empty() {
return None;
}
if url.starts_with("https://r.jina.ai/") || url.starts_with("http://r.jina.ai/") {
return Some(url.to_string());
}
let stripped = url
.strip_prefix("https://")
.or_else(|| url.strip_prefix("http://"))
.unwrap_or(url);
Some(format!("https://r.jina.ai/http://{stripped}"))
}
async fn fetch_jina_mirror_body(
cookie_jar: Arc<Jar>,
user_agent: Option<String>,
proxy_enabled: bool,
url: &str,
) -> Result<String, AnyErr> {
let mirror_url = Self::jina_mirror_url(url).ok_or("invalid mirror url")?;
let client = Self::build_client(cookie_jar, user_agent.as_deref());
let mut request = client.get(&mirror_url).version(Version::HTTP_11);
if proxy_enabled {
if let Ok(proxy_url) = env::var("BURP_URL") {
let proxy = Proxy::all(&proxy_url)?;
request = request.proxy(proxy);
}
}
let response = request.send().await?;
if !response.status().is_success() {
return Err(format!("jina mirror returned status {}", response.status()).into());
}
Ok(response.text().await?)
}
#[cfg(any(not(hottub_single_provider), hottub_provider = "hypnotube"))] #[cfg(any(not(hottub_single_provider), hottub_provider = "hypnotube"))]
fn debug_cookie_preview_from_borrowed_headers( fn debug_cookie_preview_from_borrowed_headers(
&self, &self,
@@ -453,6 +489,14 @@ impl Requester {
Some(v) => v, Some(v) => v,
None => Version::HTTP_11, None => Version::HTTP_11,
}; };
let cookie_jar = self.cookie_jar.clone();
let user_agent = self.user_agent.clone();
let proxy_enabled = self.proxy;
let _trace_id = self.debug_trace_id.as_deref().unwrap_or("none").to_string();
let url_owned = url.to_string();
let jina_handle = tokio::spawn(async move {
Self::fetch_jina_mirror_body(cookie_jar, user_agent, proxy_enabled, &url_owned).await
});
loop { loop {
let mut request = self.client.get(url).version(http_version); let mut request = self.client.get(url).version(http_version);
for (key, value) in headers.iter() { for (key, value) in headers.iter() {
@@ -464,7 +508,38 @@ impl Requester {
request = request.proxy(proxy); request = request.proxy(proxy);
} }
} }
let response = request.send().await?; let response = match request.send().await {
Ok(response) => response,
Err(error) => {
crate::flow_debug!(
"trace={} requester direct transport failed url={} error={} using_jina_fallback=true",
_trace_id,
crate::util::flow_debug::preview(url, 120),
crate::util::flow_debug::preview(&error.to_string(), 160)
);
match jina_handle.await {
Ok(Ok(body)) => return Ok(body),
Ok(Err(_jina_error)) => {
crate::flow_debug!(
"trace={} requester jina fallback failed after transport error url={} error={}",
_trace_id,
crate::util::flow_debug::preview(url, 120),
crate::util::flow_debug::preview(&_jina_error.to_string(), 160)
);
return Err(error.into());
}
Err(_join_error) => {
crate::flow_debug!(
"trace={} requester jina task join failed after transport error url={} error={}",
_trace_id,
crate::util::flow_debug::preview(url, 120),
crate::util::flow_debug::preview(&_join_error.to_string(), 160)
);
return Err(error.into());
}
}
}
};
self.store_response_cookies(url, &response); self.store_response_cookies(url, &response);
crate::flow_debug!( crate::flow_debug!(
"trace={} requester direct response url={} status={}", "trace={} requester direct response url={} status={}",
@@ -473,6 +548,7 @@ impl Requester {
response.status() response.status()
); );
if response.status().is_success() || response.status().as_u16() == 404 { if response.status().is_success() || response.status().as_u16() == 404 {
jina_handle.abort();
return Ok(response.text().await?); return Ok(response.text().await?);
} }
if response.status().as_u16() == 429 { if response.status().as_u16() == 429 {
@@ -484,11 +560,31 @@ impl Requester {
tokio::time::sleep(std::time::Duration::from_secs(1)).await; tokio::time::sleep(std::time::Duration::from_secs(1)).await;
continue; continue;
} else { } else {
println!( crate::flow_debug!(
"Direct request to {} failed with status: {}", "trace={} requester direct failed url={} status={} using_jina_fallback=true",
url, _trace_id,
crate::util::flow_debug::preview(url, 120),
response.status() response.status()
); );
match jina_handle.await {
Ok(Ok(body)) => return Ok(body),
Ok(Err(_error)) => {
crate::flow_debug!(
"trace={} requester jina fallback failed url={} error={}",
_trace_id,
crate::util::flow_debug::preview(url, 120),
crate::util::flow_debug::preview(&_error.to_string(), 160)
);
}
Err(_error) => {
crate::flow_debug!(
"trace={} requester jina task join failed url={} error={}",
_trace_id,
crate::util::flow_debug::preview(url, 120),
crate::util::flow_debug::preview(&_error.to_string(), 160)
);
}
}
break; break;
} }
} }