fixes etc

This commit is contained in:
Simon
2026-04-07 16:53:45 +00:00
parent 81e8158161
commit 6e43b3b3d0
5 changed files with 452 additions and 207 deletions

View File

@@ -10,8 +10,8 @@ use async_trait::async_trait;
use error_chain::error_chain;
use htmlentity::entity::{ICodedDataTrait, decode};
use scraper::{ElementRef, Html, Selector};
use serde_json::Value;
use std::process::Command;
use std::time::Duration;
use url::form_urlencoded::byte_serialize;
pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
@@ -143,7 +143,38 @@ impl SpankbangProvider {
}
fn request_headers(&self) -> Vec<(String, String)> {
vec![("Referer".to_string(), format!("{}/", self.url))]
vec![
(
"accept".to_string(),
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
.to_string(),
),
("accept-language".to_string(), "en-US,en;q=0.6".to_string()),
("cache-control".to_string(), "no-cache".to_string()),
("pragma".to_string(), "no-cache".to_string()),
("priority".to_string(), "u=0, i".to_string()),
(
"sec-ch-ua".to_string(),
r#""Chromium";v="146", "Not-A.Brand";v="24", "Brave";v="146""#.to_string(),
),
("sec-ch-ua-mobile".to_string(), "?0".to_string()),
("sec-ch-ua-platform".to_string(), "\"Linux\"".to_string()),
("sec-fetch-dest".to_string(), "document".to_string()),
("sec-fetch-mode".to_string(), "navigate".to_string()),
("sec-fetch-site".to_string(), "none".to_string()),
("sec-fetch-user".to_string(), "?1".to_string()),
("sec-gpc".to_string(), "1".to_string()),
(
"upgrade-insecure-requests".to_string(),
"1".to_string(),
),
(
"user-agent".to_string(),
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
.to_string(),
),
("Referer".to_string(), format!("{}/", self.url)),
]
}
fn is_cloudflare_block(text: &str) -> bool {
@@ -153,168 +184,222 @@ impl SpankbangProvider {
|| lowercase.contains("cloudflare ray id")
}
fn fallback_items_from_ytdlp(&self, page_url: &str, limit: usize) -> Vec<VideoItem> {
let output = match Command::new("yt-dlp")
.arg("-J")
.arg("--flat-playlist")
.arg("--extractor-args")
.arg("generic:impersonate=chrome")
fn fetch_items_with_curl_cffi(&self, page_url: &str, proxy_base_url: &str) -> Vec<VideoItem> {
crate::flow_debug!(
"trace={} spankbang curl_cffi fetch start url={}",
"none",
crate::util::flow_debug::preview(page_url, 120)
);
let output = match Command::new("python3")
.arg("-c")
.arg(
r#"from curl_cffi import requests
from bs4 import BeautifulSoup
import json
import sys
url = sys.argv[1]
r = requests.get(url, impersonate='chrome124', timeout=45, headers={'Referer': 'https://spankbang.com/'})
if r.status_code >= 400:
raise SystemExit(2)
soup = BeautifulSoup(r.text, 'html.parser')
cards = soup.select('[data-testid="video-list"] [data-testid="video-item"]')
if not cards:
cards = soup.select('[data-testid="video-item"]')
items = []
for card in cards:
vid = (card.get('data-id') or '').strip()
link = card.select_one('a[href*="/video/"]')
if not vid or link is None:
continue
href = (link.get('href') or '').strip()
if not href:
continue
img = card.select_one('picture img, img')
title_anchor = card.select_one('p a[title], a[title]')
duration = card.select_one('[data-testid="video-item-length"]')
views = card.select_one('[data-testid="views"]')
uploader = card.select_one('[data-testid="video-info-with-badge"] a[data-testid="title"]')
preview = card.select_one('video source[data-src]')
items.append({
'id': vid,
'href': href,
'title': (title_anchor.get('title') if title_anchor else '') or (img.get('alt') if img else ''),
'thumb': ((img.get('src') if img else '') or (img.get('data-src') if img else '') or '').strip(),
'preview': (preview.get('data-src') if preview else '') or '',
'duration': duration.get_text(' ', strip=True) if duration else '',
'views': views.get_text(' ', strip=True) if views else '',
'uploader': uploader.get_text(' ', strip=True) if uploader else '',
'uploader_href': (uploader.get('href') if uploader else '') or '',
})
sys.stdout.write(json.dumps(items))
"#,
)
.arg(page_url)
.output()
{
Ok(output) if output.status.success() => output,
_ => return vec![],
Ok(output) => {
let stderr = String::from_utf8_lossy(&output.stderr);
crate::providers::report_provider_error_background(
"spankbang",
"curl_cffi.fetch.status",
&format!(
"url={page_url}; status={}; stderr={}",
output.status,
crate::util::flow_debug::preview(&stderr, 300)
),
);
return vec![];
}
Err(e) => {
crate::providers::report_provider_error_background(
"spankbang",
"curl_cffi.fetch.spawn",
&format!("url={page_url}; error={e}"),
);
return vec![];
}
};
let payload: serde_json::Value = match serde_json::from_slice(&output.stdout) {
Ok(payload) => payload,
Err(_) => return vec![],
};
let payload = String::from_utf8(output.stdout).ok();
if payload.as_deref().unwrap_or("").trim().is_empty() {
crate::providers::report_provider_error_background(
"spankbang",
"curl_cffi.fetch.empty",
&format!("url={page_url}"),
);
return vec![];
}
crate::flow_debug!(
"trace={} spankbang curl_cffi fetch ok url={} bytes={}",
"none",
crate::util::flow_debug::preview(page_url, 120),
payload.as_deref().unwrap_or("").len()
);
let entries = match payload.get("entries").and_then(|value| value.as_array()) {
Some(entries) => entries,
None => return vec![],
let items_json: Value = match serde_json::from_str(payload.as_deref().unwrap_or("")) {
Ok(value) => value,
Err(e) => {
crate::providers::report_provider_error_background(
"spankbang",
"curl_cffi.parse.json",
&format!("url={page_url}; error={e}"),
);
return vec![];
}
};
let Some(entries) = items_json.as_array() else {
return vec![];
};
let mut items = Vec::new();
for (index, entry) in entries.iter().take(limit).enumerate() {
let Some(url) = entry.get("url").and_then(|value| value.as_str()) else {
continue;
};
if !(url.starts_with("https://") || url.starts_with("http://")) {
continue;
}
for entry in entries {
let id = entry
.get("id")
.and_then(|value| value.as_str())
.filter(|value| !value.is_empty())
.map(ToOwned::to_owned)
.unwrap_or_else(|| format!("spankbang-fallback-{}", index + 1));
.unwrap_or("")
.trim()
.to_string();
let href = entry
.get("href")
.and_then(|value| value.as_str())
.unwrap_or("")
.trim()
.to_string();
if id.is_empty() || href.is_empty() {
continue;
}
let detail_url = self.normalize_url(&href);
let title = entry
.get("title")
.and_then(|value| value.as_str())
.filter(|value| !value.is_empty())
.map(Self::decode_html)
.unwrap_or_else(|| format!("SpankBang Video {}", index + 1));
.unwrap_or_default();
if title.is_empty() {
continue;
}
let thumb = entry
.get("thumbnail")
.get("thumb")
.and_then(|value| value.as_str())
.unwrap_or("")
.to_string();
.map(|value| self.normalize_url(value))
.unwrap_or_default();
let preview = entry
.get("preview")
.and_then(|value| value.as_str())
.map(|value| self.normalize_url(value))
.unwrap_or_default();
let duration = entry
.get("duration")
.and_then(|value| value.as_u64())
.and_then(|value| u32::try_from(value).ok())
.and_then(|value| value.as_str())
.map(Self::parse_duration)
.unwrap_or(0);
let format_kind = if url.contains(".m3u8") {
"m3u8"
} else {
"video/mp4"
};
let mut format = VideoFormat::new(url.to_string(), "auto".to_string(), format_kind.to_string());
if let Some(headers) = entry.get("http_headers").and_then(|value| value.as_object()) {
for (key, value) in headers {
if let Some(value) = value.as_str() {
format.add_http_header(key.to_string(), value.to_string());
}
}
}
if entry
.get("http_headers")
.and_then(|value| value.as_object())
.is_none()
{
format.add_http_header("Referer".to_string(), format!("{}/", self.url));
}
let views = entry
.get("views")
.and_then(|value| value.as_str())
.and_then(parse_abbreviated_number);
let mut item = VideoItem::new(
id,
title,
url.to_string(),
self.proxy_url(proxy_base_url, &href),
"spankbang".to_string(),
thumb,
duration,
)
.formats(vec![format]);
if let Some(views) = entry
.get("view_count")
.and_then(|value| value.as_u64())
.and_then(|value| u32::try_from(value).ok())
{
);
if let Some(views) = views {
item = item.views(views);
}
if let Some(uploader) = entry
if !preview.is_empty() {
let mut format = VideoFormat::new(
preview.clone(),
"preview".to_string(),
"video/mp4".to_string(),
);
format.add_http_header("Referer".to_string(), detail_url.clone());
item = item.preview(preview).formats(vec![format]);
}
let uploader = entry
.get("uploader")
.and_then(|value| value.as_str())
.filter(|value| !value.is_empty())
{
item = item.uploader(uploader.to_string());
.map(Self::decode_html)
.unwrap_or_default();
if !uploader.is_empty() {
item = item.uploader(uploader);
}
let uploader_href = entry
.get("uploader_href")
.and_then(|value| value.as_str())
.unwrap_or("")
.trim();
if !uploader_href.is_empty() {
let uploader_url = self.normalize_url(uploader_href);
if !uploader_url.is_empty() {
item = item.uploader_url(uploader_url);
}
}
items.push(item);
}
items
}
async fn fallback_items_with_working_media(
&self,
page_url: &str,
options: &ServerOptions,
) -> Vec<VideoItem> {
let fallback_items = self.fallback_items_from_ytdlp(page_url, 72);
if fallback_items.is_empty() {
if items.is_empty() {
crate::providers::report_provider_error_background(
"spankbang",
"curl_cffi.parse.empty",
&format!("url={page_url}"),
);
return vec![];
}
let mut requester = requester_or_default(
options,
"spankbang",
"spankbang.fallback_items_with_working_media.missing_requester",
crate::flow_debug!(
"trace={} spankbang curl_cffi parsed url={} items={}",
"none",
crate::util::flow_debug::preview(page_url, 120),
items.len()
);
let mut working_items = Vec::new();
for item in fallback_items {
let format_headers = item
.formats
.as_ref()
.and_then(|formats| formats.first())
.map(|format| format.http_headers_pairs())
.unwrap_or_default();
let media_url = item
.formats
.as_ref()
.and_then(|formats| formats.first())
.map(|format| format.url.clone())
.unwrap_or_else(|| item.url.clone());
if media_url.is_empty() {
continue;
}
let mut headers = format_headers;
if !headers
.iter()
.any(|(key, _)| key.eq_ignore_ascii_case("range"))
{
headers.push(("Range".to_string(), "bytes=0-2047".to_string()));
}
let is_working = match requester
.get_raw_with_headers_timeout(&media_url, headers, Some(Duration::from_secs(20)))
.await
{
Ok(response) => response.status().is_success(),
Err(_) => false,
};
if is_working {
working_items.push(item);
}
}
working_items
items
}
fn build_query_url(&self, query: &str, page: u32, sort: &str) -> String {
@@ -512,8 +597,11 @@ impl SpankbangProvider {
item = item.rating(rating);
}
if let Some(preview) = preview {
let mut format =
VideoFormat::new(preview.clone(), "preview".to_string(), "video/mp4".to_string());
let mut format = VideoFormat::new(
preview.clone(),
"preview".to_string(),
"video/mp4".to_string(),
);
format.add_http_header("Referer".to_string(), detail_url.clone());
item = item.preview(preview).formats(vec![format]);
}
@@ -594,7 +682,6 @@ impl SpankbangProvider {
}
None => vec![],
};
let mut requester =
requester_or_default(&options, "spankbang", "spankbang.get.missing_requester");
let text = match requester
@@ -609,13 +696,12 @@ impl SpankbangProvider {
&format!("url={video_url}; error={e}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
@@ -628,13 +714,12 @@ impl SpankbangProvider {
&format!("url={video_url}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
@@ -646,17 +731,17 @@ impl SpankbangProvider {
&format!("url={video_url}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
let looks_like_html = text.to_ascii_lowercase().contains("<html");
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let video_items = self.get_video_items_from_html(text, proxy_base_url);
if !video_items.is_empty() {
@@ -665,6 +750,18 @@ impl SpankbangProvider {
return Ok(video_items);
}
report_provider_error(
"spankbang",
"get.parse_empty",
&format!("url={video_url}; looks_like_html={looks_like_html}"),
)
.await;
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
Ok(old_items)
}
@@ -686,7 +783,6 @@ impl SpankbangProvider {
}
None => vec![],
};
let mut requester =
requester_or_default(&options, "spankbang", "spankbang.query.missing_requester");
let text = match requester
@@ -701,13 +797,12 @@ impl SpankbangProvider {
&format!("url={video_url}; error={e}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
@@ -720,13 +815,12 @@ impl SpankbangProvider {
&format!("url={video_url}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
@@ -738,17 +832,17 @@ impl SpankbangProvider {
&format!("url={video_url}"),
)
.await;
let fallback_items = self
.fallback_items_with_working_media(&video_url, &options)
.await;
if !fallback_items.is_empty() {
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), fallback_items.clone());
return Ok(fallback_items);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
return Ok(old_items);
}
let looks_like_html = text.to_ascii_lowercase().contains("<html");
let proxy_base_url = options.public_url_base.as_deref().unwrap_or_default();
let video_items = self.get_video_items_from_html(text, proxy_base_url);
if !video_items.is_empty() {
@@ -757,6 +851,18 @@ impl SpankbangProvider {
return Ok(video_items);
}
report_provider_error(
"spankbang",
"query.parse_empty",
&format!("url={video_url}; looks_like_html={looks_like_html}"),
)
.await;
let curl_cffi_items = self.fetch_items_with_curl_cffi(&video_url, proxy_base_url);
if !curl_cffi_items.is_empty() {
cache.remove(&video_url);
cache.insert(video_url.clone(), curl_cffi_items.clone());
return Ok(curl_cffi_items);
}
Ok(old_items)
}
}
@@ -857,7 +963,38 @@ mod tests {
);
assert_eq!(
provider.request_headers(),
vec![("Referer".to_string(), "https://spankbang.com/".to_string())]
vec![
(
"accept".to_string(),
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
.to_string(),
),
("accept-language".to_string(), "en-US,en;q=0.6".to_string()),
("cache-control".to_string(), "no-cache".to_string()),
("pragma".to_string(), "no-cache".to_string()),
("priority".to_string(), "u=0, i".to_string()),
(
"sec-ch-ua".to_string(),
r#""Chromium";v="146", "Not-A.Brand";v="24", "Brave";v="146""#.to_string(),
),
("sec-ch-ua-mobile".to_string(), "?0".to_string()),
("sec-ch-ua-platform".to_string(), "\"Linux\"".to_string()),
("sec-fetch-dest".to_string(), "document".to_string()),
("sec-fetch-mode".to_string(), "navigate".to_string()),
("sec-fetch-site".to_string(), "none".to_string()),
("sec-fetch-user".to_string(), "?1".to_string()),
("sec-gpc".to_string(), "1".to_string()),
(
"upgrade-insecure-requests".to_string(),
"1".to_string(),
),
(
"user-agent".to_string(),
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
.to_string(),
),
("Referer".to_string(), "https://spankbang.com/".to_string()),
]
);
}