archivebate still needs work
This commit is contained in:
@@ -7,10 +7,11 @@ use crate::status::*;
|
|||||||
use crate::util::cache::VideoCache;
|
use crate::util::cache::VideoCache;
|
||||||
use crate::util::parse_abbreviated_number;
|
use crate::util::parse_abbreviated_number;
|
||||||
use crate::util::time::parse_time_to_seconds;
|
use crate::util::time::parse_time_to_seconds;
|
||||||
use crate::videos::{ServerOptions, VideoItem};
|
use crate::videos::{ServerOptions, VideoFormat, VideoItem};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use chrono::{Duration as ChronoDuration, Utc};
|
use chrono::{Duration as ChronoDuration, Utc};
|
||||||
use error_chain::error_chain;
|
use error_chain::error_chain;
|
||||||
|
use futures::stream::{self, StreamExt};
|
||||||
use htmlentity::entity::{ICodedDataTrait, decode};
|
use htmlentity::entity::{ICodedDataTrait, decode};
|
||||||
use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode};
|
use percent_encoding::{NON_ALPHANUMERIC, percent_decode_str, utf8_percent_encode};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
@@ -20,6 +21,8 @@ use serde_json::Value;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
use std::thread;
|
use std::thread;
|
||||||
|
use std::time::Duration as StdDuration;
|
||||||
|
use tokio::time::timeout;
|
||||||
|
|
||||||
pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
|
pub const CHANNEL_METADATA: crate::providers::ProviderChannelMetadata =
|
||||||
crate::providers::ProviderChannelMetadata {
|
crate::providers::ProviderChannelMetadata {
|
||||||
@@ -281,6 +284,9 @@ impl ArchivebateProvider {
|
|||||||
if value.starts_with("http://") || value.starts_with("https://") {
|
if value.starts_with("http://") || value.starts_with("https://") {
|
||||||
return value.to_string();
|
return value.to_string();
|
||||||
}
|
}
|
||||||
|
if value.starts_with("//") {
|
||||||
|
return format!("https:{value}");
|
||||||
|
}
|
||||||
format!(
|
format!(
|
||||||
"{}/{}",
|
"{}/{}",
|
||||||
self.url.trim_end_matches('/'),
|
self.url.trim_end_matches('/'),
|
||||||
@@ -711,6 +717,213 @@ impl ArchivebateProvider {
|
|||||||
Ok(items)
|
Ok(items)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_mixin_packed_eval(html: &str) -> Option<String> {
|
||||||
|
let eval_regex = Regex::new(
|
||||||
|
r#"(?s)eval\(function\(p,a,c,k,e,d\)\{.*?\}\('(?P<payload>.*?)',\s*(?P<radix>[0-9]+),\s*(?P<count>[0-9]+),\s*'(?P<tokens>.*?)'\.split\('\|'\)"#,
|
||||||
|
)
|
||||||
|
.ok()?;
|
||||||
|
let captures = eval_regex.captures(html)?;
|
||||||
|
let payload_raw = captures.name("payload")?.as_str();
|
||||||
|
let radix = captures.name("radix")?.as_str().parse::<u32>().ok()?;
|
||||||
|
let count = captures.name("count")?.as_str().parse::<usize>().ok()?;
|
||||||
|
if !(2..=36).contains(&radix) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let payload = Self::unescape_js_single_quoted(payload_raw);
|
||||||
|
let tokens_raw = captures.name("tokens")?.as_str();
|
||||||
|
let tokens = tokens_raw.split('|').collect::<Vec<_>>();
|
||||||
|
let mut unpacked = payload;
|
||||||
|
|
||||||
|
for index in (0..count).rev() {
|
||||||
|
let Some(token) = tokens.get(index) else {
|
||||||
|
continue;
|
||||||
|
};
|
||||||
|
if token.is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let key = Self::to_radix(index, radix);
|
||||||
|
let pattern = format!(r"\b{}\b", regex::escape(&key));
|
||||||
|
let re = Regex::new(&pattern).ok()?;
|
||||||
|
unpacked = re.replace_all(&unpacked, *token).into_owned();
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(unpacked)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unescape_js_single_quoted(value: &str) -> String {
|
||||||
|
let mut output = String::with_capacity(value.len());
|
||||||
|
let mut chars = value.chars();
|
||||||
|
while let Some(character) = chars.next() {
|
||||||
|
if character != '\\' {
|
||||||
|
output.push(character);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let Some(next) = chars.next() else {
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
match next {
|
||||||
|
'\\' => output.push('\\'),
|
||||||
|
'\'' => output.push('\''),
|
||||||
|
'"' => output.push('"'),
|
||||||
|
'n' => output.push('\n'),
|
||||||
|
'r' => output.push('\r'),
|
||||||
|
't' => output.push('\t'),
|
||||||
|
_ => output.push(next),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_radix(mut value: usize, radix: u32) -> String {
|
||||||
|
if value == 0 {
|
||||||
|
return "0".to_string();
|
||||||
|
}
|
||||||
|
let alphabet = b"0123456789abcdefghijklmnopqrstuvwxyz";
|
||||||
|
let mut out = Vec::new();
|
||||||
|
while value > 0 {
|
||||||
|
let digit = value % radix as usize;
|
||||||
|
out.push(alphabet[digit] as char);
|
||||||
|
value /= radix as usize;
|
||||||
|
}
|
||||||
|
out.iter().rev().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_mixdrop_media_url(html: &str) -> Option<String> {
|
||||||
|
let direct_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
|
||||||
|
if let Some(url) = direct_regex
|
||||||
|
.captures(html)
|
||||||
|
.and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
|
||||||
|
{
|
||||||
|
return Some(Self::normalize_possible_protocol_relative(&url));
|
||||||
|
}
|
||||||
|
|
||||||
|
let unpacked = Self::parse_mixin_packed_eval(html)?;
|
||||||
|
let unpacked_regex = Regex::new(r#"MDCore\.wurl\s*=\s*"([^"]+)""#).ok()?;
|
||||||
|
unpacked_regex
|
||||||
|
.captures(&unpacked)
|
||||||
|
.and_then(|captures| captures.get(1).map(|value| value.as_str().to_string()))
|
||||||
|
.map(|value| Self::normalize_possible_protocol_relative(&value))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_possible_protocol_relative(value: &str) -> String {
|
||||||
|
let trimmed = value.trim();
|
||||||
|
if trimmed.starts_with("//") {
|
||||||
|
format!("https:{trimmed}")
|
||||||
|
} else {
|
||||||
|
trimmed.to_string()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn host_from_url(url: &str) -> Option<String> {
|
||||||
|
let parsed = url::Url::parse(url).ok()?;
|
||||||
|
parsed.host_str().map(|value| value.to_ascii_lowercase())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_mixdrop_host(url: &str) -> bool {
|
||||||
|
let Some(host) = Self::host_from_url(url) else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
host.contains("mixdrop") || host.contains("m1xdrop")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn first_video_source_from_html(html: &str) -> Option<String> {
|
||||||
|
let document = Html::parse_document(html);
|
||||||
|
let source_selector = Selector::parse("video source[src]").ok()?;
|
||||||
|
let video_src_selector = Selector::parse("video[src]").ok()?;
|
||||||
|
|
||||||
|
if let Some(value) = document
|
||||||
|
.select(&source_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|node| node.value().attr("src"))
|
||||||
|
{
|
||||||
|
return Some(value.to_string());
|
||||||
|
}
|
||||||
|
document
|
||||||
|
.select(&video_src_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|node| node.value().attr("src"))
|
||||||
|
.map(|value| value.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn first_iframe_source_from_html(html: &str) -> Option<String> {
|
||||||
|
let document = Html::parse_document(html);
|
||||||
|
let iframe_selector = Selector::parse("iframe[src]").ok()?;
|
||||||
|
document
|
||||||
|
.select(&iframe_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|node| node.value().attr("src"))
|
||||||
|
.map(|value| value.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn resolve_mixdrop_media_from_iframe(
|
||||||
|
&self,
|
||||||
|
iframe_url: &str,
|
||||||
|
referer: &str,
|
||||||
|
options: &ServerOptions,
|
||||||
|
) -> Option<String> {
|
||||||
|
let mut requester = requester_or_default(options, CHANNEL_ID, "resolve_mixdrop_media");
|
||||||
|
let iframe_html = requester
|
||||||
|
.get_with_headers(
|
||||||
|
iframe_url,
|
||||||
|
self.html_headers(referer),
|
||||||
|
Some(wreq::Version::HTTP_11),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok()?;
|
||||||
|
Self::extract_mixdrop_media_url(&iframe_html)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn enrich_video(&self, item: VideoItem, options: &ServerOptions) -> VideoItem {
|
||||||
|
let page_url = item.url.clone();
|
||||||
|
let mut requester = requester_or_default(options, CHANNEL_ID, "archivebate.enrich_video");
|
||||||
|
let detail_html = match requester
|
||||||
|
.get_with_headers(
|
||||||
|
&page_url,
|
||||||
|
self.html_headers(&format!("{}/", self.url)),
|
||||||
|
Some(wreq::Version::HTTP_11),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(value) => value,
|
||||||
|
Err(error) => {
|
||||||
|
report_provider_error_background(
|
||||||
|
CHANNEL_ID,
|
||||||
|
"enrich_video.fetch_detail",
|
||||||
|
&format!("url={page_url}; error={error}"),
|
||||||
|
);
|
||||||
|
return item;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut media_url = Self::first_video_source_from_html(&detail_html)
|
||||||
|
.map(|value| self.absolute_url(&value));
|
||||||
|
|
||||||
|
if media_url.is_none() {
|
||||||
|
let iframe_url = Self::first_iframe_source_from_html(&detail_html)
|
||||||
|
.map(|value| self.absolute_url(&value));
|
||||||
|
if let Some(iframe_url) = iframe_url {
|
||||||
|
if Self::is_mixdrop_host(&iframe_url) {
|
||||||
|
if let Some(resolved) = self
|
||||||
|
.resolve_mixdrop_media_from_iframe(&iframe_url, &page_url, options)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
media_url = Some(resolved);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(media_url) = media_url else {
|
||||||
|
return item;
|
||||||
|
};
|
||||||
|
|
||||||
|
let format = VideoFormat::new(media_url, "source".to_string(), "mp4".to_string());
|
||||||
|
let mut enriched = item;
|
||||||
|
enriched.formats = Some(vec![format]);
|
||||||
|
enriched
|
||||||
|
}
|
||||||
|
|
||||||
fn extract_csrf_token(html: &str) -> Option<String> {
|
fn extract_csrf_token(html: &str) -> Option<String> {
|
||||||
let regex = Regex::new(r#"<meta name="csrf-token" content="([^"]+)""#).ok()?;
|
let regex = Regex::new(r#"<meta name="csrf-token" content="([^"]+)""#).ok()?;
|
||||||
regex
|
regex
|
||||||
@@ -1047,13 +1260,35 @@ impl Provider for ArchivebateProvider {
|
|||||||
|
|
||||||
let result = match query {
|
let result = match query {
|
||||||
Some(query) if !query.trim().is_empty() => {
|
Some(query) if !query.trim().is_empty() => {
|
||||||
self.query(cache, page, per_page, &query, options).await
|
self.query(cache, page, per_page, &query, options.clone()).await
|
||||||
}
|
}
|
||||||
_ => self.get_default(cache, page, per_page, options).await,
|
_ => self.get_default(cache, page, per_page, options.clone()).await,
|
||||||
};
|
};
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
Ok(videos) => videos,
|
Ok(videos) => {
|
||||||
|
if videos.is_empty() {
|
||||||
|
return videos;
|
||||||
|
}
|
||||||
|
stream::iter(videos.into_iter().map(|video| {
|
||||||
|
let provider = self.clone();
|
||||||
|
let options = options.clone();
|
||||||
|
async move {
|
||||||
|
let timeout_result = timeout(
|
||||||
|
StdDuration::from_secs(8),
|
||||||
|
provider.enrich_video(video.clone(), &options),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
match timeout_result {
|
||||||
|
Ok(enriched) => enriched,
|
||||||
|
Err(_) => video,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.buffer_unordered(4)
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
.await
|
||||||
|
}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await;
|
report_provider_error(CHANNEL_ID, "get_videos", &error.to_string()).await;
|
||||||
vec![]
|
vec![]
|
||||||
@@ -1065,3 +1300,23 @@ impl Provider for ArchivebateProvider {
|
|||||||
Some(self.build_channel(clientversion))
|
Some(self.build_channel(clientversion))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::ArchivebateProvider;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extracts_mixdrop_wurl_from_packed_eval() {
|
||||||
|
let html = r#"
|
||||||
|
<script>
|
||||||
|
eval(function(p,a,c,k,e,d){e=function(c){return c};if(!''.replace(/^/,String)){while(c--){d[c]=k[c]||c}k=[function(e){return d[e]}];e=function(){return'\\w+'};c=1};while(c--){if(k[c]){p=p.replace(new RegExp('\\b'+e(c)+'\\b','g'),k[c])}}return p}('1.2="//3.4.5/6/7.8?9=a&b=c";',13,13,'|MDCore|wurl|o230m5y6z|mxcontent|net|v2|r6pkwozjber741|mp4|s|TvNTJe3_z_6nKveumEHk8Q|e|1776460168'.split('|'),0,{}))
|
||||||
|
</script>
|
||||||
|
"#;
|
||||||
|
let actual = ArchivebateProvider::extract_mixdrop_media_url(html)
|
||||||
|
.expect("expected mixdrop media url");
|
||||||
|
assert_eq!(
|
||||||
|
actual,
|
||||||
|
"https://o230m5y6z.mxcontent.net/v2/r6pkwozjber741.mp4?s=TvNTJe3_z_6nKveumEHk8Q&e=1776460168"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user