From c88611a77345fd6acd606182250d0a1d8441813b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Dro=C5=BCak?= Date: Thu, 30 Jul 2020 22:28:23 +0200 Subject: [PATCH 1/2] [cda] Fix extractor (fixes #24458) --- youtube_dl/extractor/cda.py | 63 ++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 0c3af23d5..6b1b411e2 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -5,10 +5,12 @@ import codecs import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, float_or_none, int_or_none, + try_get, multipart_encode, parse_duration, random_birthday, @@ -98,6 +100,14 @@ class CDAIE(InfoExtractor): formats = [] + metadata_json = self._html_search_regex(r'''(?x) + ]+type=(["\'])application/ld\+json\1[^>]*> + (?P(?:.|\n)+?) + + ''', webpage, 'metadata_json', fatal=False, group='metadata_json') + + metadata = self._parse_json(metadata_json, 'metadata', fatal=False) + uploader = self._search_regex(r'''(?x) <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*> (?:<\1[^>]*>[^<]*|(?!)(?:.|\n))*? @@ -106,9 +116,7 @@ class CDAIE(InfoExtractor): view_count = self._search_regex( r'OdsÅ‚ony:(?:\s| )*([0-9]+)', webpage, 'view_count', default=None) - average_rating = self._search_regex( - r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P[0-9.]+)', - webpage, 'rating', fatal=False, group='rating_value') + average_rating = try_get(metadata, lambda x: x[0]['aggregateRating']['ratingValue'], str) info_dict = { 'id': video_id, @@ -123,6 +131,47 @@ class CDAIE(InfoExtractor): 'age_limit': 18 if need_confirm_age else 0, } + # Function extracted from cda.pl player.js script + def deobfuscate_video_url(url): + if not any(word in url for word in ['http', '.mp4', 'uggcf://']): + word_list = [ + '_XDDD', + '_CDA', + '_ADC', + '_CXD', + '_QWE', + '_Q5', + '_IKSDE', + ] + for word in word_list: + url = url.replace(word, '') + + url = compat_urllib_parse_unquote(url) + + char_list = list(url) + for i, char in enumerate(char_list): + char_code = ord(char) + if 33 <= char_code <= 126: + char_list[i] = chr(33 + ((char_code + 14) % 94)) + url = ''.join(char_list) + + url = url.replace('.cda.mp4', '') + url = url.replace('.2cda.pl', '.cda.pl') + url = url.replace('.3cda.pl', '.cda.pl') + + url = 'https://' + (url.replace('/upstream', '.mp4/upstream') + if '/upstream' in url else url + '.mp4') + + if 'http' not in url: + url = codecs.decode(url, 'rot_13') + + if 'mp4' not in url: + url += '.mp4' + + url = url.replace('adc.mp4', '.mp4') + + return url + def extract_format(page, version): json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P.+?)\1', page, @@ -137,12 +186,10 @@ class CDAIE(InfoExtractor): if not video or 'file' not in video: self.report_warning('Unable to extract %s version information' % version) return - if video['file'].startswith('uggc'): - video['file'] = codecs.decode(video['file'], 'rot_13') - if video['file'].endswith('adc.mp4'): - video['file'] = video['file'].replace('adc.mp4', '.mp4') + + url = deobfuscate_video_url(video['file']) f = { - 'url': video['file'], + 'url': url, } m = re.search( r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', From 12e9f45a37034e4c9752326e8768edbe2f11d9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Dro=C5=BCak?= Date: Fri, 31 Jul 2020 13:26:17 +0200 Subject: [PATCH 2/2] Shorten the code --- youtube_dl/extractor/cda.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 6b1b411e2..eb0e0657e 100644 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -148,12 +148,8 @@ class CDAIE(InfoExtractor): url = compat_urllib_parse_unquote(url) - char_list = list(url) - for i, char in enumerate(char_list): - char_code = ord(char) - if 33 <= char_code <= 126: - char_list[i] = chr(33 + ((char_code + 14) % 94)) - url = ''.join(char_list) + url = ''.join(map(lambda char: chr(33 + ((ord(char) + 14) % 94)) + if 33 <= ord(char) <= 126 else char, url)) url = url.replace('.cda.mp4', '') url = url.replace('.2cda.pl', '.cda.pl')