From 54269370755d7cb2462a0615a4ef5a3f2c050045 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Thu, 18 Jan 2018 13:29:18 +0100 Subject: [PATCH 1/6] [hhu] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hhu.py | 72 ++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 youtube_dl/extractor/hhu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 44120cae2..81021d5b5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -438,6 +438,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hhu import HHUIE from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py new file mode 100644 index 000000000..5ecf4a9bb --- /dev/null +++ b/youtube_dl/extractor/hhu.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class HHUIE(InfoExtractor): + _VALID_URL = r'https://mediathek\.hhu\.de/watch/(?P.+)' + _TEST = { + 'url': 'https://mediathek.hhu.de/watch/2dd05982-ea45-4108-9620-0c36e6ed8df5', + 'md5': 'b99ff77f2148b1e754555abdf53f0e51', + 'info_dict': { + 'id': '2dd05982-ea45-4108-9620-0c36e6ed8df5', + 'ext': 'mp4', + 'title': 'Das Multimediazentrum', + 'description': '', + 'uploader_id': 'clames', + 'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', + } + } + + def _real_extract(self, url): + # TODO: Login for some videos. + video_id = self._match_id(url) + webpage, webpage_url = self._download_webpage_handle(url, video_id) + if webpage_url.geturl().startswith("https://sts."): + self.raise_login_required() + file_id = self._html_search_regex( + r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", + webpage, 'file_id' + ) + formats = [ + ({'url': format_url.format(file_id)}) + for format_url in ( + 'https://mediathek.hhu.de/movies/{}/v_10.webm', + 'https://mediathek.hhu.de/movies/{}/v_10.mp4', + 'https://mediathek.hhu.de/movies/{}/v_50.webm', + 'https://mediathek.hhu.de/movies/{}/v_50.mp4', + 'https://mediathek.hhu.de/movies/{}/v_100.webm', + 'https://mediathek.hhu.de/movies/{}/v_100.mp4', + ) + ] + try: + title = self._og_search_title(webpage) + except: + title = self._html_search_regex( + r'

\s+(.+?)\s+<\/h1>', + webpage, 'title' + ) + try: + description = self._og_search_description(webpage) + except: + description = self._html_search_regex( + r'

\s+(.+?)\s+<\/p>', + webpage, 'description', fatal=False + ) + thumbnail = self._og_search_property( + 'image:secure_url', webpage, 'thumbnail' + ) + uploader_id = self._html_search_regex( + r'(.+?)<\/a>', + webpage, 'uploader', fatal=False + ) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + 'formats': formats, + } From f08371c07ced991c580412e0b1672e52a4e1e5b5 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 2 Oct 2019 21:18:36 +0200 Subject: [PATCH 2/6] [hhu] Parse video player config --- youtube_dl/extractor/hhu.py | 98 ++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index 5ecf4a9bb..c994662e2 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import js_to_json, RegexNotFoundError, urljoin + +import json +import re class HHUIE(InfoExtractor): @@ -20,53 +24,89 @@ class HHUIE(InfoExtractor): } def _real_extract(self, url): - # TODO: Login for some videos. video_id = self._match_id(url) webpage, webpage_url = self._download_webpage_handle(url, video_id) if webpage_url.geturl().startswith("https://sts."): self.raise_login_required() - file_id = self._html_search_regex( - r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", - webpage, 'file_id' - ) - formats = [ - ({'url': format_url.format(file_id)}) - for format_url in ( - 'https://mediathek.hhu.de/movies/{}/v_10.webm', - 'https://mediathek.hhu.de/movies/{}/v_10.mp4', - 'https://mediathek.hhu.de/movies/{}/v_50.webm', - 'https://mediathek.hhu.de/movies/{}/v_50.mp4', - 'https://mediathek.hhu.de/movies/{}/v_100.webm', - 'https://mediathek.hhu.de/movies/{}/v_100.mp4', - ) - ] + # Some videos need a login, maybe TODO. try: - title = self._og_search_title(webpage) - except: + config_js = self._search_regex( + r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js' + ) + # remove 'link: encodeURI(""),' + if 'link: encodeURI' in config_js: + encode_begin = config_js.find('link: encodeURI') + encode_end = config_js.find(')', encode_begin) + config_js = ( + config_js[:encode_begin] + config_js[encode_end + 2:] + ) + del encode_begin, encode_end + config = json.loads(js_to_json(config_js)) + if len(config['playlist']) > 1: + self.report_warning( + 'more than one video, just taking the first one' + ) + video = config['playlist'][0] + formats = [ + { + 'url': urljoin('https://mediathek.hhu.de/', source['file']), + 'format_note': source.get('label'), + 'format_id': source['file'].split("/")[-1], + } + for source in video['sources'] + ] + formats.reverse() # config sorts from highest to lowest quality + title = video.get('title') + thumbnail = video.get('image') + thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None + + except (RegexNotFoundError, ValueError): + self.report_warning('failed to get player config, guessing formats') + # This will likely work but better warn. + file_id = self._html_search_regex( + r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", + webpage, 'file_id' + ) + formats = [ + ({'url': format_url.format(file_id)}) + for format_url in ( + 'https://mediathek.hhu.de/movies/{}/v_10.webm', + 'https://mediathek.hhu.de/movies/{}/v_10.mp4', + 'https://mediathek.hhu.de/movies/{}/v_50.webm', + 'https://mediathek.hhu.de/movies/{}/v_50.mp4', + 'https://mediathek.hhu.de/movies/{}/v_100.webm', + 'https://mediathek.hhu.de/movies/{}/v_100.mp4', + ) + ] + title = thumbnail = None + if not title: title = self._html_search_regex( r'

\s+(.+?)\s+<\/h1>', webpage, 'title' ) - try: - description = self._og_search_description(webpage) - except: - description = self._html_search_regex( - r'

\s+(.+?)\s+<\/p>', - webpage, 'description', fatal=False - ) - thumbnail = self._og_search_property( - 'image:secure_url', webpage, 'thumbnail' + if not title: + title = self._og_search_title(webpage, fatal=False) + description = self._html_search_regex( + r'

\s+(.+?)\s+<\/p>', + webpage, 'description', fatal=False ) - uploader_id = self._html_search_regex( + if not description: + description = self._og_search_description(webpage, default='') + if not thumbnail: + thumbnail = self._og_search_property( + 'image:secure_url', webpage, 'thumbnail', fatal=False + ) + uploader = self._html_search_regex( r'(.+?)<\/a>', webpage, 'uploader', fatal=False ) + return { 'id': video_id, 'title': title, 'description': description, - 'uploader_id': uploader_id, + 'uploader': uploader, 'thumbnail': thumbnail, 'formats': formats, } From 9f07fb23820cc962e365c4f712c9da7c81e9841d Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 2 Oct 2019 21:18:57 +0200 Subject: [PATCH 3/6] [hhu] Add more details --- youtube_dl/extractor/hhu.py | 57 +++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index c994662e2..ca4a36e18 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json, RegexNotFoundError, urljoin +from ..utils import ( + js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate +) import json import re @@ -18,7 +20,18 @@ class HHUIE(InfoExtractor): 'ext': 'mp4', 'title': 'Das Multimediazentrum', 'description': '', + 'categories': ['Imagefilme'], + 'tags': [ + 'MMZ', 'Multimediazentrum', 'Heinrich-Heine-Universität', + 'UKD', 'eLearning', 'Abstimmsysteme', 'Portale', + 'Studierendenportal', 'Lehrfilme', 'Lehrfilm', + 'Operationsfilme', 'Vorlesungsaufzeichnung', 'Multimedia', + 'ZIM', 'HHU', 'Ute', 'Clames', # yes, that's incorrect + ], + 'uploader': 'clames', 'uploader_id': 'clames', + 'license': 'CC BY 3.0 DE', + 'upload_date': '20150126', 'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', } } @@ -100,13 +113,53 @@ class HHUIE(InfoExtractor): r'(.+?)<\/a>', webpage, 'uploader', fatal=False ) - + uploader_id = self._html_search_regex( + r'.+?<\/a>', + webpage, 'uploader_id', fatal=False + ) + # CC licenses get a image with an appropriate alt text + license_img = get_element_by_id('mt_watch_license', webpage) + if license_img: + license = self._search_regex( + r'alt="(.+)"', license_img, 'license_img', fatal=False + ) + if not license_img or not license: + # other licenses are just text + license = self._html_search_regex( + r'

(.+)<\/div>', + webpage, 'license_text', fatal=False + ) + upload_date = _date(self._html_search_regex( + r'(.+?)<\/span>', + webpage, 'upload_date', fatal=False + )) + category = self._html_search_regex( + r'(.+)', webpage, 'category', fatal=False + ) + tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) + tags = _tags(tags_html) return { 'id': video_id, 'title': title, 'description': description, + 'license': license, + 'categories': [category], # there's just one category per video + 'tags': tags, 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, 'thumbnail': thumbnail, 'formats': formats, } + + +def _date(str_containing_date): + """Parse the string 'at (M)M/(D)D/YYYY' to YYYYMMDD.""" + return unified_strdate(str_containing_date.split(' ')[1], day_first=False) + + +def _tags(tags_html): + """Parse the HTML markup containing the tags.""" + matches = re.findall(r'(.+)<\/a>', tags_html) + return [match.rstrip(',') for match in matches] From 233400f3d92dd3ac8ef4664f9679eb09c9132d98 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 9 Oct 2019 20:04:51 +0200 Subject: [PATCH 4/6] [hhu] Don't place closing braces/brackets/parentheses on their own lines --- youtube_dl/extractor/hhu.py | 59 ++++++++++++------------------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index ca4a36e18..a23ae7d96 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -26,15 +26,12 @@ class HHUIE(InfoExtractor): 'UKD', 'eLearning', 'Abstimmsysteme', 'Portale', 'Studierendenportal', 'Lehrfilme', 'Lehrfilm', 'Operationsfilme', 'Vorlesungsaufzeichnung', 'Multimedia', - 'ZIM', 'HHU', 'Ute', 'Clames', # yes, that's incorrect - ], + 'ZIM', 'HHU', 'Ute', 'Clames', ], # yes, that's incorrect 'uploader': 'clames', 'uploader_id': 'clames', 'license': 'CC BY 3.0 DE', 'upload_date': '20150126', - 'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', - } - } + 'thumbnail': 'https://mediathek.hhu.de/thumbs/2dd05982-ea45-4108-9620-0c36e6ed8df5/thumb_000.jpg', }} def _real_extract(self, url): video_id = self._match_id(url) @@ -44,30 +41,25 @@ class HHUIE(InfoExtractor): # Some videos need a login, maybe TODO. try: config_js = self._search_regex( - r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js' - ) + r'playerInstance\.setup\(([^;]+)\);', webpage, 'config_js') # remove 'link: encodeURI(""),' if 'link: encodeURI' in config_js: encode_begin = config_js.find('link: encodeURI') encode_end = config_js.find(')', encode_begin) config_js = ( - config_js[:encode_begin] + config_js[encode_end + 2:] - ) + config_js[:encode_begin] + config_js[encode_end + 2:]) del encode_begin, encode_end config = json.loads(js_to_json(config_js)) if len(config['playlist']) > 1: self.report_warning( - 'more than one video, just taking the first one' - ) + 'more than one video, just taking the first one') video = config['playlist'][0] formats = [ { 'url': urljoin('https://mediathek.hhu.de/', source['file']), 'format_note': source.get('label'), - 'format_id': source['file'].split("/")[-1], - } - for source in video['sources'] - ] + 'format_id': source['file'].split("/")[-1], } + for source in video['sources']] formats.reverse() # config sorts from highest to lowest quality title = video.get('title') thumbnail = video.get('image') @@ -78,8 +70,7 @@ class HHUIE(InfoExtractor): # This will likely work but better warn. file_id = self._html_search_regex( r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", - webpage, 'file_id' - ) + webpage, 'file_id') formats = [ ({'url': format_url.format(file_id)}) for format_url in ( @@ -88,54 +79,43 @@ class HHUIE(InfoExtractor): 'https://mediathek.hhu.de/movies/{}/v_50.webm', 'https://mediathek.hhu.de/movies/{}/v_50.mp4', 'https://mediathek.hhu.de/movies/{}/v_100.webm', - 'https://mediathek.hhu.de/movies/{}/v_100.mp4', - ) - ] + 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)] title = thumbnail = None if not title: title = self._html_search_regex( r'

\s+(.+?)\s+<\/h1>', - webpage, 'title' - ) + webpage, 'title') if not title: title = self._og_search_title(webpage, fatal=False) description = self._html_search_regex( r'

\s+(.+?)\s+<\/p>', - webpage, 'description', fatal=False - ) + webpage, 'description', fatal=False) if not description: description = self._og_search_description(webpage, default='') if not thumbnail: thumbnail = self._og_search_property( - 'image:secure_url', webpage, 'thumbnail', fatal=False - ) + 'image:secure_url', webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( r'(.+?)<\/a>', - webpage, 'uploader', fatal=False - ) + webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( r'.+?<\/a>', - webpage, 'uploader_id', fatal=False - ) + webpage, 'uploader_id', fatal=False) # CC licenses get a image with an appropriate alt text license_img = get_element_by_id('mt_watch_license', webpage) if license_img: license = self._search_regex( - r'alt="(.+)"', license_img, 'license_img', fatal=False - ) + r'alt="(.+)"', license_img, 'license_img', fatal=False) if not license_img or not license: # other licenses are just text license = self._html_search_regex( r'

(.+)<\/div>', - webpage, 'license_text', fatal=False - ) + webpage, 'license_text', fatal=False) upload_date = _date(self._html_search_regex( r'(.+?)<\/span>', - webpage, 'upload_date', fatal=False - )) + webpage, 'upload_date', fatal=False)) category = self._html_search_regex( - r'(.+)', webpage, 'category', fatal=False - ) + r'(.+)', webpage, 'category', fatal=False) tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) tags = _tags(tags_html) @@ -150,8 +130,7 @@ class HHUIE(InfoExtractor): 'uploader_id': uploader_id, 'upload_date': upload_date, 'thumbnail': thumbnail, - 'formats': formats, - } + 'formats': formats, } def _date(str_containing_date): From dada9f6db9427f0e8738ecf6551a9a0fc865bbb0 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 9 Oct 2019 21:46:05 +0200 Subject: [PATCH 5/6] [common] _parse_jwplayer_formats: Accept more labels as formats Some labels are of the form 'low quality (320p)'. This commit changes the regex so, that the whole label is searched for the number, not just the beginning. --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 859786617..4d9ee81ab 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2750,7 +2750,7 @@ class InfoExtractor(object): # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + r'(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, From 4d52506dbefbd4dafd90689fcce820f522bad983 Mon Sep 17 00:00:00 2001 From: Niklas Sombert Date: Wed, 9 Oct 2019 21:39:40 +0200 Subject: [PATCH 6/6] [hhu] Use _parse_jwplayer_data --- youtube_dl/extractor/hhu.py | 92 ++++++++++++++----------------------- 1 file changed, 35 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/hhu.py b/youtube_dl/extractor/hhu.py index a23ae7d96..13eaca6fc 100644 --- a/youtube_dl/extractor/hhu.py +++ b/youtube_dl/extractor/hhu.py @@ -3,10 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - js_to_json, RegexNotFoundError, urljoin, get_element_by_id, unified_strdate + js_to_json, RegexNotFoundError, get_element_by_id, unified_strdate ) -import json import re @@ -49,88 +48,67 @@ class HHUIE(InfoExtractor): config_js = ( config_js[:encode_begin] + config_js[encode_end + 2:]) del encode_begin, encode_end - config = json.loads(js_to_json(config_js)) - if len(config['playlist']) > 1: - self.report_warning( - 'more than one video, just taking the first one') - video = config['playlist'][0] - formats = [ - { - 'url': urljoin('https://mediathek.hhu.de/', source['file']), - 'format_note': source.get('label'), - 'format_id': source['file'].split("/")[-1], } - for source in video['sources']] - formats.reverse() # config sorts from highest to lowest quality - title = video.get('title') - thumbnail = video.get('image') - thumbnail = urljoin('https://mediathek.hhu.de/', thumbnail) if thumbnail else None - + config = self._parse_json( + config_js, video_id, transform_source=js_to_json) + info = self._parse_jwplayer_data( + config, video_id, require_title=False, + base_url='https://mediathek.hhu.de/') except (RegexNotFoundError, ValueError): self.report_warning('failed to get player config, guessing formats') # This will likely work but better warn. file_id = self._html_search_regex( r"{ file: '\/movies\/(.+?)\/v_100\.mp4', label: '", webpage, 'file_id') - formats = [ - ({'url': format_url.format(file_id)}) - for format_url in ( - 'https://mediathek.hhu.de/movies/{}/v_10.webm', - 'https://mediathek.hhu.de/movies/{}/v_10.mp4', - 'https://mediathek.hhu.de/movies/{}/v_50.webm', - 'https://mediathek.hhu.de/movies/{}/v_50.mp4', - 'https://mediathek.hhu.de/movies/{}/v_100.webm', - 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)] - title = thumbnail = None - if not title: - title = self._html_search_regex( + info = { + 'video_id': video_id, + 'formats': [ + ({'url': format_url.format(file_id)}) + for format_url in ( + 'https://mediathek.hhu.de/movies/{}/v_10.webm', + 'https://mediathek.hhu.de/movies/{}/v_10.mp4', + 'https://mediathek.hhu.de/movies/{}/v_50.webm', + 'https://mediathek.hhu.de/movies/{}/v_50.mp4', + 'https://mediathek.hhu.de/movies/{}/v_100.webm', + 'https://mediathek.hhu.de/movies/{}/v_100.mp4',)]} + if not info.get('title'): + info['title'] = self._html_search_regex( r'

\s+(.+?)\s+<\/h1>', webpage, 'title') - if not title: - title = self._og_search_title(webpage, fatal=False) - description = self._html_search_regex( + if not info.get('title'): + info['title'] = self._og_search_title(webpage, fatal=False) + info['description'] = self._html_search_regex( r'

\s+(.+?)\s+<\/p>', webpage, 'description', fatal=False) - if not description: - description = self._og_search_description(webpage, default='') - if not thumbnail: - thumbnail = self._og_search_property( + if not info.get('description'): + info['description'] = self._og_search_description(webpage, default='') + if not info.get('thumbnail'): + info['thumbnail'] = self._og_search_property( 'image:secure_url', webpage, 'thumbnail', fatal=False) - uploader = self._html_search_regex( + info['uploader'] = self._html_search_regex( r'(.+?)<\/a>', webpage, 'uploader', fatal=False) - uploader_id = self._html_search_regex( + info['uploader_id'] = self._html_search_regex( r'.+?<\/a>', webpage, 'uploader_id', fatal=False) # CC licenses get a image with an appropriate alt text license_img = get_element_by_id('mt_watch_license', webpage) if license_img: - license = self._search_regex( + info['license'] = self._search_regex( r'alt="(.+)"', license_img, 'license_img', fatal=False) - if not license_img or not license: + if not license_img or not info.get('license'): # other licenses are just text - license = self._html_search_regex( + info['license'] = self._html_search_regex( r'

(.+)<\/div>', webpage, 'license_text', fatal=False) - upload_date = _date(self._html_search_regex( + info['upload_date'] = _date(self._html_search_regex( r'(.+?)<\/span>', webpage, 'upload_date', fatal=False)) category = self._html_search_regex( r'(.+)', webpage, 'category', fatal=False) + info['categories'] = [category] # there's just one category per video tags_html = get_element_by_id('mt_watch_info_tag_list', webpage) - tags = _tags(tags_html) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'license': license, - 'categories': [category], # there's just one category per video - 'tags': tags, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'thumbnail': thumbnail, - 'formats': formats, } + info['tags'] = _tags(tags_html) + return info def _date(str_containing_date):