From 94b27f66ef055fcbc04619907794de88560ffb28 Mon Sep 17 00:00:00 2001 From: Kyle Date: Sun, 19 May 2019 21:55:00 +0900 Subject: [PATCH 1/3] [criterion] Fix extractor --- youtube_dl/extractor/criterion.py | 124 ++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index f7815b905..fe7dce080 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -2,38 +2,118 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import int_or_none, mimetype2ext, str_or_none, try_get, url_or_none class CriterionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P[0-9]+)-.+' - _TEST = { - 'url': 'http://www.criterion.com/films/184-le-samourai', - 'md5': 'bc51beba55685509883a9a7830919ec3', - 'info_dict': { - 'id': '184', - 'ext': 'mp4', - 'title': 'Le Samouraï', - 'description': 'md5:a2b4b116326558149bef81f76dcbb93f', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } + _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P\d+)-.+' + _TESTS = [ + { + 'url': 'http://www.criterion.com/films/184-le-samourai', + 'md5': 'e80a6ec09375c58e0050b809238c4d39', + 'info_dict': { + 'id': '265399901', + 'title': 'Le samouraï', + 'ext': 'mp4', + 'description': 'md5:56ad66935158c6c88d4e391397c00d22', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'https://www.criterion.com/films/28986-the-heiress', + 'md5': '7178b368986eed7c9bf362dd90472c74', + 'info_dict': { + 'id': '315291282', + 'title': 'The Heiress', + 'ext': 'mp4', + 'description': 'md5:3d34fe5e6ff5520998b13137eba0f7ce', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'https://www.criterion.com/films/28836-funny-games', + 'md5': '6e36a90749755e600eeb57dc632e920d', + 'info_dict': { + 'id': '316586307', + 'title': 'Funny Games', + 'ext': 'mp4', + 'description': 'md5:64326c0cd08a6a582c10d63349941250', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'https://www.criterion.com/films/613-the-magic-flute', + 'md5': '8458ac11d5809f3f2d8f9aec1afa2fd6', + 'info_dict': { + 'id': '305845790', + 'title': 'The Magic Flute', + 'ext': 'mp4', + 'description': 'md5:9f232dcf15d9861c6a551662973482a5', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + ] + + def _extract_embedded_url(self, pattern, html, group): + return self._search_regex(pattern, html, 'embedded url', group=group) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - final_url = self._search_regex( - r'so\.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage) - thumbnail = self._search_regex( - r'so\.addVariable\("thumbnailURL", "(.+?)"\)\;', - webpage, 'thumbnail url') + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage + ) + description = self._og_search_description( + webpage, default=None + ) or self._html_search_meta('twitter:description', webpage, fatal=False) + + # Follow embedded url + embedded_re = r']+?src=["\'](?Phttps?:\/\/player\.vimeo\.com\/video\/(?P\d+)\?[^"\']+?)["\']' + embedded_id = str_or_none( + self._extract_embedded_url(embedded_re, webpage, group='id') + ) + embedded_url = url_or_none( + self._extract_embedded_url(embedded_re, webpage, group='url') + ) + + embedded_webpage = self._download_webpage( + embedded_url, embedded_id, headers={'Referer': url} + ) + + # Parse json data + data_re = r'var\s*config\s*=\s*(?P.*?);' + data_str = self._search_regex( + data_re, embedded_webpage, 'json data', group='data' + ) + data = self._parse_json(data_str, embedded_id) + + final_id = str_or_none(try_get(data, lambda x: x['video']['id'])) + + videos = try_get(data, lambda x: x['request']['files']['progressive'], list) + formats = [] + for vid in videos: + formats.append( + { + 'url': url_or_none(vid.get('url')), + 'ext': mimetype2ext(vid.get('mime')), + 'resolution': str_or_none(vid.get('quality')), + 'height': int_or_none(vid.get('height')), + 'width': int_or_none(vid.get('width')), + 'fps': int_or_none(vid.get('fps')), + } + ) + formats.sort(key=lambda x: x['height']) + + thumb_data = try_get(data, lambda x: x['video']['thumbs'], dict) + thumbnails = [] + for url in thumb_data: + thumbnails.append({'url': url_or_none(thumb_data[url])}) return { - 'id': video_id, - 'url': final_url, + 'id': final_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'formats': formats, + 'thumbnails': thumbnails, } From 9aff332b535447f467775004df9bfcc7a463fb5f Mon Sep 17 00:00:00 2001 From: Kyle Date: Tue, 21 May 2019 18:25:30 +0900 Subject: [PATCH 2/3] Add hls and dash formats. --- youtube_dl/extractor/criterion.py | 105 ++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index fe7dce080..22820b216 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -2,7 +2,14 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none, mimetype2ext, str_or_none, try_get, url_or_none +from ..utils import ( + determine_ext, + int_or_none, + mimetype2ext, + str_or_none, + try_get, + url_or_none, +) class CriterionIE(InfoExtractor): @@ -10,7 +17,6 @@ class CriterionIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.criterion.com/films/184-le-samourai', - 'md5': 'e80a6ec09375c58e0050b809238c4d39', 'info_dict': { 'id': '265399901', 'title': 'Le samouraï', @@ -18,10 +24,10 @@ class CriterionIE(InfoExtractor): 'description': 'md5:56ad66935158c6c88d4e391397c00d22', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.criterion.com/films/28986-the-heiress', - 'md5': '7178b368986eed7c9bf362dd90472c74', 'info_dict': { 'id': '315291282', 'title': 'The Heiress', @@ -29,10 +35,10 @@ class CriterionIE(InfoExtractor): 'description': 'md5:3d34fe5e6ff5520998b13137eba0f7ce', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.criterion.com/films/28836-funny-games', - 'md5': '6e36a90749755e600eeb57dc632e920d', 'info_dict': { 'id': '316586307', 'title': 'Funny Games', @@ -40,10 +46,10 @@ class CriterionIE(InfoExtractor): 'description': 'md5:64326c0cd08a6a582c10d63349941250', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': True}, }, { 'url': 'https://www.criterion.com/films/613-the-magic-flute', - 'md5': '8458ac11d5809f3f2d8f9aec1afa2fd6', 'info_dict': { 'id': '305845790', 'title': 'The Magic Flute', @@ -51,12 +57,66 @@ class CriterionIE(InfoExtractor): 'description': 'md5:9f232dcf15d9861c6a551662973482a5', 'thumbnail': r're:^https?://.*\.jpg$', }, + 'params': {'skip_download': True}, }, ] def _extract_embedded_url(self, pattern, html, group): return self._search_regex(pattern, html, 'embedded url', group=group) + def _extract_formats_from_stream(self, data, final_id): + ext_formats = [] + if isinstance(data, dict): + cdns = data.get('cdns') + if cdns: + for cdn_name, cdn_data in cdns.items(): + url = url_or_none(cdn_data.get('url')) + + ext = determine_ext(url) + + if ext == 'm3u8': + ext_formats.extend( + self._extract_m3u8_formats( + url, + final_id, + ext='mp4', + m3u8_id=str_or_none(cdn_name) or 'hls', + fatal=False, + ) + ) + # dash mpd + if ext == 'json': + ext_formats.extend( + self._extract_mpd_formats( + url.replace('master.json', 'master.mpd'), + final_id, + mpd_id=cdn_name, + fatal=False, + ) + ) + return ext_formats + + def _extract_formats_from_other(self, src, data): + if not isinstance(data, list): + data = [data] + ext_formats = [] + if src == 'progressive': + for vid in data: + profile = str_or_none(vid.get('profile')) + ext_formats.append( + { + 'url': url_or_none(vid.get('url')), + 'ext': mimetype2ext(vid.get('mime')), + 'format_id': vid.get('cdn') + (profile if profile else ''), + 'height': int_or_none(vid.get('height')), + 'width': int_or_none(vid.get('width')), + 'fps': int_or_none(vid.get('fps')), + } + ) + vid_id = vid.get('id') or '' + self.to_screen('%s: Downloading mp4 information' % (vid_id,)) + return ext_formats + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -81,7 +141,7 @@ class CriterionIE(InfoExtractor): embedded_url, embedded_id, headers={'Referer': url} ) - # Parse json data + # Grab json data data_re = r'var\s*config\s*=\s*(?P.*?);' data_str = self._search_regex( data_re, embedded_webpage, 'json data', group='data' @@ -90,25 +150,28 @@ class CriterionIE(InfoExtractor): final_id = str_or_none(try_get(data, lambda x: x['video']['id'])) - videos = try_get(data, lambda x: x['request']['files']['progressive'], list) + stream_types = ('hls', 'dash',) + nonstream_types = ('progressive',) + + # Collect formats formats = [] - for vid in videos: - formats.append( - { - 'url': url_or_none(vid.get('url')), - 'ext': mimetype2ext(vid.get('mime')), - 'resolution': str_or_none(vid.get('quality')), - 'height': int_or_none(vid.get('height')), - 'width': int_or_none(vid.get('width')), - 'fps': int_or_none(vid.get('fps')), - } - ) - formats.sort(key=lambda x: x['height']) + sources = try_get(data, lambda x: x['request']['files'], dict) + if sources: + for src, src_data in sources.items(): + if src in stream_types: + formats.extend( + self._extract_formats_from_stream(src_data, final_id) + ) + elif src in nonstream_types: + formats.extend(self._extract_formats_from_other(src, src_data)) + + self._sort_formats(formats) thumb_data = try_get(data, lambda x: x['video']['thumbs'], dict) thumbnails = [] - for url in thumb_data: - thumbnails.append({'url': url_or_none(thumb_data[url])}) + if thumb_data: + for url in thumb_data: + thumbnails.append({'url': url_or_none(thumb_data[url])}) return { 'id': final_id, From 9e4dcb4a5bea040bca35bdb74dedc8d2dd00cec2 Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 23 May 2019 14:59:26 +0900 Subject: [PATCH 3/3] Sort formats based on vimeo sort. --- youtube_dl/extractor/criterion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index 22820b216..a2df5b943 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -10,6 +10,7 @@ from ..utils import ( try_get, url_or_none, ) +from .vimeo import VimeoBaseInfoExtractor as VimeoBaseIE class CriterionIE(InfoExtractor): @@ -165,7 +166,7 @@ class CriterionIE(InfoExtractor): elif src in nonstream_types: formats.extend(self._extract_formats_from_other(src, src_data)) - self._sort_formats(formats) + VimeoBaseIE._vimeo_sort_formats(self, formats) thumb_data = try_get(data, lambda x: x['video']['thumbs'], dict) thumbnails = []