From a613d24c4bb4b7ec6088102a4e4cb646a1567c4b Mon Sep 17 00:00:00 2001 From: Kyle Date: Thu, 23 May 2019 19:48:42 +0900 Subject: [PATCH] [criterion] Fix extractor --- youtube_dl/extractor/criterion.py | 143 +++++++++++++++++++++++------- 1 file changed, 113 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index f7815b905..00c91d7ce 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -1,39 +1,122 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +import re + +from ..utils import RegexNotFoundError, str_or_none, url_or_none +from .vimeo import VimeoBaseInfoExtractor -class CriterionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P[0-9]+)-.+' - _TEST = { - 'url': 'http://www.criterion.com/films/184-le-samourai', - 'md5': 'bc51beba55685509883a9a7830919ec3', - 'info_dict': { - 'id': '184', - 'ext': 'mp4', - 'title': 'Le Samouraï', - 'description': 'md5:a2b4b116326558149bef81f76dcbb93f', - 'thumbnail': r're:^https?://.*\.jpg$', - } - } +class CriterionIE(VimeoBaseInfoExtractor): + IE_NAME = 'criterion' + IE_DESC = 'The Criterion Collection' + _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P\d+)-.+' + _TESTS = [ + { + 'url': 'http://www.criterion.com/films/184-le-samourai', + 'info_dict': { + 'id': '265399901', + 'title': 'Le samouraï', + 'description': 'md5:56ad66935158c6c88d4e391397c00d22', + 'uploader': 'Criterion Collection', + 'uploader_id': 'criterioncollection', + 'uploader_url': 'https://vimeo.com/criterioncollection', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://www.criterion.com/films/28986-the-heiress', + 'info_dict': { + 'id': '315291282', + 'title': 'The Heiress', + 'description': 'md5:3d34fe5e6ff5520998b13137eba0f7ce', + 'uploader': 'Criterion Collection', + 'uploader_id': 'criterioncollection', + 'uploader_url': 'https://vimeo.com/criterioncollection', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://www.criterion.com/films/28836-funny-games', + 'info_dict': { + 'id': '316586307', + 'title': 'Funny Games', + 'description': 'md5:64326c0cd08a6a582c10d63349941250', + 'uploader': 'Criterion Collection', + 'uploader_id': 'criterioncollection', + 'uploader_url': 'https://vimeo.com/criterioncollection', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://www.criterion.com/films/613-the-magic-flute', + 'info_dict': { + 'id': '305845790', + 'title': 'The Magic Flute', + 'description': 'md5:9f232dcf15d9861c6a551662973482a5', + 'uploader': 'Criterion Collection', + 'uploader_id': 'criterioncollection', + 'uploader_url': 'https://vimeo.com/criterioncollection', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': {'skip_download': True}, + }, + ] + + @staticmethod + def _extract_video_id_and_url(webpage): + emb_re = r']+?src=["\'](?Phttps?:\/\/player\.vimeo\.com\/video\/(?P\d+)\?[^"\']+?)["\']' + + mobj = re.search(emb_re, webpage) + if mobj is None: + raise RegexNotFoundError('Unable to extract embedded id and url.') + + emb_id = str_or_none(mobj.group('id')) + emb_url = url_or_none(mobj.group('url')) + + return (emb_id, emb_url) + + def _get_config(self, webpage, video_id): + config_re = r'var\s*config\s*=\s*(?P.*?);' + config_str = self._search_regex( + config_re, webpage, 'json config data', group='data' + ) + + return self._parse_json(config_str, video_id) def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + criterion_id = self._match_id(url) + criterion_webpage = self._download_webpage(url, criterion_id) - final_url = self._search_regex( - r'so\.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage) - thumbnail = self._search_regex( - r'so\.addVariable\("thumbnailURL", "(.+?)"\)\;', - webpage, 'thumbnail url') + title = self._og_search_title( + criterion_webpage, default=None + ) or self._html_search_meta('twitter:title', criterion_webpage) + description = self._og_search_description( + criterion_webpage, default=None + ) or self._html_search_meta( + 'twitter:description', criterion_webpage, fatal=False + ) - return { - 'id': video_id, - 'url': final_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } + video_id, video_url = CriterionIE._extract_video_id_and_url(criterion_webpage) + + video_webpage = self._download_webpage( + video_url, video_id, headers={'Referer': url} + ) + + config = self._get_config(video_webpage, video_id) + + info_dict = self._parse_config(config, video_id) + self._vimeo_sort_formats(info_dict['formats']) + + info_dict['id'] = video_id + info_dict['title'] = title + info_dict['description'] = description + + return info_dict