From 12efabd86d9dbbaeff161a1f3851fe751931facb Mon Sep 17 00:00:00 2001 From: Forcer78 Date: Wed, 8 Jun 2016 23:13:49 -0400 Subject: [PATCH] [appletrailers] Fix for newer JSON data sources --- youtube_dl/extractor/appletrailers.py | 154 ++++++++++++++++---------- 1 file changed, 96 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..d9b3a850b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, + unified_strdate, + parse_duration, ) @@ -63,19 +65,28 @@ class AppleTrailersIE(InfoExtractor): 'uploader_id': 'wb', }, }, - ] + ], + 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 404: Not Found'], }, { 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', 'info_dict': { 'id': 'blackthorn', }, 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 404: Not Found'], }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', - 'only_matching': True, + 'info_dict': { + 'id': 'autrui', + }, + 'playlist_mincount': 1, + 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 404: Not Found'], }, { 'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/', - 'only_matching': True, + 'info_dict': { + 'id': 'kuboandthetwostrings', + }, + 'playlist_mincount': 4, }] _JSON_RE = r'iTunes.playURL\((.*?)\);' @@ -85,71 +96,98 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') - - def fix_html(s): - s = re.sub(r'(?s).*?', '', s) - s = re.sub(r'', r'', s) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # like: http://trailers.apple.com/trailers/wb/gravity/ - - def _clean_json(m): - return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - s = re.sub(self._JSON_RE, _clean_json, s) - s = '%s' % s - return s - doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - playlist = [] - for li in doc.findall('./div/ul/li'): - on_click = li.find('.//a').attrib['onClick'] - trailer_info_json = self._search_regex(self._JSON_RE, - on_click, 'trailer info') - trailer_info = json.loads(trailer_info_json) - first_url = trailer_info.get('url') - if not first_url: - continue - title = trailer_info['title'] + + def create_playlist_item(source, uncommon={}): + title = source['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() - thumbnail = li.find('.//img').attrib['src'] - upload_date = trailer_info['posted'].replace('-', '') - runtime = trailer_info['runtime'] - m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) - duration = None - if m: - duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - - trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() - settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') - - formats = [] - for format in settings['metadata']['sizes']: - # The src is a file pointing to the real video file - format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) - formats.append({ - 'url': format_url, - 'format': format['type'], - 'width': int_or_none(format['width']), - 'height': int_or_none(format['height']), - }) - - self._sort_formats(formats) - - playlist.append({ + item = { '_type': 'video', 'id': video_id, - 'formats': formats, 'title': title, - 'duration': duration, - 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'duration': parse_duration(source['runtime']), + 'upload_date': unified_strdate(source['posted']), 'uploader_id': uploader_id, 'http_headers': { 'User-Agent': 'QuickTime compatible (youtube-dl)', }, - }) + } + item.update(uncommon) + return item + + def create_format_item(source, uncommon={}): + item = { + # The src is a file pointing to the real video file + 'url': re.sub(r'_(\d*p.mov)' , r'_h\1', source['src']), + 'width': int_or_none(source['width']), + 'height': int_or_none(source['height']), + } + item.update(uncommon) + return item + + # Test for presence of newer JSON format. + html = self._download_webpage(url, movie, fatal=False) + film_id = self._search_regex(r"^\s*var\s+FilmId\s*=\s*'(?P\d+)'", html, 'FilmId', fatal=False, flags=re.MULTILINE, group='film_id') + json_data = None + if film_id: + data_url = 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id + json_data = self._download_json(data_url, movie, 'Downloading trailer JSON', fatal=False) + + if json_data: + for clip in json_data['clips']: + formats = [] + for kind, meta in clip['versions']['enus']['sizes'].items(): + formats.append(create_format_item(meta, { + 'format_id': kind, + })) + self._sort_formats(formats) + + playlist.append(create_playlist_item(clip, { + 'formats': formats, + 'thumbnail': clip['thumb'], + })) + else: + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') + + def fix_html(s): + s = re.sub(r'(?s).*?', '', s) + s = re.sub(r'', r'', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + + def _clean_json(m): + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = '%s' % s + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) + + for li in doc.findall('./div/ul/li'): + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, 'trailer info') + trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue + thumbnail = li.find('.//img').attrib['src'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') + + formats = [] + for format in settings['metadata']['sizes']: + formats.append(create_format_item(format, { + 'format': format['type'], + })) + + self._sort_formats(formats) + + playlist.append(create_playlist_item(trailer_info, { + 'formats': formats, + 'thumbnail': thumbnail, + })) return { '_type': 'playlist',