From 8e01f3ca811e15aae04c7f2c5345c5eca38f99d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Oct 2017 22:58:01 +0700 Subject: [PATCH 01/17] [dctptv] Fix extraction (closes #14599) --- youtube_dl/extractor/dctp.py | 68 ++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 00fbbff2f..3a6d0560e 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -2,53 +2,85 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..compat import compat_str +from ..utils import ( + float_or_none, + unified_strdate, +) class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', - 'md5': '174dd4a8a6225cf5655952f969cfbe24', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 71.24, + }, + 'params': { + # rtmp download + 'skip_download': True, }, } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - object_id = self._html_search_meta('DC.identifier', webpage) + webpage = self._download_webpage(url, display_id) - servers_json = self._download_json( - 'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/', - video_id, note='Downloading server list') - server = servers_json[0]['server'] - m3u8_path = self._search_regex( - r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') - formats = self._extract_m3u8_formats( - 'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', - entry_protocol='m3u8_native') + video_id = self._html_search_meta( + 'DC.identifier', webpage, 'video id', + default=None) or self._search_regex( + r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') title = self._og_search_title(webpage) + + servers = self._download_json( + 'http://www.dctp.tv/streaming_servers/', display_id, + note='Downloading server list', fatal=False) + + if servers: + endpoint = next( + server['endpoint'] + for server in servers + if isinstance(server.get('endpoint'), compat_str) and + 'cloudfront' in server['endpoint']) + else: + endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' + + app = self._search_regex( + r'^rtmpe?://[^/]+/(?P.*)$', endpoint, 'app') + + formats = [{ + 'url': endpoint, + 'app': app, + 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'page_url': url, + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'ext': 'flv', + }] + description = self._html_search_meta('DC.description', webpage) upload_date = unified_strdate( self._html_search_meta('DC.date.created', webpage)) thumbnail = self._og_search_thumbnail(webpage) + duration = float_or_none(self._search_regex( + r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', + default=None), scale=1000) return { - 'id': object_id, + 'id': video_id, 'title': title, 'formats': formats, - 'display_id': video_id, + 'display_id': display_id, 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, + 'duration': duration, } From 47a8587915668ef82632a7a75f8bc9862679623a Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Sun, 12 Mar 2017 18:19:32 -0400 Subject: [PATCH 02/17] [younow] Add extractor --- youtube_dl/extractor/extractors.py | 5 + youtube_dl/extractor/younow.py | 197 +++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 youtube_dl/extractor/younow.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 18350810b..b6ad50ec7 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1335,6 +1335,11 @@ from .youku import ( YoukuIE, YoukuShowIE, ) +from .younow import ( + YouNowIE, + YouNowChannelIE, + YouNowMomentIE, +) from .youporn import YouPornIE from .yourupload import YourUploadIE from .youtube import ( diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py new file mode 100644 index 000000000..99abd66a8 --- /dev/null +++ b/youtube_dl/extractor/younow.py @@ -0,0 +1,197 @@ +# coding: utf-8 +from __future__ import unicode_literals +from datetime import date, datetime + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none, UnsupportedError + +MOMENT_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/fetch/id=%s' +STREAM_URL_FORMAT = 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + + +class YouNowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/]+)' + _TEST = { + 'url': 'https://www.younow.com/AmandaPadeezy', + 'info_dict': { + 'id': 'AmandaPadeezy', + 'ext': 'mp4', + 'is_live': True, + 'title': 'March 26, 2017', + 'description': 'YouNow is the best way to broadcast live and get an audience to watch you.', + 'thumbnail': 'https://ynassets.s3.amazonaws.com/broadcast/live/157869188/157869188.jpg', + 'tags': ['girls'], + 'categories': ['girls'], + 'uploader': 'AmandaPadeezy', + 'uploader_id': '6716501', + 'uploader_url': 'https://www.younow.com/AmandaPadeezy', + 'creator': 'AmandaPadeezy', + 'formats': [{ + 'url': 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=157869188/channelId=6716501', + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + } + + def _real_extract(self, url): + username = self._match_id(url) + data = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username) + + if data.get('media'): + stream_url = 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' % ( + data.get('broadcastId'), + data.get('userId'), + ) + else: + raise UnsupportedError('Unsupported stream or user is not streaming at this time') + + webpage = self._download_webpage(url, username) + try: + uploader = data['user']['profileUrlString'] + except KeyError: + uploader = username + try: + title = data['title'] + except KeyError: + title = date.today().strftime('%B %d, %Y') + + return { + 'id': uploader, + 'is_live': True, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': data.get('awsUrl'), + 'tags': data.get('tags'), + 'categories': data.get('tags'), + 'uploader': uploader, + 'uploader_id': data.get('userId'), + 'uploader_url': 'https://www.younow.com/%s' % (data['user']['profileUrlString'],), + 'creator': uploader, + 'view_count': int_or_none(data.get('viewers')), + 'like_count': int_or_none(data.get('likes')), + 'formats': [{ + 'url': stream_url, + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + +def _moment_to_entry(item): + title = item.get('text') + title_type = item.get('titleType') + if not title: + if title_type: + title = 'YouNow %s' % item.get('titleType') + else: + title = 'YouNow moment' + + entry = { + 'id': compat_str(item['momentId']), + 'title': title, + 'view_count': int_or_none(item.get('views')), + 'like_count': int_or_none(item.get('likes')), + 'timestamp': int_or_none(item.get('created')), + 'formats': [{ + 'url': STREAM_URL_FORMAT % (item['momentId'], item['momentId']), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + try: + entry['uploader'] = entry['creator'] = item['owner']['name'] + entry['uploader_url'] = 'https://www.younow.com/%s' % (item['owner']['name'],) + entry['uploader_id'] = item['owner']['userId'] + except KeyError: + pass + + return entry + + +class YouNowChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/]+)/channel' + _TEST = { + 'url': 'https://www.younow.com/Kate_Swiz/channel', + 'info_dict': { + 'title': 'Kate_Swiz moments' + }, + 'playlist_count': 6, + } + + MOMENTS_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/profile/channelId=%s/createdBefore=%d/records=20' + + def _real_extract(self, url): + entries = [] + username = self._match_id(url) + user_info = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username, note='Downloading user information') + channel_id = user_info['userId'] + created_before = 0 + moment_ids = [] + moment_ids_processed = [] + err = False + + while True: + if created_before: + cb = datetime.fromtimestamp(created_before) + else: + cb = datetime.now() + info = self._download_json(self.MOMENTS_URL_FORMAT % (channel_id, created_before), username, note='Downloading moments data (created before %s)' % (cb)) + + for item in info['items']: + if item['type'] == 'moment': + entry = _moment_to_entry(item) + moment_ids_processed.append(entry['id']) + entries.append(entry) + elif item['type'] == 'collection': + moment_ids += [compat_str(x) for x in item['momentsIds']] + + try: + created_before = int_or_none(item['created']) + except KeyError: + err = True + break + + if (err or + not info['hasMore'] or + 'items' not in info or + not info['items']): + break + + for mid in set(moment_ids): + if mid in moment_ids_processed: + continue + item = self._download_json(MOMENT_URL_FORMAT % (mid), mid) + entries.append(_moment_to_entry(item['item'])) + + return self.playlist_result(entries, playlist_title='%s moments' % (username)) + + +class YouNowMomentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P[^/]+)/[^/]+' + _TEST = { + 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'info_dict': { + 'id': '20712117', + 'ext': 'mp4', + 'title': 'YouNow capture', + 'view_count': 19, + 'like_count': 0, + 'timestamp': 1490432040, + 'formats': [{ + 'url': 'https://hls.younow.com/momentsplaylists/live/20712117/20712117.m3u8', + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'upload_date': '20170325', + 'uploader': 'GABO...', + 'uploader_id': 35917228, + }, + } + + def _real_extract(self, url): + mid = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % (mid), mid) + return _moment_to_entry(item['item']) From eb4b5818e2a297bd001eb1b4962d709b1245fd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 04:16:07 +0700 Subject: [PATCH 03/17] [younow] Fix issues and improve extraction (closes #9255, closes #9432, closes #12436) --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/younow.py | 221 +++++++++++++++-------------- 2 files changed, 114 insertions(+), 109 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b6ad50ec7..2eed706f9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1336,7 +1336,7 @@ from .youku import ( YoukuShowIE, ) from .younow import ( - YouNowIE, + YouNowLiveIE, YouNowChannelIE, YouNowMomentIE, ) diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py index 99abd66a8..04dbc87fc 100644 --- a/youtube_dl/extractor/younow.py +++ b/youtube_dl/extractor/younow.py @@ -1,17 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -from datetime import date, datetime + +import itertools from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none, UnsupportedError +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) -MOMENT_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/fetch/id=%s' -STREAM_URL_FORMAT = 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' +CDN_API_BASE = 'https://cdn.younow.com/php/api' +MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE -class YouNowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/]+)' +class YouNowLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.younow.com/AmandaPadeezy', 'info_dict': { @@ -19,179 +24,179 @@ class YouNowIE(InfoExtractor): 'ext': 'mp4', 'is_live': True, 'title': 'March 26, 2017', - 'description': 'YouNow is the best way to broadcast live and get an audience to watch you.', - 'thumbnail': 'https://ynassets.s3.amazonaws.com/broadcast/live/157869188/157869188.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'tags': ['girls'], 'categories': ['girls'], 'uploader': 'AmandaPadeezy', 'uploader_id': '6716501', 'uploader_url': 'https://www.younow.com/AmandaPadeezy', 'creator': 'AmandaPadeezy', - 'formats': [{ - 'url': 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=157869188/channelId=6716501', - 'ext': 'mp4', - 'protocol': 'm3u8', - }], - } + }, + 'skip': True, } + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url) + else super(YouNowLiveIE, cls).suitable(url)) + def _real_extract(self, url): username = self._match_id(url) - data = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username) - if data.get('media'): - stream_url = 'https://cdn.younow.com/php/api/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' % ( - data.get('broadcastId'), - data.get('userId'), - ) - else: - raise UnsupportedError('Unsupported stream or user is not streaming at this time') + data = self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username) - webpage = self._download_webpage(url, username) - try: - uploader = data['user']['profileUrlString'] - except KeyError: - uploader = username - try: - title = data['title'] - except KeyError: - title = date.today().strftime('%B %d, %Y') + if data.get('errorCode') != 0: + raise ExtractorError(data['errorMsg'], expected=True) + + uploader = try_get( + data, lambda x: x['user']['profileUrlString'], + compat_str) or username return { 'id': uploader, 'is_live': True, - 'title': title, - 'description': self._og_search_description(webpage), + 'title': self._live_title(uploader), 'thumbnail': data.get('awsUrl'), 'tags': data.get('tags'), 'categories': data.get('tags'), 'uploader': uploader, 'uploader_id': data.get('userId'), - 'uploader_url': 'https://www.younow.com/%s' % (data['user']['profileUrlString'],), + 'uploader_url': 'https://www.younow.com/%s' % username, 'creator': uploader, 'view_count': int_or_none(data.get('viewers')), 'like_count': int_or_none(data.get('likes')), 'formats': [{ - 'url': stream_url, + 'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' + % (CDN_API_BASE, data['broadcastId'], data['userId']), 'ext': 'mp4', 'protocol': 'm3u8', }], } -def _moment_to_entry(item): +def _extract_moment(item, fatal=True): + moment_id = item.get('momentId') + if not moment_id: + if not fatal: + return + raise ExtractorError('Unable to extract moment id') + + moment_id = compat_str(moment_id) + title = item.get('text') - title_type = item.get('titleType') if not title: - if title_type: - title = 'YouNow %s' % item.get('titleType') - else: - title = 'YouNow moment' + title = 'YouNow %s' % ( + item.get('momentType') or item.get('titleType') or 'moment') + + uploader = try_get(item, lambda x: x['owner']['name'], compat_str) + uploader_id = try_get(item, lambda x: x['owner']['userId']) + uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None entry = { - 'id': compat_str(item['momentId']), + 'extractor_key': 'YouNowMoment', + 'id': moment_id, 'title': title, 'view_count': int_or_none(item.get('views')), 'like_count': int_or_none(item.get('likes')), 'timestamp': int_or_none(item.get('created')), + 'creator': uploader, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, 'formats': [{ - 'url': STREAM_URL_FORMAT % (item['momentId'], item['momentId']), + 'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + % (moment_id, moment_id), 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }], } - try: - entry['uploader'] = entry['creator'] = item['owner']['name'] - entry['uploader_url'] = 'https://www.younow.com/%s' % (item['owner']['name'],) - entry['uploader_id'] = item['owner']['userId'] - except KeyError: - pass - return entry class YouNowChannelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P[^/]+)/channel' _TEST = { - 'url': 'https://www.younow.com/Kate_Swiz/channel', + 'url': 'https://www.younow.com/its_Kateee_/channel', 'info_dict': { - 'title': 'Kate_Swiz moments' + 'id': '14629760', + 'title': 'its_Kateee_ moments' }, - 'playlist_count': 6, + 'playlist_mincount': 8, } - MOMENTS_URL_FORMAT = 'https://cdn.younow.com/php/api/moment/profile/channelId=%s/createdBefore=%d/records=20' + def _entries(self, username, channel_id): + created_before = 0 + for page_num in itertools.count(1): + if created_before is None: + break + info = self._download_json( + '%s/moment/profile/channelId=%s/createdBefore=%d/records=20' + % (CDN_API_BASE, channel_id, created_before), username, + note='Downloading moments page %d' % page_num) + items = info.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + item_type = item.get('type') + if item_type == 'moment': + entry = _extract_moment(item, fatal=False) + if entry: + yield entry + elif item_type == 'collection': + moments = item.get('momentsIds') + if isinstance(moments, list): + for moment_id in moments: + m = self._download_json( + MOMENT_URL_FORMAT % moment_id, username, + note='Downloading %s moment JSON' % moment_id, + fatal=False) + if m and isinstance(m, dict) and m.get('item'): + entry = _extract_moment(m['item']) + if entry: + yield entry + created_before = int_or_none(item.get('created')) def _real_extract(self, url): - entries = [] username = self._match_id(url) - user_info = self._download_json('https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' % (username), username, note='Downloading user information') - channel_id = user_info['userId'] - created_before = 0 - moment_ids = [] - moment_ids_processed = [] - err = False - - while True: - if created_before: - cb = datetime.fromtimestamp(created_before) - else: - cb = datetime.now() - info = self._download_json(self.MOMENTS_URL_FORMAT % (channel_id, created_before), username, note='Downloading moments data (created before %s)' % (cb)) - - for item in info['items']: - if item['type'] == 'moment': - entry = _moment_to_entry(item) - moment_ids_processed.append(entry['id']) - entries.append(entry) - elif item['type'] == 'collection': - moment_ids += [compat_str(x) for x in item['momentsIds']] - - try: - created_before = int_or_none(item['created']) - except KeyError: - err = True - break - - if (err or - not info['hasMore'] or - 'items' not in info or - not info['items']): - break - - for mid in set(moment_ids): - if mid in moment_ids_processed: - continue - item = self._download_json(MOMENT_URL_FORMAT % (mid), mid) - entries.append(_moment_to_entry(item['item'])) - - return self.playlist_result(entries, playlist_title='%s moments' % (username)) + channel_id = compat_str(self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username, note='Downloading user information')['userId']) + return self.playlist_result( + self._entries(username, channel_id), channel_id, + '%s moments' % username) class YouNowMomentIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P[^/]+)/[^/]+' + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P[^/?#&]+)' _TEST = { 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807', 'info_dict': { 'id': '20712117', 'ext': 'mp4', 'title': 'YouNow capture', - 'view_count': 19, - 'like_count': 0, + 'view_count': int, + 'like_count': int, 'timestamp': 1490432040, - 'formats': [{ - 'url': 'https://hls.younow.com/momentsplaylists/live/20712117/20712117.m3u8', - 'ext': 'mp4', - 'protocol': 'm3u8', - }], 'upload_date': '20170325', 'uploader': 'GABO...', 'uploader_id': 35917228, }, } + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) + else super(YouNowMomentIE, cls).suitable(url)) + def _real_extract(self, url): - mid = self._match_id(url) - item = self._download_json(MOMENT_URL_FORMAT % (mid), mid) - return _moment_to_entry(item['item']) + video_id = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id) + return _extract_moment(item['item']) From c3206d02e94ce98c6467762a228a9e58616c6d8f Mon Sep 17 00:00:00 2001 From: enigmaquip Date: Sat, 28 Oct 2017 16:20:18 -0600 Subject: [PATCH 04/17] [fxnetworks] Extract series metadata --- youtube_dl/extractor/fxnetworks.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 629897317..37549fb01 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -3,27 +3,31 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - update_url_query, extract_attributes, + int_or_none, parse_age_limit, smuggle_url, + update_url_query, ) class FXNetworksIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/719841347694', - 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'url': 'http://www.fxnetworks.com/video/1032565827847', + 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', 'info_dict': { - 'id': '719841347694', + 'id': 'dRzwHC_MMqIv', 'ext': 'mp4', - 'title': 'Vanpage', - 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'title': 'First Look: Better Things - Season 2', + 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', 'age_limit': 14, 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20160706', - 'timestamp': 1467844741, + 'upload_date': '20170825', + 'timestamp': 1503686274, + 'episode_number': 0, + 'season_number': 2, + 'series': 'Better Things', }, 'add_ie': ['ThePlatform'], }, { @@ -64,6 +68,9 @@ class FXNetworksIE(AdobePassIE): 'id': video_id, 'title': title, 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'series': video_data.get('data-show-title'), + 'episode_number': int_or_none(video_data.get('data-episode')), + 'season_number': int_or_none(video_data.get('data-season')), 'thumbnail': video_data.get('data-large-thumb'), 'age_limit': parse_age_limit(rating), 'ie_key': 'ThePlatform', From 056653bbb1b94ba04f331ed4c27a1c0d24fe1e30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 07:04:48 +0700 Subject: [PATCH 05/17] [utils] Add support for zero years and months in parse_duration --- test/test_utils.py | 1 + youtube_dl/utils.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index efa73d0f4..cc13f795c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -540,6 +540,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) self.assertEqual(parse_duration('PT00H03M30SZ'), 210) + self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 59fb33435..34866a54b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1835,10 +1835,20 @@ def parse_duration(s): days, hours, mins, secs, ms = m.groups() else: m = re.match( - r'''(?ix)(?:P?T)? + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? (?: (?P[0-9]+)\s*d(?:ays?)?\s* )? + T)? (?: (?P[0-9]+)\s*h(?:ours?)?\s* )? From 9211e3319e6006373d8b5055f7a3d0bbd734c57b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 07:05:55 +0700 Subject: [PATCH 06/17] [extractor/common] Prefix format id for audio only HLS formats --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a69240693..52f2055b5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1401,7 +1401,7 @@ class InfoExtractor(object): media_url = media.get('URI') if media_url: format_id = [] - for v in (group_id, name): + for v in (m3u8_id, group_id, name): if v: format_id.append(v) f = { From 514e8aefd488b385122fa989e937e5f9ae62d136 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 07:11:37 +0700 Subject: [PATCH 07/17] [egghead] Fix extraction (closes #14388) --- youtube_dl/extractor/egghead.py | 81 +++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index e4a3046af..edabaafe6 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, try_get, unified_timestamp, @@ -17,7 +19,7 @@ class EggheadCourseIE(InfoExtractor): 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'id': '72', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, @@ -26,14 +28,28 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + lessons = self._download_json( + 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, + playlist_id, 'Downloading course lessons JSON') - entries = [ - self.url_result( - 'wistia:%s' % lesson['wistia_id'], ie='Wistia', - video_id=lesson['wistia_id'], video_title=lesson.get('title')) - for lesson in course['lessons'] if lesson.get('wistia_id')] + entries = [] + for lesson in lessons: + lesson_url = lesson.get('http_url') + if not lesson_url or not isinstance(lesson_url, compat_str): + continue + lesson_id = lesson.get('id') + if lesson_id: + lesson_id = compat_str(lesson_id) + entries.append(self.url_result( + lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) + + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, + playlist_id, 'Downloading course JSON', fatal=False) or {} + + playlist_id = course.get('id') + if playlist_id: + playlist_id = compat_str(playlist_id) return self.playlist_result( entries, playlist_id, course.get('title'), @@ -43,11 +59,12 @@ class EggheadCourseIE(InfoExtractor): class EggheadLessonIE(InfoExtractor): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/lessons/(?P[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { - 'id': 'fv5yotjxcg', + 'id': '1196', + 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', 'ext': 'mp4', 'title': 'Create linear data flow with container style types (Box)', 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', @@ -60,25 +77,51 @@ class EggheadLessonIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, - } + }, { + 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', + 'only_matching': True, + }] def _real_extract(self, url): - lesson_id = self._match_id(url) + display_id = self._match_id(url) lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) + 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + + lesson_id = compat_str(lesson['id']) + title = lesson['title'] + + formats = [] + for _, format_url in lesson['media_urls'].items(): + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, lesson_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, lesson_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'ie_key': 'Wistia', - 'url': 'wistia:%s' % lesson['wistia_id'], - 'id': lesson['wistia_id'], - 'title': lesson.get('title'), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, 'description': lesson.get('summary'), 'thumbnail': lesson.get('thumb_nail'), 'timestamp': unified_timestamp(lesson.get('published_at')), 'duration': int_or_none(lesson.get('duration')), 'view_count': int_or_none(lesson.get('plays_count')), 'tags': try_get(lesson, lambda x: x['tag_list'], list), + 'series': try_get( + lesson, lambda x: x['series']['title'], compat_str), + 'formats': formats, } From 518d357b46cb840224d69cc543667be3a87b9b9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 07:21:33 +0700 Subject: [PATCH 08/17] [ChangeLog] Actualize --- ChangeLog | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ChangeLog b/ChangeLog index 547b55981..795491d34 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,28 @@ +version + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regex (#14600) +* [vimeo] Restrict iframe embed regex (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + version 2017.10.20 Core From 6d0630d8801fd3278a05fa7e55a73bd454403e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Oct 2017 07:22:53 +0700 Subject: [PATCH 09/17] release 2017.10.29 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 81fe10d54..881475878 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.10.20*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.10.20** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.10.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.10.29** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.10.20 +[debug] youtube-dl version 2017.10.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 795491d34..d33a710fb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.10.29 Core * [extractor/common] Prefix format id for audio only HLS formats diff --git a/docs/supportedsites.md b/docs/supportedsites.md index be5de22df..7b8e7403a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -498,7 +498,6 @@ - **MySpace:album** - **MySpass** - **Myvi** - - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - **natgeo** @@ -977,6 +976,7 @@ - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** - **VShare** @@ -1035,6 +1035,9 @@ - **YouJizz** - **youku**: 优酷 - **youku:show** + - **YouNowChannel** + - **YouNowLive** + - **YouNowMoment** - **YouPorn** - **YourUpload** - **youtube**: YouTube.com diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4d1686670..43f080bc3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.10.20' +__version__ = '2017.10.29' From 8fe767e07261abb8013b18ca2ed31ebb8d95c7d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Oct 2017 23:05:25 +0700 Subject: [PATCH 10/17] [spankbang] Detect unavailable videos (closes #14644) --- youtube_dl/extractor/spankbang.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 3394c7e6b..2863e53b5 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError class SpankBangIE(InfoExtractor): @@ -33,6 +34,10 @@ class SpankBangIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + stream_key = self._html_search_regex( r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', webpage, 'stream key') From 044eeb145556cb41485b6b6644f40b2161a4e0f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Nov 2017 23:39:26 +0700 Subject: [PATCH 11/17] [extractor/common] Respect URL query in _extract_wowza_formats (closes #14645) --- youtube_dl/extractor/common.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 52f2055b5..a67ac4411 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2233,27 +2233,35 @@ class InfoExtractor(object): return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) url_base = self._search_regex( r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') http_base_url = '%s:%s' % ('http', url_base) formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', + manifest_url('playlist.m3u8'), video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) if 'f4m' not in skip_protocols: formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', + manifest_url('manifest.f4m'), video_id, f4m_id='hds', fatal=False)) if 'dash' not in skip_protocols: formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', + manifest_url('manifest.mpd'), video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( - http_base_url + '/jwplayer.smil', + manifest_url('jwplayer.smil'), video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() From b0f4331002798522621aff55deaf18406d17e081 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 2 Nov 2017 13:30:01 +0100 Subject: [PATCH 12/17] [gamespot] extract formats referenced with new data fields(#14652) --- youtube_dl/extractor/gamespot.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 02804d297..6d177cbaf 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/videos/(?:[^/]+/\d+-|embed/)(?P\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -35,6 +35,9 @@ class GameSpotIE(OnceIE): 'params': { 'skip_download': True, # m3u8 downloads }, + }, { + 'url': 'https://www.gamespot.com/videos/embed/6439218/', + 'only_matching': True, }] def _real_extract(self, url): @@ -52,7 +55,7 @@ class GameSpotIE(OnceIE): manifest_url = f4m_url formats.extend(self._extract_f4m_formats( f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = streams.get('m3u8_stream') + m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) if m3u8_url: manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( @@ -60,7 +63,7 @@ class GameSpotIE(OnceIE): m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) if progressive_url and manifest_url: qualities_basename = self._search_regex( r'/([^/]+)\.csmil/', From 44cca168cc36a71ea77d8635a44f6ee9d2c33a99 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 2 Nov 2017 14:16:15 +0100 Subject: [PATCH 13/17] [skysport] add support ooyala embed_token protected videos(fixes #14641) --- youtube_dl/extractor/skysports.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py index 4ca9f6b3c..efcbb36a9 100644 --- a/youtube_dl/extractor/skysports.py +++ b/youtube_dl/extractor/skysports.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import strip_or_none +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) class SkySportsIE(InfoExtractor): @@ -22,12 +27,22 @@ class SkySportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get('data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin(url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url(video_url, {'embed_token': embed_token.strip('"')}) return { '_type': 'url_transparent', 'id': video_id, - 'url': 'ooyala:%s' % self._search_regex( - r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'url': video_url, 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), 'ie_key': 'Ooyala', From cd670befc4c823a38a88fffbaa6c493e539dd79d Mon Sep 17 00:00:00 2001 From: Jimbolino Date: Thu, 2 Nov 2017 17:48:43 +0100 Subject: [PATCH 14/17] [22tracks] Remove extractor (closes #11024) --- youtube_dl/extractor/extractors.py | 4 -- youtube_dl/extractor/twentytwotracks.py | 86 ------------------------- 2 files changed, 90 deletions(-) delete mode 100644 youtube_dl/extractor/twentytwotracks.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2eed706f9..92f7e9027 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1110,10 +1110,6 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) from .twitch import ( TwitchVideoIE, TwitchChapterIE, diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py deleted file mode 100644 index d6c0ab184..000000000 --- a/youtube_dl/extractor/twentytwotracks.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - -# 22Tracks regularly replace the audio tracks that can be streamed on their -# site. The tracks usually expire after 1 months, so we can't add tests. - - -class TwentyTwoTracksIE(InfoExtractor): - _VALID_URL = r'https?://22tracks\.com/(?P[a-z]+)/(?P[\da-z]+)/(?P\d+)' - IE_NAME = '22tracks:track' - - _API_BASE = 'http://22tracks.com/api' - - def _extract_info(self, city, genre_name, track_id=None): - item_id = track_id if track_id else genre_name - - cities = self._download_json( - '%s/cities' % self._API_BASE, item_id, - 'Downloading cities info', - 'Unable to download cities info') - city_id = [x['id'] for x in cities if x['slug'] == city][0] - - genres = self._download_json( - '%s/genres/%s' % (self._API_BASE, city_id), item_id, - 'Downloading %s genres info' % city, - 'Unable to download %s genres info' % city) - genre = [x for x in genres if x['slug'] == genre_name][0] - genre_id = genre['id'] - - tracks = self._download_json( - '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, - 'Downloading %s genre tracks info' % genre_name, - 'Unable to download track info') - - return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] - - def _get_track_url(self, filename, track_id): - token = self._download_json( - 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, - track_id, 'Downloading token', 'Unable to download token') - return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) - - def _extract_track_info(self, track_info, track_id): - download_url = self._get_track_url(track_info['filename'], track_id) - title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) - return { - 'id': track_id, - 'url': download_url, - 'ext': 'mp3', - 'title': title, - 'duration': int_or_none(track_info.get('duration')), - 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - track_id = mobj.group('id') - - track_info = self._extract_info(city, genre, track_id) - return self._extract_track_info(track_info, track_id) - - -class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): - _VALID_URL = r'https?://22tracks\.com/(?P[a-z]+)/(?P[\da-z]+)/?$' - IE_NAME = '22tracks:genre' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - - genre_title, tracks = self._extract_info(city, genre) - - entries = [ - self._extract_track_info(track_info, track_info['id']) - for track_info in tracks] - - return self.playlist_result(entries, genre, genre_title) From 48107c198bd76e611e3d4c2486cdc5403829a05a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Nov 2017 22:10:55 +0700 Subject: [PATCH 15/17] [f4m] Prefer baseURL for relative URLs (closes #14660) --- youtube_dl/downloader/f4m.py | 25 +++++++++++++++++-------- youtube_dl/extractor/common.py | 14 +++++++------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index c8fde9a89..fdb80f42a 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -243,8 +243,17 @@ def remove_encrypted_media(media): media)) -def _add_ns(prop): - return '{http://ns.adobe.com/f4m/1.0}%s' % prop +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url class F4mFD(FragmentFD): @@ -330,13 +339,13 @@ class F4mFD(FragmentFD): rate, media = list(filter( lambda f: int(f[0]) == requested_bitrate, formats))[0] - base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - # From Adobe F4M 3.0 spec: - # The element SHALL be the base URL for all relative - # (HTTP-based) URLs in the manifest. If is not present, said - # URLs should be relative to the location of the containing document. - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a67ac4411..64fb869aa 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,7 +29,10 @@ from ..compat import ( compat_urlparse, compat_xml_parse_error, ) -from ..downloader.f4m import remove_encrypted_media +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) from ..utils import ( NO_DEFAULT, age_restricted, @@ -1239,11 +1242,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1275,7 +1275,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested From 187ee66c941d9c397a46ffa490375e2c405500e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Nov 2017 22:11:39 +0700 Subject: [PATCH 16/17] [extractor/common] Add protocol for f4m formats --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 64fb869aa..e2d9f52b0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1310,6 +1310,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, From 181e381fda4ddb9083f3834a8bd1bab72c937545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Nov 2017 22:12:20 +0700 Subject: [PATCH 17/17] [test_InfoExtractor] Add test for #14660 --- test/test_InfoExtractor.py | 26 ++++++++++++++++++++++++++ test/testdata/f4m/custom_base_url.f4m | 10 ++++++++++ 2 files changed, 36 insertions(+) create mode 100644 test/testdata/f4m/custom_base_url.f4m diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f18a823fc..686c63efa 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -574,6 +574,32 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_f4m_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/14660 + 'custom_base_url', + 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + [{ + 'manifest_url': 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + 'ext': 'flv', + 'format_id': '2148', + 'protocol': 'f4m', + 'tbr': 2148, + 'width': 1280, + 'height': 720, + }] + ), + ] + + for f4m_file, f4m_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/f4m/%s.f4m' % f4m_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_f4m_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + f4m_url, None) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) if __name__ == '__main__': unittest.main() diff --git a/test/testdata/f4m/custom_base_url.f4m b/test/testdata/f4m/custom_base_url.f4m new file mode 100644 index 000000000..74e1539e8 --- /dev/null +++ b/test/testdata/f4m/custom_base_url.f4m @@ -0,0 +1,10 @@ + + + recorded + http://vod.livestream.com/events/0000000000673980/ + 269.293 + AAAAm2Fic3QAAAAAAAAAAQAAAAPoAAAAAAAEG+0AAAAAAAAAAAAAAAAAAQAAABlhc3J0AAAAAAAAAAABAAAAAQAAAC4BAAAAVmFmcnQAAAAAAAAD6AAAAAAEAAAAAQAAAAAAAAAAAAAXcAAAAC0AAAAAAAQHQAAAE5UAAAAuAAAAAAAEGtUAAAEYAAAAAAAAAAAAAAAAAAAAAAA= + + AgAKb25NZXRhRGF0YQgAAAAIAAhkdXJhdGlvbgBAcNSwIMSbpgAFd2lkdGgAQJQAAAAAAAAABmhlaWdodABAhoAAAAAAAAAJZnJhbWVyYXRlAEA4/7DoLwW3AA12aWRlb2RhdGFyYXRlAECe1DLgjcobAAx2aWRlb2NvZGVjaWQAQBwAAAAAAAAADWF1ZGlvZGF0YXJhdGUAQGSimlvaPKQADGF1ZGlvY29kZWNpZABAJAAAAAAAAAAACQ== + +