From fea7295b1400f27218422cb37f70e7c4e2c66c29 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 10:48:22 +0100 Subject: [PATCH 001/116] [brightcove] relax embed_in_page regex --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c9e43a275..0d162d337 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -439,7 +439,7 @@ class BrightcoveNewIE(InfoExtractor): .*? ]+ src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([\da-f-]+)_([^/]+)/index(?:\.min)?\.js + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js ''', webpage): entries.append( 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' From dd17041c82169b862e7f91cee9c5a5ed86b68ca9 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 11:59:40 +0100 Subject: [PATCH 002/116] [tenplay] remove extractor(fixes #6927) --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/tenplay.py | 90 -------------------------------- 2 files changed, 91 deletions(-) delete mode 100644 youtube_dl/extractor/tenplay.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7b0f2b21a..0a2dee40a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -737,7 +737,6 @@ from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE from .teletask import TeleTaskIE -from .tenplay import TenPlayIE from .testurl import TestURLIE from .tf1 import TF1IE from .theintercept import TheInterceptIE diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py deleted file mode 100644 index 02a31a609..000000000 --- a/youtube_dl/extractor/tenplay.py +++ /dev/null @@ -1,90 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - float_or_none, -) - - -class TenPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ten(play)?\.com\.au/.+' - _TEST = { - 'url': 'http://tenplay.com.au/ten-insider/extra/season-2013/tenplay-tv-your-way', - 'info_dict': { - 'id': '2695695426001', - 'ext': 'flv', - 'title': 'TENplay: TV your way', - 'description': 'Welcome to a new TV experience. Enjoy a taste of the TENplay benefits.', - 'timestamp': 1380150606.889, - 'upload_date': '20130925', - 'uploader': 'TENplay', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - } - - _video_fields = [ - 'id', 'name', 'shortDescription', 'longDescription', 'creationDate', - 'publishedDate', 'lastModifiedDate', 'customFields', 'videoStillURL', - 'thumbnailURL', 'referenceId', 'length', 'playsTotal', - 'playsTrailingWeek', 'renditions', 'captioning', 'startDate', 'endDate'] - - def _real_extract(self, url): - webpage = self._download_webpage(url, url) - video_id = self._html_search_regex( - r'videoID: "(\d+?)"', webpage, 'video_id') - api_token = self._html_search_regex( - r'apiToken: "([a-zA-Z0-9-_\.]+?)"', webpage, 'api_token') - title = self._html_search_regex( - r'', - webpage, 'title') - - json = self._download_json('https://api.brightcove.com/services/library?command=find_video_by_id&video_id=%s&token=%s&video_fields=%s' % (video_id, api_token, ','.join(self._video_fields)), title) - - formats = [] - for rendition in json['renditions']: - url = rendition['remoteUrl'] or rendition['url'] - protocol = 'rtmp' if url.startswith('rtmp') else 'http' - ext = 'flv' if protocol == 'rtmp' else rendition['videoContainer'].lower() - - if protocol == 'rtmp': - url = url.replace('&mp4:', '') - - tbr = int_or_none(rendition.get('encodingRate'), 1000) - - formats.append({ - 'format_id': '_'.join( - ['rtmp', rendition['videoContainer'].lower(), - rendition['videoCodec'].lower(), '%sk' % tbr]), - 'width': int_or_none(rendition['frameWidth']), - 'height': int_or_none(rendition['frameHeight']), - 'tbr': tbr, - 'filesize': int_or_none(rendition['size']), - 'protocol': protocol, - 'ext': ext, - 'vcodec': rendition['videoCodec'].lower(), - 'container': rendition['videoContainer'].lower(), - 'url': url, - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': json['referenceId'], - 'title': json['name'], - 'description': json['shortDescription'] or json['longDescription'], - 'formats': formats, - 'thumbnails': [{ - 'url': json['videoStillURL'] - }, { - 'url': json['thumbnailURL'] - }], - 'thumbnail': json['videoStillURL'], - 'duration': float_or_none(json.get('length'), 1000), - 'timestamp': float_or_none(json.get('creationDate'), 1000), - 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', - 'view_count': int_or_none(json.get('playsTotal')), - } From d84b48e3f1d9c2099a2a8ba48df3a2bd5e591807 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 13:44:55 +0100 Subject: [PATCH 003/116] [nationalgeographic] improve extraction --- youtube_dl/extractor/nationalgeographic.py | 30 +++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index d5e53365c..1560e3e81 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -8,14 +8,16 @@ from ..utils import ( class NationalGeographicIE(InfoExtractor): + IE_NAME = 'natgeo' _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ { 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo', + 'md5': '730855d559abbad6b42c2be1fa584917', 'info_dict': { - 'id': '4DmDACA6Qtk_', - 'ext': 'flv', + 'id': '0000014b-70a1-dd8c-af7f-f7b559330001', + 'ext': 'mp4', 'title': 'Mating Crabs Busted by Sharks', 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', }, @@ -23,9 +25,10 @@ class NationalGeographicIE(InfoExtractor): }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', + 'md5': '6a3105eb448c070503b3105fb9b320b5', 'info_dict': { - 'id': '_JeBD_D7PlS5', - 'ext': 'flv', + 'id': 'ngc-I0IauNSWznb_UV008GxSbwY35BZvgi2e', + 'ext': 'mp4', 'title': 'The Real Jaws', 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6', }, @@ -37,18 +40,15 @@ class NationalGeographicIE(InfoExtractor): name = url_basename(url) webpage = self._download_webpage(url, name) - feed_url = self._search_regex( - r'data-feed-url="([^"]+)"', webpage, 'feed url') guid = self._search_regex( r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"', webpage, 'guid') - feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name) - content = feed.find('.//{http://search.yahoo.com/mrss/}content') - theplatform_id = url_basename(content.attrib.get('url')) - - return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/ngs/%s?formats=MPEG4&manifest=f4m' % theplatform_id, - # For some reason, the normal links don't work and we must force - # the use of f4m - {'force_smil_url': True})) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + 'http://link.theplatform.com/s/ngs/media/guid/2423130747/%s?mbr=true' % guid, + {'force_smil_url': True}), + 'id': guid, + } From c9c39c22c5740c1eedcc9ce7a10f5df199ea5c78 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 13:47:38 +0100 Subject: [PATCH 004/116] [nationalgeographic] add support for channel.nationalgeographic.com urls --- youtube_dl/extractor/__init__.py | 5 ++- youtube_dl/extractor/nationalgeographic.py | 47 ++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0a2dee40a..76354b67e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -438,7 +438,10 @@ from .myspass import MySpassIE from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE -from .nationalgeographic import NationalGeographicIE +from .nationalgeographic import ( + NationalGeographicIE, + NationalGeographicChannelIE, +) from .naver import NaverIE from .nba import NBAIE from .nbc import ( diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 1560e3e81..61b5c700e 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( smuggle_url, url_basename, + update_url_query, ) @@ -52,3 +53,49 @@ class NationalGeographicIE(InfoExtractor): {'force_smil_url': True}), 'id': guid, } + + +class NationalGeographicChannelIE(InfoExtractor): + IE_NAME = 'natgeo:channel' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' + + _TESTS = [ + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', + 'md5': '518c9aa655686cf81493af5cc21e2a04', + 'info_dict': { + 'id': 'nB5vIAfmyllm', + 'ext': 'mp4', + 'title': 'Uncovering a Universal Knowledge', + 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', + }, + 'add_ie': ['ThePlatform'], + }, + { + 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', + 'md5': 'c4912f656b4cbe58f3e000c489360989', + 'info_dict': { + 'id': '3TmMv9OvGwIR', + 'ext': 'mp4', + 'title': 'The Stunning Red Bird of Paradise', + 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', + }, + 'add_ie': ['ThePlatform'], + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + release_url = self._search_regex( + r'video_auth_playlist_url\s*=\s*"([^"]+)"', + webpage, 'release url') + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + {'force_smil_url': True}), + 'display_id': display_id, + } From 5299bc3f91ffbb784addaee002611a52232134a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 20:42:41 +0600 Subject: [PATCH 005/116] [beeg] Switch to api v6 (Closes #9036) --- youtube_dl/extractor/beeg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 34c2a756f..9072949dd 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -34,7 +34,7 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'https://api.beeg.com/api/v5/video/%s' % video_id, video_id) + 'https://api.beeg.com/api/v6/1738/video/%s' % video_id, video_id) def split(o, e): def cut(s, x): @@ -50,8 +50,8 @@ class BeegIE(InfoExtractor): return n def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1105.js - a = '5ShMcIQlssOd7zChAIOlmeTZDaUxULbJRnywYaiB' + # Reverse engineered from http://static.beeg.com/cpl/1738.js + a = 'GUuyodcfS8FW8gQp4OKLMsZBcX0T7B' e = compat_urllib_parse_unquote(key) o = ''.join([ compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) From 81da8cbc4513df16d0d04dc2992d6de9ab0f4038 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:05:25 +0600 Subject: [PATCH 006/116] [udemy] Switch to api 2.0 (Closes #9035) --- youtube_dl/extractor/udemy.py | 60 +++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 71bea5363..2e54dbc11 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -17,6 +17,7 @@ from ..utils import ( int_or_none, sanitized_Request, unescapeHTML, + update_url_query, urlencode_postdata, ) @@ -54,6 +55,16 @@ class UdemyIE(InfoExtractor): 'only_matching': True, }] + def _extract_course_info(self, webpage, video_id): + course = self._parse_json( + unescapeHTML(self._search_regex( + r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + video_id, fatal=False) or {} + course_id = course.get('id') or self._search_regex( + (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), + webpage, 'course id') + return course_id, course.get('title') + def _enroll_course(self, base_url, webpage, course_id): def combine_url(base_url, url): return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url @@ -98,7 +109,7 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata'): + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', @@ -116,7 +127,7 @@ class UdemyIE(InfoExtractor): else: url_or_request = sanitized_Request(url_or_request, headers=headers) - response = super(UdemyIE, self)._download_json(url_or_request, video_id, note) + response = super(UdemyIE, self)._download_json(url_or_request, *args, **kwargs) self._handle_error(response) return response @@ -166,9 +177,7 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) - course_id = self._search_regex( - (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), - webpage, 'course id') + course_id, _ = self._extract_course_info(webpage, lecture_id) try: lecture = self._download_lecture(course_id, lecture_id) @@ -309,29 +318,32 @@ class UdemyCourseIE(UdemyIE): webpage = self._download_webpage(url, course_path) - response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s' % course_path, - course_path, 'Downloading course JSON') - - course_id = response['id'] - course_title = response.get('title') + course_id, title = self._extract_course_info(webpage, course_path) self._enroll_course(url, webpage, course_id) + course_url = update_url_query( + 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, + { + 'fields[chapter]': 'title,object_index', + 'fields[lecture]': 'title', + 'page_size': '1000', + }) + response = self._download_json( - 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, - course_id, 'Downloading course curriculum') + course_url, course_id, 'Downloading course curriculum') entries = [] - chapter, chapter_number = None, None - for asset in response: - asset_type = asset.get('assetType') or asset.get('asset_type') - if asset_type == 'Video': - asset_id = asset.get('id') - if asset_id: + chapter, chapter_number = [None] * 2 + for entry in response['results']: + clazz = entry.get('_class') + if clazz == 'lecture': + lecture_id = entry.get('id') + if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), + 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, entry['id']), + 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } if chapter_number: @@ -339,8 +351,8 @@ class UdemyCourseIE(UdemyIE): if chapter: entry['chapter'] = chapter entries.append(entry) - elif asset.get('type') == 'chapter': - chapter_number = asset.get('index') or asset.get('object_index') - chapter = asset.get('title') + elif clazz == 'chapter': + chapter_number = entry.get('object_index') + chapter = entry.get('title') - return self.playlist_result(entries, course_id, course_title) + return self.playlist_result(entries, course_id, title) From ed0291d1533600b21903cb98f070791a20e47433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:55:01 +0600 Subject: [PATCH 007/116] [utils] Add update_Request --- youtube_dl/utils.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6d27b80c0..4532b737b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1804,6 +1804,20 @@ def update_url_query(url, query): query=compat_urllib_parse_urlencode(qs, True))) +def update_Request(req, url=None, data=None, headers={}, query={}): + req_headers = req.headers.copy() + req_headers.update(headers) + req_data = data or req.data + req_url = update_url_query(url or req.get_full_url(), query) + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + req_url, data=req_data, headers=req_headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + if hasattr(req, 'timeout'): + new_req.timeout = req.timeout + return new_req + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: From 15d260ebaa48409112270685d306a5d9152260c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:55:49 +0600 Subject: [PATCH 008/116] [utils] Use update_Request in http_request --- youtube_dl/utils.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4532b737b..5c4ab2748 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -778,12 +778,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # Substitute URL if any change after escaping if url != url_escaped: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - new_req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - new_req.timeout = req.timeout - req = new_req + req = update_Request(req, url=url_escaped) for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 From 41d06b042427aa019200bb80a1935d12110f50b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:58:38 +0600 Subject: [PATCH 009/116] [extractor/common] Improve _request_webpage * Do not ignore data, headers and query for Requests * Default values for headers and query switched to dicts since these are used by urllib itself --- youtube_dl/extractor/common.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9b7ab8924..85ac0400c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,6 +22,7 @@ from ..compat import ( compat_str, compat_urllib_error, compat_urllib_parse_urlencode, + compat_urllib_request, compat_urlparse, ) from ..downloader.f4m import remove_encrypted_media @@ -49,6 +50,7 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_Request, update_url_query, ) @@ -347,7 +349,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -357,11 +359,14 @@ class InfoExtractor(object): else: self.to_screen('%s: %s' % (video_id, note)) # data, headers and query params will be ignored for `Request` objects - if isinstance(url_or_request, compat_str): + if isinstance(url_or_request, compat_urllib_request.Request): + url_or_request = update_Request( + url_or_request, data=data, headers=headers, query=query) + else: if query: url_or_request = update_url_query(url_or_request, query) if data or headers: - url_or_request = sanitized_Request(url_or_request, data, headers or {}) + url_or_request = sanitized_Request(url_or_request, data, headers) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -377,7 +382,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -470,7 +475,7 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): """ Returns the data of the page as a string """ success = False try_count = 0 @@ -491,7 +496,7 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): + transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) @@ -505,7 +510,7 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers=None, query=None): + fatal=True, encoding=None, data=None, headers={}, query={}): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) From 6bb4600717cb5378d392845e5c9bebe236a1b224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 22:59:19 +0600 Subject: [PATCH 010/116] [udemy:course] Simplify course curriculum downloading --- youtube_dl/extractor/udemy.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 2e54dbc11..1784599e9 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -322,17 +322,14 @@ class UdemyCourseIE(UdemyIE): self._enroll_course(url, webpage, course_id) - course_url = update_url_query( + response = self._download_json( 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, - { + course_id, 'Downloading course curriculum', query={ 'fields[chapter]': 'title,object_index', 'fields[lecture]': 'title', 'page_size': '1000', }) - response = self._download_json( - course_url, course_id, 'Downloading course curriculum') - entries = [] chapter, chapter_number = [None] * 2 for entry in response['results']: From 86f3b66cec3a1ba6d3b197018a954865ca14c323 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 31 Mar 2016 23:00:11 +0600 Subject: [PATCH 011/116] [udemy] Remove unused import --- youtube_dl/extractor/udemy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 1784599e9..e91cf44fe 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, sanitized_Request, unescapeHTML, - update_url_query, urlencode_postdata, ) From 8369942773aec76a5b97582f77b3e67f701bcf47 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 18:36:41 +0100 Subject: [PATCH 012/116] [voxmedia] Add new extractor(closes #3182) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/voxmedia.py | 99 ++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 youtube_dl/extractor/voxmedia.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 76354b67e..19f802411 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -904,6 +904,7 @@ from .vk import ( from .vlive import VLiveIE from .vodlocker import VodlockerIE from .voicerepublic import VoiceRepublicIE +from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py new file mode 100644 index 000000000..3b13d38a1 --- /dev/null +++ b/youtube_dl/extractor/voxmedia.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote + + +class VoxMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?:theverge|vox|sbnation|eater|polygon|curbed|racked)\.com/(?:[^/]+/)*(?P[^/?]+)' + _TESTS = [{ + 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', + 'md5': '73856edf3e89a711e70d5cf7cb280b37', + 'info_dict': { + 'id': '11eXZobjrG8DCSTgrNjVinU-YmmdYjhe', + 'ext': 'mp4', + 'title': 'Google\'s new material design direction', + 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', + } + }, { + # data-ooyala-id + 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', + 'md5': 'd744484ff127884cd2ba09e3fa604e4b', + 'info_dict': { + 'id': 'RkZXU4cTphOCPDMZg5oEounJyoFI0g-B', + 'ext': 'mp4', + 'title': 'The Nexus 6: hands-on with Google\'s phablet', + 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', + } + }, { + # volume embed + 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', + 'md5': '375c483c5080ab8cd85c9c84cfc2d1e4', + 'info_dict': { + 'id': 'wydzk3dDpmRz7PQoXRsTIX6XTkPjYL0b', + 'ext': 'mp4', + 'title': 'The new frontier of LGBTQ civil rights, explained', + 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', + } + }, { + # youtube embed + 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', + 'md5': '83b3080489fb103941e549352d3e0977', + 'info_dict': { + 'id': 'FcNHTJU1ufM', + 'ext': 'mp4', + 'title': 'How "the robot" became the greatest novelty dance of all time', + 'description': 'md5:b081c0d588b8b2085870cda55e6da176', + 'upload_date': '20160324', + 'uploader_id': 'voxdotcom', + 'uploader': 'Vox', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id)) + + title = None + description = None + provider_video_id = None + provider_video_type = None + + entry = self._search_regex([ + r'Chorus\.VideoContext\.addVideo\(\[({.+})\]\);', + r'var\s+entry\s*=\s*({.+});' + ], webpage, 'video data', default=None) + if entry: + video_data = self._parse_json(entry, display_id) + provider_video_id = video_data.get('provider_video_id') + provider_video_type = video_data.get('provider_video_type') + if provider_video_id and provider_video_type: + title = video_data.get('title') + description = video_data.get('description') + + if not provider_video_id or not provider_video_type: + provider_video_id = self._search_regex( + r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) + if provider_video_id: + provider_video_type = 'ooyala' + else: + volume_uuid = self._search_regex(r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid') + volume_webpage = self._download_webpage( + 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) + video_data = self._parse_json(self._search_regex( + r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) + title = video_data.get('title_short') + description = video_data.get('description_long') or video_data.get('description_short') + for pvtype in ('ooyala', 'youtube'): + provider_video_id = video_data.get('%s_id' % pvtype) + if provider_video_id: + provider_video_type = pvtype + break + + return { + '_type': 'url_transparent', + 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id), + 'title': title or self._og_search_title(webpage), + 'description': description or self._og_search_description(webpage), + } From 5b012dfce87e4f7dd9ab4a2f5a01ab8f2e16adad Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 1 Apr 2016 01:42:16 +0800 Subject: [PATCH 013/116] [tudou] Improve error handling (closes #8988) --- youtube_dl/extractor/tudou.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 9892e8a62..63b5d5924 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -5,6 +5,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + ExtractorError, int_or_none, InAdvancePagedList, float_or_none, @@ -46,6 +47,19 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' + # Translated from tudou/tools/TVCHelper.as in PortalPlayer_193.swf + # 0001, 0002 and 4001 are not included as they indicate temporary issues + TVC_ERRORS = { + '0003': 'The video is deleted or does not exist', + '1001': 'This video is unavailable due to licensing issues', + '1002': 'This video is unavailable as it\'s under review', + '1003': 'This video is unavailable as it\'s under review', + '3001': 'Password required', + '5001': 'This video is available in Mainland China only due to licensing issues', + '7001': 'This video is unavailable', + '8001': 'This video is unavailable due to licensing issues', + } + def _url_for_id(self, video_id, quality=None): info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: @@ -63,6 +77,15 @@ class TudouIE(InfoExtractor): if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') + if not item_data.get('itemSegs'): + tvc_code = item_data.get('tvcCode') + if tvc_code: + err_msg = self.TVC_ERRORS.get(tvc_code) + if err_msg: + raise ExtractorError('Tudou said: %s' % err_msg, expected=True) + raise ExtractorError('Unexpected error %s returned from Tudou' % tvc_code) + raise ExtractorError('Unxpected error returned from Tudou') + title = unescapeHTML(item_data['kw']) description = item_data.get('desc') thumbnail_url = item_data.get('pic') From 4ecc1fc6387d900b7d61d43a112becff9e293206 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 21:09:27 +0100 Subject: [PATCH 014/116] [howstuffworks] improve extraction --- youtube_dl/extractor/howstuffworks.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index 663e6632a..76b74c51d 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -6,6 +6,7 @@ from ..utils import ( int_or_none, js_to_json, unescapeHTML, + determine_ext, ) @@ -39,7 +40,7 @@ class HowStuffWorksIE(InfoExtractor): 'url': 'http://entertainment.howstuffworks.com/arts/2706-sword-swallowing-1-by-dan-meyer-video.htm', 'info_dict': { 'id': '440011', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Sword Swallowing #1 by Dan Meyer', 'description': 'Video footage (1 of 3) used by permission of the owner Dan Meyer through Sword Swallowers Association International ', 'display_id': 'sword-swallowing-1-by-dan-meyer', @@ -63,13 +64,19 @@ class HowStuffWorksIE(InfoExtractor): video_id = clip_info['content_id'] formats = [] m3u8_url = clip_info.get('m3u8') - if m3u8_url: - formats += self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + if m3u8_url and determine_ext(m3u8_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', format_id='hls', fatal=True)) + flv_url = clip_info.get('flv_url') + if flv_url: + formats.append({ + 'url': flv_url, + 'format_id': 'flv', + }) for video in clip_info.get('mp4', []): formats.append({ 'url': video['src'], - 'format_id': video['bitrate'], - 'vbr': int(video['bitrate'].rstrip('k')), + 'format_id': 'mp4-%s' % video['bitrate'], + 'vbr': int_or_none(video['bitrate'].rstrip('k')), }) if not formats: @@ -102,6 +109,6 @@ class HowStuffWorksIE(InfoExtractor): 'title': unescapeHTML(clip_info['clip_title']), 'description': unescapeHTML(clip_info.get('caption')), 'thumbnail': clip_info.get('video_still_url'), - 'duration': clip_info.get('duration'), + 'duration': int_or_none(clip_info.get('duration')), 'formats': formats, } From b53a06e3b9f2c4ad86b09d35051f5eff2ad1bff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 02:24:22 +0600 Subject: [PATCH 015/116] [udemy:course] Use new URL format --- youtube_dl/extractor/udemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index e91cf44fe..a788cdd77 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -305,7 +305,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P[\da-z-]+)' + _VALID_URL = r'https?://www\.udemy\.com/(?P[^/?#&]+)' _TESTS = [] @classmethod @@ -338,7 +338,7 @@ class UdemyCourseIE(UdemyIE): if lecture_id: entry = { '_type': 'url_transparent', - 'url': 'https://www.udemy.com/%s/#/lecture/%s' % (course_path, entry['id']), + 'url': 'https://www.udemy.com/%s/learn/v4/t/lecture/%s' % (course_path, entry['id']), 'title': entry.get('title'), 'ie_key': UdemyIE.ie_key(), } From 961fc024d218275221dce8de2015c3900f2557c8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 23:33:36 +0100 Subject: [PATCH 016/116] [voxmedia] improve sbnation support --- youtube_dl/extractor/voxmedia.py | 109 ++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/voxmedia.py b/youtube_dl/extractor/voxmedia.py index 3b13d38a1..0c6b1f030 100644 --- a/youtube_dl/extractor/voxmedia.py +++ b/youtube_dl/extractor/voxmedia.py @@ -49,51 +49,84 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'voxdotcom', 'uploader': 'Vox', } + }, { + # SBN.VideoLinkset.entryGroup multiple ooyala embeds + 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'info_dict': { + 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', + 'title': '25 lies you will tell yourself on National Signing Day', + 'description': 'It\'s the most self-delusional time of the year, and everyone\'s gonna tell the same lies together!', + }, + 'playlist': [{ + 'md5': '721fededf2ab74ae4176c8c8cbfe092e', + 'info_dict': { + 'id': 'p3cThlMjE61VDi_SD9JlIteSNPWVDBB9', + 'ext': 'mp4', + 'title': 'Buddy Hield vs Steph Curry (and the world)', + 'description': 'Let’s dissect only the most important Final Four storylines.', + }, + }, { + 'md5': 'bf0c5cc115636af028be1bab79217ea9', + 'info_dict': { + 'id': 'BmbmVjMjE6esPHxdALGubTrouQ0jYLHj', + 'ext': 'mp4', + 'title': 'Chasing Cinderella 2016: Syracuse basketball', + 'description': 'md5:e02d56b026d51aa32c010676765a690d', + }, + }], }] def _real_extract(self, url): display_id = self._match_id(url) webpage = compat_urllib_parse_unquote(self._download_webpage(url, display_id)) - title = None - description = None - provider_video_id = None - provider_video_type = None + def create_entry(provider_video_id, provider_video_type, title=None, description=None): + return { + '_type': 'url_transparent', + 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id), + 'title': title or self._og_search_title(webpage), + 'description': description or self._og_search_description(webpage), + } - entry = self._search_regex([ - r'Chorus\.VideoContext\.addVideo\(\[({.+})\]\);', - r'var\s+entry\s*=\s*({.+});' + entries = [] + entries_data = self._search_regex([ + r'Chorus\.VideoContext\.addVideo\((\[{.+}\])\);', + r'var\s+entry\s*=\s*({.+});', + r'SBN\.VideoLinkset\.entryGroup\(\s*(\[.+\])', ], webpage, 'video data', default=None) - if entry: - video_data = self._parse_json(entry, display_id) - provider_video_id = video_data.get('provider_video_id') - provider_video_type = video_data.get('provider_video_type') - if provider_video_id and provider_video_type: - title = video_data.get('title') - description = video_data.get('description') + if entries_data: + entries_data = self._parse_json(entries_data, display_id) + if isinstance(entries_data, dict): + entries_data = [entries_data] + for video_data in entries_data: + provider_video_id = video_data.get('provider_video_id') + provider_video_type = video_data.get('provider_video_type') + if provider_video_id and provider_video_type: + entries.append(create_entry( + provider_video_id, provider_video_type, + video_data.get('title'), video_data.get('description'))) - if not provider_video_id or not provider_video_type: - provider_video_id = self._search_regex( - r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) - if provider_video_id: - provider_video_type = 'ooyala' - else: - volume_uuid = self._search_regex(r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid') - volume_webpage = self._download_webpage( - 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) - video_data = self._parse_json(self._search_regex( - r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) - title = video_data.get('title_short') - description = video_data.get('description_long') or video_data.get('description_short') - for pvtype in ('ooyala', 'youtube'): - provider_video_id = video_data.get('%s_id' % pvtype) - if provider_video_id: - provider_video_type = pvtype - break + provider_video_id = self._search_regex( + r'data-ooyala-id="([^"]+)"', webpage, 'ooyala id', default=None) + if provider_video_id: + entries.append(create_entry(provider_video_id, 'ooyala')) - return { - '_type': 'url_transparent', - 'url': provider_video_id if provider_video_type == 'youtube' else '%s:%s' % (provider_video_type, provider_video_id), - 'title': title or self._og_search_title(webpage), - 'description': description or self._og_search_description(webpage), - } + volume_uuid = self._search_regex( + r'data-volume-uuid="([^"]+)"', webpage, 'volume uuid', default=None) + if volume_uuid: + volume_webpage = self._download_webpage( + 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) + video_data = self._parse_json(self._search_regex( + r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) + for provider_video_type in ('ooyala', 'youtube'): + provider_video_id = video_data.get('%s_id' % provider_video_type) + if provider_video_id: + description = video_data.get('description_long') or video_data.get('description_short') + entries.append(create_entry( + provider_video_id, provider_video_type, video_data.get('title_short'), description)) + break + + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries, display_id, self._og_search_title(webpage), self._og_search_description(webpage)) From f9b1529af8aec98bffd42edb5be15e1ada791a20 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 31 Mar 2016 23:50:45 +0100 Subject: [PATCH 017/116] [generic] remove sbnation test(handled by VoxMediaIE) --- youtube_dl/extractor/generic.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f3de738f7..589d1e152 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -406,19 +406,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # multiple ooyala embeds on SBN network websites - { - 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'info_dict': { - 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com', - }, - 'playlist_mincount': 3, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', From 63c55e9f2245b24f437564e616ebff76994978cf Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 07:33:37 +0100 Subject: [PATCH 018/116] [cbs] improve extraction(closes #6321) --- youtube_dl/extractor/cbs.py | 78 +++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 40d07ab18..0cf56b9c7 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,21 +1,24 @@ from __future__ import unicode_literals -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( - sanitized_Request, - smuggle_url, + xpath_text, + xpath_element, + int_or_none, + ExtractorError, + find_xpath_attr, ) -class CBSIE(InfoExtractor): +class CBSIE(ThePlatformIE): _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { - 'id': '4JUVEwq3wUT7', + 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', 'display_id': 'connect-chat-feat-garth-brooks', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, @@ -47,22 +50,55 @@ class CBSIE(InfoExtractor): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' + + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + return { + 'en': [{ + 'ext': 'ttml', + 'url': closed_caption_e.attrib['value'], + }] + } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] def _real_extract(self, url): display_id = self._match_id(url) - request = sanitized_Request(url) - # Android UA is served with higher quality (720p) streams (see - # https://github.com/rg3/youtube-dl/issues/7490) - request.add_header('User-Agent', 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5)') - webpage = self._download_webpage(request, display_id) - real_id = self._search_regex( - [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], - webpage, 'real video ID') - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true&manifest=m3u' % real_id, - {'force_smil_url': True}), + webpage = self._download_webpage(url, display_id) + content_id = self._search_regex( + [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], + webpage, 'content id') + items_data = self._download_xml( + 'http://can.cbs.com/thunder/player/videoPlayerService.php', + content_id, query={'partner': 'cbs', 'contentId': content_id}) + video_data = xpath_element(items_data, './/item') + title = xpath_text(video_data, 'videoTitle', 'title', True) + + subtitles = {} + formats = [] + for item in items_data.findall('.//item'): + pid = xpath_text(item, 'pid') + if not pid: + continue + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % pid, content_id, 'Downloading %s SMIL data' % pid) + except ExtractorError: + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + self._sort_formats(formats) + + info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) + info.update({ + 'id': content_id, 'display_id': display_id, - } + 'title': title, + 'series': xpath_text(video_data, 'seriesTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), + 'thumbnail': xpath_text(video_data, 'previewImageURL'), + 'formats': formats, + 'subtitles': subtitles, + }) + return info From 1e02bc7ba28bfb5859dcd0c8d960b965d9c59f12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 1 Apr 2016 09:07:40 +0200 Subject: [PATCH 019/116] release 2016.04.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 5b1f573e7..128ba2fc0 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.03.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.03.27** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.03.27 +[debug] youtube-dl version 2016.04.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 00b8c247c..b9bcf450c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -118,6 +118,7 @@ - **Clubic** - **Clyp** - **cmt.com** + - **CNBC** - **CNET** - **CNN** - **CNNArticle** @@ -134,6 +135,7 @@ - **CrooksAndLiars** - **Crunchyroll** - **crunchyroll:playlist** + - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** @@ -376,7 +378,8 @@ - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - - **NationalGeographic** + - **natgeo** + - **natgeo:channel** - **Naver** - **NBA** - **NBC** @@ -618,7 +621,6 @@ - **Telegraaf** - **TeleMB** - **TeleTask** - - **TenPlay** - **TF1** - **TheIntercept** - **TheOnion** @@ -740,6 +742,7 @@ - **vlive** - **Vodlocker** - **VoiceRepublic** + - **VoxMedia** - **Vporn** - **vpro**: npo.nl and ntr.nl - **VRT** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5daa7f4e8..d9e1cb2a8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.27' +__version__ = '2016.04.01' From db1c969da5dc22b36228b50ac2c09f3a50dd17ae Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 09:55:46 +0100 Subject: [PATCH 020/116] [theplatform] sign https urls --- youtube_dl/extractor/theplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 236c99972..bf6f82f5a 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -154,7 +154,7 @@ class ThePlatformIE(ThePlatformBaseIE): def hex_to_str(hex): return binascii.a2b_hex(hex) - relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0] + relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) From d8873d4defdd527c82634bea8566370f5d385020 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 09:56:18 +0100 Subject: [PATCH 021/116] [aenetworks] improve format extraction --- youtube_dl/extractor/aenetworks.py | 32 +++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 6018ae79a..b7232c904 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -1,13 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, + unescapeHTML, +) class AENetworksIE(InfoExtractor): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P[^/]+)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', @@ -25,16 +31,13 @@ class AENetworksIE(InfoExtractor): 'expected_warnings': ['JSON-LD'], }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', 'info_dict': { 'id': 'eg47EERs_JsZ', 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', @@ -48,7 +51,7 @@ class AENetworksIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + page_type, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) @@ -56,11 +59,22 @@ class AENetworksIE(InfoExtractor): r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, r"media_url\s*=\s*'([^']+)'" ] - video_url = self._search_regex(video_url_re, webpage, 'video url') + video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) + query = {'mbr': 'true'} + if page_type == 'shows': + query['assetTypes'] = 'medium_video_s3' + if 'switch=hds' in video_url: + query['switch'] = 'hls' info = self._search_json_ld(webpage, video_id, fatal=False) info.update({ '_type': 'url_transparent', - 'url': smuggle_url(video_url, {'sig': {'key': 'crazyjava', 'secret': 's3cr3t'}}), + 'url': smuggle_url(update_url_query( + video_url, query), { + 'sig': { + 'key': 'crazyjava', + 'secret': 's3cr3t'}, + 'force_smil_url': True + }), }) return info From 3e0c3d14d9ab49f3a98324e91763c849ef52a1df Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 10:12:29 +0100 Subject: [PATCH 022/116] [cbs] add base extractor --- youtube_dl/extractor/cbs.py | 22 ++++++++++++---------- youtube_dl/extractor/cbsnews.py | 13 ++----------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 0cf56b9c7..6e4079ca3 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -10,7 +10,18 @@ from ..utils import ( ) -class CBSIE(ThePlatformIE): +class CBSBaseIE(ThePlatformIE): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') + return { + 'en': [{ + 'ext': 'ttml', + 'url': closed_caption_e.attrib['value'], + }] + } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + + +class CBSIE(CBSBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+)' _TESTS = [{ @@ -52,15 +63,6 @@ class CBSIE(ThePlatformIE): }] TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?manifest=m3u&mbr=true' - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index e6b7f3584..b5e78a65d 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,14 +2,14 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .cbs import CBSBaseIE from ..utils import ( parse_duration, find_xpath_attr, ) -class CBSNewsIE(ThePlatformIE): +class CBSNewsIE(CBSBaseIE): IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' @@ -49,15 +49,6 @@ class CBSNewsIE(ThePlatformIE): }, ] - def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _real_extract(self, url): video_id = self._match_id(url) From 3f64379eda3477306df013466045ab1a711533f4 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 16:22:06 +0100 Subject: [PATCH 023/116] [movieclips] fix extraction --- youtube_dl/extractor/movieclips.py | 43 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 1564cb71f..d0cb8278e 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -2,39 +2,48 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import sanitized_Request +from ..utils import ( + smuggle_url, + float_or_none, + parse_iso8601, + update_url_query, +) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/.+-(?P\d+)(?:\?|$)' _TEST = { - 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597', + 'md5': '42b5a0352d4933a7bd54f2104f481244', 'info_dict': { 'id': 'pKIGmG83AqD9', - 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', 'title': 'Warcraft Trailer 1', 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1446843055, + 'upload_date': '20151106', + 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - display_id = self._match_id(url) - - req = sanitized_Request(url) - # it doesn't work if it thinks the browser it's too old - req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') - webpage = self._download_webpage(req, display_id) - theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') - title = self._html_search_regex(r']*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') - description = self._html_search_meta('description', webpage) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video = next(v for v in self._parse_json(self._search_regex( + r'var\s+__REACT_ENGINE__\s*=\s*({.+});', + webpage, 'react engine'), video_id)['playlist']['videos'] if v['id'] == video_id) return { '_type': 'url_transparent', - 'url': theplatform_link, - 'title': title, - 'display_id': display_id, - 'description': description, + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + video['contentUrl'], {'mbr': 'true'}), {'force_smil_url': True}), + 'title': self._og_search_title(webpage), + 'description': self._html_search_meta('description', webpage), + 'duration': float_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('dateCreated')), + 'thumbnail': video.get('defaultImage'), + 'uploader': video.get('provider'), } From 03caa463e73c2ae2f666b85febf25ddb03f961ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:38:56 +0600 Subject: [PATCH 024/116] [udemy:course] Skip non-video lectures --- youtube_dl/extractor/udemy.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a788cdd77..bc69e6e41 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -193,12 +193,12 @@ class UdemyIE(InfoExtractor): asset = lecture['asset'] - asset_type = asset.get('assetType') or asset.get('asset_type') + asset_type = asset.get('asset_type') or asset.get('assetType') if asset_type != 'Video': raise ExtractorError( 'Lecture %s is not a video' % lecture_id, expected=True) - stream_url = asset.get('streamUrl') or asset.get('stream_url') + stream_url = asset.get('stream_url') or asset.get('streamUrl') if stream_url: youtube_url = self._search_regex( r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url, 'youtube URL', default=None) @@ -206,7 +206,7 @@ class UdemyIE(InfoExtractor): return self.url_result(youtube_url, 'Youtube') video_id = asset['id'] - thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') + thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) formats = [] @@ -325,7 +325,7 @@ class UdemyCourseIE(UdemyIE): 'https://www.udemy.com/api-2.0/courses/%s/cached-subscriber-curriculum-items' % course_id, course_id, 'Downloading course curriculum', query={ 'fields[chapter]': 'title,object_index', - 'fields[lecture]': 'title', + 'fields[lecture]': 'title,asset', 'page_size': '1000', }) @@ -334,6 +334,11 @@ class UdemyCourseIE(UdemyIE): for entry in response['results']: clazz = entry.get('_class') if clazz == 'lecture': + asset = entry.get('asset') + if isinstance(asset, dict): + asset_type = asset.get('asset_type') or asset.get('assetType') + if asset_type != 'Video': + continue lecture_id = entry.get('id') if lecture_id: entry = { From a3373823e1bd0239e0f58d5dd16ef5a4ec6bceb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:42:09 +0600 Subject: [PATCH 025/116] [udemy] Remove unnecessary login/password encode This is now covered by compat_urllib_parse_urlencode --- youtube_dl/extractor/udemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index bc69e6e41..d1e6f2703 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -151,8 +151,8 @@ class UdemyIE(InfoExtractor): login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ - 'email': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'email': username, + 'password': password, }) request = sanitized_Request( From fbdaced256f9d7d9b0adb97d093f0f381c9483f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:45:20 +0600 Subject: [PATCH 026/116] [lynda] Remove unnecessary login/password encode --- youtube_dl/extractor/lynda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 655627479..86d47266f 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -28,8 +28,8 @@ class LyndaBaseIE(InfoExtractor): return login_form = { - 'username': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'username': username, + 'password': password, 'remember': 'false', 'stayPut': 'false' } From 244cd04237fe4a1e4d92421711f41de3c2566d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 22:46:46 +0600 Subject: [PATCH 027/116] [pluralsight] Remove unnecessary login/password encode --- youtube_dl/extractor/pluralsight.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index df03dd419..9aab77645 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -64,8 +64,8 @@ class PluralsightIE(PluralsightBaseIE): login_form = self._hidden_inputs(login_page) login_form.update({ - 'Username': username.encode('utf-8'), - 'Password': password.encode('utf-8'), + 'Username': username, + 'Password': password, }) post_url = self._search_regex( From 83cedc1cf224206adf513f5bdd5f5ce915d67933 Mon Sep 17 00:00:00 2001 From: Martin Trigaux Date: Tue, 29 Mar 2016 14:18:44 +0200 Subject: [PATCH 028/116] screencast.com: support missing www The "www." part of the URL is not mandatory --- youtube_dl/extractor/screencast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index dfd897ba3..d5111c629 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -12,7 +12,7 @@ from ..utils import ( class ScreencastIE(InfoExtractor): - _VALID_URL = r'https?://www\.screencast\.com/t/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?screencast\.com/t/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://www.screencast.com/t/3ZEjQXlT', 'md5': '917df1c13798a3e96211dd1561fded83', @@ -34,7 +34,7 @@ class ScreencastIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } }, { - 'url': 'http://www.screencast.com/t/aAB3iowa', + 'url': 'http://screencast.com/t/aAB3iowa', 'md5': 'dedb2734ed00c9755761ccaee88527cd', 'info_dict': { 'id': 'aAB3iowa', From 81de73e5b43e5009a14f569aed92fe73e61d4f03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:00:45 +0600 Subject: [PATCH 029/116] [screencast] Add test --- youtube_dl/extractor/screencast.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index d5111c629..c69451151 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -34,7 +34,7 @@ class ScreencastIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } }, { - 'url': 'http://screencast.com/t/aAB3iowa', + 'url': 'http://www.screencast.com/t/aAB3iowa', 'md5': 'dedb2734ed00c9755761ccaee88527cd', 'info_dict': { 'id': 'aAB3iowa', @@ -53,8 +53,10 @@ class ScreencastIE(InfoExtractor): 'description': 'md5:7b9f393bc92af02326a5c5889639eab0', 'thumbnail': 're:^https?://.*\.(?:gif|jpg)$', } - }, - ] + }, { + 'url': 'http://screencast.com/t/aAB3iowa', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 791d6aaeccd2efae2c4c5fa1e72010be85eb89b8 Mon Sep 17 00:00:00 2001 From: Martin Trigaux Date: Tue, 29 Mar 2016 14:34:58 +0200 Subject: [PATCH 030/116] screencast.com: fallback on page title When determining the title of the page, use the tag of the page --- youtube_dl/extractor/screencast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index c69451151..32f31fdd7 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -97,7 +97,8 @@ class ScreencastIE(InfoExtractor): if title is None: title = self._html_search_regex( [r'<b>Title:</b> ([^<]*)</div>', - r'class="tabSeperator">></span><span class="tabText">(.*?)<'], + r'class="tabSeperator">></span><span class="tabText">(.*?)<', + r'<title>([^<]*)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) From 75d572e9fb8d3e26e4ab45e65cd5e23c6b1c6915 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:01:55 +0600 Subject: [PATCH 031/116] [screencast] Improve title regexes (Closes #9025) --- youtube_dl/extractor/screencast.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/screencast.py b/youtube_dl/extractor/screencast.py index 32f31fdd7..356631700 100644 --- a/youtube_dl/extractor/screencast.py +++ b/youtube_dl/extractor/screencast.py @@ -96,9 +96,9 @@ class ScreencastIE(InfoExtractor): title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( - [r'Title: ([^<]*)', - r'class="tabSeperator">>(.*?)<', - r'([^<]*)'], + [r'Title: ([^<]+)', + r'class="tabSeperator">>(.+?)<', + r'([^<]+)'], webpage, 'title') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage, default=None) From 79ba9140dc8fcf5883b7473596e8f20cba6b479f Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 18:06:11 +0100 Subject: [PATCH 032/116] [theplatform] extract timestamp and uploader --- youtube_dl/extractor/aenetworks.py | 6 ++++++ youtube_dl/extractor/bravotv.py | 3 +++ youtube_dl/extractor/cbs.py | 3 +++ youtube_dl/extractor/cnbc.py | 3 +++ youtube_dl/extractor/fox.py | 3 +++ youtube_dl/extractor/nationalgeographic.py | 12 ++++++++++++ youtube_dl/extractor/nbc.py | 14 ++++++++++++++ youtube_dl/extractor/sbs.py | 6 +++++- youtube_dl/extractor/theplatform.py | 10 ++++++++++ 9 files changed, 59 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index b7232c904..3fddaba54 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -22,6 +22,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -37,6 +40,9 @@ class AENetworksIE(InfoExtractor): 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', }, 'add_ie': ['ThePlatform'], }, { diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 34d451f38..541c76944 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -15,6 +15,9 @@ class BravoTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Last Chance Kitchen Returns', 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + 'timestamp': 1448926740, + 'upload_date': '20151130', + 'uploader': 'NBCU-BRAV', } } diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 6e4079ca3..c621a08d5 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -33,6 +33,9 @@ class CBSIE(CBSBaseIE): 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', 'duration': 1495, + 'timestamp': 1385585425, + 'upload_date': '20131127', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index 25b308752..d354d9f95 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -14,6 +14,9 @@ class CNBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fighting zombies is big business', 'description': 'md5:0c100d8e1a7947bd2feec9a5550e519e', + 'timestamp': 1459332000, + 'upload_date': '20160330', + 'uploader': 'NBCU-CNBC', }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index fa05af50d..95c1abf94 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -16,6 +16,9 @@ class FOXIE(InfoExtractor): 'title': 'Official Trailer: Gotham', 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', 'duration': 129, + 'timestamp': 1400020798, + 'upload_date': '20140513', + 'uploader': 'NEWA-FNG-FOXCOM', }, 'add_ie': ['ThePlatform'], } diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 61b5c700e..722518663 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -21,6 +21,9 @@ class NationalGeographicIE(InfoExtractor): 'ext': 'mp4', 'title': 'Mating Crabs Busted by Sharks', 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', + 'timestamp': 1423523799, + 'upload_date': '20150209', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, @@ -32,6 +35,9 @@ class NationalGeographicIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Real Jaws', 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6', + 'timestamp': 1433772632, + 'upload_date': '20150608', + 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], }, @@ -68,6 +74,9 @@ class NationalGeographicChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'Uncovering a Universal Knowledge', 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', + 'timestamp': 1458680907, + 'upload_date': '20160322', + 'uploader': 'NEWA-FNG-NGTV', }, 'add_ie': ['ThePlatform'], }, @@ -79,6 +88,9 @@ class NationalGeographicChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Stunning Red Bird of Paradise', 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', + 'timestamp': 1459362152, + 'upload_date': '20160330', + 'uploader': 'NEWA-FNG-NGTV', }, 'add_ie': ['ThePlatform'], }, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 43d75d3ca..e67025ff6 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -27,6 +27,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', + 'timestamp': 1424246400, + 'upload_date': '20150218', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -50,6 +53,9 @@ class NBCIE(InfoExtractor): 'ext': 'mp4', 'title': 'Star Wars Teaser', 'description': 'md5:0b40f9cbde5b671a7ff62fceccc4f442', + 'timestamp': 1417852800, + 'upload_date': '20141206', + 'uploader': 'NBCU-COM', }, 'params': { # m3u8 download @@ -78,6 +84,7 @@ class NBCIE(InfoExtractor): theplatform_url = 'http:' + theplatform_url return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'url': smuggle_url(theplatform_url, {'source_url': url}), 'id': video_id, } @@ -93,6 +100,9 @@ class NBCSportsVPlayerIE(InfoExtractor): 'ext': 'flv', 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', + 'timestamp': 1426270238, + 'upload_date': '20150313', + 'uploader': 'NBCU-SPORTS', } }, { 'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z', @@ -144,6 +154,9 @@ class CSNNEIE(InfoExtractor): 'ext': 'mp4', 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', + 'timestamp': 1459369979, + 'upload_date': '20160330', + 'uploader': 'NBCU-SPORTS', } } @@ -331,6 +344,7 @@ class MSNBCIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, } diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 2f96477ca..96472fbc4 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -24,6 +24,9 @@ class SBSIE(InfoExtractor): 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', 'duration': 308, + 'timestamp': 1408613220, + 'upload_date': '20140821', + 'uploader': 'SBSC', }, }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', @@ -57,6 +60,7 @@ class SBSIE(InfoExtractor): return { '_type': 'url_transparent', + 'ie_key': 'ThePlatform', 'id': video_id, - 'url': smuggle_url(theplatform_url, {'force_smil_url': True}), + 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), } diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index bf6f82f5a..6da701a39 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -76,6 +76,8 @@ class ThePlatformBaseIE(OnceIE): 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], 'duration': int_or_none(info.get('duration'), 1000), + 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, + 'uploader': info.get('billingCode'), } @@ -94,6 +96,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'title': 'Blackberry\'s big, bold Z30', 'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.', 'duration': 247, + 'timestamp': 1383239700, + 'upload_date': '20131031', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -107,6 +112,9 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'flv', 'description': 'md5:ac330c9258c04f9d7512cf26b9595409', 'title': 'Tesla Model S: A second step towards a cleaner motoring future', + 'timestamp': 1426176191, + 'upload_date': '20150312', + 'uploader': 'CBSI-NEW', }, 'params': { # rtmp download @@ -119,6 +127,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'ext': 'mp4', 'description': 'md5:644ad9188d655b742f942bf2e06b002d', 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + 'uploader': 'EGSM', } }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', @@ -135,6 +144,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', + 'uploader': 'NBCU-NEWS', }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 From 04819db58edfa7a169e7ba6fd2d5734500754571 Mon Sep 17 00:00:00 2001 From: theGeekPirate Date: Sat, 26 Mar 2016 05:37:40 -0700 Subject: [PATCH 033/116] [camwithher] Add extractor Corrected unnecessary test Sane variable naming RTMP all .flv & url_id for _download_webpage() Corrected all outstanding issues, next up is a squash! --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/camwithher.py | 55 ++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/camwithher.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 19f802411..438e1cc63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -95,6 +95,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py new file mode 100644 index 000000000..eb0a4ec56 --- /dev/null +++ b/youtube_dl/extractor/camwithher.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CamWithHerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*viewkey=(?P\w+)' + + _TESTS = [ + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + }, + 'params': { + 'skip_download': True, + } + }, + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, + { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, + { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flv_id = self._html_search_regex(r' 2010 else flv_id) + + title = self._html_search_regex(r'
\s+

(.+?)

', webpage, 'title') + + return { + 'id': flv_id, + 'url': rtmp_url, + 'no_resume': True, + 'ext': 'flv', + 'title': title, + } From 9aaaf8e8e8ae12ed8fbc62461558a4cdb8640ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 1 Apr 2016 23:47:27 +0600 Subject: [PATCH 034/116] [camwithher] Improve extraction (Closes #8989) --- youtube_dl/extractor/camwithher.py | 95 ++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index eb0a4ec56..9809096ec 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -1,55 +1,88 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_strdate, +) class CamWithHerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*viewkey=(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?camwithher\.tv/view_video\.php\?.*\bviewkey=(?P\w+)' - _TESTS = [ - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', - 'info_dict': { - 'id': '5644', - 'ext': 'flv', - 'title': 'Periscope Tease', - }, - 'params': { - 'skip_download': True, - } + _TESTS = [{ + 'url': 'http://camwithher.tv/view_video.php?viewkey=6e9a24e2c0e842e1f177&page=&viewtype=&category=', + 'info_dict': { + 'id': '5644', + 'ext': 'flv', + 'title': 'Periscope Tease', + 'description': 'In the clouds teasing on periscope to my favorite song', + 'duration': 240, + 'view_count': int, + 'comment_count': int, + 'uploader': 'MileenaK', + 'upload_date': '20160322', }, - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', - 'only_matching': True, - }, - { - 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', - 'only_matching': True, - }, - { - 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', - 'only_matching': True, + 'params': { + 'skip_download': True, } - ] + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=6dfd8b7c97531a459937', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?page=&viewkey=6e9a24e2c0e842e1f177&viewtype=&category=', + 'only_matching': True, + }, { + 'url': 'http://camwithher.tv/view_video.php?viewkey=b6c3b5bea9515d1a1fc4&page=&viewtype=&category=mv', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - flv_id = self._html_search_regex(r'
2010 else flv_id) + # Video URL construction algorithm is reverse-engineered from cwhplayer.swf + rtmp_url = 'rtmp://camwithher.tv/clipshare/%s' % ( + ('mp4:%s.mp4' % flv_id) if int(flv_id) > 2010 else flv_id) + + title = self._html_search_regex( + r']+style="float:left"[^>]*>\s*

(.+?)

', webpage, 'title') + description = self._html_search_regex( + r'>Description:(.+?)
', webpage, 'description', default=None) + + runtime = self._search_regex( + r'Runtime\s*:\s*(.+?) \|', webpage, 'duration', default=None) + if runtime: + runtime = re.sub(r'[\s-]', '', runtime) + duration = parse_duration(runtime) + view_count = int_or_none(self._search_regex( + r'Views\s*:\s*(\d+)', webpage, 'view count', default=None)) + comment_count = int_or_none(self._search_regex( + r'Comments\s*:\s*(\d+)', webpage, 'comment count', default=None)) + + uploader = self._search_regex( + r'Added by\s*:\s*]+>([^<]+)', webpage, 'uploader', default=None) + upload_date = unified_strdate(self._search_regex( + r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - title = self._html_search_regex(r'
\s+

(.+?)

', webpage, 'title') return { 'id': flv_id, 'url': rtmp_url, - 'no_resume': True, 'ext': 'flv', + 'no_resume': True, 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'uploader': uploader, + 'upload_date': upload_date, } From 329c1eae54bf71ae8602f79f71570eaf90ef7d2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:42:19 +0200 Subject: [PATCH 035/116] [aenetworks] Make pep8 happy --- youtube_dl/extractor/aenetworks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 3fddaba54..1bbfe2641 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -75,8 +75,9 @@ class AENetworksIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, fatal=False) info.update({ '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - video_url, query), { + 'url': smuggle_url( + update_url_query(video_url, query), + { 'sig': { 'key': 'crazyjava', 'secret': 's3cr3t'}, From 0f28777f58b5c21226d8e02477834dbb08b170ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:43:14 +0200 Subject: [PATCH 036/116] [cbsnews] Remove unused import --- youtube_dl/extractor/cbsnews.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index b5e78a65d..79ddc20a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from .cbs import CBSBaseIE from ..utils import ( parse_duration, - find_xpath_attr, ) From 6d628fafcadf6b9d2bc16c34c8cda8b53860e406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 1 Apr 2016 20:45:21 +0200 Subject: [PATCH 037/116] [camwithher] Remove extra blank line --- youtube_dl/extractor/camwithher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index 9809096ec..afbc5ea26 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -72,7 +72,6 @@ class CamWithHerIE(InfoExtractor): upload_date = unified_strdate(self._search_regex( r'Added on\s*:\s*([\d-]+)', webpage, 'upload date', default=None)) - return { 'id': flv_id, 'url': rtmp_url, From df634be2ed85b33968973a3e85935bb5d578ce42 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 19:39:02 +0100 Subject: [PATCH 038/116] [common] prefer using mime type over ext for smil subtitle extraction the subtitle ext for http://www.cnet.com/videos/download-amazon-prime-movies-and-tv/ is adb_xml while using the mime type it get tt(application/smptett+xml) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 85ac0400c..94a583891 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1335,7 +1335,7 @@ class InfoExtractor(object): if not src or src in urls: continue urls.append(src) - ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type')) + ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, From 0750b2491f5f14e51c2bf91584fd490944154393 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 19:47:20 +0100 Subject: [PATCH 039/116] [ffmpeg] try to convert tt subtitles usng dfxp2srt --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 06b8c0548..b64cd396b 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -536,7 +536,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext) - if ext == 'dfxp' or ext == 'ttml': + if ext == 'dfxp' or ext == 'ttml' or ext == 'tt': self._downloader.report_warning( 'You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') From 5f705baf5ecda6be678481ff9ab9c27a6cd54dc0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 20:42:15 +0100 Subject: [PATCH 040/116] [cnet] extract more formats --- youtube_dl/extractor/cnet.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index c154b3e19..58c26f20f 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -17,6 +17,8 @@ class CNETIE(ThePlatformIE): 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', 'duration': 70, + 'timestamp': 1396479627, + 'upload_date': '20140402', }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', @@ -28,8 +30,11 @@ class CNETIE(ThePlatformIE): 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, + 'timestamp': 1433289889, + 'upload_date': '20150603', }, }] + TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' def _real_extract(self, url): display_id = self._match_id(url) @@ -51,16 +56,12 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - metadata = self.get_metadata('kYEXFC/%s' % list(vdata['files'].values())[0], video_id) - description = vdata.get('description') or metadata.get('description') - duration = int_or_none(vdata.get('duration')) or metadata.get('duration') - - formats = [] - subtitles = {} + media_guid_path = 'media/guid/2288573011/%s' % vdata['mpxRefId'] + formats, subtitles = self._extract_theplatform_smil(self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid + release_url = self.TP_RELEASE_URL_TEMPLATE % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) @@ -68,15 +69,15 @@ class CNETIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - return { + info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info.update({ 'id': video_id, 'display_id': display_id, 'title': title, - 'description': description, - 'thumbnail': metadata.get('thumbnail'), - 'duration': duration, + 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, 'subtitles': subtitles, 'formats': formats, - } + }) + return info From fe7ef95e91cec1c1794692029561a68e2aaa7809 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 1 Apr 2016 23:53:32 +0100 Subject: [PATCH 041/116] [cbsinteractive] Add support for ZDNet videos --- youtube_dl/extractor/__init__.py | 2 +- .../extractor/{cnet.py => cbsinteractive.py} | 37 ++++++++++++++++--- 2 files changed, 32 insertions(+), 7 deletions(-) rename youtube_dl/extractor/{cnet.py => cbsinteractive.py} (69%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 438e1cc63..ebf5ca3e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -104,6 +104,7 @@ from .cbc import ( CBCPlayerIE, ) from .cbs import CBSIE +from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsIE, CBSNewsLiveVideoIE, @@ -129,7 +130,6 @@ from .clubic import ClubicIE from .clyp import ClypIE from .cmt import CMTIE from .cnbc import CNBCIE -from .cnet import CNETIE from .cnn import ( CNNIE, CNNBlogsIE, diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cbsinteractive.py similarity index 69% rename from youtube_dl/extractor/cnet.py rename to youtube_dl/extractor/cbsinteractive.py index 58c26f20f..0011c3029 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import int_or_none -class CNETIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P[^/]+)/' +class CBSInteractiveIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?Pcnet|zdnet)\.com/(?:videos|video/share)/(?P[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { @@ -33,15 +35,35 @@ class CNETIE(ThePlatformIE): 'timestamp': 1433289889, 'upload_date': '20150603', }, + }, { + 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', + 'info_dict': { + 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'ext': 'mp4', + 'title': 'Video: Keeping Android smartphones and tablets secure', + 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', + 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', + 'uploader': 'Adrian Kingsley-Hughes', + 'timestamp': 1448961720, + 'upload_date': '20151201', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { + 'cnet': 2288573011, + 'zdnet': 2387448114, + } def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-cnet-video(?:-uvp)?-options='([^']+)'", + r"data-(?:cnet|zdnet)-video(?:-uvp)?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] @@ -56,8 +78,11 @@ class CNETIE(ThePlatformIE): uploader = None uploader_id = None - media_guid_path = 'media/guid/2288573011/%s' % vdata['mpxRefId'] - formats, subtitles = self._extract_theplatform_smil(self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) + media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) + formats, subtitles = [], {} + if site == 'cnet': + formats, subtitles = self._extract_theplatform_smil( + self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue From 08136dc13805abb1832587e03e68066f07bd5776 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 10:57:57 +0100 Subject: [PATCH 042/116] [brightcove] fix format sorting --- youtube_dl/extractor/brightcove.py | 3 ++- youtube_dl/extractor/thestar.py | 4 ++++ youtube_dl/extractor/tv3.py | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 0d162d337..a8919001d 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -396,6 +396,7 @@ class BrightcoveNewIE(InfoExtractor): 'formats': 'mincount:41', }, 'params': { + # m3u8 download 'skip_download': True, } }, { @@ -533,7 +534,7 @@ class BrightcoveNewIE(InfoExtractor): f.update({ 'url': src or streaming_src, 'format_id': build_format_id('http' if src else 'http-streaming'), - 'preference': 2 if src else 1, + 'source_preference': 0 if src else -1, }) else: f.update({ diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py index b7e9af2af..ba1380abc 100644 --- a/youtube_dl/extractor/thestar.py +++ b/youtube_dl/extractor/thestar.py @@ -19,6 +19,10 @@ class TheStarIE(InfoExtractor): 'uploader_id': '794267642001', 'timestamp': 1454353482, 'upload_date': '20160201', + }, + 'params': { + # m3u8 download + 'skip_download': True, } } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py index d3f690dc7..3867ec90d 100644 --- a/youtube_dl/extractor/tv3.py +++ b/youtube_dl/extractor/tv3.py @@ -21,6 +21,7 @@ class TV3IE(InfoExtractor): 'Failed to download MPD manifest' ], 'params': { + # m3u8 download 'skip_download': True, }, } From db8ee7ec0598f8893e4259ac9373c44726e4f84f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 18:48:05 +0600 Subject: [PATCH 043/116] [extractor/common] Fix numeric identifiers conversion in DASH URL templates --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 94a583891..011edcc0a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1515,7 +1515,8 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) media_template.replace('$$', '$') representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: From b507cc925b8dbb37b0abce748ff73a7ad102494a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 18:49:58 +0600 Subject: [PATCH 044/116] [extractor/common] Carry long line --- youtube_dl/extractor/common.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 011edcc0a..ec6625eea 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1518,7 +1518,13 @@ class InfoExtractor(object): media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])] + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth')} + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], From bbc26c8a012d215e98a98a671471cd75e7765286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 2 Apr 2016 19:00:38 +0600 Subject: [PATCH 045/116] [bbc] Set vcodec to none for audio formats --- youtube_dl/extractor/bbc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index dedf721bd..425f08f2b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -328,6 +328,7 @@ class BBCCoUkIE(InfoExtractor): 'format_id': '%s_%s' % (service, format['format_id']), 'abr': abr, 'acodec': acodec, + 'vcodec': 'none', }) formats.extend(conn_formats) return formats From 2defa7d75aa424c16ca76a25a05297daed0bb5bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 2 Apr 2016 18:01:58 +0200 Subject: [PATCH 046/116] [instagram:user] Fix extraction (fixes #9059) The URL for the next page was incorrect and we always got the same page, therefore it got trapped in an infinite loop. --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 4e62098b0..11bb58d8a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -152,7 +152,7 @@ class InstagramUserIE(InfoExtractor): if not page['items']: break - max_id = page['items'][-1]['id'] + max_id = page['items'][-1]['id'].split('_')[0] media_url = ( 'http://instagram.com/%s/media?max_id=%s' % ( uploader_id, max_id)) From 41f5492fbcddfcbae133dc27e8d94ece3755df2e Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:54:40 +0100 Subject: [PATCH 047/116] [brightcove:legacy] improve format extraction and extract uploader_id, duration and timestamp --- youtube_dl/extractor/brightcove.py | 48 +++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a8919001d..a5091238b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -46,6 +46,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'timestamp': 1368213670, + 'upload_date': '20130510', + 'uploader_id': 1589608506001, } }, { @@ -57,6 +60,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', + 'timestamp': 1344975024, + 'upload_date': '20120814', + 'uploader_id': 1460825906, }, }, { @@ -68,6 +74,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', 'uploader': 'Mashable', + 'timestamp': 1382041798, + 'upload_date': '20131017', + 'uploader_id': 1130468786001, }, }, { @@ -85,14 +94,17 @@ class BrightcoveLegacyIE(InfoExtractor): { # test flv videos served by akamaihd.net # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william - 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3Aevent-stream-356&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', # The md5 checksum changes on each download 'info_dict': { - 'id': '2996102916001', + 'id': '3750436379001', 'ext': 'flv', 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', - 'uploader': 'Red Bull TV', + 'uploader': 'RBTV Old (do not use)', 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'timestamp': 1409122195, + 'upload_date': '20140827', + 'uploader_id': 710858724001, }, }, { @@ -106,6 +118,12 @@ class BrightcoveLegacyIE(InfoExtractor): 'playlist_mincount': 7, }, ] + FLV_VCODECS = { + 1: 'SORENSON', + 2: 'ON2', + 3: 'H264', + 4: 'VP8', + } @classmethod def _build_brighcove_url(cls, object_str): @@ -295,6 +313,9 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), + 'uploader_id': video_info.get('publisherId'), + 'duration': float_or_none(video_info.get('length'), 1000), + 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) @@ -318,19 +339,30 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - size = rend.get('size') + tbr = int_or_none(rend.get('encodingRate'), 1000), a_format = { + 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, 'ext': ext, - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - 'filesize': size if size != 0 else None, + 'filesize': int_or_none(rend.get('size')) or None, + 'tbr': tbr, } + if rend.get('audioOnly'): + a_format.update({ + 'vcodec': 'none', + }) + else: + a_format.update({ + 'height': int_or_none(rend.get('frameHeight')), + 'width': int_or_none(rend.get('frameWidth')), + 'vcodec': rend.get('videoCodec'), + }) # m3u8 manifests with remote == false are media playlists # Not calling _extract_m3u8_formats here to save network traffic if ext == 'm3u8': a_format.update({ + 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), 'ext': 'mp4', 'protocol': 'm3u8', }) @@ -341,6 +373,8 @@ class BrightcoveLegacyIE(InfoExtractor): elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], + 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), + 'filesize': int_or_none(video_info.get('FLVFullSize')), }) if self._downloader.params.get('include_ads', False): From e47d19e991456fe4afdab1a76a653f7821e99c3f Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:56:01 +0100 Subject: [PATCH 048/116] [brightcove:new] extract subtitles and strip video title --- youtube_dl/extractor/brightcove.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a5091238b..6128b6762 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -515,7 +515,7 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(json_data[0]['message'], expected=True) raise - title = json_data['name'] + title = json_data['name'].strip() formats = [] for source in json_data.get('sources', []): @@ -579,20 +579,22 @@ class BrightcoveNewIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = json_data.get('description') - thumbnail = json_data.get('thumbnail') - timestamp = parse_iso8601(json_data.get('published_at')) - duration = float_or_none(json_data.get('duration'), 1000) - tags = json_data.get('tags', []) + subtitles = {} + for text_track in json_data.get('text_tracks', []): + if text_track.get('src'): + subtitles.setdefault(text_track.get('srclang'), []).append({ + 'url': text_track['src'], + }) return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': json_data.get('description'), + 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'duration': float_or_none(json_data.get('duration'), 1000), + 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': account_id, 'formats': formats, - 'tags': tags, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), } From 3aac9b2fb1a103f1d350ba10060e59bb04a6a2e8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 18:56:31 +0100 Subject: [PATCH 049/116] [nowness] update tests --- youtube_dl/extractor/nowness.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index 446f5901c..74860eb20 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -63,8 +63,11 @@ class NownessIE(NownessBaseIE): 'title': 'Candor: The Art of Gesticulation', 'description': 'Candor: The Art of Gesticulation', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1446745676, + 'upload_date': '20151105', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://cn.nowness.com/story/kasper-bjorke-ft-jaakko-eino-kalevi-tnr', 'md5': 'e79cf125e387216f86b2e0a5b5c63aa3', @@ -74,8 +77,11 @@ class NownessIE(NownessBaseIE): 'title': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'description': 'Kasper Bjørke ft. Jaakko Eino Kalevi: TNR', 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Nowness', + 'timestamp': 1407315371, + 'upload_date': '20140806', + 'uploader_id': '2385340575001', }, + 'add_ie': ['BrightcoveNew'], }, { # vimeo 'url': 'https://www.nowness.com/series/nowness-picks/jean-luc-godard-supercut', @@ -90,6 +96,7 @@ class NownessIE(NownessBaseIE): 'uploader': 'Cinema Sem Lei', 'uploader_id': 'cinemasemlei', }, + 'add_ie': ['Vimeo'], }] def _real_extract(self, url): From 4d4cd35f485c652a39a631fbf3d40c9f4353e807 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 20:55:44 +0100 Subject: [PATCH 050/116] [brightcove:legacy] extract uploader_id as a string --- youtube_dl/extractor/brightcove.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 6128b6762..f9056f514 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -48,7 +48,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', 'timestamp': 1368213670, 'upload_date': '20130510', - 'uploader_id': 1589608506001, + 'uploader_id': '1589608506001', } }, { @@ -62,7 +62,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'uploader': 'Oracle', 'timestamp': 1344975024, 'upload_date': '20120814', - 'uploader_id': 1460825906, + 'uploader_id': '1460825906', }, }, { @@ -76,7 +76,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'uploader': 'Mashable', 'timestamp': 1382041798, 'upload_date': '20131017', - 'uploader_id': 1130468786001, + 'uploader_id': '1130468786001', }, }, { @@ -104,7 +104,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', 'timestamp': 1409122195, 'upload_date': '20140827', - 'uploader_id': 710858724001, + 'uploader_id': '710858724001', }, }, { @@ -313,7 +313,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), - 'uploader_id': video_info.get('publisherId'), + 'uploader_id': compat_str(video_info.get('publisherId')), 'duration': float_or_none(video_info.get('length'), 1000), 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } From 23576edbfcaa3d7f0283631516c82715a85c6856 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 2 Apr 2016 21:31:21 +0100 Subject: [PATCH 051/116] [brightcove:legacy] skip None value for uploader_id --- youtube_dl/extractor/brightcove.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f9056f514..c718cf385 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,13 +307,14 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + publisher_id = video_info.get('publisherId') info = { 'id': compat_str(video_info['id']), 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), 'uploader': video_info.get('publisherName'), - 'uploader_id': compat_str(video_info.get('publisherId')), + 'uploader_id': compat_str(publisher_id) if publisher_id else None, 'duration': float_or_none(video_info.get('length'), 1000), 'timestamp': int_or_none(video_info.get('creationDate'), 1000), } From 6d4fc66bfc9bb3ed2a4f68366f372a9bedf6e708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 4 Apr 2016 02:26:20 +0600 Subject: [PATCH 052/116] [youtube] Add support for zwearz (Closes #9062) --- youtube_dl/extractor/youtube.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 28355bf46..188066561 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -270,7 +270,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): )) |(?: youtu\.be| # just youtu.be/xxxx - vid\.plus # or vid.plus/xxxx + vid\.plus| # or vid.plus/xxxx + zwearz\.com/watch| # or zwearz.com/watch/xxxx )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) @@ -758,6 +759,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'http://vid.plus/FlRa-iH7PGw', 'only_matching': True, }, + { + 'url': 'http://zwearz.com/watch/9lWxNJF-ufM/electra-woman-dyna-girl-official-trailer-grace-helbig.html', + 'only_matching': True, + }, { # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see From 8c7d6e8e2279beccf638cd0fae9d91876e0486b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 4 Apr 2016 20:44:06 +0200 Subject: [PATCH 053/116] [zdf] Extract subtitles (closes #9081) --- youtube_dl/extractor/zdf.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 81c22a627..2ef177275 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -85,6 +85,13 @@ class ZDFIE(InfoExtractor): uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + subtitles = {} + captions_url = doc.find('.//caption/url') + if captions_url is not None: + subtitles['de'] = [{ + 'url': captions_url.text, + 'ext': 'ttml', + }] def xml_to_thumbnails(fnode): thumbnails = [] @@ -190,6 +197,7 @@ class ZDFIE(InfoExtractor): 'uploader_id': uploader_id, 'upload_date': upload_date, 'formats': formats, + 'subtitles': subtitles, } def _real_extract(self, url): From 5bf28d7864d83be98233b6d1e478d7911f99e2cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 4 Apr 2016 20:46:35 +0200 Subject: [PATCH 054/116] [utils] dfxp2srt: add additional namespace Used by the ZDF subtitles (#9081). --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5c4ab2748..8e53962c9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2131,6 +2131,7 @@ def dfxp2srt(dfxp_data): _x = functools.partial(xpath_with_ns, ns_map={ 'ttml': 'http://www.w3.org/ns/ttml', 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', }) class TTMLPElementParser(object): @@ -2157,7 +2158,7 @@ def dfxp2srt(dfxp_data): dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') From 995cf05c96e888f7f1a818f9886345ea25671aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Apr 2016 21:40:43 +0600 Subject: [PATCH 055/116] [novamov] Make title fatal --- youtube_dl/extractor/novamov.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index a131f7dbd..63a77b9bf 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -81,7 +81,7 @@ class NovaMovIE(InfoExtractor): filekey = extract_filekey() - title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title', fatal=False) + title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title') description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False) api_response = self._download_webpage( From e7d77efb9dddc145b3d431047f2f98e19df4114e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Apr 2016 21:52:07 +0600 Subject: [PATCH 056/116] [auroravid] Add extractor (Closes #9070) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/novamov.py | 35 ++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ebf5ca3e9..69ea21a23 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -501,6 +501,7 @@ from .novamov import ( NowVideoIE, VideoWeedIE, CloudTimeIE, + AuroraVidIE, ) from .nowness import ( NownessIE, diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 63a77b9bf..5771a675d 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -27,17 +27,7 @@ class NovaMovIE(InfoExtractor): _DESCRIPTION_REGEX = r'(?s)
\s*

[^<]+

([^<]+)

' _URL_TEMPLATE = 'http://%s/video/%s' - _TEST = { - 'url': 'http://www.novamov.com/video/4rurhn9x446jj', - 'md5': '7205f346a52bbeba427603ba10d4b935', - 'info_dict': { - 'id': '4rurhn9x446jj', - 'ext': 'flv', - 'title': 'search engine optimization', - 'description': 'search engine optimization is used to rank the web page in the google search engine' - }, - 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' - } + _TEST = None def _check_existence(self, webpage, video_id): if re.search(self._FILE_DELETED_REGEX, webpage) is not None: @@ -187,3 +177,26 @@ class CloudTimeIE(NovaMovIE): _TITLE_REGEX = r']+class=["\']video_det["\'][^>]*>\s*([^<]+)' _TEST = None + + +class AuroraVidIE(NovaMovIE): + IE_NAME = 'auroravid' + IE_DESC = 'AuroraVid' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'auroravid\.to'} + + _HOST = 'www.auroravid.to' + + _FILE_DELETED_REGEX = r'This file no longer exists on our servers!<' + + _TEST = { + 'url': 'http://www.auroravid.to/video/4rurhn9x446jj', + 'md5': '7205f346a52bbeba427603ba10d4b935', + 'info_dict': { + 'id': '4rurhn9x446jj', + 'ext': 'flv', + 'title': 'search engine optimization', + 'description': 'search engine optimization is used to rank the web page in the google search engine' + }, + 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' + } From 40a056d85d7711e2f93bd5f7cc057672650386b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Apr 2016 21:54:09 +0600 Subject: [PATCH 057/116] [extractor/__init__] Remove novamov extractor and sort novamov based extractors alphabetically --- youtube_dl/extractor/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 69ea21a23..c3121d83c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -496,12 +496,11 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE from .novamov import ( - NovaMovIE, - WholeCloudIE, + AuroraVidIE, + CloudTimeIE, NowVideoIE, VideoWeedIE, - CloudTimeIE, - AuroraVidIE, + WholeCloudIE, ) from .nowness import ( NownessIE, From e42f413716de822918356d15b0cb3c66608681b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Apr 2016 22:23:20 +0600 Subject: [PATCH 058/116] [rte] Improve thumbnail extraction (Closes #9085) --- youtube_dl/extractor/rte.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 9c89974e7..ebe563ebb 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -39,9 +39,14 @@ class RteIE(InfoExtractor): duration = float_or_none(self._html_search_meta( 'duration', webpage, 'duration', fatal=False), 1000) - thumbnail_id = self._search_regex( - r'', webpage, 'thumbnail') - thumbnail = 'http://img.rasset.ie/' + thumbnail_id + '.jpg' + thumbnail = None + thumbnail_meta = self._html_search_meta('thumbnail', webpage) + if thumbnail_meta: + thumbnail_id = self._search_regex( + r'uri:irus:(.+)', thumbnail_meta, + 'thumbnail id', fatal=False) + if thumbnail_id: + thumbnail = 'http://img.rasset.ie/%s.jpg' % thumbnail_id feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id json_string = self._download_json(feeds_url, video_id) From 65150b41bb3055820938c3c572ccb2ffd9f312aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 5 Apr 2016 22:27:33 +0600 Subject: [PATCH 059/116] [deezer] Fix extraction (Closes #9086) --- youtube_dl/extractor/deezer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/deezer.py b/youtube_dl/extractor/deezer.py index c3205ff5f..7a07f3267 100644 --- a/youtube_dl/extractor/deezer.py +++ b/youtube_dl/extractor/deezer.py @@ -41,7 +41,9 @@ class DeezerPlaylistIE(InfoExtractor): 'Deezer said: %s' % geoblocking_msg, expected=True) data_json = self._search_regex( - r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n', webpage, 'data JSON') + (r'__DZR_APP_STATE__\s*=\s*({.+?})\s*', + r'naboo\.display\(\'[^\']+\',\s*(.*?)\);\n'), + webpage, 'data JSON') data = json.loads(data_json) playlist_title = data.get('DATA', {}).get('TITLE') From 416930d45007cb1e24e4cd8638d9867dd84f3961 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 5 Apr 2016 18:36:24 +0200 Subject: [PATCH 060/116] release 2016.04.05 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 5 +++-- youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 128ba2fc0..e0a7d8282 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.05*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.05** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.01 +[debug] youtube-dl version 2016.04.05 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b9bcf450c..d6ee8476b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -57,6 +57,7 @@ - **AudioBoom** - **audiomack** - **audiomack:album** + - **auroravid**: AuroraVid - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -92,12 +93,14 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamWithHer** - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **Canvas** - **CBC** - **CBCPlayer** - **CBS** + - **CBSInteractive** - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** @@ -119,7 +122,6 @@ - **Clyp** - **cmt.com** - **CNBC** - - **CNET** - **CNN** - **CNNArticle** - **CNNBlogs** @@ -419,7 +421,6 @@ - **Normalboots** - **NosVideo** - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - - **novamov**: NovaMov - **nowness** - **nowness:playlist** - **nowness:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d9e1cb2a8..45d152902 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.01' +__version__ = '2016.04.05' From 8790249c6860374b4d7eeeffae9e7b30d346eaf7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 6 Apr 2016 16:12:16 +0800 Subject: [PATCH 061/116] [iqiyi] Improve error detection for VIP-only videos Closes #9071 --- youtube_dl/extractor/iqiyi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 9e8c9432a..88570f261 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -368,7 +368,10 @@ class IqiyiIE(InfoExtractor): auth_req, video_id, note='Downloading video authentication JSON', errnote='Unable to download video authentication JSON') - if auth_result['code'] == 'Q00506': # requires a VIP account + + if auth_result['code'] == 'Q00505': # No preview available (不允许试看鉴权失败) + raise ExtractorError('This video requires a VIP account', expected=True) + if auth_result['code'] == 'Q00506': # End of preview time (试看结束鉴权失败) if do_report_warning: self.report_warning('Needs a VIP account for full video') return False From 92d5477d84c6663e71d6d22e261753a16b78000f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 6 Apr 2016 18:29:54 +0800 Subject: [PATCH 062/116] [compat] Handle tuples properly in urlencode() Fixes #9055 --- test/test_compat.py | 4 ++++ youtube_dl/compat.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index cc105807a..618668210 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -76,6 +76,10 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_urlencode({'abc': b'def'}), 'abc=def') self.assertEqual(compat_urllib_parse_urlencode({b'abc': 'def'}), 'abc=def') self.assertEqual(compat_urllib_parse_urlencode({b'abc': b'def'}), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode([('abc', 'def')]), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode([('abc', b'def')]), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode([(b'abc', 'def')]), 'abc=def') + self.assertEqual(compat_urllib_parse_urlencode([(b'abc', b'def')]), 'abc=def') def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 76b6b0e38..0b6c5ca7a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -181,7 +181,8 @@ except ImportError: # Python 2 if isinstance(e, dict): e = encode_dict(e) elif isinstance(e, (list, tuple,)): - e = encode_list(e) + list_e = encode_list(e) + e = tuple(list_e) if isinstance(e, tuple) else list_e elif isinstance(e, compat_str): e = e.encode(encoding) return e From be0c7009fb21ebbbe530ad594a9465dff9d72467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 6 Apr 2016 14:09:31 +0200 Subject: [PATCH 063/116] Makefile: use full path for the ISSUE_TEMPLATE.md file --- Makefile | 4 ++-- devscripts/release.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3a6c37944..1b08a4783 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -all: youtube-dl README.md CONTRIBUTING.md ISSUE_TEMPLATE.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites +all: youtube-dl README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe @@ -59,7 +59,7 @@ README.md: youtube_dl/*.py youtube_dl/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md -ISSUE_TEMPLATE.md: +.github/ISSUE_TEMPLATE.md: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md youtube_dl/version.py $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md .github/ISSUE_TEMPLATE.md supportedsites: diff --git a/devscripts/release.sh b/devscripts/release.sh index 6718ce39b..8dea55dbb 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -46,7 +46,7 @@ fi sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py /bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..." -make README.md CONTRIBUTING.md ISSUE_TEMPLATE.md supportedsites +make README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md supportedsites git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py git commit -m "release $version" From ec4a4c6fccebc1c8ae7a35129171b1181d8badb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 6 Apr 2016 14:16:05 +0200 Subject: [PATCH 064/116] Makefile: remove ISSUE_TEMPLATE.md from the 'all' target (fixes #9088) It isn't included in the tar file, causing build failures. Since it's only used for GitHub, I think we don't need to store it in the tar file. --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1b08a4783..ba7f7ed36 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -all: youtube-dl README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites +all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe From c41cf65d4ab41f08f98905082b7a137ac57927ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 6 Apr 2016 15:13:08 +0200 Subject: [PATCH 065/116] release 2016.04.06 --- .github/ISSUE_TEMPLATE.md | 6 +++--- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index e0a7d8282..bf9494646 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.05*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.05** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.06** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.05 +[debug] youtube-dl version 2016.04.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 45d152902..167b16e24 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.05' +__version__ = '2016.04.06' From 7a93ab5f3f1535efc948376869f61716ed2af1f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 Apr 2016 02:53:53 +0600 Subject: [PATCH 066/116] [extractor/common] Introduce music album metafields --- youtube_dl/extractor/common.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ec6625eea..2b40f3b7c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -232,6 +232,24 @@ class InfoExtractor(object): episode_number: Number of the video episode within a season, as an integer. episode_id: Id of the video episode, as a unicode string. + The following fields should only be used when the media is a track or a part of + a music album: + + track: Title of the track. + track_number: Number of the track within an album or a disc, as an integer. + track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii), + as a unicode string. + artist: Artist(s) of the track. + genre: Genre(s) of the track. + album: Title of the album the track belongs to. + album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). + album_artist: List of all artists appeared on the album (e.g. + "Ash Borer / Fell Voices" or "Various Artists", useful for splits + and compilations). + disc_number: Number of the disc or other physical medium the track belongs to, + as an integer. + release_year: Year (YYYY) when the album was released. + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. From e90d175436e61e207e0b0cae7f699494dcf15922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 Apr 2016 02:56:13 +0600 Subject: [PATCH 067/116] [yandexmusic] Extract music album metafields (Closes #7354) --- youtube_dl/extractor/yandexmusic.py | 40 ++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 025716958..7a90cc60c 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -39,9 +39,14 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'info_dict': { 'id': '4878838', 'ext': 'mp3', - 'title': 'Carlo Ambrosio - Gypsy Eyes 1', + 'title': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio - Gypsy Eyes 1', 'filesize': 4628061, 'duration': 193.04, + 'track': 'Gypsy Eyes 1', + 'album': 'Gypsy Soul', + 'album_artist': 'Carlo Ambrosio', + 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', + 'release_year': '2009', } } @@ -64,16 +69,45 @@ class YandexMusicTrackIE(YandexMusicBaseIE): thumbnail = cover_uri.replace('%%', 'orig') if not thumbnail.startswith('http'): thumbnail = 'http://' + thumbnail - return { + + track_title = track['title'] + track_info = { 'id': track['id'], 'ext': 'mp3', 'url': self._get_track_url(track['storageDir'], track['id']), - 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), 'thumbnail': thumbnail, + 'track': track_title, } + def extract_artist(artist_list): + if artist_list and isinstance(artist_list, list): + artists_names = [a['name'] for a in artist_list if a.get('name')] + if artists_names: + return ', '.join(artists_names) + + albums = track.get('albums') + if albums and isinstance(albums, list): + album = albums[0] + if isinstance(album, dict): + year = album.get('year') + track_info.update({ + 'album': album.get('title'), + 'album_artist': extract_artist(album.get('artists')), + 'release_year': compat_str(year) if year else None, + }) + + track_artist = extract_artist(track.get('artists')) + if track_artist: + track_info.update({ + 'artist': track_artist, + 'title': '%s - %s' % (track_artist, track_title), + }) + else: + track_info['title'] = track_title + return track_info + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) album_id, track_id = mobj.group('album_id'), mobj.group('id') From 3afef2e3fc60a7baa2d923e9cfbaf521c7f5ca0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 Apr 2016 22:40:35 +0600 Subject: [PATCH 068/116] [beeg] Improve extraction --- youtube_dl/extractor/beeg.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 9072949dd..956c7680e 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,8 +33,33 @@ class BeegIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + cpl_url = self._search_regex( + r']+src=(["\'])(?P(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + webpage, 'cpl', default=None, group='url') + + beeg_version, beeg_salt = [None] * 2 + + if cpl_url: + cpl = self._download_webpage( + self._proto_relative_url(cpl_url), video_id, + 'Downloading cpl JS', fatal=False) + if cpl: + beeg_version = self._search_regex( + r'beeg_version\s*=\s*(\d+)', cpl, + 'beeg version', default=None) or self._search_regex( + r'/(\d+)\.js', cpl_url, 'beeg version', default=None) + beeg_salt = self._search_regex( + r'beeg_salt\s*=\s*(["\'])(?P.+?)\1', cpl, 'beeg beeg_salt', + default=None, group='beeg_salt') + + beeg_version = beeg_version or '1750' + beeg_salt = beeg_salt or 'MIDtGaw96f0N1kMMAM1DE46EC9pmFr' + video = self._download_json( - 'https://api.beeg.com/api/v6/1738/video/%s' % video_id, video_id) + 'http://api.beeg.com/api/v6/%s/video/%s' % (beeg_version, video_id), + video_id) def split(o, e): def cut(s, x): @@ -51,7 +76,7 @@ class BeegIE(InfoExtractor): def decrypt_key(key): # Reverse engineered from http://static.beeg.com/cpl/1738.js - a = 'GUuyodcfS8FW8gQp4OKLMsZBcX0T7B' + a = beeg_salt e = compat_urllib_parse_unquote(key) o = ''.join([ compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) @@ -101,5 +126,5 @@ class BeegIE(InfoExtractor): 'duration': duration, 'tags': tags, 'formats': formats, - 'age_limit': 18, + 'age_limit': self._rta_search(webpage), } From ed6fb8b804448724fcd1ba4abc3fa028b817efe2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 Apr 2016 23:22:43 +0600 Subject: [PATCH 069/116] [vrt] Add support for direct hls playlists and YouTube (Closes #9108) --- youtube_dl/extractor/vrt.py | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 2b6bae89b..8e35f24e8 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + determine_ext, + float_or_none, +) class VRTIE(InfoExtractor): @@ -52,6 +55,11 @@ class VRTIE(InfoExtractor): 'duration': 661, } }, + { + # YouTube video + 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', + 'only_matching': True, + }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', 'only_matching': True, @@ -66,7 +74,17 @@ class VRTIE(InfoExtractor): video_id = self._search_regex( r'data-video-id="([^"]+)_[^"]+"', webpage, 'video id', fatal=False) + src = self._search_regex( + r'data-video-src="([^"]+)"', webpage, 'video src', default=None) + + video_type = self._search_regex( + r'data-video-type="([^"]+)"', webpage, 'video type', default=None) + + if video_type == 'YouTubeVideo': + return self.url_result(src, 'Youtube') + formats = [] + mobj = re.search( r'data-video-iphone-server="(?P[^"]+)"\s+data-video-iphone-path="(?P[^"]+)"', webpage) @@ -74,11 +92,15 @@ class VRTIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( '%s/%s' % (mobj.group('server'), mobj.group('path')), video_id, 'mp4', m3u8_id='hls', fatal=False)) - mobj = re.search(r'data-video-src="(?P[^"]+)"', webpage) - if mobj: - formats.extend(self._extract_f4m_formats( - '%s/manifest.f4m' % mobj.group('src'), - video_id, f4m_id='hds', fatal=False)) + + if src: + if determine_ext(src) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.extend(self._extract_f4m_formats( + '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) if not formats and 'data-video-geoblocking="true"' in webpage: self.raise_geo_restricted('This video is only available in Belgium') From 536a55dabd7bcc2f34195beb84211028c934ed7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 00:17:47 +0600 Subject: [PATCH 070/116] [YoutubeDL] Sanitize single thumbnail URL --- youtube_dl/YoutubeDL.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d7aa951ff..cd0805303 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1240,7 +1240,10 @@ class YoutubeDL(object): self.list_thumbnails(info_dict) return - if thumbnails and 'thumbnail' not in info_dict: + thumbnail = info_dict.get('thumbnail') + if thumbnail: + info_dict['thumbnail'] = sanitize_url(thumbnail) + elif thumbnails: info_dict['thumbnail'] = thumbnails[-1]['url'] if 'display_id' not in info_dict and 'id' in info_dict: From 9a32e80477f470b8d8d320db38b5de02e0e6bc92 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 8 Apr 2016 14:51:00 +0100 Subject: [PATCH 071/116] [acast] fix extraction(#9117) --- youtube_dl/extractor/acast.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 92eee8119..79a17e73a 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -26,13 +26,7 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - - embed_page = self._download_webpage( - re.sub('(?:www\.)?acast\.com', 'embedcdn.acast.com', url), display_id) - cast_data = self._parse_json(self._search_regex( - r'window\[\'acast/queries\'\]\s*=\s*([^;]+);', embed_page, 'acast data'), - display_id)['GetAcast/%s/%s' % (channel, display_id)] - + cast_data = self._download_json('https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) return { 'id': compat_str(cast_data['id']), 'display_id': display_id, From a1ff3cd5f98980e37b5bdb1fd24cdba56ed6e618 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 8 Apr 2016 15:15:34 +0100 Subject: [PATCH 072/116] [acast] fix channel extraction(closes #9117) --- youtube_dl/extractor/acast.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 79a17e73a..94ce88c83 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -2,10 +2,14 @@ from __future__ import unicode_literals import re +import functools from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import ( + int_or_none, + OnDemandPagedList, +) class ACastIE(InfoExtractor): @@ -26,7 +30,8 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - cast_data = self._download_json('https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) + cast_data = self._download_json( + 'https://embed.acast.com/api/acasts/%s/%s' % (channel, display_id), display_id) return { 'id': compat_str(cast_data['id']), 'display_id': display_id, @@ -52,15 +57,26 @@ class ACastChannelIE(InfoExtractor): 'playlist_mincount': 20, } _API_BASE_URL = 'https://www.acast.com/api/' + _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _real_extract(self, url): - display_id = self._match_id(url) - channel_data = self._download_json(self._API_BASE_URL + 'channels/%s' % display_id, display_id) - casts = self._download_json(self._API_BASE_URL + 'channels/%s/acasts' % display_id, display_id) - entries = [self.url_result('https://www.acast.com/%s/%s' % (display_id, cast['url']), 'ACast') for cast in casts] + def _fetch_page(self, channel_slug, page): + casts = self._download_json( + self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), + channel_slug, note='Download page %d of channel data' % page) + for cast in casts: + yield self.url_result( + 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']), + 'ACast', cast['id']) - return self.playlist_result(entries, compat_str(channel_data['id']), channel_data['name'], channel_data.get('description')) + def _real_extract(self, url): + channel_slug = self._match_id(url) + channel_data = self._download_json( + self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, channel_slug), self._PAGE_SIZE) + return self.playlist_result(entries, compat_str( + channel_data['id']), channel_data['name'], channel_data.get('description')) From 56019444cb2fe64f0937fb52aff9e36f30b3b343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 21:26:42 +0600 Subject: [PATCH 073/116] [novamov] Improve _VALID_URL template (Closes #9116) --- youtube_dl/extractor/novamov.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 5771a675d..3bbd47355 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -16,7 +16,14 @@ class NovaMovIE(InfoExtractor): IE_NAME = 'novamov' IE_DESC = 'NovaMov' - _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video|mobile/#/videos)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P[a-z\d]{13})' + _VALID_URL_TEMPLATE = r'''(?x) + http:// + (?: + (?:www\.)?%(host)s/(?:file|video|mobile/\#/videos)/| + (?:(?:embed|www)\.)%(host)s/embed(?:\.php|/)?\?(?:.*?&)?\bv= + ) + (?P[a-z\d]{13}) + ''' _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'} _HOST = 'www.novamov.com' @@ -189,7 +196,7 @@ class AuroraVidIE(NovaMovIE): _FILE_DELETED_REGEX = r'This file no longer exists on our servers!<' - _TEST = { + _TESTS = [{ 'url': 'http://www.auroravid.to/video/4rurhn9x446jj', 'md5': '7205f346a52bbeba427603ba10d4b935', 'info_dict': { @@ -199,4 +206,7 @@ class AuroraVidIE(NovaMovIE): 'description': 'search engine optimization is used to rank the web page in the google search engine' }, 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' - } + }, { + 'url': 'http://www.auroravid.to/embed/?v=4rurhn9x446jj', + 'only_matching': True, + }] From a64c0c9b06b24085b1332452b5859fe6ac0a26d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 22:15:36 +0600 Subject: [PATCH 074/116] [democracynow] Make description optional (Closes #9115) --- youtube_dl/extractor/democracynow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 6cd395e11..4b6d2652a 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -38,7 +38,7 @@ class DemocracynowIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - description = self._og_search_description(webpage) + description = self._og_search_description(webpage, default=None) json_data = self._parse_json(self._search_regex( r']+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), From a134426d619ac711f6adc24242b1e7d66d0b346a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 22:19:16 +0600 Subject: [PATCH 075/116] [democracynow] Fix tests --- youtube_dl/extractor/democracynow.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 4b6d2652a..188f890ce 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -17,22 +17,23 @@ class DemocracynowIE(InfoExtractor): IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', - 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', + 'md5': '3757c182d3d84da68f5c8f506c18c196', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'July 03, 2015 - Democracy Now!', - 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'title': 'Daily Show', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', - 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): From 18da24634c38ff6af4deaf606badfcbb9e6c3d68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 22:27:27 +0600 Subject: [PATCH 076/116] [democracynow] Improve extraction --- youtube_dl/extractor/democracynow.py | 36 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 188f890ce..65a98d789 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -38,17 +38,32 @@ class DemocracynowIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - description = self._og_search_description(webpage, default=None) json_data = self._parse_json(self._search_regex( r']+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), display_id) - video_id = None + + title = json_data['title'] formats = [] - default_lang = 'en' + video_id = None + for key in ('file', 'audio', 'video', 'high_res_video'): + media_url = json_data.get(key, '') + if not media_url: + continue + media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') + formats.append({ + 'url': media_url, + 'vcodec': 'none' if key == 'audio' else None, + }) + + self._sort_formats(formats) + + default_lang = 'en' subtitles = {} def add_subtitle_item(lang, info_dict): @@ -68,22 +83,13 @@ class DemocracynowIE(InfoExtractor): 'url': compat_urlparse.urljoin(url, subtitle_item['url']), }) - for key in ('file', 'audio', 'video'): - media_url = json_data.get(key, '') - if not media_url: - continue - media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) - video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') - formats.append({ - 'url': media_url, - }) - - self._sort_formats(formats) + description = self._og_search_description(webpage, default=None) return { 'id': video_id or display_id, - 'title': json_data['title'], + 'title': title, 'description': description, + 'thumbnail': json_data.get('image'), 'subtitles': subtitles, 'formats': formats, } From fb38aa8b53d25606d2582e1043d09ad1a077bf61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 22:48:08 +0600 Subject: [PATCH 077/116] [extractor/common] Support arbitrary format strings for template based identifiers in mpd manifests (Closes #9119, closes #9120) --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2b40f3b7c..a7324af5c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1534,7 +1534,7 @@ class InfoExtractor(object): media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%(\d+)\$', r'%(\1)\2d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') representation_ms_info['segment_urls'] = [ media_template % { From 3c6c7e7d7e5a7cbf87385bf3c649342c3ee4327d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 23:16:02 +0600 Subject: [PATCH 078/116] [gdcvault] Fix extraction (Closes #9107, closes #9114) --- youtube_dl/extractor/gdcvault.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 59ed4c38f..25e93c9a4 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -159,9 +159,10 @@ class GDCVaultIE(InfoExtractor): 'title': title, } + PLAYER_REGEX = r'', - start_page, 'xml root', default=None) + PLAYER_REGEX, start_page, 'xml root', default=None) if xml_root is None: # Probably need to authenticate login_res = self._login(webpage_url, display_id) @@ -171,18 +172,19 @@ class GDCVaultIE(InfoExtractor): start_page = login_res # Grab the url from the authenticated page xml_root = self._html_search_regex( - r'', start_page, 'xml filename', default=None) if xml_name is None: # Fallback to the older format - xml_name = self._html_search_regex(r'', + start_page, 'xml filename') - xml_description_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_description_url, display_id) + xml_description = self._download_xml( + '%s/xml/%s' % (xml_root, xml_name), display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) From bacec0397ff2abddac460148e8ceb49989fce6a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 23:33:45 +0600 Subject: [PATCH 079/116] [extractor/common] Relax _hidden_inputs --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a7324af5c..17d00721c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -843,7 +843,7 @@ class InfoExtractor(object): for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue - name = re.search(r'name=(["\'])(?P.+?)\1', input) + name = re.search(r'(?:name|id)=(["\'])(?P.+?)\1', input) if not name: continue value = re.search(r'value=(["\'])(?P.*?)\1', input) From 2f2fcf1a3304a7134d280726ae1a3d7010adbd59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 Apr 2016 23:34:59 +0600 Subject: [PATCH 080/116] [tnaflix] Fix extraction (Closes #9074) --- youtube_dl/extractor/tnaflix.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 79f036fe4..bc32c0926 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -76,7 +76,11 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') + self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:') + + if not cfg_url: + inputs = self._hidden_inputs(webpage) + cfg_url = 'https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s' % (inputs['vkey'], inputs['nkey']) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', From 568d2f78d635c3993e95334b9f8f6d2b47ecee51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Apr 2016 00:27:24 +0600 Subject: [PATCH 081/116] [tnaflix] Fix metadata extraction --- youtube_dl/extractor/tnaflix.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index bc32c0926..78174178e 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -136,7 +136,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = categories_str.split(', ') if categories_str is not None else [] + categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] return { 'id': video_id, @@ -190,13 +190,14 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r'

([^<]+)

' - _UPLOADER_REGEX = r'(?s)]+class="infoTitle"[^>]*>Uploaded By:(.+?)]+name="description"[^>]+content="([^"]+)"' + _UPLOADER_REGEX = r'\s*Verified Member\s*\s*

(.+?)

' + _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)
' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'md5': '7e569419fe6d69543d01e6be22f5f7c4', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', @@ -205,17 +206,16 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): 'thumbnail': 're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, - 'uploader': 'Anonymous', - 'categories': [], + 'categories': ['Porn Stars'], } }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'md5': 'fcba2636572895aba116171a899a5658', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Educational xxx video', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': 're:https?://.*\.jpg$', From e52d7f85f25e806527d7b618d8c3ad16d27681f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 10 Feb 2016 13:16:18 +0100 Subject: [PATCH 082/116] Delay initialization of InfoExtractors until they are needed --- youtube_dl/YoutubeDL.py | 10 ++++++---- youtube_dl/extractor/__init__.py | 9 ++++++++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index cd0805303..f18a8e840 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,7 +82,7 @@ from .utils import ( YoutubeDLHandler, ) from .cache import Cache -from .extractor import get_info_extractor, gen_extractors +from .extractor import get_info_extractor, gen_extractor_classes from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( @@ -378,8 +378,9 @@ class YoutubeDL(object): def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" self._ies.append(ie) - self._ies_instances[ie.ie_key()] = ie - ie.set_downloader(self) + if not isinstance(ie, type): + self._ies_instances[ie.ie_key()] = ie + ie.set_downloader(self) def get_info_extractor(self, ie_key): """ @@ -397,7 +398,7 @@ class YoutubeDL(object): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractors(): + for ie in gen_extractor_classes(): self.add_info_extractor(ie) def add_post_processor(self, pp): @@ -661,6 +662,7 @@ class YoutubeDL(object): if not ie.suitable(url): continue + ie = self.get_info_extractor(ie.ie_key()) if not ie.working(): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c3121d83c..cd1f116e2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -997,11 +997,18 @@ _ALL_CLASSES = [ _ALL_CLASSES.append(GenericIE) +def gen_extractor_classes(): + """ Return a list of supported extractors. + The order does matter; the first extractor matched is the one handling the URL. + """ + return _ALL_CLASSES + + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. """ - return [klass() for klass in _ALL_CLASSES] + return [klass() for klass in gen_extractor_classes()] def list_extractors(age_limit): From 1b3d5e05a824f880f1171eb840235e13cd8848dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 10 Feb 2016 13:24:49 +0100 Subject: [PATCH 083/116] Move the extreactors import to youtube_dl/extractor/extractors.py --- README.md | 4 +- youtube_dl/extractor/__init__.py | 989 +--------------------------- youtube_dl/extractor/extractors.py | 991 +++++++++++++++++++++++++++++ 3 files changed, 994 insertions(+), 990 deletions(-) create mode 100644 youtube_dl/extractor/extractors.py diff --git a/README.md b/README.md index e972bf69f..cd18edd87 100644 --- a/README.md +++ b/README.md @@ -889,14 +889,14 @@ After you have ensured this site is distributing it's content legally, you can f # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. 8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. 9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). 10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: - $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cd1f116e2..a0a53445a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,993 +1,6 @@ from __future__ import unicode_literals -from .abc import ABCIE -from .abc7news import Abc7NewsIE -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .addanime import AddAnimeIE -from .adobetv import ( - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE -from .aftonbladet import AftonbladetIE -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .animeondemand import AnimeOnDemandIE -from .anitube import AnitubeIE -from .anysex import AnySexIE -from .aol import ( - AolIE, - AolFeaturesIE, -) -from .allocine import AllocineIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .archiveorg import ArchiveOrgIE -from .ard import ( - ARDIE, - ARDMediathekIE, - SportschauIE, -) -from .arte import ( - ArteTvIE, - ArteTVPlus7IE, - ArteTVCreativeIE, - ArteTVConcertIE, - ArteTVFutureIE, - ArteTVCinemaIE, - ArteTVDDCIE, - ArteTVMagazineIE, - ArteTVEmbedIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .azubu import AzubuIE, AzubuLiveIE -from .baidu import BaiduVideoIE -from .bambuser import BambuserIE, BambuserChannelIE -from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .beatportpro import BeatportProIE -from .bet import BetIE -from .bigflix import BigflixIE -from .bild import BildIE -from .bilibili import BiliBiliIE -from .biobiochiletv import BioBioChileTVIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blinkx import BlinkxIE -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bpb import BpbIE -from .br import BRIE -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .camwithher import CamWithHerIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import CanvasIE -from .cbc import ( - CBCIE, - CBCPlayerIE, -) -from .cbs import CBSIE -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import CBSSportsIE -from .ccc import CCCIE -from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE -from .channel9 import Channel9IE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE -from .clipfish import ClipfishIE -from .cliphunter import CliphunterIE -from .clipsyndicate import ClipsyndicateIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import CNBCIE -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .collegehumor import CollegeHumorIE -from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE -from .comcarcoff import ComCarCoffIE -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import RtmpIE -from .condenast import CondeNastIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .criterion import CriterionIE -from .crooksandliars import CrooksAndLiarsIE -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE -) -from .cspan import CSpanIE -from .ctsnews import CtsNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .cwtv import CWTVIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, - DailymotionCloudIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .dbtv import DBTVIE -from .dcn import ( - DCNIE, - DCNVideoIE, - DCNLiveIE, - DCNSeasonIE, -) -from .dctp import DctpTvIE -from .deezer import DeezerPlaylistIE -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .dotsub import DotsubIE -from .douyutv import DouyuTVIE -from .dplay import DPlayIE -from .dramafever import ( - DramaFeverIE, - DramaFeverSeriesIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import DRTVIE -from .dvtv import DVTVIE -from .dump import DumpIE -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .discovery import DiscoveryIE -from .dropbox import DropboxIE -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentv import ( - EllenTVIE, - EllenTVClipsIE, -) -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .eporner import EpornerIE -from .eroprofile import EroProfileIE -from .escapist import EscapistIE -from .espn import ESPNIE -from .esri import EsriVideoIE -from .europa import EuropaIE -from .everyonesmixtape import EveryonesMixtapeIE -from .exfm import ExfmIE -from .expotv import ExpoTVIE -from .extremetube import ExtremeTubeIE -from .facebook import FacebookIE -from .faz import FazIE -from .fc2 import FC2IE -from .fczenit import FczenitIE -from .firstpost import FirstpostIE -from .firsttv import FirstTVIE -from .fivemin import FiveMinIE -from .fivetv import FiveTVIE -from .fktv import FKTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .fourtube import FourTubeIE -from .fox import FOXIE -from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE -from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, -) -from .franceinter import FranceInterIE -from .francetv import ( - PluzzIE, - FranceTvInfoIE, - FranceTVIE, - GenerationQuoiIE, - CultureboxIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .freevideo import FreeVideoIE -from .funimation import FunimationIE -from .funnyordie import FunnyOrDieIE -from .gameinformer import GameInformerIE -from .gamekings import GamekingsIE -from .gameone import ( - GameOneIE, - GameOnePlaylistIE, -) -from .gamersyde import GamersydeIE -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gametrailers import GametrailersIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .generic import GenericIE -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .godtube import GodTubeIE -from .goldenmoustache import GoldenMoustacheIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE -from .googlesearch import GoogleSearchIE -from .goshgay import GoshgayIE -from .gputechconf import GPUTechConfIE -from .groupon import GrouponIE -from .hark import HarkIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hornbunny import HornBunnyIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import HotStarIE -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .huffpost import HuffPostIE -from .hypem import HypemIE -from .iconosquare import IconosquareIE -from .ign import ( - IGNIE, - OneUPIE, - PCMagIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, -) -from .ina import InaIE -from .indavideo import ( - IndavideoIE, - IndavideoEmbedIE, -) -from .infoq import InfoQIE -from .instagram import InstagramIE, InstagramUserIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE -from .iqiyi import IqiyiIE -from .ir90tv import Ir90TvIE -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .izlesene import IzleseneIE -from .jadorecettepub import JadoreCettePubIE -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .jwplatform import JWPlatformIE -from .jpopsukitv import JpopsukiIE -from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE -from .kankan import KankanIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .khanacademy import KhanAcademyIE -from .kickstarter import KickStarterIE -from .keek import KeekIE -from .konserthusetplay import KonserthusetPlayIE -from .kontrtube import KontrTubeIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import LA7IE -from .laola1tv import Laola1TvIE -from .lecture2go import Lecture2GoIE -from .lemonde import LemondeIE -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .liveleak import LiveLeakIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import LnkGoIE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .macgamestore import MacGameStoreIE -from .mailru import MailRuIE -from .makerschannel import MakersChannelIE -from .makertv import MakerTVIE -from .malemotion import MalemotionIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .minhateca import MinhatecaIE -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mit import TechTVMITIE, MITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixcloud import MixcloudIE -from .mlb import MLBIE -from .mnet import MnetIE -from .mpora import MporaIE -from .moevideo import MoeVideoIE -from .mofosex import MofosexIE -from .mojvideo import MojvideoIE -from .moniker import MonikerIE -from .mooshare import MooshareIE -from .morningstar import MorningstarIE -from .motherless import MotherlessIE -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .mtv import ( - MTVIE, - MTVServicesEmbeddedIE, - MTVIggyIE, - MTVDEIE, -) -from .muenchentv import MuenchenTVIE -from .musicplayon import MusicPlayOnIE -from .muzu import MuzuTVIE -from .mwave import MwaveIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import MyviIE -from .myvideo import MyVideoIE -from .myvidster import MyVidsterIE -from .nationalgeographic import ( - NationalGeographicIE, - NationalGeographicChannelIE, -) -from .naver import NaverIE -from .nba import NBAIE -from .nbc import ( - CSNNEIE, - NBCIE, - NBCNewsIE, - NBCSportsIE, - NBCSportsVPlayerIE, - MSNBCIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .netzkino import NetzkinoIE -from .nerdcubed import NerdCubedFeedIE -from .nerdist import NerdistIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import NewgroundsIE -from .newstube import NewstubeIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, -) -from .nextmovie import NextMovieIE -from .nfb import NFBIE -from .nfl import NFLIE -from .nhl import ( - NHLIE, - NHLNewsIE, - NHLVideocenterIE, -) -from .nick import NickIE -from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninegag import NineGagIE -from .noco import NocoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import NovaIE -from .novamov import ( - AuroraVidIE, - CloudTimeIE, - NowVideoIE, - VideoWeedIE, - WholeCloudIE, -) -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .nowtv import ( - NowTVIE, - NowTVListIE, -) -from .noz import NozIE -from .npo import ( - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - VPROIE, - WNLIE -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, -) -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, -) -from .nuvid import NuvidIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .openload import OpenloadIE -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFOE1IE, - ORFFM4IE, - ORFIPTVIE, -) -from .pandoratv import PandoraTVIE -from .parliamentliveuk import ParliamentLiveUKIE -from .patreon import PatreonIE -from .pbs import PBSIE -from .periscope import PeriscopeIE -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .pinkbike import PinkbikeIE -from .planetaplay import PlanetaPlayIE -from .pladform import PladformIE -from .played import PlayedIE -from .playfm import PlayFMIE -from .plays import PlaysTVIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .porn91 import Porn91IE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubPlaylistIE, - PornHubUserVideosIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .primesharetv import PrimeShareTVIE -from .promptfile import PromptFileIE -from .prosiebensat1 import ProSiebenSat1IE -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .quickvid import QuickVidIE -from .r7 import R7IE -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .rai import ( - RaiTVIE, - RaiIE, -) -from .rbmaradio import RBMARadioIE -from .rds import RDSIE -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .restudy import RestudyIE -from .reverbnation import ReverbNationIE -from .revision3 import Revision3IE -from .rice import RICEIE -from .ringtv import RingTVIE -from .ro220 import Ro220IE -from .rottentomatoes import RottenTomatoesIE -from .roxwel import RoxwelIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import RTL2IE -from .rtp import RTPIE -from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE -from .rtvnh import RTVNHIE -from .ruhd import RUHDIE -from .ruleporn import RulePornIE -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .sandia import SandiaIE -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .scivee import SciVeeIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .screenjunkies import ScreenJunkiesIE -from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE -from .senateisvp import SenateISVPIE -from .servingsys import ServingSysIE -from .sexu import SexuIE -from .sexykarma import SexyKarmaIE -from .shahid import ShahidIE -from .shared import SharedIE -from .sharesix import ShareSixIE -from .sina import SinaIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .slideshare import SlideshareIE -from .slutload import SlutloadIE -from .smotri import ( - SmotriIE, - SmotriCommunityIE, - SmotriUserIE, - SmotriBroadcastIE, -) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) -from .snotr import SnotrIE -from .sohu import SohuIE -from .soundcloud import ( - SoundcloudIE, - SoundcloudSetIE, - SoundcloudUserIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .spankbang import SpankBangIE -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE -from .spike import SpikeIE -from .stitcher import StitcherIE -from .sport5 import Sport5IE -from .sportbox import ( - SportBoxIE, - SportBoxEmbedIE, -) -from .sportdeutschland import SportDeutschlandIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .ssa import SSAIE -from .stanfordoc import StanfordOpenClassroomIE -from .steam import SteamIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streetvoice import StreetVoiceIE -from .sunporno import SunPornoIE -from .svt import ( - SVTIE, - SVTPlayIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tapely import TapelyIE -from .tass import TassIE -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .techtalks import TechTalksIE -from .ted import TEDIE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telemb import TeleMBIE -from .teletask import TeleTaskIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .theintercept import TheInterceptIE -from .theonion import TheOnionIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thescene import TheSceneIE -from .thesixtyone import TheSixtyOneIE -from .thestar import TheStarIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .tinypic import TinyPicIE -from .tlc import TlcDeIE -from .tmz import ( - TMZIE, - TMZArticleIE, -) -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ToggleIE -from .thvideo import ( - THVideoIE, - THVideoPlaylistIE -) -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trollvids import TrollvidsIE -from .trutube import TruTubeIE -from .tube8 import Tube8IE -from .tubitv import TubiTvIE -from .tudou import ( - TudouIE, - TudouPlaylistIE, - TudouAlbumIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .turbo import TurboIE -from .tutv import TutvIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, -) -from .tv3 import TV3IE -from .tv4 import TV4IE -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvp import TvpIE, TvpSeriesIE -from .tvplay import TVPlayIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) -from .twitch import ( - TwitchVideoIE, - TwitchChapterIE, - TwitchVodIE, - TwitchProfileIE, - TwitchPastBroadcastsIE, - TwitchBookmarksIE, - TwitchStreamIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, -) -from .ubu import UbuIE -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .digiteka import DigitekaIE -from .unistra import UnistraIE -from .urort import UrortIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veoh import VeohIE -from .vessel import VesselIE -from .vesti import VestiIE -from .vevo import VevoIE -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceShowIE, -) -from .viddler import ViddlerIE -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomega import VideoMegaIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopremium import VideoPremiumIE -from .videott import VideoTtIE -from .vidme import ( - VidmeIE, - VidmeUserIE, - VidmeUserLikesIE, -) -from .vidzi import VidziIE -from .vier import VierIE, VierVideosIE -from .viewster import ViewsterIE -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, -) -from .vlive import VLiveIE -from .vodlocker import VodlockerIE -from .voicerepublic import VoiceRepublicIE -from .voxmedia import VoxMediaIE -from .vporn import VpornIE -from .vrt import VRTIE -from .vube import VubeIE -from .vuclip import VuClipIE -from .vulture import VultureIE -from .walla import WallaIE -from .washingtonpost import WashingtonPostIE -from .wat import WatIE -from .wayofthemaster import WayOfTheMasterIE -from .wdr import ( - WDRIE, - WDRMobileIE, - WDRMausIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import WeiboIE -from .weiqitv import WeiqiTVIE -from .wimp import WimpIE -from .wistia import WistiaIE -from .worldstarhiphop import WorldStarHipHopIE -from .wrzuta import WrzutaIE -from .wsj import WSJIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, -) -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, -) -from .yam import YamIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, -) -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import YoukuIE -from .youporn import YouPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeChannelIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeLiveIE, - YoutubePlaylistIE, - YoutubePlaylistsIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeShowIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) -from .zippcast import ZippCastIE +from .extractors import * _ALL_CLASSES = [ klass diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py new file mode 100644 index 000000000..de29c7956 --- /dev/null +++ b/youtube_dl/extractor/extractors.py @@ -0,0 +1,991 @@ +# flake8: noqa +from __future__ import unicode_literals + +from .abc import ABCIE +from .abc7news import Abc7NewsIE +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .addanime import AddAnimeIE +from .adobetv import ( + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import AENetworksIE +from .aftonbladet import AftonbladetIE +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .animeondemand import AnimeOnDemandIE +from .anitube import AnitubeIE +from .anysex import AnySexIE +from .aol import ( + AolIE, + AolFeaturesIE, +) +from .allocine import AllocineIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .archiveorg import ArchiveOrgIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) +from .arte import ( + ArteTvIE, + ArteTVPlus7IE, + ArteTVCreativeIE, + ArteTVConcertIE, + ArteTVFutureIE, + ArteTVCinemaIE, + ArteTVDDCIE, + ArteTVMagazineIE, + ArteTVEmbedIE, +) +from .atresplayer import AtresPlayerIE +from .atttechchannel import ATTTechChannelIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .azubu import AzubuIE, AzubuLiveIE +from .baidu import BaiduVideoIE +from .bambuser import BambuserIE, BambuserChannelIE +from .bandcamp import BandcampIE, BandcampAlbumIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE +from .bet import BetIE +from .bigflix import BigflixIE +from .bild import BildIE +from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blinkx import BlinkxIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bpb import BpbIE +from .br import BRIE +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .camwithher import CamWithHerIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import CanvasIE +from .cbc import ( + CBCIE, + CBCPlayerIE, +) +from .cbs import CBSIE +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import CBSSportsIE +from .ccc import CCCIE +from .cda import CDAIE +from .ceskatelevize import CeskaTelevizeIE +from .channel9 import Channel9IE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE +from .cliphunter import CliphunterIE +from .clipsyndicate import ClipsyndicateIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import CNBCIE +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, +) +from .collegehumor import CollegeHumorIE +from .collegerama import CollegeRamaIE +from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comcarcoff import ComCarCoffIE +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE +from .condenast import CondeNastIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .criterion import CriterionIE +from .crooksandliars import CrooksAndLiarsIE +from .crunchyroll import ( + CrunchyrollIE, + CrunchyrollShowPlaylistIE +) +from .cspan import CSpanIE +from .ctsnews import CtsNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .cwtv import CWTVIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, + DailymotionCloudIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .dbtv import DBTVIE +from .dcn import ( + DCNIE, + DCNVideoIE, + DCNLiveIE, + DCNSeasonIE, +) +from .dctp import DctpTvIE +from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE +from .dfb import DFBIE +from .dhm import DHMIE +from .dotsub import DotsubIE +from .douyutv import DouyuTVIE +from .dplay import DPlayIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import DRTVIE +from .dvtv import DVTVIE +from .dump import DumpIE +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .discovery import DiscoveryIE +from .dropbox import DropboxIE +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentv import ( + EllenTVIE, + EllenTVClipsIE, +) +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .eporner import EpornerIE +from .eroprofile import EroProfileIE +from .escapist import EscapistIE +from .espn import ESPNIE +from .esri import EsriVideoIE +from .europa import EuropaIE +from .everyonesmixtape import EveryonesMixtapeIE +from .exfm import ExfmIE +from .expotv import ExpoTVIE +from .extremetube import ExtremeTubeIE +from .facebook import FacebookIE +from .faz import FazIE +from .fc2 import FC2IE +from .fczenit import FczenitIE +from .firstpost import FirstpostIE +from .firsttv import FirstTVIE +from .fivemin import FiveMinIE +from .fivetv import FiveTVIE +from .fktv import FKTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .fourtube import FourTubeIE +from .fox import FOXIE +from .foxgay import FoxgayIE +from .foxnews import FoxNewsIE +from .foxsports import FoxSportsIE +from .franceculture import ( + FranceCultureIE, + FranceCultureEmissionIE, +) +from .franceinter import FranceInterIE +from .francetv import ( + PluzzIE, + FranceTvInfoIE, + FranceTVIE, + GenerationQuoiIE, + CultureboxIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .freevideo import FreeVideoIE +from .funimation import FunimationIE +from .funnyordie import FunnyOrDieIE +from .gameinformer import GameInformerIE +from .gamekings import GamekingsIE +from .gameone import ( + GameOneIE, + GameOnePlaylistIE, +) +from .gamersyde import GamersydeIE +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gametrailers import GametrailersIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .generic import GenericIE +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .godtube import GodTubeIE +from .goldenmoustache import GoldenMoustacheIE +from .golem import GolemIE +from .googledrive import GoogleDriveIE +from .googleplus import GooglePlusIE +from .googlesearch import GoogleSearchIE +from .goshgay import GoshgayIE +from .gputechconf import GPUTechConfIE +from .groupon import GrouponIE +from .hark import HarkIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hornbunny import HornBunnyIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import HotStarIE +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .huffpost import HuffPostIE +from .hypem import HypemIE +from .iconosquare import IconosquareIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, +) +from .ina import InaIE +from .indavideo import ( + IndavideoIE, + IndavideoEmbedIE, +) +from .infoq import InfoQIE +from .instagram import InstagramIE, InstagramUserIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import IPrimaIE +from .iqiyi import IqiyiIE +from .ir90tv import Ir90TvIE +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .izlesene import IzleseneIE +from .jadorecettepub import JadoreCettePubIE +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .jwplatform import JWPlatformIE +from .jpopsukitv import JpopsukiIE +from .kaltura import KalturaIE +from .kanalplay import KanalPlayIE +from .kankan import KankanIE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .khanacademy import KhanAcademyIE +from .kickstarter import KickStarterIE +from .keek import KeekIE +from .konserthusetplay import KonserthusetPlayIE +from .kontrtube import KontrTubeIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import LA7IE +from .laola1tv import Laola1TvIE +from .lecture2go import Lecture2GoIE +from .lemonde import LemondeIE +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .liveleak import LiveLeakIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .lnkgo import LnkGoIE +from .lovehomeporn import LoveHomePornIE +from .lrt import LRTIE +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .macgamestore import MacGameStoreIE +from .mailru import MailRuIE +from .makerschannel import MakersChannelIE +from .makertv import MakerTVIE +from .malemotion import MalemotionIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .minhateca import MinhatecaIE +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mit import TechTVMITIE, MITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixcloud import MixcloudIE +from .mlb import MLBIE +from .mnet import MnetIE +from .mpora import MporaIE +from .moevideo import MoeVideoIE +from .mofosex import MofosexIE +from .mojvideo import MojvideoIE +from .moniker import MonikerIE +from .mooshare import MooshareIE +from .morningstar import MorningstarIE +from .motherless import MotherlessIE +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviezine import MoviezineIE +from .mtv import ( + MTVIE, + MTVServicesEmbeddedIE, + MTVIggyIE, + MTVDEIE, +) +from .muenchentv import MuenchenTVIE +from .musicplayon import MusicPlayOnIE +from .muzu import MuzuTVIE +from .mwave import MwaveIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import MyviIE +from .myvideo import MyVideoIE +from .myvidster import MyVidsterIE +from .nationalgeographic import ( + NationalGeographicIE, + NationalGeographicChannelIE, +) +from .naver import NaverIE +from .nba import NBAIE +from .nbc import ( + CSNNEIE, + NBCIE, + NBCNewsIE, + NBCSportsIE, + NBCSportsVPlayerIE, + MSNBCIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .netzkino import NetzkinoIE +from .nerdcubed import NerdCubedFeedIE +from .nerdist import NerdistIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .newgrounds import NewgroundsIE +from .newstube import NewstubeIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, +) +from .nextmovie import NextMovieIE +from .nfb import NFBIE +from .nfl import NFLIE +from .nhl import ( + NHLIE, + NHLNewsIE, + NHLVideocenterIE, +) +from .nick import NickIE +from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninegag import NineGagIE +from .noco import NocoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nova import NovaIE +from .novamov import ( + AuroraVidIE, + CloudTimeIE, + NowVideoIE, + VideoWeedIE, + WholeCloudIE, +) +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .nowtv import ( + NowTVIE, + NowTVListIE, +) +from .noz import NozIE +from .npo import ( + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + VPROIE, + WNLIE +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, +) +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, +) +from .nuvid import NuvidIE +from .odnoklassniki import OdnoklassnikiIE +from .oktoberfesttv import OktoberfestTVIE +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .openload import OpenloadIE +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFOE1IE, + ORFFM4IE, + ORFIPTVIE, +) +from .pandoratv import PandoraTVIE +from .parliamentliveuk import ParliamentLiveUKIE +from .patreon import PatreonIE +from .pbs import PBSIE +from .periscope import PeriscopeIE +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE +from .planetaplay import PlanetaPlayIE +from .pladform import PladformIE +from .played import PlayedIE +from .playfm import PlayFMIE +from .plays import PlaysTVIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podomatic import PodomaticIE +from .porn91 import Porn91IE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubPlaylistIE, + PornHubUserVideosIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .primesharetv import PrimeShareTVIE +from .promptfile import PromptFileIE +from .prosiebensat1 import ProSiebenSat1IE +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .quickvid import QuickVidIE +from .r7 import R7IE +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import RadioFranceIE +from .rai import ( + RaiTVIE, + RaiIE, +) +from .rbmaradio import RBMARadioIE +from .rds import RDSIE +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .restudy import RestudyIE +from .reverbnation import ReverbNationIE +from .revision3 import Revision3IE +from .rice import RICEIE +from .ringtv import RingTVIE +from .ro220 import Ro220IE +from .rottentomatoes import RottenTomatoesIE +from .roxwel import RoxwelIE +from .rtbf import RTBFIE +from .rte import RteIE, RteRadioIE +from .rtlnl import RtlNlIE +from .rtl2 import RTL2IE +from .rtp import RTPIE +from .rts import RTSIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtvnh import RTVNHIE +from .ruhd import RUHDIE +from .ruleporn import RulePornIE +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .sandia import SandiaIE +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .scivee import SciVeeIE +from .screencast import ScreencastIE +from .screencastomatic import ScreencastOMaticIE +from .screenjunkies import ScreenJunkiesIE +from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE +from .senateisvp import SenateISVPIE +from .servingsys import ServingSysIE +from .sexu import SexuIE +from .sexykarma import SexyKarmaIE +from .shahid import ShahidIE +from .shared import SharedIE +from .sharesix import ShareSixIE +from .sina import SinaIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .slideshare import SlideshareIE +from .slutload import SlutloadIE +from .smotri import ( + SmotriIE, + SmotriCommunityIE, + SmotriUserIE, + SmotriBroadcastIE, +) +from .snagfilms import ( + SnagFilmsIE, + SnagFilmsEmbedIE, +) +from .snotr import SnotrIE +from .sohu import SohuIE +from .soundcloud import ( + SoundcloudIE, + SoundcloudSetIE, + SoundcloudUserIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkNlIE +) +from .spankbang import SpankBangIE +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE, SpiegelArticleIE +from .spiegeltv import SpiegeltvIE +from .spike import SpikeIE +from .stitcher import StitcherIE +from .sport5 import Sport5IE +from .sportbox import ( + SportBoxIE, + SportBoxEmbedIE, +) +from .sportdeutschland import SportDeutschlandIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .ssa import SSAIE +from .stanfordoc import StanfordOpenClassroomIE +from .steam import SteamIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE +from .sunporno import SunPornoIE +from .svt import ( + SVTIE, + SVTPlayIE, +) +from .swrmediathek import SWRMediathekIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tapely import TapelyIE +from .tass import TassIE +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .techtalks import TechTalksIE +from .ted import TEDIE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telemb import TeleMBIE +from .teletask import TeleTaskIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .theintercept import TheInterceptIE +from .theonion import TheOnionIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thescene import TheSceneIE +from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .tinypic import TinyPicIE +from .tlc import TlcDeIE +from .tmz import ( + TMZIE, + TMZArticleIE, +) +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ToggleIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .trilulilu import TriluliluIE +from .trollvids import TrollvidsIE +from .trutube import TruTubeIE +from .tube8 import Tube8IE +from .tubitv import TubiTvIE +from .tudou import ( + TudouIE, + TudouPlaylistIE, + TudouAlbumIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .turbo import TurboIE +from .tutv import TutvIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) +from .tv3 import TV3IE +from .tv4 import TV4IE +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tvigle import TvigleIE +from .tvland import TVLandIE +from .tvp import TvpIE, TvpSeriesIE +from .tvplay import TVPlayIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentytwotracks import ( + TwentyTwoTracksIE, + TwentyTwoTracksGenreIE +) +from .twitch import ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchPastBroadcastsIE, + TwitchBookmarksIE, + TwitchStreamIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, +) +from .ubu import UbuIE +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .digiteka import DigitekaIE +from .unistra import UnistraIE +from .urort import UrortIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import UstudioIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veoh import VeohIE +from .vessel import VesselIE +from .vesti import VestiIE +from .vevo import VevoIE +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceShowIE, +) +from .viddler import ViddlerIE +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomega import VideoMegaIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopremium import VideoPremiumIE +from .videott import VideoTtIE +from .vidme import ( + VidmeIE, + VidmeUserIE, + VidmeUserLikesIE, +) +from .vidzi import VidziIE +from .vier import VierIE, VierVideosIE +from .viewster import ViewsterIE +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, +) +from .vlive import VLiveIE +from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE +from .voxmedia import VoxMediaIE +from .vporn import VpornIE +from .vrt import VRTIE +from .vube import VubeIE +from .vuclip import VuClipIE +from .vulture import VultureIE +from .walla import WallaIE +from .washingtonpost import WashingtonPostIE +from .wat import WatIE +from .wayofthemaster import WayOfTheMasterIE +from .wdr import ( + WDRIE, + WDRMobileIE, + WDRMausIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import WeiboIE +from .weiqitv import WeiqiTVIE +from .wimp import WimpIE +from .wistia import WistiaIE +from .worldstarhiphop import WorldStarHipHopIE +from .wrzuta import WrzutaIE +from .wsj import WSJIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, +) +from .yam import YamIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, +) +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import YoukuIE +from .youporn import YouPornIE +from .yourupload import YourUploadIE +from .youtube import ( + YoutubeIE, + YoutubeChannelIE, + YoutubeFavouritesIE, + YoutubeHistoryIE, + YoutubeLiveIE, + YoutubePlaylistIE, + YoutubePlaylistsIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeShowIE, + YoutubeSubscriptionsIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeUserIE, + YoutubeWatchLaterIE, +) +from .zapiks import ZapiksIE +from .zdf import ZDFIE, ZDFChannelIE +from .zingmp3 import ( + ZingMp3SongIE, + ZingMp3AlbumIE, +) +from .zippcast import ZippCastIE From 779822d945dc7ebba7062ac9a5e760d21a7f362a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 10 Feb 2016 14:01:31 +0100 Subject: [PATCH 084/116] Add experimental support for lazy loading the info extractors 'make lazy-extractors' creates the youtube_dl/extractor/lazy_extractors.py (imported by youtube_dl/extractor/__init__.py), which contains simplified classes that only have the 'suitable' class method and that load the appropiate class with the '__new__' method when a instance is created. --- .gitignore | 1 + Makefile | 8 +++- devscripts/lazy_load_template.py | 17 ++++++++ devscripts/make_lazy_extractors.py | 63 ++++++++++++++++++++++++++++++ youtube_dl/extractor/__init__.py | 18 +++++---- 5 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 devscripts/lazy_load_template.py create mode 100644 devscripts/make_lazy_extractors.py diff --git a/.gitignore b/.gitignore index 26dbde73d..72c10425d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.fish +youtube_dl/extractor/lazy_extractors.py youtube-dl youtube-dl.exe youtube-dl.tar.gz diff --git a/Makefile b/Makefile index ba7f7ed36..06cffcb71 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -88,6 +88,12 @@ youtube-dl.fish: youtube_dl/*.py youtube_dl/*/*.py devscripts/fish-completion.in fish-completion: youtube-dl.fish +lazy-extractors: youtube_dl/extractor/lazy_extractors.py + +_EXTRACTOR_FILES != find youtube_dl/extractor -iname '*.py' -and -not -iname 'lazy_extractors.py' +youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) + $(PYTHON) devscripts/make_lazy_extractors.py $@ + youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py new file mode 100644 index 000000000..ae2bd2701 --- /dev/null +++ b/devscripts/lazy_load_template.py @@ -0,0 +1,17 @@ +# flake8: noqa +from __future__ import unicode_literals + +import re + + +class LazyLoadExtractor(object): + _module = None + + @classmethod + def ie_key(cls): + return cls.__name__[:-2] + + def __new__(cls): + mod = __import__(cls._module, fromlist=(cls.__name__,)) + real_cls = getattr(mod, cls.__name__) + return real_cls.__new__(real_cls) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py new file mode 100644 index 000000000..8627d0b1c --- /dev/null +++ b/devscripts/make_lazy_extractors.py @@ -0,0 +1,63 @@ +from __future__ import unicode_literals, print_function + +from inspect import getsource +import os +from os.path import dirname as dirn +import sys + +print('WARNING: Lazy loading extractors is an experimental feature that may not always work', file=sys.stderr) + +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) + +lazy_extractors_filename = sys.argv[1] +if os.path.exists(lazy_extractors_filename): + os.remove(lazy_extractors_filename) + +from youtube_dl.extractor import _ALL_CLASSES +from youtube_dl.extractor.common import InfoExtractor + +with open('devscripts/lazy_load_template.py', 'rt') as f: + module_template = f.read() + +module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)] + +ie_template = ''' +class {name}(LazyLoadExtractor): + _VALID_URL = {valid_url!r} + _module = '{module}' +''' + +make_valid_template = ''' + @classmethod + def _make_valid_url(cls): + return {!r} +''' + + +def build_lazy_ie(ie, name): + valid_url = getattr(ie, '_VALID_URL', None) + s = ie_template.format( + name=name, + valid_url=valid_url, + module=ie.__module__) + if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: + s += getsource(ie.suitable) + if hasattr(ie, '_make_valid_url'): + # search extractors + s += make_valid_template.format(ie._make_valid_url()) + return s + +names = [] +for ie in _ALL_CLASSES: + name = ie.ie_key() + 'IE' + src = build_lazy_ie(ie, name) + module_contents.append(src) + names.append(name) + +module_contents.append( + '_ALL_CLASSES = [{}]'.format(', '.join(names))) + +module_src = '\n'.join(module_contents) + +with open(lazy_extractors_filename, 'wt') as f: + f.write(module_src) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a0a53445a..b0d4d156b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,13 +1,17 @@ from __future__ import unicode_literals -from .extractors import * +try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES +except ImportError: + from .extractors import * -_ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' -] -_ALL_CLASSES.append(GenericIE) + _ALL_CLASSES = [ + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) def gen_extractor_classes(): From 0d778b1db909c8d096be4e199384fff96a722fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 11 Feb 2016 14:49:02 +0100 Subject: [PATCH 085/116] lazy extractors: specify the encoding When building with python3 the unicode characters are not escaped, python2 needs to know the encoding. --- devscripts/lazy_load_template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index ae2bd2701..563d629f8 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,3 +1,4 @@ +# encoding: utf-8 # flake8: noqa from __future__ import unicode_literals From c1ce6acdd73da7744f4bbe27698e96275467e14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 21 Feb 2016 11:53:48 +0100 Subject: [PATCH 086/116] lazy extractors: Fix building with python2.6 --- devscripts/make_lazy_extractors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 8627d0b1c..5d0ddb401 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -30,7 +30,7 @@ class {name}(LazyLoadExtractor): make_valid_template = ''' @classmethod def _make_valid_url(cls): - return {!r} + return {valid_url!r} ''' @@ -44,7 +44,7 @@ def build_lazy_ie(ie, name): s += getsource(ie.suitable) if hasattr(ie, '_make_valid_url'): # search extractors - s += make_valid_template.format(ie._make_valid_url()) + s += make_valid_template.format(valid_url=ie._make_valid_url()) return s names = [] @@ -55,7 +55,7 @@ for ie in _ALL_CLASSES: names.append(name) module_contents.append( - '_ALL_CLASSES = [{}]'.format(', '.join(names))) + '_ALL_CLASSES = [{0}]'.format(', '.join(names))) module_src = '\n'.join(module_contents) From 6b97ca96fc242c1d7639d080e2c8e3ee9f9d0bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 21 Feb 2016 12:22:12 +0100 Subject: [PATCH 087/116] lazy extractors: Style fixes * Sort extractors alphabetically * Add newlines when needed (youtube_dl/extractors/lazy_extractors.py pass the flake8 test now) --- devscripts/lazy_load_template.py | 1 - devscripts/make_lazy_extractors.py | 6 +++--- setup.cfg | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index 563d629f8..b984aab9b 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -1,5 +1,4 @@ # encoding: utf-8 -# flake8: noqa from __future__ import unicode_literals import re diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 5d0ddb401..b5a8b9190 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -41,14 +41,14 @@ def build_lazy_ie(ie, name): valid_url=valid_url, module=ie.__module__) if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: - s += getsource(ie.suitable) + s += '\n' + getsource(ie.suitable) if hasattr(ie, '_make_valid_url'): # search extractors s += make_valid_template.format(valid_url=ie._make_valid_url()) return s names = [] -for ie in _ALL_CLASSES: +for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]: name = ie.ie_key() + 'IE' src = build_lazy_ie(ie, name) module_contents.append(src) @@ -57,7 +57,7 @@ for ie in _ALL_CLASSES: module_contents.append( '_ALL_CLASSES = [{0}]'.format(', '.join(names))) -module_src = '\n'.join(module_contents) +module_src = '\n'.join(module_contents) + '\n' with open(lazy_extractors_filename, 'wt') as f: f.write(module_src) diff --git a/setup.cfg b/setup.cfg index 5760112d4..2dc06ffe4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,5 +2,5 @@ universal = True [flake8] -exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/make_issue_template.py,setup.py,build,.git +exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git ignore = E402,E501,E731 From e0986e31cfd57392aaf3cc84b17fbf32c6134ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 21 Feb 2016 12:28:58 +0100 Subject: [PATCH 088/116] lazy extractors: Output if it's enabled in the verbose log --- youtube_dl/YoutubeDL.py | 4 +++- youtube_dl/extractor/__init__.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f18a8e840..a89a71a25 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,7 +82,7 @@ from .utils import ( YoutubeDLHandler, ) from .cache import Cache -from .extractor import get_info_extractor, gen_extractor_classes +from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER from .downloader import get_suitable_downloader from .downloader.rtmp import rtmpdump_version from .postprocessor import ( @@ -1959,6 +1959,8 @@ class YoutubeDL(object): write_string(encoding_str, encoding=None) self._write_string('[debug] youtube-dl version ' + __version__ + '\n') + if _LAZY_LOADER: + self._write_string('[debug] Lazy loading extractors enabled' + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b0d4d156b..18d8dbcd6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals try: from .lazy_extractors import * from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True except ImportError: + _LAZY_LOADER = False from .extractors import * _ALL_CLASSES = [ From 8a5dc1c1e14cc19f143c84702f4bbc29e4f91e47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 21 Feb 2016 12:46:14 +0100 Subject: [PATCH 089/116] lazy extractors: Initialize the real info extractor According to the docs '__init__' is only called automatically if '__new__' returns an instance of the original class. --- devscripts/lazy_load_template.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index b984aab9b..2e6e6641b 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -11,7 +11,9 @@ class LazyLoadExtractor(object): def ie_key(cls): return cls.__name__[:-2] - def __new__(cls): + def __new__(cls, *args, **kwargs): mod = __import__(cls._module, fromlist=(cls.__name__,)) real_cls = getattr(mod, cls.__name__) - return real_cls.__new__(real_cls) + instance = real_cls.__new__(real_cls) + instance.__init__(*args, **kwargs) + return instance From 5a9858bfa9aba01c9dec549b83f5a0b17a520f13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 6 Mar 2016 19:36:39 +0100 Subject: [PATCH 090/116] setup.py: add command for building the lazy_extractors module --- setup.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index bfe931f5b..9444d403d 100644 --- a/setup.py +++ b/setup.py @@ -8,11 +8,12 @@ import warnings import sys try: - from setuptools import setup + from setuptools import setup, Command setuptools_available = True except ImportError: - from distutils.core import setup + from distutils.core import setup, Command setuptools_available = False +from distutils.spawn import spawn try: # This will create an exe that needs Microsoft Visual C++ 2008 @@ -70,6 +71,22 @@ else: else: params['scripts'] = ['bin/youtube-dl'] +class build_lazy_extractors(Command): + description = "Build the extractor lazy loading module" + user_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + spawn( + [sys.executable, 'devscripts/make_lazy_extractors.py', 'youtube_dl/extractor/lazy_extractors.py'], + dry_run=self.dry_run, + ) + # Get the version from youtube_dl/version.py without importing the package exec(compile(open('youtube_dl/version.py').read(), 'youtube_dl/version.py', 'exec')) @@ -107,5 +124,6 @@ setup( "Programming Language :: Python :: 3.4", ], + cmdclass={'build_lazy_extractors': build_lazy_extractors}, **params ) From bffb245a4882b10b5e66015fa89ef1cadf974415 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 9 Apr 2016 10:47:46 +0100 Subject: [PATCH 091/116] [aol] add support for videos with vidible IDs(closes #9124) --- youtube_dl/extractor/aol.py | 78 +++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 95a99c6b0..b729157d2 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,11 +1,17 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P[^/?-]+)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -14,13 +20,79 @@ class AolIE(InfoExtractor): 'id': '518167793', 'ext': 'mp4', 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', }, - 'add_ie': ['FiveMin'], + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', + 'info_dict': { + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) + + video_data = response['data'] + formats = [] + m3u8_url = video_data.get('videoMasterPlaylist') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = rendition.get('url') + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + formats.append(f) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, + } class AolFeaturesIE(InfoExtractor): From cacd9966624883523b264fa9ac48138074597730 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 19:27:54 +0800 Subject: [PATCH 092/116] [utils] Don't touch URLs if not necessary Fix test_Generic_15 (Google redirect) --- youtube_dl/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e53962c9..999dfabb5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1792,6 +1792,8 @@ def urlencode_postdata(*args, **kargs): def update_url_query(url, query): + if not query: + return url parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) From 92c7f3157aad87096aa1fdd1a4daed3bdf262178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Apr 2016 17:32:23 +0600 Subject: [PATCH 093/116] [aol] Add coding cookie --- youtube_dl/extractor/aol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b729157d2..d4801a25b 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re From ab481b48e536dd2e03d6022abb7f4d1593294721 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 20:12:11 +0800 Subject: [PATCH 094/116] [funnyordie] Relax M3U8 URL matching Also, m3u8_url extraction should be fatal as all formats depends directly or indirectly on it. This change fixes test_Generic_26 and TestFunnyOrDieSubtitles --- youtube_dl/extractor/funnyordie.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4c4a87e2a..8c5ffc9e8 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor): links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) m3u8_url = self._search_regex( - r']+src=(["\'])(?P.+?/master\.m3u8)\1', - webpage, 'm3u8 url', default=None, group='url') + r']+src=(["\'])(?P.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') formats = [] From bfe96d7bea7c5227456bf1aecca51907c8f30c51 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Fri, 9 Oct 2015 18:38:11 +0200 Subject: [PATCH 095/116] [presstv] Added extractor PressTV. Fixes #7060 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/presstv.py | 80 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 youtube_dl/extractor/presstv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de29c7956..c2fa83918 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -583,6 +583,7 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py new file mode 100644 index 000000000..724d8b1c4 --- /dev/null +++ b/youtube_dl/extractor/presstv.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..utils import str_to_int + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/Video/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + + _TEST = { + 'url': 'http://www.presstv.ir/Video/2015/10/04/431915/Max-Igan-Press-TV-Face-to-Face', + 'md5': 'e95736ac75088b5f1e5bbb68f248f90d', + 'info_dict': { + 'id': '431915', + 'ext': 'mp4', + 'title': 'Press TV’s full interview with Max Igan', + 'upload_date': '20151004', + 'thumbnail': 'http://217.218.67.233/photo/20151004/d5c333ad-98f9-4bd3-bc3e-a1ad6a192803.jpg', + 'description': ('Watch Press TV’s full interview with Max Igan, a radio talk show host and political ' + 'commentator.\nThe interview, conducted on Press TV’s Face ' + 'to Face program, was aired on October 3, 2015.') + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # extract video URL from webpage + video_url = self._html_search_regex(r'', webpage, + 'Video URL') + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + formats = [ + { + 'url': base_url + video_url, + 'format': '1080p mp4', + 'format_id': '1080p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low800.mp4"), + 'format': '720p mp4', + 'format_id': '720p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low400.mp4"), + 'format': '360p mp4', + 'format_id': '360p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low200.mp4"), + 'format': '180p mp4', + 'format_id': '180p' + } + ] + formats.reverse() + + # extract video metadata + title = self._html_search_meta('title', webpage, 'Title', True) + title = title.partition(' - ')[2] + + description = self._html_search_regex(r'
(.*?)
', webpage, + 'Description', flags=re.DOTALL) + + thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) + + year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) + month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) + day = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload day', group='d')) + upload_date = '%04d%02d%02d' % (year, month, day) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } From c05025fdd79993314e20a6074aed084889199e50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:46:51 +0800 Subject: [PATCH 096/116] [internetvideoarchive] Fix extraction and support json URLs --- youtube_dl/extractor/internetvideoarchive.py | 118 +++++++++---------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index e60145b3d..45add007f 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,93 +1,91 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( - xpath_with_ns, + determine_ext, + int_or_none, + xpath_text, ) class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' _TEST = { - 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', 'info_dict': { - 'id': '452693', + 'id': '194487', 'ext': 'mp4', - 'title': 'SKYFALL', - 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - 'duration': 152, + 'title': 'KICK-ASS 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @staticmethod - def _build_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query @staticmethod - def _clean_query(query): - NEEDED_ARGS = ['publishedid', 'customerid'] - query_dic = compat_urlparse.parse_qs(query) - cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) - # Other player ids return m3u8 urls - cleaned_dic['playerid'] = '247' - cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse_urlencode(cleaned_dic) + def _build_xml_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query def _real_extract(self, url): query = compat_urlparse.urlparse(url).query - query_dic = compat_urlparse.parse_qs(query) + query_dic = compat_parse_qs(query) video_id = query_dic['publishedid'][0] - url = self._build_url(query) - flashconfiguration = self._download_xml(url, video_id, - 'Downloading flash configuration') - file_url = flashconfiguration.find('file').text - file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') - # Replace some of the parameters in the query to get the best quality - # and http links (no m3u8 manifests) - file_url = re.sub(r'(?<=\?)(.+)$', - lambda m: self._clean_query(m.group()), - file_url) - info = self._download_xml(file_url, video_id, - 'Downloading video info') - item = info.find('channel/item') + if '/player/' in url: + configuration = self._download_json(url, video_id) - def _bp(p): - return xpath_with_ns( - p, - { - 'media': 'http://search.yahoo.com/mrss/', - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', - } - ) - formats = [] - for content in item.findall(_bp('media:group/media:content')): - attr = content.attrib - f_url = attr['url'] - width = int(attr['width']) - bitrate = int(attr['bitrate']) - format_id = '%d-%dk' % (width, bitrate) - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'width': width, - 'tbr': bitrate, - }) + # There are multiple videos in the playlist whlie only the first one + # matches the video played in browsers + video_info = configuration['playlist'][0] - self._sort_formats(formats) + formats = [] + for source in video_info['sources']: + file_url = source['file'] + if determine_ext(file_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, ext='mp4', m3u8_id='hls')) + else: + a_format = { + 'url': file_url, + } + + if source.get('label') and source['label'][-4:] == ' kbs': + tbr = int_or_none(source['label'][:-4]) + a_format.update({ + 'tbr': tbr, + 'format_id': 'http-%d' % tbr, + }) + formats.append(a_format) + + self._sort_formats(formats) + + title = video_info['title'] + description = video_info.get('description') + thumbnail = video_info.get('image') + else: + configuration = self._download_xml(url, video_id) + formats = [{ + 'url': xpath_text(configuration, './file', 'file URL', fatal=True), + }] + thumbnail = xpath_text(configuration, './image', 'thumbnail') + title = 'InternetVideoArchive video %s' % video_id + description = None return { 'id': video_id, - 'title': item.find('title').text, + 'title': title, 'formats': formats, - 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], - 'description': item.find('description').text, - 'duration': int(attr['duration']), + 'thumbnail': thumbnail, + 'description': description, } From dae2a058de81e42d73bdbe0041a598262703c352 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:47:12 +0800 Subject: [PATCH 097/116] [rottentomatoes] Adapt to InternetVideoArchiveIE --- youtube_dl/extractor/rottentomatoes.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index e8bb20a08..f9cd48790 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,11 +1,11 @@ from __future__ import unicode_literals -from .videodetective import VideoDetectiveIE +from .common import InfoExtractor +from ..compat import compat_urlparse +from .internetvideoarchive import InternetVideoArchiveIE -# It just uses the same method as videodetective.com, -# the internetvideoarchive.com is extracted from the og:video property -class RottenTomatoesIE(VideoDetectiveIE): +class RottenTomatoesIE(InfoExtractor): _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' _TEST = { @@ -13,7 +13,19 @@ class RottenTomatoesIE(VideoDetectiveIE): 'info_dict': { 'id': '613340', 'ext': 'mp4', - 'title': 'TOY STORY 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'title': 'Toy Story 3', }, } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + + return { + '_type': 'url_transparent', + 'url': InternetVideoArchiveIE._build_xml_url(query), + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'title': self._og_search_title(webpage), + } From c991106706c05401090bcba79e65feae5c7e3fda Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:47:35 +0800 Subject: [PATCH 098/116] [videodetective] Adapt to InternetVideoArchiveIE --- youtube_dl/extractor/videodetective.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 0ffc7ff7d..2ed5d9643 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -14,8 +14,11 @@ class VideoDetectiveIE(InfoExtractor): 'id': '194487', 'ext': 'mp4', 'title': 'KICK-ASS 2', - 'description': 'md5:65ba37ad619165afac7d432eaded6013', - 'duration': 138, + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @@ -24,4 +27,4 @@ class VideoDetectiveIE(InfoExtractor): webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage) query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key()) + return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) From 6c4c7539f222cd9e80dfae0b1c9dabbd45d1b3dc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 22:04:48 +0800 Subject: [PATCH 099/116] [test/helper] Check got values to be strings for md5: fields Seen in PBSIE tests --- test/helper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/helper.py b/test/helper.py index f2d878212..b8e22c5cb 100644 --- a/test/helper.py +++ b/test/helper.py @@ -143,6 +143,9 @@ def expect_value(self, got, expected, field): expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): + self.assertTrue( + isinstance(got, compat_str), + 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) elif isinstance(expected, compat_str) and expected.startswith('mincount:'): self.assertTrue( From 95153a960d098d75e6100e38e77fdaa32f5267a2 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Sat, 9 Apr 2016 16:14:05 +0200 Subject: [PATCH 100/116] [presstv] updated extractor and tests to work with current PressTV website --- youtube_dl/extractor/presstv.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 724d8b1c4..9af6780c1 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -7,20 +7,20 @@ from ..utils import str_to_int class PressTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?presstv\.ir/Video/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' _TEST = { - 'url': 'http://www.presstv.ir/Video/2015/10/04/431915/Max-Igan-Press-TV-Face-to-Face', - 'md5': 'e95736ac75088b5f1e5bbb68f248f90d', + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', 'info_dict': { - 'id': '431915', + 'id': '459911', 'ext': 'mp4', - 'title': 'Press TV’s full interview with Max Igan', - 'upload_date': '20151004', - 'thumbnail': 'http://217.218.67.233/photo/20151004/d5c333ad-98f9-4bd3-bc3e-a1ad6a192803.jpg', - 'description': ('Watch Press TV’s full interview with Max Igan, a radio talk show host and political ' - 'commentator.\nThe interview, conducted on Press TV’s Face ' - 'to Face program, was aired on October 3, 2015.') + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': 'http://media.presstv.com/photo/20160409/41719129-76fa-4372-a09d-bf348278eb5d.jpg', + 'description': ('A trial program at an Australian sewerage treatment facility hopes to change ' + 'the way waste water is treated by using plant mattresses to reduce chemical ' + 'and electricity use.') } } @@ -58,12 +58,10 @@ class PressTVIE(InfoExtractor): # extract video metadata title = self._html_search_meta('title', webpage, 'Title', True) - title = title.partition(' - ')[2] - - description = self._html_search_regex(r'
(.*?)
', webpage, - 'Description', flags=re.DOTALL) + title = title.partition('-')[2].strip() thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) + description = self._html_search_meta('og:description', webpage, 'Description', True) year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) From eb9c3edd5ec970abb349bd4c71040b75e9d19e0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 Apr 2016 22:40:05 +0200 Subject: [PATCH 101/116] [test/utils] Add test for date_from_str --- test/test_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index a35debfe1..0f36bb9f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( args_to_str, encode_base_n, clean_html, + date_from_str, DateRange, detect_exe_version, determine_ext, @@ -234,6 +235,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + def test_date_from_str(self): + self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) + self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) + self.assertEqual(date_from_str('now+14day'), date_from_str('now+2week')) + self.assertEqual(date_from_str('now+365day'), date_from_str('now+1year')) + self.assertEqual(date_from_str('now+30day'), date_from_str('now+1month')) + def test_daterange(self): _20century = DateRange("19000101", "20000101") self.assertFalse("17890714" in _20century) From 61dd350a04a77abe86e46cfe8b7603514e8f2ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 03:02:35 +0600 Subject: [PATCH 102/116] [1tv] Fix extraction (Closes #9103) --- youtube_dl/extractor/firsttv.py | 145 ++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 98b165143..88bca1007 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,78 +2,133 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_xpath +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + xpath_attr, + xpath_element, + xpath_text, + xpath_with_ns, +) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P\d+)' _TESTS = [{ - 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '777f525feeec4806130f4f764bc18a4f', - 'info_dict': { - 'id': '73390', - 'ext': 'mp4', - 'title': 'Олимпийские канатные дороги', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 149, - 'like_count': int, - 'dislike_count': int, - }, - 'skip': 'Only works from Russia', - }, { + # single format via video_materials.json API 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'md5': '82a2777648acae812d58b3f5bd42882b', 'info_dict': { 'id': '35930', 'ext': 'mp4', - 'title': 'Наедине со всеми. Людмила Сенчина', - 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'description': 'md5:357933adeede13b202c7c21f91b871b2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20150212', 'duration': 2694, }, - 'skip': 'Only works from Russia', + }, { + # multiple formats via video_materials.json API + 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + 'info_dict': { + 'id': '113641', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API + 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', + 'md5': '519d306c5b5669761fd8906c39dbee23', + 'info_dict': { + 'id': '47038', + 'ext': 'mp4', + 'title': '"Побег". Второй сезон. 3 серия', + 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20120516', + 'duration': 3080, + }, + }, { + 'url': 'http://www.1tv.ru/videoarchive/9967', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, 'Downloading page') + # Videos with multiple formats only available via this API + video = self._download_json( + 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, + video_id, fatal=False) - video_url = self._html_search_regex( - r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', - webpage, 'video URL') + description, thumbnail, upload_date, duration = [None] * 4 - title = self._html_search_regex( - [r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"], webpage, 'title') - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( + if video: + item = video[0] + title = item['title'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [{ + 'url': f['src'], + 'format_id': f.get('name'), + 'quality': quality(f.get('name')), + } for f in item['mbr'] if f.get('src')] + thumbnail = item.get('poster') + else: + # Some videos are not available via video_materials.json + video = self._download_xml( + 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, + video_id) + + NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + } + + item = xpath_element(video, './channel/item', fatal=True) + title = xpath_text(item, './title', fatal=True) + formats = [{ + 'url': content.attrib['url'], + } for content in item.findall( + compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] + thumbnail = xpath_attr( + item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) + if webpage: + title = self._html_search_regex( + (r'
\s*

([^<]*)', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or title + description = self._html_search_regex( + r'
\s*
 
\s*

([^<]*)

', + webpage, 'description', default=None) or self._html_search_meta( 'description', webpage, 'description') - - thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property( - 'video:duration', webpage, - 'video duration', fatal=False) - - like_count = self._html_search_regex( - r'title="Понравилось".*?/> \[(\d+)\]', - webpage, 'like count', default=None) - dislike_count = self._html_search_regex( - r'title="Не понравилось".*?/> \[(\d+)\]', - webpage, 'dislike count', default=None) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, + 'upload_date': upload_date, 'duration': int_or_none(duration), - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), + 'formats': formats } From 6a801f44704c3df49563852108c104c43a0551cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 Apr 2016 23:18:41 +0200 Subject: [PATCH 103/116] [test/InfoExtractors] add test for _download_json --- test/test_InfoExtractor.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 938466a80..6404ac89f 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError class TestIE(InfoExtractor): @@ -66,5 +67,14 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + def test_download_json(self): + uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') + self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) + uri = encode_data_uri(b'callback({"foo": "blah"})', 'application/javascript') + self.assertEqual(self.ie._download_json(uri, None, transform_source=strip_jsonp), {'foo': 'blah'}) + uri = encode_data_uri(b'{"foo": invalid}', 'application/json') + self.assertRaises(ExtractorError, self.ie._download_json, uri, None) + self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + if __name__ == '__main__': unittest.main() From 49caf3307f1ae713acaeed651984a6338293b8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 17:10:27 +0600 Subject: [PATCH 104/116] [extractor/common] Remove irrelevant comment --- youtube_dl/extractor/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 17d00721c..5269059d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -376,7 +376,6 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) - # data, headers and query params will be ignored for `Request` objects if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) From a1fa60a9340f61a8455a0cd85c18f63d9bdfe681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 18:43:40 +0600 Subject: [PATCH 105/116] [cliprs] Add extractor (Closes #9099) --- youtube_dl/extractor/cliprs.py | 90 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 7 +++ 2 files changed, 97 insertions(+) create mode 100644 youtube_dl/extractor/cliprs.py diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 000000000..4f9320ea5 --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, +) + + +class ClipRsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + if not f.get('url'): + continue + formats.append({ + 'url': f['url'], + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de29c7956..aefc4df01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -123,6 +123,7 @@ from .chirbit import ( ) from .cinchcast import CinchcastIE from .cinemassacre import CinemassacreIE +from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE @@ -939,6 +940,12 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, ) +from .xiami import ( + XiamiIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE From f44c2768421bc3b0ead3ccf86b5e499d498674c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 19:21:58 +0600 Subject: [PATCH 106/116] [extractor/extractors] Remove non-existant imports --- youtube_dl/extractor/extractors.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefc4df01..c1a13c982 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -940,12 +940,6 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, ) -from .xiami import ( - XiamiIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE From de728757ad7218ce175649ec0d3f0b5723f2c580 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Sun, 10 Apr 2016 16:36:44 +0200 Subject: [PATCH 107/116] [presstv] Refactored extractor. --- youtube_dl/extractor/presstv.py | 52 +++++++++++++++------------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 9af6780c1..755e32528 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -17,10 +17,8 @@ class PressTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', - 'thumbnail': 'http://media.presstv.com/photo/20160409/41719129-76fa-4372-a09d-bf348278eb5d.jpg', - 'description': ('A trial program at an Australian sewerage treatment facility hopes to change ' - 'the way waste water is treated by using plant mattresses to reduce chemical ' - 'and electricity use.') + 'thumbnail': 're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' } } @@ -35,38 +33,34 @@ class PressTVIE(InfoExtractor): # build list of available formats # specified in http://www.presstv.ir/Scripts/playback.js base_url = 'http://192.99.219.222:82/presstv' - formats = [ - { - 'url': base_url + video_url, - 'format': '1080p mp4', - 'format_id': '1080p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low800.mp4"), - 'format': '720p mp4', - 'format_id': '720p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low400.mp4"), - 'format': '360p mp4', - 'format_id': '360p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low200.mp4"), - 'format': '180p mp4', - 'format_id': '180p' - } + _formats = [ + ("180p", "_low200.mp4"), + ("360p", "_low400.mp4"), + ("720p", "_low800.mp4"), + ("1080p", ".mp4") ] - formats.reverse() + + formats = [] + for fmt in _formats: + format_id, extension = fmt + formats.append({ + 'url': base_url + video_url[:-4] + extension, + 'format_id': format_id + }) # extract video metadata title = self._html_search_meta('title', webpage, 'Title', True) title = title.partition('-')[2].strip() - thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) - description = self._html_search_meta('og:description', webpage, 'Description', True) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) - year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) - month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) - day = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload day', group='d')) - upload_date = '%04d%02d%02d' % (year, month, day) + match = re.match(PressTVIE._VALID_URL, url) + upload_date = '%04d%02d%02d' % ( + str_to_int(match.group('y')), + str_to_int(match.group('m')), + str_to_int(match.group('d')) + ) return { 'id': video_id, From 443285aabef470f546f0b01b8e8194ca988bb315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:15:11 +0600 Subject: [PATCH 108/116] [ebaumsworlds] Update _VALID_URL (Closes #9135) --- youtube_dl/extractor/ebaumsworld.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index b6bfd2b2d..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class EbaumsWorldIE(InfoExtractor): - _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P\d+)' _TEST = { - 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', + 'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/', 'info_dict': { 'id': '83367677', 'ext': 'mp4', From 66fa49586879418e357337ff82794fe851e71e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:37:14 +0600 Subject: [PATCH 109/116] [screencastomatic] Fix extraction (Closes #9136) --- youtube_dl/extractor/screencastomatic.py | 35 ++++++++---------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 05337421c..c08c89d94 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,15 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - js_to_json, -) +from .jwplatform import JWPlatformBaseIE +from ..utils import js_to_json -class ScreencastOMaticIE(InfoExtractor): +class ScreencastOMaticIE(JWPlatformBaseIE): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', @@ -27,23 +23,14 @@ class ScreencastOMaticIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - setup_js = self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", - webpage, 'setup code') - data = self._parse_json(setup_js, video_id, transform_source=js_to_json) - try: - video_data = next( - m for m in data['modes'] if m.get('type') == 'html5') - except StopIteration: - raise ExtractorError('Could not find any video entries!') - video_url = compat_urlparse.urljoin(url, video_data['config']['file']) - thumbnail = data.get('image') + jwplayer_data = self._parse_json( + self._search_regex( + r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), + video_id, transform_source=js_to_json) - return { - 'id': video_id, + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict.update({ 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), - 'url': video_url, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } + }) + return info_dict From a6d6722c8fc2174ce72ed462e649d397d1448a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:47:38 +0600 Subject: [PATCH 110/116] [jwplatform:base] Extract duration --- youtube_dl/extractor/jwplatform.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 6770685d7..01601c59e 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + float_or_none, + int_or_none, +) class JWPlatformBaseIE(InfoExtractor): @@ -41,6 +44,7 @@ class JWPlatformBaseIE(InfoExtractor): 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), 'subtitles': subtitles, 'formats': formats, } From d7eb052fa2ab26839b050a7c3fa3f8874d508a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:48:04 +0600 Subject: [PATCH 111/116] [screencastomatic] Add duration to test --- youtube_dl/extractor/screencastomatic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index c08c89d94..7a88a42cd 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -16,6 +16,7 @@ class ScreencastOMaticIE(JWPlatformBaseIE): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + 'duration': 369.163, } } From 7ebc36900d15888321a45f04113eeda169469004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:55:07 +0600 Subject: [PATCH 112/116] [jwplatform:base] Improve subtitles extraction --- youtube_dl/extractor/jwplatform.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 01601c59e..8a5e562db 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -13,10 +13,6 @@ from ..utils import ( class JWPlatformBaseIE(InfoExtractor): def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): video_data = jwplayer_data['playlist'][0] - subtitles = {} - for track in video_data['tracks']: - if track['kind'] == 'captions': - subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] formats = [] for source in video_data['sources']: @@ -38,6 +34,15 @@ class JWPlatformBaseIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) + return { 'id': video_id, 'title': video_data['title'] if require_title else video_data.get('title'), From 4a121d29bb0700beb19e8b6edb5d479e9fe7ac1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 23:45:17 +0600 Subject: [PATCH 113/116] [glide] Fix extraction (Closes #9141) --- youtube_dl/extractor/glide.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 9561ed5fb..0ab23f766 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -23,8 +23,9 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'(.*?)', webpage, 'title') - video_url = self.http_scheme() + self._search_regex( - r'', webpage, 'video URL') + video_url = self._proto_relative_url(self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'video URL', group='url'), self.http_scheme()) thumbnail_url = self._search_regex( r' Date: Sun, 10 Apr 2016 23:56:23 +0600 Subject: [PATCH 114/116] [glide] Improve extraction and extract upload info --- youtube_dl/extractor/glide.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 0ab23f766..62ff84835 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -15,27 +16,38 @@ class GlideIE(InfoExtractor): 'ext': 'mp4', 'title': 'Damon Timm\'s Glide message', 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + 'uploader': 'Damon Timm', + 'upload_date': '20140919', } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( - r'(.*?)', webpage, 'title') + r'(.+?)', webpage, 'title') video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', - webpage, 'video URL', group='url'), self.http_scheme()) - thumbnail_url = self._search_regex( - r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', + webpage, 'thumbnail url', default=None, + group='url')) or self._og_search_thumbnail(webpage) + uploader = self._search_regex( + r']+class=["\']info-name["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r']+class="info-date"[^>]*>([^<]+)', + webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, } From 452908b257da1a5b228a2c0522c89fff87296622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 Apr 2016 00:06:05 +0600 Subject: [PATCH 115/116] [telebruxelles] Fix extraction (Closes #9142) --- youtube_dl/extractor/telebruxelles.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index a3d05f97d..eefecc490 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt)/?(?P[^/#?]+)' _TESTS = [{ 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', 'md5': '59439e568c9ee42fb77588b2096b214f', @@ -39,18 +41,18 @@ class TeleBruxellesIE(InfoExtractor): webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( - r"
(.*?)

', webpage, 'title') - description = self._og_search_description(webpage) + description = self._og_search_description(webpage, default=None) rtmp_url = self._html_search_regex( - r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", + r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"', webpage, 'RTMP url') - rtmp_url = rtmp_url.replace("\" + \"", "") + rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) return { - 'id': article_id, + 'id': article_id or display_id, 'display_id': display_id, 'title': title, 'description': description, From dfbc7f7f3f44ff7f9ed2beff76dc37edbb66af8d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 11 Apr 2016 16:14:07 +0800 Subject: [PATCH 116/116] [presstv] Improve and simplify --- youtube_dl/extractor/presstv.py | 48 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 755e32528..2da93ed34 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -1,19 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import remove_start class PressTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)/(?P\d+)/(?P[^/]+)?' _TEST = { 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', 'info_dict': { 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', @@ -23,47 +25,47 @@ class PressTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) # extract video URL from webpage - video_url = self._html_search_regex(r'', webpage, - 'Video URL') + video_url = self._hidden_inputs(webpage)['inpPlayback'] # build list of available formats # specified in http://www.presstv.ir/Scripts/playback.js base_url = 'http://192.99.219.222:82/presstv' _formats = [ - ("180p", "_low200.mp4"), - ("360p", "_low400.mp4"), - ("720p", "_low800.mp4"), - ("1080p", ".mp4") + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') ] - formats = [] - for fmt in _formats: - format_id, extension = fmt - formats.append({ - 'url': base_url + video_url[:-4] + extension, - 'format_id': format_id - }) + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] # extract video metadata - title = self._html_search_meta('title', webpage, 'Title', True) - title = title.partition('-')[2].strip() + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) - match = re.match(PressTVIE._VALID_URL, url) upload_date = '%04d%02d%02d' % ( - str_to_int(match.group('y')), - str_to_int(match.group('m')), - str_to_int(match.group('d')) + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), ) return { 'id': video_id, + 'display_id': display_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail,