From 1f9fb20fcda76f165ce39b01fe907fc74c8054d3 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 25 Sep 2015 07:39:22 +0100 Subject: [PATCH 001/150] [nextmedia] update AppleDailyIE tests --- youtube_dl/extractor/nextmedia.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index c10784f6b..d1688457f 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -126,7 +126,8 @@ class AppleDailyIE(NextMediaIE): 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'md5:23c0aac567dc08c9c16a3161a2c2e3cd', 'upload_date': '20150128', - } + }, + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { # No thumbnail 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003673/', @@ -140,10 +141,19 @@ class AppleDailyIE(NextMediaIE): }, 'expected_warnings': [ 'video thumbnail', - ] + ], + 'skip': 'redirect to http://www.appledaily.com.tw/animation/', }, { 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', - 'only_matching': True, + 'md5': 'eaa20e6b9df418c912d7f5dec2ba734d', + 'info_dict': { + 'id': '35770334', + 'ext': 'mp4', + 'title': '咖啡占卜測 XU裝熟指數', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:7b859991a6a4fedbdf3dd3b66545c748', + 'upload_date': '20140417', + }, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' From 8a64969404ecbbac9cb8b970f7ca36a9e6c03cf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 29 Sep 2015 21:33:21 +0600 Subject: [PATCH 002/150] [adultswim] Prefer stream (Closes #7015) --- youtube_dl/extractor/adultswim.py | 37 ++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 4327c2f61..27de07587 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, ExtractorError, float_or_none, xpath_text, @@ -123,7 +124,6 @@ class AdultSwimIE(InfoExtractor): else: collections = bootstrapped_data['show']['collections'] collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: @@ -133,7 +133,9 @@ class AdultSwimIE(InfoExtractor): show = bootstrapped_data['show'] show_title = show['title'] - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + stream = video_info.get('stream') + clips = [stream] if stream else video_info['clips'] + segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] episode_title = video_info['title'] @@ -142,7 +144,7 @@ class AdultSwimIE(InfoExtractor): entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=mobile' % segment_id + segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: @@ -158,17 +160,30 @@ class AdultSwimIE(InfoExtractor): formats = [] file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') + unique_urls = [] + unique_file_els = [] for file_el in file_els: + media_url = file_el.text + if not media_url or determine_ext(media_url) == 'f4m': + continue + if file_el.text not in unique_urls: + unique_urls.append(file_el.text) + unique_file_els.append(file_el) + + for file_el in unique_file_els: bitrate = file_el.attrib.get('bitrate') ftype = file_el.attrib.get('type') - - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - 'quality': 1 if ftype == 'hd' else -1 - }) + media_url = file_el.text + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, segment_title, 'mp4', 'm3u8_native', preference=0, m3u8_id='hls')) + else: + formats.append({ + 'format_id': '%s_%s' % (bitrate, ftype), + 'url': file_el.text.strip(), + # The bitrate may not be a number (for example: 'iphone') + 'tbr': int(bitrate) if bitrate.isdigit() else None, + }) self._sort_formats(formats) From c596ce91cd22914e031b9ac94ec38d1137e50d60 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 30 Sep 2015 15:39:47 +0200 Subject: [PATCH 003/150] [comedycentral] Fix youtube-dl :thedailyshow We'll let the generic IE follow the redirect and call back to us with the episode URL --- youtube_dl/extractor/comedycentral.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 91ebb0ce5..3e4bd10b6 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -151,12 +151,7 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj.group('shortname'): - if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://thedailyshow.cc.com/full-episodes/' - else: - url = 'http://thecolbertreport.cc.com/full-episodes/' - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - assert mobj is not None + return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') if mobj.group('clip'): if mobj.group('videotitle'): From aedb930cfcd068600d3b5b0b3aef32df1a28f0c0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 30 Sep 2015 13:30:48 +0100 Subject: [PATCH 004/150] [nfl] fix content id regex(fixes #7012) --- youtube_dl/extractor/nfl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 55dc6107d..4c35c561b 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -151,7 +151,7 @@ class NFLIE(InfoExtractor): group='config')) # For articles, the id in the url is not the video id video_id = self._search_regex( - r'(?:]+data-contentId\s*=\s*|contentId\s*:\s*)(["\'])(?P.+?)\1', + r'(?:]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P.+?)\1', webpage, 'video id', default=video_id, group='id') config = self._download_json(config_url, video_id, 'Downloading player config') url_template = NFLIE.prepend_host( From ee2d190253ae3e0a8254029dc6e8d7e42c194042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Sep 2015 20:06:21 +0600 Subject: [PATCH 005/150] [nfl] Add test for #7012 --- youtube_dl/extractor/nfl.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 4c35c561b..200874d68 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -107,6 +107,20 @@ class NFLIE(InfoExtractor): 'timestamp': 1442618809, 'upload_date': '20150918', }, + }, { + # lowercase data-contentid + 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', + 'info_dict': { + 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', + 'ext': 'mp4', + 'title': 'Tomlin looks ahead to Ravens on a short week', + 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', + 'timestamp': 1443459651, + 'upload_date': '20150928', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', 'only_matching': True, From 93bc7ef165731c729fd6fbbb3e95ecbbb16b4c21 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Sun, 30 Aug 2015 14:33:12 +0800 Subject: [PATCH 006/150] [test] recursively check dict and list in expect_info_dict This allows to use md5:, re:, etc within the str inside a list or dict. --- test/helper.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index cb6eec8d9..6612970ef 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,7 +89,7 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() -def expect_info_dict(self, got_dict, expected_dict): +def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): if isinstance(expected, compat_str) and expected.startswith('re:'): got = got_dict.get(info_field) @@ -127,6 +127,22 @@ def expect_info_dict(self, got_dict, expected_dict): got = got_dict.get(info_field) self.assertTrue(isinstance(got, expected), 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got_dict.get(info_field, None), dict): + expect_dict(self, got_dict.get(info_field), expected) + elif isinstance(expected, list) and isinstance(got_dict.get(info_field, None), list): + got = got_dict.get(info_field, None) + self.assertEqual(len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d' % ( + len(expected), len(got))) + _id = 0 + for i, j in zip(got, expected): + _type_i = type(i) + _type_j = type(j) + self.assertEqual(_type_j, _type_i, + 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( + _id, info_field, _type_j, _type_i)) + expect_dict(self, {'_': i}, {'_': j}) + _id += 1 else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got_dict.get(info_field)) @@ -149,6 +165,9 @@ def expect_info_dict(self, got_dict, expected_dict): self.assertEqual(expected, got, 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + +def expect_info_dict(self, got_dict, expected_dict): + expect_dict(self, got_dict, expected_dict) # Check for the presence of mandatory fields if got_dict.get('_type') not in ('playlist', 'multi_video'): for key in ('id', 'url', 'title', 'ext'): From 40c931de4b3c06c1959362c2380f4bf243ef5702 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Sat, 26 Sep 2015 23:10:38 +0800 Subject: [PATCH 007/150] [test] split expect_dict to two functions --- test/helper.py | 143 ++++++++++++++++++++++++------------------------- 1 file changed, 70 insertions(+), 73 deletions(-) diff --git a/test/helper.py b/test/helper.py index 6612970ef..e50d7ff28 100644 --- a/test/helper.py +++ b/test/helper.py @@ -89,81 +89,78 @@ def gettestcases(include_onlymatching=False): md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() +def expect_value(self, got, expected, field): + if isinstance(expected, compat_str) and expected.startswith('re:'): + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + match_rex.match(got), + 'field %s (value: %r) should match %r' % (field, got, match_str)) + elif isinstance(expected, compat_str) and expected.startswith('startswith:'): + start_str = expected[len('startswith:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + got.startswith(start_str), + 'field %s (value: %r) should start with %r' % (field, got, start_str)) + elif isinstance(expected, compat_str) and expected.startswith('contains:'): + contains_str = expected[len('contains:'):] + self.assertTrue( + isinstance(got, compat_str), + 'Expected a %s object, but got %s for field %s' % ( + compat_str.__name__, type(got).__name__, field)) + self.assertTrue( + contains_str in got, + 'field %s (value: %r) should contain %r' % (field, got, contains_str)) + elif isinstance(expected, type): + self.assertTrue(isinstance(got, expected), + 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) + elif isinstance(expected, dict) and isinstance(got, dict): + expect_dict(self, got, expected) + elif isinstance(expected, list) and isinstance(got, list): + self.assertEqual(len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d' % ( + len(expected), len(got))) + _id = 0 + for i, j in zip(got, expected): + _type_i = type(i) + _type_j = type(j) + self.assertEqual(_type_j, _type_i, + 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( + _id, field, _type_j, _type_i)) + expect_value(self, i, j, field) + _id += 1 + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(got) + elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + self.assertTrue( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( + field, type(got).__name__)) + expected_num = int(expected.partition(':')[2]) + assertGreaterEqual( + self, len(got), expected_num, + 'Expected %d items in field %s, but only got %d' % ( + expected_num, field, len(got) + ) + ) + return + self.assertEqual(expected, got, + 'invalid value for field %s, expected %r, got %r' % (field, expected, got)) + + def expect_dict(self, got_dict, expected_dict): for info_field, expected in expected_dict.items(): - if isinstance(expected, compat_str) and expected.startswith('re:'): - got = got_dict.get(info_field) - match_str = expected[len('re:'):] - match_rex = re.compile(match_str) - - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - match_rex.match(got), - 'field %s (value: %r) should match %r' % (info_field, got, match_str)) - elif isinstance(expected, compat_str) and expected.startswith('startswith:'): - got = got_dict.get(info_field) - start_str = expected[len('startswith:'):] - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - got.startswith(start_str), - 'field %s (value: %r) should start with %r' % (info_field, got, start_str)) - elif isinstance(expected, compat_str) and expected.startswith('contains:'): - got = got_dict.get(info_field) - contains_str = expected[len('contains:'):] - self.assertTrue( - isinstance(got, compat_str), - 'Expected a %s object, but got %s for field %s' % ( - compat_str.__name__, type(got).__name__, info_field)) - self.assertTrue( - contains_str in got, - 'field %s (value: %r) should contain %r' % (info_field, got, contains_str)) - elif isinstance(expected, type): - got = got_dict.get(info_field) - self.assertTrue(isinstance(got, expected), - 'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) - elif isinstance(expected, dict) and isinstance(got_dict.get(info_field, None), dict): - expect_dict(self, got_dict.get(info_field), expected) - elif isinstance(expected, list) and isinstance(got_dict.get(info_field, None), list): - got = got_dict.get(info_field, None) - self.assertEqual(len(expected), len(got), - 'Expect a list of length %d, but got a list of length %d' % ( - len(expected), len(got))) - _id = 0 - for i, j in zip(got, expected): - _type_i = type(i) - _type_j = type(j) - self.assertEqual(_type_j, _type_i, - 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( - _id, info_field, _type_j, _type_i)) - expect_dict(self, {'_': i}, {'_': j}) - _id += 1 - else: - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(got_dict.get(info_field)) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): - got = got_dict.get(info_field) - self.assertTrue( - isinstance(got, (list, dict)), - 'Expected field %s to be a list or a dict, but it is of type %s' % ( - info_field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( - self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % ( - expected_num, info_field, len(got) - ) - ) - continue - else: - got = got_dict.get(info_field) - self.assertEqual(expected, got, - 'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + got = got_dict.get(info_field) + expect_value(self, got, expected, info_field) def expect_info_dict(self, got_dict, expected_dict): From 687c04cbb8d80f76b45caf9377b2d3b64462c203 Mon Sep 17 00:00:00 2001 From: Qijiang Fan Date: Wed, 30 Sep 2015 10:30:04 +0200 Subject: [PATCH 008/150] [test] use descriptive variable name --- test/helper.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/test/helper.py b/test/helper.py index e50d7ff28..5c44105d4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -128,15 +128,13 @@ def expect_value(self, got, expected, field): self.assertEqual(len(expected), len(got), 'Expect a list of length %d, but got a list of length %d' % ( len(expected), len(got))) - _id = 0 - for i, j in zip(got, expected): - _type_i = type(i) - _type_j = type(j) - self.assertEqual(_type_j, _type_i, + for index, (item_got, item_expected) in enumerate(zip(got, expected)): + type_got = type(item_got) + type_expected = type(item_expected) + self.assertEqual(type_expected, type_got, 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( - _id, field, _type_j, _type_i)) - expect_value(self, i, j, field) - _id += 1 + index, field, type_expected, type_got)) + expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): got = 'md5:' + md5(got) From 2e885de796e211300fd70709da17325b84c86dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Sep 2015 20:21:01 +0600 Subject: [PATCH 009/150] [test/helper] Formatting --- test/helper.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/test/helper.py b/test/helper.py index 5c44105d4..288ed237d 100644 --- a/test/helper.py +++ b/test/helper.py @@ -120,20 +120,22 @@ def expect_value(self, got, expected, field): contains_str in got, 'field %s (value: %r) should contain %r' % (field, got, contains_str)) elif isinstance(expected, type): - self.assertTrue(isinstance(got, expected), - 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) + self.assertTrue( + isinstance(got, expected), + 'Expected type %r for field %s, but got value %r of type %r' % (expected, field, got, type(got))) elif isinstance(expected, dict) and isinstance(got, dict): expect_dict(self, got, expected) elif isinstance(expected, list) and isinstance(got, list): - self.assertEqual(len(expected), len(got), - 'Expect a list of length %d, but got a list of length %d' % ( - len(expected), len(got))) + self.assertEqual( + len(expected), len(got), + 'Expect a list of length %d, but got a list of length %d' % (len(expected), len(got))) for index, (item_got, item_expected) in enumerate(zip(got, expected)): type_got = type(item_got) type_expected = type(item_expected) - self.assertEqual(type_expected, type_got, - 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( - index, field, type_expected, type_got)) + self.assertEqual( + type_expected, type_got, + 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( + index, field, type_expected, type_got)) expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): @@ -146,13 +148,11 @@ def expect_value(self, got, expected, field): expected_num = int(expected.partition(':')[2]) assertGreaterEqual( self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % ( - expected_num, field, len(got) - ) - ) + 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) return - self.assertEqual(expected, got, - 'invalid value for field %s, expected %r, got %r' % (field, expected, got)) + self.assertEqual( + expected, got, + 'invalid value for field %s, expected %r, got %r' % (field, expected, got)) def expect_dict(self, got_dict, expected_dict): From 386a7b52d548da0aaf11e5805d67d5ab30d9dfab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Sep 2015 20:26:42 +0600 Subject: [PATCH 010/150] [test/helper] Spelling --- test/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/helper.py b/test/helper.py index 288ed237d..0ce9a0fb3 100644 --- a/test/helper.py +++ b/test/helper.py @@ -134,7 +134,7 @@ def expect_value(self, got, expected, field): type_expected = type(item_expected) self.assertEqual( type_expected, type_got, - 'Type doesn\'t match at element %d of the list in field %s, expect %s, got %s' % ( + 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( index, field, type_expected, type_got)) expect_value(self, item_got, item_expected, field) else: @@ -152,7 +152,7 @@ def expect_value(self, got, expected, field): return self.assertEqual( expected, got, - 'invalid value for field %s, expected %r, got %r' % (field, expected, got)) + 'Invalid value for field %s, expected %r, got %r' % (field, expected, got)) def expect_dict(self, got_dict, expected_dict): From f88f1b40ce7f70a3a510259154364a25d7dceff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Sep 2015 20:31:29 +0600 Subject: [PATCH 011/150] [test/helper] Clarify field for list length mismatch --- test/helper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index 0ce9a0fb3..28fd135b2 100644 --- a/test/helper.py +++ b/test/helper.py @@ -128,7 +128,8 @@ def expect_value(self, got, expected, field): elif isinstance(expected, list) and isinstance(got, list): self.assertEqual( len(expected), len(got), - 'Expect a list of length %d, but got a list of length %d' % (len(expected), len(got))) + 'Expect a list of length %d, but got a list of length %d for field %s' % ( + len(expected), len(got), field)) for index, (item_got, item_expected) in enumerate(zip(got, expected)): type_got = type(item_got) type_expected = type(item_expected) From 8466336104164b37b3f32c32278e2426590db42f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 Sep 2015 22:12:52 +0600 Subject: [PATCH 012/150] [vk] Detect vimeo embeds (Closes #7021) --- youtube_dl/extractor/vk.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index c30c5a8e5..765e9e6fd 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -17,6 +17,7 @@ from ..utils import ( unescapeHTML, unified_strdate, ) +from .vimeo import VimeoIE class VKIE(InfoExtractor): @@ -249,6 +250,10 @@ class VKIE(InfoExtractor): if youtube_url: return self.url_result(youtube_url, 'Youtube') + vimeo_url = VimeoIE._extract_vimeo_url(url, info_page) + if vimeo_url is not None: + return self.url_result(vimeo_url) + m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) if m_rutube is not None: From f540b937062a4de2cd0a097245ec82dd9fd23277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 02:33:48 +0600 Subject: [PATCH 013/150] [naver] Improve error regex --- youtube_dl/extractor/naver.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 925967753..35cbb3e6d 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -46,11 +46,11 @@ class NaverIE(InfoExtractor): m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"', webpage) if m_id is None: - m_error = re.search( - r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', - webpage) - if m_error: - raise ExtractorError(clean_html(m_error.group('msg')), expected=True) + error = self._html_search_regex( + r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') vid = m_id.group(1) key = m_id.group(2) From b83831df1fb0dd241210a2c992ac9753c5df59cd Mon Sep 17 00:00:00 2001 From: Joakim Fremstad Date: Thu, 1 Oct 2015 14:58:49 +0200 Subject: [PATCH 014/150] [nrk] Spelling --- youtube_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index d066a96db..5ed235f1e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -49,7 +49,7 @@ class NRKIE(InfoExtractor): if data['usageRights']['isGeoBlocked']: raise ExtractorError( - 'NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', + 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', expected=True) video_url = data['mediaUrl'] + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81' From 2b3f951a2e2e01adfb068eac05b976ff445db073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 20:33:17 +0600 Subject: [PATCH 015/150] [nrktv] Rework subtitles and eliminate downloading twice --- youtube_dl/extractor/nrk.py | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 5ed235f1e..8ac38a174 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( ExtractorError, float_or_none, @@ -196,20 +197,6 @@ class NRKTVIE(InfoExtractor): } ] - def _debug_print(self, txt): - if self._downloader.params.get('verbose', False): - self.to_screen('[debug] %s' % txt) - - def _get_subtitles(self, subtitlesurl, video_id, baseurl): - url = "%s%s" % (baseurl, subtitlesurl) - self._debug_print('%s: Subtitle url: %s' % (video_id, url)) - captions = self._download_xml( - url, video_id, 'Downloading subtitles') - lang = captions.get('lang', 'no') - return {lang: [ - {'ext': 'ttml', 'url': url}, - ]} - def _extract_f4m(self, manifest_url, video_id): return self._extract_f4m_formats( manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') @@ -218,7 +205,7 @@ class NRKTVIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') part_id = mobj.group('part_id') - baseurl = mobj.group('baseurl') + base_url = mobj.group('baseurl') webpage = self._download_webpage(url, video_id) @@ -278,11 +265,14 @@ class NRKTVIE(InfoExtractor): self._sort_formats(formats) subtitles_url = self._html_search_regex( - r'data-subtitlesurl[ ]*=[ ]*"([^"]+)"', - webpage, 'subtitle URL', default=None) - subtitles = None + r'data-subtitlesurl\s*=\s*(["\'])(?P.+?)\1', + webpage, 'subtitle URL', default=None, group='url') + subtitles = {} if subtitles_url: - subtitles = self.extract_subtitles(subtitles_url, video_id, baseurl) + subtitles['no'] = [{ + 'ext': 'ttml', + 'url': compat_urlparse.urljoin(base_url, subtitles_url), + }] return { 'id': video_id, From e7d8e98a9ffdec2502bedb21a4f043df6da225a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 21:51:04 +0600 Subject: [PATCH 016/150] [extractor/common] Allow float bitrates --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2fe0d5d37..9c40d56a9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1084,7 +1084,7 @@ class InfoExtractor(object): if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) From 1e5bcdec0264190ed2a05ee49c1f9f5b20ba3aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:08:16 +0600 Subject: [PATCH 017/150] [extractor/common] Extract images from SMIL --- youtube_dl/extractor/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c40d56a9..5684227dc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1054,10 +1054,18 @@ class InfoExtractor(object): elif not description and name in ('description', 'abstract'): description = content + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + return { 'id': video_id, 'title': title or video_id, 'description': description, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, } From 647eab4541d1f55dea6aa20af9b35c2726dda48a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:18:59 +0600 Subject: [PATCH 018/150] [extractor/common] Extract upload date from SMIL --- youtube_dl/extractor/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5684227dc..b86d06523 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -39,6 +39,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + unified_strdate, url_basename, xpath_text, xpath_with_ns, @@ -1044,6 +1045,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1053,6 +1055,8 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) thumbnails = [{ 'id': image.get('type'), @@ -1065,6 +1069,7 @@ class InfoExtractor(object): 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, From acfb717a18cfa9c0f377372068e16862bba345b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:19:39 +0600 Subject: [PATCH 019/150] [videolecturesnet] Use generic SMIL extraction --- youtube_dl/extractor/videolecturesnet.py | 62 +++--------------------- 1 file changed, 8 insertions(+), 54 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index ef2da5632..160dbb590 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -1,13 +1,8 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - find_xpath_attr, - int_or_none, parse_duration, - unified_strdate, ) @@ -29,58 +24,17 @@ class VideoLecturesNetIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id - smil = self._download_xml(smil_url, video_id) + smil = self._download_smil(smil_url, video_id) - title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content'] - description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract') - description = ( - None if description_el is None - else description_el.attrib['content']) - upload_date = unified_strdate( - find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content']) + info = self._parse_smil(smil, smil_url, video_id) + + info['id'] = video_id switch = smil.find('.//switch') - duration = parse_duration(switch.attrib.get('dur')) - thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail') - thumbnail = ( - None if thumbnail_el is None else thumbnail_el.attrib.get('src')) + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) - formats = [] - for v in switch.findall('./video'): - proto = v.attrib.get('proto') - if proto not in ['http', 'rtmp']: - continue - f = { - 'width': int_or_none(v.attrib.get('width')), - 'height': int_or_none(v.attrib.get('height')), - 'filesize': int_or_none(v.attrib.get('size')), - 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0, - 'ext': v.attrib.get('ext'), - } - src = v.attrib['src'] - if proto == 'http': - if self._is_valid_url(src, video_id): - f['url'] = src - formats.append(f) - elif proto == 'rtmp': - f.update({ - 'url': v.attrib['streamer'], - 'play_path': src, - 'rtmp_real_time': True, - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'upload_date': upload_date, - 'duration': duration, - 'thumbnail': thumbnail, - 'formats': formats, - } + return info From 0c996b9f488bfaa74d79e94739af80a0be38e125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:39:38 +0600 Subject: [PATCH 020/150] [videolecturesnet] Add support for playlists (Closes #7031) --- youtube_dl/extractor/videolecturesnet.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 160dbb590..eadff8d18 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import ( - parse_duration, -) +from ..compat import compat_urlparse +from ..utils import parse_duration class VideoLecturesNetIE(InfoExtractor): @@ -27,7 +28,17 @@ class VideoLecturesNetIE(InfoExtractor): video_id = self._match_id(url) smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id - smil = self._download_smil(smil_url, video_id) + smil = self._download_smil(smil_url, video_id, fatal=False) + + # Probably a playlist + if smil is False: + webpage = self._download_webpage(url, video_id) + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) + playlist_description = self._html_search_meta('description', webpage, 'description') + return self.playlist_result(entries, video_id, playlist_title, playlist_description) info = self._parse_smil(smil, smil_url, video_id) From fb97809e64a145768346345ff2b5e5539880c014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:44:51 +0600 Subject: [PATCH 021/150] [videolecturesnet] Improve playlist extraction --- youtube_dl/extractor/videolecturesnet.py | 32 +++++++++++++++--------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index eadff8d18..704165406 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -3,8 +3,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import parse_duration +from ..compat import ( + compat_HTTPError, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_duration, +) class VideoLecturesNetIE(InfoExtractor): @@ -28,17 +34,19 @@ class VideoLecturesNetIE(InfoExtractor): video_id = self._match_id(url) smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id - smil = self._download_smil(smil_url, video_id, fatal=False) - # Probably a playlist - if smil is False: - webpage = self._download_webpage(url, video_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) - playlist_description = self._html_search_meta('description', webpage, 'description') - return self.playlist_result(entries, video_id, playlist_title, playlist_description) + try: + smil = self._download_smil(smil_url, video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + # Probably a playlist + webpage = self._download_webpage(url, video_id) + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r']+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) + playlist_description = self._html_search_meta('description', webpage, 'description') + return self.playlist_result(entries, video_id, playlist_title, playlist_description) info = self._parse_smil(smil, smil_url, video_id) From 6edaa0e25b880be5d1ea513d9d907c91e18a8347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:45:10 +0600 Subject: [PATCH 022/150] [videolecturesnet] Add playlist test --- youtube_dl/extractor/videolecturesnet.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 704165406..113a2289b 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -17,7 +17,7 @@ class VideoLecturesNetIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P[^/#?]+)/*(?:[#?].*)?$' IE_NAME = 'videolectures.net' - _TEST = { + _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { 'id': 'promogram_igor_mekjavic_eng', @@ -28,7 +28,15 @@ class VideoLecturesNetIE(InfoExtractor): 'duration': 565, 'thumbnail': 're:http://.*\.jpg', }, - } + }, { + 'url': 'http://videolectures.net/deeplearning2015_montreal/', + 'info_dict': { + 'id': 'deeplearning2015_montreal', + 'title': 'Deep Learning Summer School, Montreal 2015', + 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b', + }, + 'playlist_count': 30, + }] def _real_extract(self, url): video_id = self._match_id(url) From c78e48177c72baa0c985c4d47f95e6edc6ec33dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:54:54 +0600 Subject: [PATCH 023/150] [extractor/common] Check validity of direct URLs --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b86d06523..b928e24be 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1145,7 +1145,7 @@ class InfoExtractor(object): formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, From 3a1341a7bc9f11d972c7649a33e143fab72f8bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 22:59:20 +0600 Subject: [PATCH 024/150] [extractor/common] Make m3u8 extraction for SMIL non fatal --- youtube_dl/extractor/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b928e24be..9a5a7cc2c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1129,8 +1129,10 @@ class InfoExtractor(object): src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) continue if src_ext == 'f4m': From 4de6131090e0232c7cc99bcaafe6a3e71269b7af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 23:03:31 +0600 Subject: [PATCH 025/150] [extractor/common] Add fatal to _extract_f4m_formats --- youtube_dl/extractor/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9a5a7cc2c..c2aa3a749 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -870,13 +870,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -897,7 +902,10 @@ class InfoExtractor(object): # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ From e5851b963ab0872f27f2db1cff3c89548b6bf95e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 23:04:56 +0600 Subject: [PATCH 026/150] [extractor/common] Make f4m extraction for SMIL non fatal --- youtube_dl/extractor/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c2aa3a749..4fe2307cd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1152,7 +1152,9 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) continue if src_url.startswith('http') and self._is_valid_url(src, video_id): From 06c6efa9701d0527147875adf0caf437ffbb0397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Oct 2015 23:10:36 +0600 Subject: [PATCH 027/150] [videolecturesnet] Add test video with broken direct format links --- youtube_dl/extractor/videolecturesnet.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 113a2289b..649ac9433 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -28,6 +28,21 @@ class VideoLecturesNetIE(InfoExtractor): 'duration': 565, 'thumbnail': 're:http://.*\.jpg', }, + }, { + # video with invalid direct format links (HTTP 403) + 'url': 'http://videolectures.net/russir2010_filippova_nlp/', + 'info_dict': { + 'id': 'russir2010_filippova_nlp', + 'ext': 'flv', + 'title': 'NLP at Google', + 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', + 'duration': 5352, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { From 44451f22d5d1a5bed5f5851b27a963860813ecd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 2 Oct 2015 13:41:52 +0200 Subject: [PATCH 028/150] [naver] Remove unused import --- youtube_dl/extractor/naver.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 35cbb3e6d..1f5fc2145 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -10,7 +10,6 @@ from ..compat import ( ) from ..utils import ( ExtractorError, - clean_html, ) From 7d0ada5ff907824c66c466ee9b83008210250d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 2 Oct 2015 13:42:11 +0200 Subject: [PATCH 029/150] [test/helper] Fix style Use the correct indentation to please flake8 --- test/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index 28fd135b2..bdd7acca4 100644 --- a/test/helper.py +++ b/test/helper.py @@ -136,7 +136,7 @@ def expect_value(self, got, expected, field): self.assertEqual( type_expected, type_got, 'Type mismatch for list item at index %d for field %s, expected %r, got %r' % ( - index, field, type_expected, type_got)) + index, field, type_expected, type_got)) expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): From 0facd2af3ebfda68b79c7e2e1c575d73f9680802 Mon Sep 17 00:00:00 2001 From: fluks Date: Fri, 2 Oct 2015 04:08:13 +0300 Subject: [PATCH 030/150] Fix ruutu extractor bug If there's no resolution attribute in xml, only width gets a value, height doesn't and ValueError is raised. --- youtube_dl/extractor/ruutu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index c67ad25ce..7720f1383 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -74,7 +74,7 @@ class RuutuIE(InfoExtractor): preference = -1 if proto == 'rtmp' else 1 label = child.get('label') tbr = int_or_none(child.get('bitrate')) - width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')] + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')] formats.append({ 'format_id': '%s-%s' % (proto, label if label else tbr), 'url': video_url, From 59a9efe85b15e53c5928b7fdb810c150f5bf4b78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Oct 2015 20:48:39 +0600 Subject: [PATCH 031/150] [ruutu] Limit resolution split to 2 pieces (Closes #7037, closes #7042) --- youtube_dl/extractor/ruutu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 7720f1383..a16b73ff4 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -74,7 +74,7 @@ class RuutuIE(InfoExtractor): preference = -1 if proto == 'rtmp' else 1 label = child.get('label') tbr = int_or_none(child.get('bitrate')) - width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')] + width, height = [int_or_none(x) for x in child.get('resolution', 'x').split('x')[:2]] formats.append({ 'format_id': '%s-%s' % (proto, label if label else tbr), 'url': video_url, From 3bb3f0410822d3d21c6199bb8915b598990628e6 Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 16:59:04 +0200 Subject: [PATCH 032/150] [europa] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/europa.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 youtube_dl/extractor/europa.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..495a18c17 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -158,6 +158,7 @@ from .eroprofile import EroProfileIE from .escapist import EscapistIE from .espn import ESPNIE from .esri import EsriVideoIE +from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py new file mode 100644 index 000000000..c437c4886 --- /dev/null +++ b/youtube_dl/extractor/europa.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + compat_urlparse, + xpath_text +) + + +class EuropaIE(InfoExtractor): + _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?(?:[^&]|&(?!ref))*ref=(?P[A-Za-z0-9]+)' + _TEST = { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', + 'md5': '728cca2fd41d5aa7350cec1141fbe620', + 'info_dict': { + 'id': 'I107758', + 'ext': 'mp4', + 'title': 'TRADE - Wikileaks on TTIP', + 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', + 'thumbnail': 're:^http://defiris\.ec\.streamcloud\.be/findmedia/18/107758/THUMB_[0-9A-Z]+\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + lang = query.get('sitelang', ['en'])[0] + + playlist = self._download_xml('http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=' + video_id, video_id) + videos = {} + formats = [] + + for item in playlist.findall('info/title/item'): + videos[xpath_text(item, 'lg')] = {'title': xpath_text(item, 'label').strip()} + + for item in playlist.findall('info/description/item'): + videos[xpath_text(item, 'lg')]['description'] = xpath_text(item, 'label').strip() + + for item in playlist.findall('files/file'): + lg = xpath_text(item, 'lg') + vid = videos[lg] + vid['format_note'] = xpath_text(item, 'lglabel') + vid['url'] = xpath_text(item, 'url') + + if lg == lang: + vid['language_preference'] = 10 + + formats.append(vid) + + formats.reverse() + def_video = videos.get(lang, videos['int']) + + return { + 'id': video_id, + 'title': def_video['title'], + 'description': def_video['description'], + 'thumbnail': xpath_text(playlist, 'info/thumburl', 'thumburl'), + 'formats': formats + } From af17794c654bd24bbd5f47997596430b201ea08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Oct 2015 22:29:15 +0600 Subject: [PATCH 033/150] [europa] Improve extraction --- youtube_dl/extractor/europa.py | 94 ++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index c437c4886..02ba8d63c 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -2,59 +2,89 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( - compat_urlparse, + int_or_none, + orderedSet, + parse_duration, + qualities, + unified_strdate, xpath_text ) class EuropaIE(InfoExtractor): - _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?(?:[^&]|&(?!ref))*ref=(?P[A-Za-z0-9]+)' - _TEST = { + _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?.*?\bref=(?P[A-Za-z0-9]+)' + _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', - 'md5': '728cca2fd41d5aa7350cec1141fbe620', + 'md5': '574f080699ddd1e19a675b0ddf010371', 'info_dict': { 'id': 'I107758', 'ext': 'mp4', 'title': 'TRADE - Wikileaks on TTIP', 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015', - 'thumbnail': 're:^http://defiris\.ec\.streamcloud\.be/findmedia/18/107758/THUMB_[0-9A-Z]+\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150811', + 'duration': 34, + 'view_count': int, + 'formats': 'mincount:3', } - } + }, { + 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + + playlist = self._download_xml( + 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id) + + def get_item(type_, preference): + items = {} + for item in playlist.findall('./info/%s/item' % type_): + lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None) + if lang and label: + items[lang] = label.strip() + for p in preference: + if items.get(p): + return items[p] + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) - lang = query.get('sitelang', ['en'])[0] + preferred_lang = query.get('sitelang', ('en', ))[0] + + preferred_langs = orderedSet((preferred_lang, 'en', 'int')) + + title = get_item('title', preferred_langs) or video_id + description = get_item('description', preferred_langs) + thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') + upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) + duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) + view_count = int_or_none(xpath_text(playlist,'./info/views', 'views')) + + language_preference = qualities(preferred_langs[::-1]) - playlist = self._download_xml('http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=' + video_id, video_id) - videos = {} formats = [] - - for item in playlist.findall('info/title/item'): - videos[xpath_text(item, 'lg')] = {'title': xpath_text(item, 'label').strip()} - - for item in playlist.findall('info/description/item'): - videos[xpath_text(item, 'lg')]['description'] = xpath_text(item, 'label').strip() - - for item in playlist.findall('files/file'): - lg = xpath_text(item, 'lg') - vid = videos[lg] - vid['format_note'] = xpath_text(item, 'lglabel') - vid['url'] = xpath_text(item, 'url') - - if lg == lang: - vid['language_preference'] = 10 - - formats.append(vid) - - formats.reverse() - def_video = videos.get(lang, videos['int']) + for file_ in playlist.findall('./files/file'): + video_url = xpath_text(file_, './url') + if not video_url: + continue + lang = xpath_text(file_, './lg') + formats.append({ + 'url': video_url, + 'format_id': lang, + 'format_note': xpath_text(file_, './lglabel'), + 'language_preference': language_preference(lang) + }) + self._sort_formats(formats) return { 'id': video_id, - 'title': def_video['title'], - 'description': def_video['description'], - 'thumbnail': xpath_text(playlist, 'info/thumburl', 'thumburl'), + 'title': title, + 'description': description, + 'thumbnail': thumbnmail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'formats': formats } From f3b098fb90b985484d800fcdbfe18add2360e4df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Oct 2015 23:22:53 +0600 Subject: [PATCH 034/150] [europa] Add support for audio URLs --- youtube_dl/extractor/europa.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index 02ba8d63c..9e33cacff 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -14,7 +14,7 @@ from ..utils import ( class EuropaIE(InfoExtractor): - _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?.*?\bref=(?P[A-Za-z0-9]+)' + _VALID_URL = r'https?://ec\.europa\.eu/avservices/(?:video/player|audio/audioDetails)\.cfm\?.*?\bref=(?P[A-Za-z0-9-]+)' _TESTS = [{ 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758', 'md5': '574f080699ddd1e19a675b0ddf010371', @@ -32,6 +32,9 @@ class EuropaIE(InfoExtractor): }, { 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786', 'only_matching': True, + }, { + 'url': 'http://ec.europa.eu/avservices/audio/audioDetails.cfm?ref=I-109295&sitelang=en', + 'only_matching': True, }] def _real_extract(self, url): From b203095d4c95d471bc2ac7045693c6938ee914d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 2 Oct 2015 22:40:35 +0200 Subject: [PATCH 035/150] [europa] Style fix: add whitespace after comma --- youtube_dl/extractor/europa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/europa.py b/youtube_dl/extractor/europa.py index 9e33cacff..adc43919e 100644 --- a/youtube_dl/extractor/europa.py +++ b/youtube_dl/extractor/europa.py @@ -63,7 +63,7 @@ class EuropaIE(InfoExtractor): thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) - view_count = int_or_none(xpath_text(playlist,'./info/views', 'views')) + view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) language_preference = qualities(preferred_langs[::-1]) From 5495937f461268a850a6a54d3fe19ed1f0f01eef Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 20:00:10 +0800 Subject: [PATCH 036/150] [options] Cleanup double spaces in help texts --- youtube_dl/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5eccc0a70..3dd6d290b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -276,7 +276,7 @@ def parseOpts(overrideArguments=None): 'For example, to only match videos that have been liked more than ' '100 times and disliked less than 50 times (or the dislike ' 'functionality is not available at the given service), but who ' - 'also have a description, use --match-filter ' + 'also have a description, use --match-filter ' '"like_count > 100 & dislike_count Date: Sun, 13 Sep 2015 20:04:27 +0800 Subject: [PATCH 037/150] [compat] Allow overriding by only COLUMNS or LINES in compat_get_terminal_size Now the semantic of this function is identical to shutil.get_terminal_size() in Python 3.3+. The new behavior also corresponds to the old get_term_width(), which is removed in 003c69a84b68cadb46aeb8e03115848a722fd675 --- youtube_dl/compat.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 1ff42d94b..c36c9c23f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -416,7 +416,7 @@ if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3 else: _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines']) - def compat_get_terminal_size(): + def compat_get_terminal_size(fallback=(80, 24)): columns = compat_getenv('COLUMNS', None) if columns: columns = int(columns) @@ -428,14 +428,20 @@ else: else: lines = None - try: - sp = subprocess.Popen( - ['stty', 'size'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = sp.communicate() - lines, columns = map(int, out.split()) - except Exception: - pass + if columns <= 0 or lines <= 0: + try: + sp = subprocess.Popen( + ['stty', 'size'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = sp.communicate() + _columns, _lines = map(int, out.split()) + except Exception: + _columns, _lines = _terminal_size(*fallback) + + if columns <= 0: + columns = _columns + if lines <= 0: + lines = _lines return _terminal_size(columns, lines) try: From bad84757eb135b85d5a1b29524a064d23ab4e1e9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 20:10:23 +0800 Subject: [PATCH 038/150] [doc] Better formatting of youtube-dl.1 (closes #6510) --- devscripts/prepare_manpage.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 7ece37754..776e6556e 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -8,6 +8,35 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') + +def filter_options(readme): + ret = '' + in_options = False + for line in readme.split('\n'): + if line.startswith('# '): + if line[2:].startswith('OPTIONS'): + in_options = True + else: + in_options = False + + if in_options: + if line.lstrip().startswith('-'): + option, description = re.split(r'\s{2,}', line.lstrip()) + split_option = option.split(' ') + + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + else: + ret += line.lstrip() + '\n' + else: + ret += line + '\n' + + return ret + with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() @@ -26,6 +55,8 @@ readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) readme = re.sub(r'\s+youtube-dl \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) readme = PREFIX + readme +readme = filter_options(readme) + if sys.version_info < (3, 0): print(readme.encode('utf-8')) else: From 97d5bfcba65c8575ab06a34e91fae30a5fda3161 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 3 Oct 2015 14:17:17 +0100 Subject: [PATCH 039/150] [engadget] accept short video urls --- youtube_dl/extractor/engadget.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index 4ea37ebd9..e4180701d 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -10,7 +10,7 @@ from ..utils import ( class EngadgetIE(InfoExtractor): _VALID_URL = r'''(?x)https?://www.engadget.com/ - (?:video/5min/(?P\d+)| + (?:video(?:/5min)?/(?P\d+)| [\d/]+/.*?) ''' From 60d23e5e592aebe4a77dfb4ab70e87337967721c Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Sat, 3 Oct 2015 16:25:33 +0300 Subject: [PATCH 040/150] [tapely] Improve _VALID_URL --- youtube_dl/extractor/tapely.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py index f1f43d0a7..744f9db38 100644 --- a/youtube_dl/extractor/tapely.py +++ b/youtube_dl/extractor/tapely.py @@ -16,7 +16,7 @@ from ..utils import ( class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tape\.ly/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' + _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' _API_URL = 'http://tape.ly/showtape?id={0:}' _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' @@ -42,6 +42,10 @@ class TapelyIE(InfoExtractor): 'ext': 'm4a', }, }, + { + 'url': 'https://tapely.com/my-grief-as-told-by-water', + 'only_matching': True, + }, ] def _real_extract(self, url): From ef5acfe32de4c995625f9800cfe0776237961436 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 1 Sep 2015 23:05:19 +0100 Subject: [PATCH 041/150] [limelight] Add new extractor --- youtube_dl/extractor/__init__.py | 5 + youtube_dl/extractor/limelight.py | 176 ++++++++++++++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 youtube_dl/extractor/limelight.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 495a18c17..20cc3660c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -295,6 +295,11 @@ from .lifenews import ( LifeNewsIE, LifeEmbedIE, ) +from .limelight import ( + LimeLightMediaIE, + LimeLightChannelIE, + LimeLightChannelListIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py new file mode 100644 index 000000000..dcfc215c7 --- /dev/null +++ b/youtube_dl/extractor/limelight.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, +) + + +class LimeLightBaseIE(InfoExtractor): + + def get_playlist_service(self, id, method): + return self._download_json(self.PLAYLIST_SERVICE_URL % (id, method), id) + + def get_api(self, orgId, id, method): + return self._download_json(self.API_URL % (orgId, id, method), id) + + def process_data(self, mobileUrls, streams, properties): + video_id = properties['media_id'] + formats = [] + + for mobileUrl in mobileUrls: + if '.m3u8' in mobileUrl['mobileUrl']: + formats.extend(self._extract_m3u8_formats(mobileUrl['mobileUrl'], video_id)) + else: + formats.append({'url': mobileUrl['mobileUrl']}) + + for stream in streams: + if '.f4m' in stream['url']: + formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + else: + fmt = { + 'url': stream.get('url'), + 'abr': stream.get('audioBitRate'), + 'vbr': stream.get('videoBitRate'), + 'fps': stream.get('videoFrameRate'), + 'width': stream.get('videoWidthInPixels'), + 'height': stream.get('videoHeightInPixels'), + 'ext': determine_ext(stream.get('url')) + } + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', stream['url']) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + }) + formats.append(fmt) + + self._sort_formats(formats) + + title = properties['title'] + description = properties.get('description') + timestamp = properties.get('create_date') + duration = int_or_none(properties.get('duration_in_milliseconds')) + filesize = properties.get('total_storage_in_bytes') + categories = [properties.get('category')] + thumbnails = [{ + 'url': thumbnail.get('url'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + } for thumbnail in properties.get('thumbnails')] + subtitles = {caption.get('language_code'): [{'url': caption.get('url')}] for caption in properties.get('captions')} + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'timestamp': timestamp, + 'duration': duration, + 'filesize': filesize, + 'categories': categories, + 'thumbnails': thumbnails, + 'subtitles': subtitles, + } + + +class LimeLightMediaIE(LimeLightBaseIE): + IE_NAME = 'limelight' + _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*mediaId=(?P[a-z0-9]{32})' + _TEST = { + 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', + 'md5': '3213605088be599705677ef785db6972', + 'info_dict': { + 'id': '3ffd040b522b4485b6d84effc750cd86', + 'ext': 'mp4', + 'title': 'HaP and the HB Prince Trailer', + 'description': 'As Harry Potter begins his 6th year at Hogwarts School of Witchcraft and Wizardry, he discovers an old book marked mysteriously "This book is the property of the Half-Blood Prince" and begins to learn more about Lord Voldemort\'s dark past.', + 'thumbnail': 're:^https?://.*\.jpeg$', + 'duration': 144230, + 'timestamp': 1244136834, + "upload_date": "20090604", + } + } + PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/media/%s/%s' + API_URL = 'http://api.video.limelight.com/rest/organizations/%s/media/%s/%s.json' + + def _real_extract(self, url): + video_id = self._match_id(url) + + mobile_json_data = self.get_playlist_service(video_id, 'getMobilePlaylistByMediaId') + pc_json_data = self.get_playlist_service(video_id, 'getPlaylistByMediaId') + properties = self.get_api(pc_json_data['orgId'], video_id, 'properties') + + return self.process_data(mobile_json_data['mediaList'][0]['mobileUrls'], pc_json_data['playlistItems'][0]['streams'], properties) + + +class LimeLightChannelIE(LimeLightBaseIE): + IE_NAME = 'limelight:channel' + _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*channelId=(?P[a-z0-9]{32})' + _TEST = { + 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', + 'info_dict': { + 'id': 'ab6a524c379342f9b23642917020c082', + 'title': 'Javascript Sample Code', + }, + 'playlist_mincount': 3, + } + PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/channel/%s/%s' + API_URL = 'http://api.video.limelight.com/rest/organizations/%s/channels/%s/%s.json' + + def _real_extract(self, url): + channel_id = self._match_id(url) + + mobile_json_data = self.get_playlist_service(channel_id, 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1') + pc_json_data = self.get_playlist_service(channel_id, 'getPlaylistByChannelId') + medias = self.get_api(pc_json_data['orgId'], channel_id, 'media') + + entries = [] + for i in range(len(medias['media_list'])): + entries.append(self.process_data(mobile_json_data['mediaList'][i]['mobileUrls'], pc_json_data['playlistItems'][i]['streams'], medias['media_list'][i])) + + return { + 'id': channel_id, + 'title': pc_json_data['title'], + 'entries': entries, + '_type': 'playlist', + } + + +class LimeLightChannelListIE(LimeLightBaseIE): + IE_NAME = 'limelight:channel_list' + _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*channelListId=(?P[a-z0-9]{32})' + _TEST = { + 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', + 'info_dict': { + 'id': '301b117890c4465c8179ede21fd92e2b', + 'title': 'Website - Hero Player', + }, + 'playlist_mincount': 2, + } + PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/channel_list/%s/%s' + + def _real_extract(self, url): + channel_list_id = self._match_id(url) + + json_data = self.get_playlist_service(channel_list_id, 'getMobileChannelListById') + + entries = [] + for channel in json_data['channelList']: + entries.append({ + 'url': 'http://link.videoplatform.limelight.com/media/?channelId=%s' % channel['id'], + '_type': 'url', + 'ie_key': 'LimeLightChannel', + }) + + return { + 'id': channel_list_id, + 'title': json_data['title'], + 'entries': entries, + '_type': 'playlist', + } From 4bba371644818d79b5f8481b5b31c53ea8ecbcc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 20:33:42 +0600 Subject: [PATCH 042/150] [YoutubeDL] Autocalculate ext for subtitles when missing --- youtube_dl/YoutubeDL.py | 11 +++++++++-- youtube_dl/extractor/common.py | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d65253882..adf70d658 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1232,13 +1232,20 @@ class YoutubeDL(object): except (ValueError, OverflowError, OSError): pass + subtitles = info_dict.get('subtitles') + if subtitles: + for _, subtitle in subtitles.items(): + for subtitle_format in subtitle: + if 'ext' not in subtitle_format: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') - self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], info_dict.get('subtitles'), + info_dict['id'], subtitles, info_dict.get('automatic_captions')) # We now pick which formats have to be downloaded diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4fe2307cd..dbae75406 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -165,6 +165,7 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. From d7fc56318b72607758e7484c22076ec2999f10b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 20:41:57 +0600 Subject: [PATCH 043/150] [limelight] Fix python 2.6, simplify, make more robust (Closes #6734) --- youtube_dl/extractor/__init__.py | 6 +- youtube_dl/extractor/limelight.py | 185 ++++++++++++++++++------------ 2 files changed, 113 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 20cc3660c..3ace1cc2c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -296,9 +296,9 @@ from .lifenews import ( LifeEmbedIE, ) from .limelight import ( - LimeLightMediaIE, - LimeLightChannelIE, - LimeLightChannelListIE, + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, ) from .liveleak import LiveLeakIE from .livestream import ( diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index dcfc215c7..599d8413d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -5,65 +5,105 @@ import re from .common import InfoExtractor from ..utils import ( - int_or_none, determine_ext, + float_or_none, + int_or_none, ) -class LimeLightBaseIE(InfoExtractor): +class LimelightBaseIE(InfoExtractor): + _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' + _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' - def get_playlist_service(self, id, method): - return self._download_json(self.PLAYLIST_SERVICE_URL % (id, method), id) + def _call_playlist_service(self, item_id, method, fatal=True): + return self._download_json( + self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method), + item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal) - def get_api(self, orgId, id, method): - return self._download_json(self.API_URL % (orgId, id, method), id) + def _call_api(self, organization_id, item_id, method): + return self._download_json( + self._API_URL % (organization_id, self._API_PATH, item_id, method), + item_id, 'Downloading API %s JSON' % method) - def process_data(self, mobileUrls, streams, properties): + def _extract(self, item_id, pc_method, mobile_method, meta_method): + pc = self._call_playlist_service(item_id, pc_method) + metadata = self._call_api(pc['orgId'], item_id, meta_method) + mobile = self._call_playlist_service(item_id, mobile_method, fatal=False) + return pc, mobile, metadata + + def _extract_info(self, streams, mobile_urls, properties): video_id = properties['media_id'] formats = [] - for mobileUrl in mobileUrls: - if '.m3u8' in mobileUrl['mobileUrl']: - formats.extend(self._extract_m3u8_formats(mobileUrl['mobileUrl'], video_id)) - else: - formats.append({'url': mobileUrl['mobileUrl']}) - for stream in streams: - if '.f4m' in stream['url']: - formats.extend(self._extract_f4m_formats(stream['url'], video_id)) + stream_url = stream.get('url') + if not stream_url: + continue + if '.f4m' in stream_url: + formats.extend(self._extract_f4m_formats(stream_url, video_id)) else: fmt = { - 'url': stream.get('url'), - 'abr': stream.get('audioBitRate'), - 'vbr': stream.get('videoBitRate'), - 'fps': stream.get('videoFrameRate'), - 'width': stream.get('videoWidthInPixels'), - 'height': stream.get('videoHeightInPixels'), - 'ext': determine_ext(stream.get('url')) + 'url': stream_url, + 'abr': float_or_none(stream.get('audioBitRate')), + 'vbr': float_or_none(stream.get('videoBitRate')), + 'fps': float_or_none(stream.get('videoFrameRate')), + 'width': int_or_none(stream.get('videoWidthInPixels')), + 'height': int_or_none(stream.get('videoHeightInPixels')), + 'ext': determine_ext(stream_url) } - rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', stream['url']) + rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', stream_url) if rtmp: + format_id = 'rtmp' + if stream.get('videoBitRate'): + format_id += '-%d' % int_or_none(stream['videoBitRate']) fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': format_id, }) formats.append(fmt) + for mobile_url in mobile_urls: + media_url = mobile_url.get('mobileUrl') + if not media_url: + continue + format_id = mobile_url.get('targetMediaPlatform') + if determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=-1, m3u8_id=format_id)) + else: + formats.append({ + 'url': media_url, + 'format_id': format_id, + 'preference': -1, + }) + self._sort_formats(formats) title = properties['title'] description = properties.get('description') - timestamp = properties.get('create_date') - duration = int_or_none(properties.get('duration_in_milliseconds')) - filesize = properties.get('total_storage_in_bytes') + timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date')) + duration = float_or_none(properties.get('duration_in_milliseconds'), 1000) + filesize = int_or_none(properties.get('total_storage_in_bytes')) categories = [properties.get('category')] + tags = properties.get('tags', []) thumbnails = [{ - 'url': thumbnail.get('url'), + 'url': thumbnail['url'], 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), - } for thumbnail in properties.get('thumbnails')] - subtitles = {caption.get('language_code'): [{'url': caption.get('url')}] for caption in properties.get('captions')} + } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')] + + subtitles = {} + for caption in properties.get('captions', {}): + lang = caption.get('language_code') + subtitles_url = caption.get('url') + if lang and subtitles_url: + subtitles[lang] = [{ + 'url': subtitles_url, + }] return { 'id': video_id, @@ -74,44 +114,50 @@ class LimeLightBaseIE(InfoExtractor): 'duration': duration, 'filesize': filesize, 'categories': categories, + 'tags': tags, 'thumbnails': thumbnails, 'subtitles': subtitles, } -class LimeLightMediaIE(LimeLightBaseIE): +class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' - _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*mediaId=(?P[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', - 'md5': '3213605088be599705677ef785db6972', 'info_dict': { 'id': '3ffd040b522b4485b6d84effc750cd86', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'HaP and the HB Prince Trailer', 'description': 'As Harry Potter begins his 6th year at Hogwarts School of Witchcraft and Wizardry, he discovers an old book marked mysteriously "This book is the property of the Half-Blood Prince" and begins to learn more about Lord Voldemort\'s dark past.', 'thumbnail': 're:^https?://.*\.jpeg$', - 'duration': 144230, + 'duration': 144.23, 'timestamp': 1244136834, - "upload_date": "20090604", - } + 'upload_date': '20090604', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, } - PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/media/%s/%s' - API_URL = 'http://api.video.limelight.com/rest/organizations/%s/media/%s/%s.json' + _PLAYLIST_SERVICE_PATH = 'media' + _API_PATH = 'media' def _real_extract(self, url): video_id = self._match_id(url) - mobile_json_data = self.get_playlist_service(video_id, 'getMobilePlaylistByMediaId') - pc_json_data = self.get_playlist_service(video_id, 'getPlaylistByMediaId') - properties = self.get_api(pc_json_data['orgId'], video_id, 'properties') + pc, mobile, metadata = self._extract( + video_id, 'getPlaylistByMediaId', 'getMobilePlaylistByMediaId', 'properties') - return self.process_data(mobile_json_data['mediaList'][0]['mobileUrls'], pc_json_data['playlistItems'][0]['streams'], properties) + return self._extract_info( + pc['playlistItems'][0].get('streams', []), + mobile['mediaList'][0].get('mobileUrls', []) if mobile else [], + metadata) -class LimeLightChannelIE(LimeLightBaseIE): +class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' - _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*channelId=(?P[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { @@ -120,31 +166,29 @@ class LimeLightChannelIE(LimeLightBaseIE): }, 'playlist_mincount': 3, } - PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/channel/%s/%s' - API_URL = 'http://api.video.limelight.com/rest/organizations/%s/channels/%s/%s.json' + _PLAYLIST_SERVICE_PATH = 'channel' + _API_PATH = 'channels' def _real_extract(self, url): channel_id = self._match_id(url) - mobile_json_data = self.get_playlist_service(channel_id, 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1') - pc_json_data = self.get_playlist_service(channel_id, 'getPlaylistByChannelId') - medias = self.get_api(pc_json_data['orgId'], channel_id, 'media') + pc, mobile, medias = self._extract( + channel_id, 'getPlaylistByChannelId', + 'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1', 'media') - entries = [] - for i in range(len(medias['media_list'])): - entries.append(self.process_data(mobile_json_data['mediaList'][i]['mobileUrls'], pc_json_data['playlistItems'][i]['streams'], medias['media_list'][i])) + entries = [ + self._extract_info( + pc['playlistItems'][i].get('streams', []), + mobile['mediaList'][i].get('mobileUrls', []) if mobile else [], + medias['media_list'][i]) + for i in range(len(medias['media_list']))] - return { - 'id': channel_id, - 'title': pc_json_data['title'], - 'entries': entries, - '_type': 'playlist', - } + return self.playlist_result(entries, channel_id, pc['title']) -class LimeLightChannelListIE(LimeLightBaseIE): +class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' - _VALID_URL = r'http://link\.videoplatform\.limelight\.com/media/?.*channelListId=(?P[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { @@ -153,24 +197,15 @@ class LimeLightChannelListIE(LimeLightBaseIE): }, 'playlist_mincount': 2, } - PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/channel_list/%s/%s' + _PLAYLIST_SERVICE_PATH = 'channel_list' def _real_extract(self, url): channel_list_id = self._match_id(url) - json_data = self.get_playlist_service(channel_list_id, 'getMobileChannelListById') + channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById') - entries = [] - for channel in json_data['channelList']: - entries.append({ - 'url': 'http://link.videoplatform.limelight.com/media/?channelId=%s' % channel['id'], - '_type': 'url', - 'ie_key': 'LimeLightChannel', - }) + entries = [ + self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel') + for channel in channel_list['channelList']] - return { - 'id': channel_list_id, - 'title': json_data['title'], - 'entries': entries, - '_type': 'playlist', - } + return self.playlist_result(entries, channel_list_id, channel_list['title']) From 9c544e2537abda1d65e96f2b33a79984f3ab7c10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 20:48:44 +0600 Subject: [PATCH 044/150] [limelight] Add test video with subtitles --- youtube_dl/extractor/limelight.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 599d8413d..fb03dd527 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -123,13 +123,13 @@ class LimelightBaseIE(InfoExtractor): class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P[a-z0-9]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { 'id': '3ffd040b522b4485b6d84effc750cd86', 'ext': 'flv', 'title': 'HaP and the HB Prince Trailer', - 'description': 'As Harry Potter begins his 6th year at Hogwarts School of Witchcraft and Wizardry, he discovers an old book marked mysteriously "This book is the property of the Half-Blood Prince" and begins to learn more about Lord Voldemort\'s dark past.', + 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 144.23, 'timestamp': 1244136834, @@ -139,7 +139,25 @@ class LimelightMediaIE(LimelightBaseIE): # rtmp download 'skip_download': True, }, - } + }, { + # video with subtitles + 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'info_dict': { + 'id': 'a3e00274d4564ec4a9b29b9466432335', + 'ext': 'flv', + 'title': '3Play Media Overview Video', + 'description': '', + 'thumbnail': 're:^https?://.*\.jpeg$', + 'duration': 78.101, + 'timestamp': 1338929955, + 'upload_date': '20120605', + 'subtitles': 'mincount:9', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] _PLAYLIST_SERVICE_PATH = 'media' _API_PATH = 'media' From 0659dfccfea9df3206c476e83a2b090456c25a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 21:13:13 +0600 Subject: [PATCH 045/150] [pbs] Improve player regex (Closes #7059) --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 683c81de3..0bca3152b 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -167,7 +167,7 @@ class PBSIE(InfoExtractor): return media_id, presumptive_id, upload_date url = self._search_regex( - r']*\s+src=["\']([^\'"]+partnerplayer[^\'"]+)["\']', + r'(?s)]+?(?:[a-z-]+?=["\'].+?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', webpage, 'player URL') mobj = re.match(self._VALID_URL, url) From 96229998c29705c8ee4230915ec7ff050bcfecf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 21:19:47 +0600 Subject: [PATCH 046/150] [pbs] Allow empty attribute in player regex --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 0bca3152b..66b3dda47 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -167,7 +167,7 @@ class PBSIE(InfoExtractor): return media_id, presumptive_id, upload_date url = self._search_regex( - r'(?s)]+?(?:[a-z-]+?=["\'].+?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', + r'(?s)]+?(?:[a-z-]+?=["\'].*?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', webpage, 'player URL') mobj = re.match(self._VALID_URL, url) From 90ab741e909c949039e31805da04f5e546a1a8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Oct 2015 21:37:49 +0600 Subject: [PATCH 047/150] [pbs] Add test for #7059 --- youtube_dl/extractor/pbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 66b3dda47..6923c6094 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -134,6 +134,24 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + # Video embedded in iframe containing angle brackets as attribute's value (e.g. + # "