From 62b1a974962af7202051b32610007f4a5b679423 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:11:04 +0100 Subject: [PATCH 1/7] [YoutubeDL] add basic support for multipart videos --- youtube_dl/YoutubeDL.py | 46 +++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d50b7cfed..88e932bca 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1274,8 +1274,19 @@ class YoutubeDL(object): # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): - if 'url' not in format: - raise ExtractorError('Missing "url" key in result (index %d)' % i) + if 'parts' in format: + if len(format['parts']) == 0: + raise ExtractorError('Empty "parts" array in result (index %d)' % i) + elif len(format['parts']) == 1: + format.update(format['parts'][0]) + del format['parts'] + if 'parts' in format: + for j, part in enumerate(format['parts']): + if 'url' not in part: + raise ExtractorError('Missing "url" key in result (index %d, part %d)' % (i, j)) + else: + if 'url' not in format: + raise ExtractorError('Missing "url" key in result (index %d)' % i) if format.get('format_id') is None: format['format_id'] = compat_str(i) @@ -1302,9 +1313,15 @@ class YoutubeDL(object): format['ext'] = determine_ext(format['url']).lower() # Add HTTP headers, so that external programs can use them from the # json output - full_format_info = info_dict.copy() - full_format_info.update(format) - format['http_headers'] = self._calc_headers(full_format_info) + if 'parts' in format: + for part in format['parts']: + full_format_info = info_dict.copy() + full_format_info.update(part) + part['http_headers'] = self._calc_headers(full_format_info) + else: + full_format_info = info_dict.copy() + full_format_info.update(format) + format['http_headers'] = self._calc_headers(full_format_info) # TODO Central sorting goes here @@ -1430,12 +1447,19 @@ class YoutubeDL(object): if self.params.get('forceid', False): self.to_stdout(info_dict['id']) if self.params.get('forceurl', False): + def print_format_url(format_info): + if 'parts' in format_info: + for f in format_info['parts']: + self.to_stdout(f['url'] + f.get('play_path', '')) + self.to_stdout('') + else: + self.to_stdout(format_info['url'] + format_info.get('play_path', '')) if info_dict.get('requested_formats') is not None: for f in info_dict['requested_formats']: - self.to_stdout(f['url'] + f.get('play_path', '')) + print_format_url(f) else: # For RTMP URLs, also include the playpath - self.to_stdout(info_dict['url'] + info_dict.get('play_path', '')) + print_format_url(info_dict) if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None: self.to_stdout(info_dict['thumbnail']) if self.params.get('forcedescription', False) and info_dict.get('description') is not None: @@ -1546,8 +1570,8 @@ class YoutubeDL(object): fd = get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) - if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + if self.params.get('verbose') and 'url' in info: + self.to_stdout('[debug] Invoking downloader on %r' % info['url']) return fd.download(name, info) if info_dict.get('requested_formats') is not None: @@ -1839,6 +1863,10 @@ class YoutubeDL(object): if res: res += ', ' res += '~' + format_bytes(fdict['filesize_approx']) + if fdict.get('parts'): + if res: + res += ', ' + res += '%d parts' % len(fdict['parts']) return res def list_formats(self, info_dict): From f48137f157c74a48e93d24196e0e59779d787e96 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:13:32 +0100 Subject: [PATCH 2/7] [multipart] add simple implementation for a multipart downloader --- youtube_dl/downloader/__init__.py | 2 ++ youtube_dl/downloader/multipart.py | 29 +++++++++++++++++++++++++++++ youtube_dl/utils.py | 2 ++ 3 files changed, 33 insertions(+) create mode 100644 youtube_dl/downloader/multipart.py diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index dccc59212..d654028d3 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -9,6 +9,7 @@ from .http import HttpFD from .rtsp import RtspFD from .rtmp import RtmpFD from .dash import DashSegmentsFD +from .multipart import MultiPartFD from ..utils import ( determine_protocol, @@ -22,6 +23,7 @@ PROTOCOL_MAP = { 'rtsp': RtspFD, 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, + 'multipart': MultiPartFD, } diff --git a/youtube_dl/downloader/multipart.py b/youtube_dl/downloader/multipart.py new file mode 100644 index 000000000..b955cb14a --- /dev/null +++ b/youtube_dl/downloader/multipart.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +from .common import FileDownloader + +from ..utils import prepend_extension + + +class MultiPartFD(FileDownloader): + + FD_NAME = 'multipart' + + def real_download(self, filename, info_dict): + parts = info_dict['parts'] + self.to_screen('[%s] Total parts: %d' % (self.FD_NAME, len(parts))) + + for i in range(len(parts)): + fd = get_suitable_downloader(parts[i], self.params)(self.ydl, self.params) + frag_filename = prepend_extension(filename, 'part%d' % i) + success = fd.download(frag_filename, parts[i]) + if not success: + return False + # We only download the first fragment during the test + if self.params.get('test', False): + break + + return True + +# workaround circular imports +from .__init__ import get_suitable_downloader diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9c1c0e0bd..e1a0dd91d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1891,6 +1891,8 @@ def determine_protocol(info_dict): if protocol is not None: return protocol + if 'parts' in info_dict: + return 'multipart' url = info_dict['url'] if url.startswith('rtmp'): return 'rtmp' From 799e01d70914b259097b77a31a9bcaf1244dfbfb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:19:40 +0100 Subject: [PATCH 3/7] [common] calculate format metadata if they exist in parts array --- youtube_dl/extractor/common.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b05b22a94..42886d1b2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -789,10 +789,28 @@ class InfoExtractor(object): raise ExtractorError('No video formats found') def _formats_key(f): + if 'parts' in f: + for part_key, format_key in [('duration', 'duration'), ('filesize', 'filesize_approx')]: + if format_key in f: + continue + total = 0 + for part in f['parts']: + value = part.get(part_key) + if not value: + total = None + break + total += value + f[format_key] = total # TODO remove the following workaround from ..utils import determine_ext - if not f.get('ext') and 'url' in f: - f['ext'] = determine_ext(f['url']) + if not f.get('ext'): + if 'url' in f: + f['ext'] = determine_ext(f['url']) + elif 'parts' in f: + for part in f['parts']: + if 'url' in part: + f['ext'] = determine_ext(part['url']) + break if isinstance(field_preference, (list, tuple)): return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) From 40de7b914e650518a13ea033b2e6028640760e35 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:22:13 +0100 Subject: [PATCH 4/7] [youku] fix multipart formats extraction(fixes #6193) --- youtube_dl/extractor/youku.py | 60 +++++++++++++++-------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index f767fa15f..d9bb46ca5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -14,6 +14,8 @@ from ..compat import ( from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, + float_or_none, ) @@ -66,7 +68,7 @@ class YoukuIE(InfoExtractor): }, }] - def construct_video_urls(self, data): + def construct_formats(self, data): # get sid, token def yk_t(s1, s2): ls = list(range(256)) @@ -114,11 +116,10 @@ class YoukuIE(InfoExtractor): ep = base64.b64encode(ep_t).decode('ascii') return ep - # generate video_urls - video_urls_dict = {} + formats = [] for stream in data['stream']: format = stream.get('stream_type') - video_urls = [] + parts = [] for dt in stream['segs']: n = str(stream['segs'].index(dt)) param = { @@ -139,10 +140,20 @@ class YoukuIE(InfoExtractor): '/st/' + self.parse_ext_l(format) + \ '/fileid/' + get_fileid(format, n) + '?' + \ compat_urllib_parse.urlencode(param) - video_urls.append(video_url) - video_urls_dict[format] = video_urls - - return video_urls_dict + parts.append({ + 'url': video_url, + 'filesize': int_or_none(dt.get('size')), + }) + formats.append({ + 'format_id': self.get_format_name(format), + 'parts': parts, + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'filesize': int_or_none(stream.get('size')), + 'ext': self.parse_ext_l(format), + }) + self._sort_formats(formats) + return formats @staticmethod def get_ysuid(): @@ -235,34 +246,13 @@ class YoukuIE(InfoExtractor): msg += ': ' + error_note raise ExtractorError(msg) - # get video title - title = data['video']['title'] - - # generate video_urls_dict - video_urls_dict = self.construct_video_urls(data) - - # construct info - entries = [{ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - 'formats': [], - # some formats are not available for all parts, we have to detect - # which one has all - } for i in range(max(len(v.get('segs')) for v in data['stream']))] - for stream in data['stream']: - fm = stream.get('stream_type') - video_urls = video_urls_dict[fm] - for video_url, seg, entry in zip(video_urls, stream['segs'], entries): - entry['formats'].append({ - 'url': video_url, - 'format_id': self.get_format_name(fm), - 'ext': self.parse_ext_l(fm), - 'filesize': int(seg['size']), - }) + video_data = data['video'] return { - '_type': 'multi_video', 'id': video_id, - 'title': title, - 'entries': entries, + 'title': video_data['title'], + 'duration': float_or_none(video_data.get('seconds')), + 'uploader': video_data.get('username'), + 'uploader_id': video_data.get('userid'), + 'formats': self.construct_formats(data), } From 3f97d9fd5984dd5548d1408dcc44e46f53d806a1 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:23:39 +0100 Subject: [PATCH 5/7] [iqiyi] fix multipart formats extraction --- youtube_dl/extractor/iqiyi.py | 94 ++++++++++++++--------------------- 1 file changed, 37 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 66a70a181..463c7a4e7 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -6,10 +6,14 @@ import math import random import time import uuid +import re from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, +) class IqiyiIE(InfoExtractor): @@ -108,7 +112,7 @@ class IqiyiIE(InfoExtractor): def md5_text(text): return hashlib.md5(text.encode('utf-8')).hexdigest() - def construct_video_urls(self, data, video_id, _uuid): + def construct_formats(self, data, video_id, _uuid): def do_xor(x, y): a = y % 3 if a == 1: @@ -136,14 +140,14 @@ class IqiyiIE(InfoExtractor): t = str(int(math.floor(int(tm) / (600.0)))) return self.md5_text(t + mg + x) - video_urls_dict = {} + formats = [] for format_item in data['vp']['tkl'][0]['vs']: if 0 < int(format_item['bid']) <= 10: format_id = self.get_format(format_item['bid']) else: continue - video_urls = [] + parts = [] video_urls_info = format_item['fs'] if not format_item['fs'][0]['l'].startswith('/'): @@ -151,13 +155,12 @@ class IqiyiIE(InfoExtractor): if t.endswith('mp4'): video_urls_info = format_item['flvs'] - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] + for part_index, part in enumerate(video_urls_info): + vl = part['l'] if not vl.startswith('/'): vl = get_encode_code(vl) key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - filesize = segment['b'] + vl.split('/')[-1].split('.')[0], format_id, part_index) base_url = data['vp']['du'].split('/') base_url.insert(-1, key) base_url = '/'.join(base_url) @@ -174,13 +177,29 @@ class IqiyiIE(InfoExtractor): compat_urllib_parse.urlencode(param) js = self._download_json( api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) + note='Download video info of part %d for format %s' % (part_index + 1, format_id)) video_url = js['l'] - video_urls.append( - (video_url, filesize)) + parts.append({ + 'url': video_url, + 'filesize': int_or_none(part['b']), + }) - video_urls_dict[format_id] = video_urls - return video_urls_dict + format_info = { + 'format_id': format_id, + 'parts': parts, + 'duration': int_or_none(format_item.get('duration')), + } + scrsz = format_item.get('scrsz') + if scrsz: + mobj = re.match(r'(\d+)x(\d+)', scrsz) + if mobj: + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + formats.append(format_info) + self._sort_formats(formats) + return formats def get_format(self, bid): matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] @@ -241,47 +260,8 @@ class IqiyiIE(InfoExtractor): data = raw_data['data'] - title = data['vi']['vn'] - - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid) - - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) - - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + return { + 'id': video_id, + 'title': data['vi']['vn'], + 'formats': self.construct_formats(data, video_id, _uuid), + } From c56621f173bac431fae17e4acc9d9c22ac31b7fc Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:25:33 +0100 Subject: [PATCH 6/7] [sohu] fix multipart formats extraction --- youtube_dl/extractor/sohu.py | 66 ++++++++++++++---------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index ea8fc258d..38b640dd0 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, ) @@ -127,6 +128,7 @@ class SohuIE(InfoExtractor): raise ExtractorError( 'Sohu said: The video is only licensed to users in Mainland China.', expected=True) + vid = compat_str(vid_data['id']) formats_json = {} for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): @@ -136,18 +138,12 @@ class SohuIE(InfoExtractor): vid_id = compat_str(vid_id) formats_json[format_id] = vid_data if vid == vid_id else _fetch_data(vid_id, mytv) - part_count = vid_data['data']['totalBlocks'] - - playlist = [] - for i in range(part_count): - formats = [] - for format_id, format_data in formats_json.items(): - allot = format_data['allot'] - - data = format_data['data'] - clips_url = data['clipsURL'] - su = data['su'] - + formats = [] + for format_id, format_data in formats_json.items(): + data = format_data['data'] + part_count = data['totalBlocks'] + parts = [] + for i in range(part_count): video_url = 'newflv.sohu.ccgslb.net' cdnId = None retries = 0 @@ -155,8 +151,8 @@ class SohuIE(InfoExtractor): while 'newflv.sohu.ccgslb.net' in video_url: params = { 'prot': 9, - 'file': clips_url[i], - 'new': su[i], + 'file': data['clipsURL'][i], + 'new': data['su'][i], 'prod': 'flash', 'rb': 1, } @@ -170,7 +166,7 @@ class SohuIE(InfoExtractor): if retries > 0: download_note += ' (retry #%d)' % retries part_info = self._parse_json(self._download_webpage( - 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + 'http://%s/?%s' % (format_data['allot'], compat_urllib_parse.urlencode(params)), video_id, download_note), video_id) video_url = part_info['url'] @@ -180,32 +176,22 @@ class SohuIE(InfoExtractor): if retries > 5: raise ExtractorError('Failed to get video URL') - formats.append({ + parts.append({ 'url': video_url, - 'format_id': format_id, - 'filesize': data['clipsBytes'][i], - 'width': data['width'], - 'height': data['height'], - 'fps': data['fps'], + 'duration': int_or_none(vid_data['data']['clipsDuration'][i]), + 'filesize': int_or_none(data['clipsBytes'][i]), }) - self._sort_formats(formats) - - playlist.append({ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - 'duration': vid_data['data']['clipsDuration'][i], - 'formats': formats, + formats.append({ + 'format_id': format_id, + 'parts': parts, + 'width': int_or_none(data['width']), + 'height': int_or_none(data['height']), + 'fps': int_or_none(data['fps']), }) + self._sort_formats(formats) - if len(playlist) == 1: - info = playlist[0] - info['id'] = video_id - else: - info = { - '_type': 'multi_video', - 'entries': playlist, - 'id': video_id, - 'title': title, - } - - return info + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } From 4cb71dd78527beef299f25729ab893f11dba57cb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 15 Jan 2016 01:26:20 +0100 Subject: [PATCH 7/7] [tudou] fix multipart formats extraction --- youtube_dl/extractor/tudou.py | 68 ++++++++++++++++------------------- 1 file changed, 30 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index da3cd76f7..a22919fe2 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -62,49 +62,41 @@ class TudouIE(InfoExtractor): if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') - title = unescapeHTML(item_data['kw']) - description = item_data.get('desc') - thumbnail_url = item_data.get('pic') - view_count = int_or_none(item_data.get('playTimes')) - timestamp = int_or_none(item_data.get('pt')) - segments = self._parse_json(item_data['itemSegs'], video_id) # It looks like the keys are the arguments that have to be passed as - # the hd field in the request url, we pick the higher - # Also, filter non-number qualities (see issue #3643). - quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), - key=lambda k: int(k))[-1] - parts = segments[quality] - result = [] - len_parts = len(parts) - if len_parts > 1: - self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: - part_id = part['k'] - final_url = self._url_for_id(part_id, quality) - ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { - 'id': '%s' % part_id, - 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'description': description, - 'view_count': view_count, - 'timestamp': timestamp, - 'duration': float_or_none(part.get('seconds'), 1000), - 'filesize': int_or_none(part.get('size')), - 'http_headers': { - 'Referer': self._PLAYER_URL, - }, - } - result.append(part_info) + # the hd field in the request url, we filter non-number qualities (see issue #3643). + qualites = sorted(filter(lambda k: k.isdigit(), segments.keys()), + key=lambda k: int(k)) + formats = [] + for quality in qualites: + parts = [] + for part in segments[quality]: + final_url = self._url_for_id(part['k'], quality) + ext = (final_url.split('?')[0]).split('.')[-1] + part_info = { + 'url': final_url, + 'ext': ext, + 'duration': float_or_none(part.get('seconds'), 1000), + 'filesize': int_or_none(part.get('size')), + 'http_headers': { + 'Referer': self._PLAYER_URL, + }, + } + parts.append(part_info) + formats.append({ + 'formats_id': compat_str(quality), + 'parts': parts, + }) + self._sort_formats(formats) return { - '_type': 'multi_video', - 'entries': result, 'id': video_id, - 'title': title, + 'title': unescapeHTML(item_data['kw']), + 'thumbnail': item_data.get('pic'), + 'description': item_data.get('desc'), + 'view_count': int_or_none(item_data.get('playTimes')), + 'timestamp': int_or_none(item_data.get('pt')), + 'formats': formats, }