From 6bb3efe8dffc5f93716eb58e3af6fb5e6a51a336 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Sat, 9 Dec 2017 00:33:28 -0600 Subject: [PATCH 1/5] [youtube] add storyboards meta field with list and write options Storyboards are grids of small images that appear when the user hovers their cursor over a video's timeline. See related issue #9868. Options added: * --list-storyboards * --write-storyboards --- youtube_dl/YoutubeDL.py | 58 ++++++++++++++++++++++--------- youtube_dl/__init__.py | 2 ++ youtube_dl/extractor/common.py | 6 ++++ youtube_dl/extractor/youtube.py | 60 +++++++++++++++++++++++++++++++++ youtube_dl/options.py | 8 +++++ 5 files changed, 118 insertions(+), 16 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 19370f62b..d1c13726e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -181,6 +181,7 @@ class YoutubeDL(object): writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files + writestoryboards: Write all storyboards (grid of video frames) to a file writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video @@ -277,6 +278,7 @@ class YoutubeDL(object): [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. + list_storyboards: Print a table of all storyboards and exit. match_filter: A function that gets called with the info_dict of every video. If it returns a message, the video is ignored. @@ -1467,6 +1469,10 @@ class YoutubeDL(object): self.list_thumbnails(info_dict) return + if self.params.get('list_storyboards'): + self.list_thumbnails(info_dict, item_name='storyboards') + return + thumbnail = info_dict.get('thumbnail') if thumbnail: info_dict['thumbnail'] = sanitize_url(thumbnail) @@ -2208,17 +2214,27 @@ class YoutubeDL(object): '[info] Available formats for %s:\n%s' % (info_dict['id'], render_table(header_line, table))) - def list_thumbnails(self, info_dict): - thumbnails = info_dict.get('thumbnails') + def list_thumbnails(self, info_dict, item_name='thumbnails'): + thumbnails = info_dict.get(item_name) if not thumbnails: - self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) + self.to_screen('[info] No %s present for %s' % (item_name, info_dict['id'])) return self.to_screen( - '[info] Thumbnails for %s:' % info_dict['id']) - self.to_screen(render_table( - ['ID', 'width', 'height', 'URL'], - [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) + '[info] %s for %s:' % (item_name.title(), info_dict['id'])) + + columns = ['ID', 'width', 'height'] + if item_name == 'storyboards': + columns += ['cols', 'rows', 'frames'] + columns += ['URL'] + + table = [] + for t in thumbnails: + table.append([]) + for column in columns: + table[-1].append(t.get(column.lower(), 'unknown')) + + self.to_screen(render_table(columns, table)) def list_subtitles(self, video_id, subtitles, name='subtitles'): if not subtitles: @@ -2383,12 +2399,16 @@ class YoutubeDL(object): return encoding def _write_thumbnails(self, info_dict, filename): + item_name = 'thumbnail' if self.params.get('writethumbnail', False): thumbnails = info_dict.get('thumbnails') if thumbnails: thumbnails = [thumbnails[-1]] elif self.params.get('write_all_thumbnails', False): thumbnails = info_dict.get('thumbnails') + elif self.params.get('writestoryboards', False): + thumbnails = info_dict.get('storyboards') + item_name = 'storyboard' else: return @@ -2398,22 +2418,28 @@ class YoutubeDL(object): for t in thumbnails: thumb_ext = determine_ext(t['url'], 'jpg') - suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' + if item_name == 'thumbnails': + suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' + else: + suffix = '_%s_%s' % (item_name, t['id']) thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): - self.to_screen('[%s] %s: Thumbnail %sis already present' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen('[%s] %s: %s %sis already present' % + (info_dict['extractor'], info_dict['id'], + item_name.title(), thumb_display_id)) else: - self.to_screen('[%s] %s: Downloading thumbnail %s...' % - (info_dict['extractor'], info_dict['id'], thumb_display_id)) + self.to_screen('[%s] %s: Downloading %s %s...' % + (info_dict['extractor'], info_dict['id'], + item_name, thumb_display_id)) try: uf = self.urlopen(t['url']) with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) - self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % - (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) + self.to_screen('[%s] %s: Writing %s %sto: %s' % + (info_dict['extractor'], info_dict['id'], + item_name, thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], error_to_compat_str(err))) + self.report_warning('Unable to download %s "%s": %s' % + (t['url'], item_name, error_to_compat_str(err))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9a659fc65..22321e6ac 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -371,6 +371,7 @@ def _real_main(argv=None): 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'write_all_thumbnails': opts.write_all_thumbnails, + 'writestoryboards': opts.writestoryboards, 'writesubtitles': opts.writesubtitles, 'writeautomaticsub': opts.writeautomaticsub, 'allsubtitles': opts.allsubtitles, @@ -418,6 +419,7 @@ def _real_main(argv=None): 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, + 'list_storyboards': opts.list_storyboards, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, 'match_filter': match_filter, diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a61753b17..0fad73792 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -224,6 +224,12 @@ class InfoExtractor(object): deprecated) * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. + storyboards: A list of dictionaries representing storyboards. + A storyboard is an image grid made of frames from the video. + This has the same structure as the thumbnails list, plus: + * "cols" (optional, int) + * "rows" (optional, int) + * "frames" (optional, int) description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fec17987b..06964aeaa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -10,6 +10,7 @@ import random import re import time import traceback +import math from .common import InfoExtractor, SearchInfoExtractor from ..jsinterp import JSInterpreter @@ -1740,8 +1741,65 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd not in dash_mpds: dash_mpds.append(dash_mpd) + def get_storyboards(video_info): + storyboards = [] + spec = video_info.get('storyboard_spec', []) + + for s in spec: + s_parts = s.split('|') + base_url = s_parts[0] + i = 0 + for params in s_parts[1:]: + storyboard_attrib = params.split('#') + if len(storyboard_attrib) != 8: + self._downloader.report_warning('Unable to extract storyboard') + continue + + frame_width = int_or_none(storyboard_attrib[0]) + frame_height = int_or_none(storyboard_attrib[1]) + total_frames = int_or_none(storyboard_attrib[2]) + cols = int_or_none(storyboard_attrib[3]) + rows = int_or_none(storyboard_attrib[4]) + filename = storyboard_attrib[6] + sigh = storyboard_attrib[7] + + if frame_width and frame_height and cols and rows and total_frames: + frames = cols * rows + width, height = frame_width * cols, frame_height * rows + n_images = int(math.ceil(total_frames / float(cols * rows))) + else: + self._downloader.report_warning('Unable to extract storyboard') + continue + + storyboards_url = base_url.replace('$L', compat_str(i)) + '?' + for j in range(n_images): + url = storyboards_url.replace('$N', filename).replace('$M', compat_str(j)) + 'sigh=' + sigh + if j == n_images-1: + remaining_frames = total_frames % (cols * rows) + if remaining_frames != 0: + frames = remaining_frames + rows = int(math.ceil(float(remaining_frames) / rows)) + height = rows * frame_height + if rows == 1: + cols = remaining_frames + width = cols * frame_width + + storyboards.append({ + 'id': 'L' + compat_str(i) + '-M' + compat_str(j), + 'width': width, + 'height': height, + 'cols': cols, + 'rows': rows, + 'frames': frames, + 'url': url + }) + i += 1 + + return storyboards + is_live = None view_count = None + storyboards = None def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) @@ -1786,6 +1844,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = extract_player_response(pl_response, video_id) add_dash_mpd(video_info) view_count = extract_view_count(video_info) + storyboards = get_storyboards(video_info) else: age_gate = False # Try looking directly into the video webpage @@ -2416,6 +2475,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': video_title, 'alt_title': video_alt_title or track, 'thumbnail': video_thumbnail, + 'storyboards': storyboards, 'description': video_description, 'categories': video_categories, 'tags': video_tags, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6d5ac62b3..e0169347b 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -777,6 +777,14 @@ def parseOpts(overrideArguments=None): '--list-thumbnails', action='store_true', dest='list_thumbnails', default=False, help='Simulate and list all available thumbnail formats') + thumbnail.add_option( + '--write-storyboards', + action='store_true', dest='writestoryboards', default=False, + help='Write all storyboards (grid of video frames) to disk') + thumbnail.add_option( + '--list-storyboards', + action='store_true', dest='list_storyboards', default=False, + help='Simulate and list all available storyboards') postproc = optparse.OptionGroup(parser, 'Post-processing Options') postproc.add_option( From 6e1fd3a3b275b1ee22432354a9746bc3abcce5d9 Mon Sep 17 00:00:00 2001 From: Benoit Favre Date: Fri, 30 Nov 2018 14:53:04 +1000 Subject: [PATCH 2/5] Update parsing of storyboards The format for getting storyboards seems to have changed. They are now available in the 'player_response' field. --- youtube_dl/extractor/youtube.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 06964aeaa..adc6d044d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1743,7 +1743,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def get_storyboards(video_info): storyboards = [] - spec = video_info.get('storyboard_spec', []) + + player_response = video_info.get('player_response', []) + if len(player_response) > 0 and isinstance(player_response[0], compat_str): + player_response = self._parse_json( + player_response[0], video_id, fatal=False) + if player_response: + spec = [player_response['storyboards']['playerStoryboardSpecRenderer']['spec']] + + else: + spec = video_info.get('storyboard_spec', []) for s in spec: s_parts = s.split('|') From fca5e0439848ed4a4a9bafc8bc923abf98e347e3 Mon Sep 17 00:00:00 2001 From: Benoit Favre Date: Fri, 30 Nov 2018 15:31:09 +1000 Subject: [PATCH 3/5] Fix double question mark in url --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index adc6d044d..3e614882b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1780,7 +1780,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning('Unable to extract storyboard') continue - storyboards_url = base_url.replace('$L', compat_str(i)) + '?' + storyboards_url = base_url.replace('$L', compat_str(i)) + '&' for j in range(n_images): url = storyboards_url.replace('$N', filename).replace('$M', compat_str(j)) + 'sigh=' + sigh if j == n_images-1: From 089a84a81dbd64ce1f9635b2354c3083b7f9bf9a Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Mon, 11 Mar 2019 18:11:35 -0600 Subject: [PATCH 4/5] [youtube] find storyboards in webpage if not found in info page --- youtube_dl/extractor/youtube.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3e614882b..f60438e8b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1741,19 +1741,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd not in dash_mpds: dash_mpds.append(dash_mpd) - def get_storyboards(video_info): + def get_storyboards(video_info, video_webpage): storyboards = [] + # Try to extract storyborads from video_info player_response = video_info.get('player_response', []) if len(player_response) > 0 and isinstance(player_response[0], compat_str): player_response = self._parse_json( player_response[0], video_id, fatal=False) - if player_response: + if player_response and 'storyboards' in player_response: spec = [player_response['storyboards']['playerStoryboardSpecRenderer']['spec']] - + else: + spec = [] else: spec = video_info.get('storyboard_spec', []) + if len(spec) == 0: + # Try to extract storyborads from video_webpage + sb_index = video_webpage.find('playerStoryboardSpecRenderer') + if sb_index != -1: + sb_spec_renderer = video_webpage[sb_index:] + sb_str = sb_spec_renderer[sb_spec_renderer.find('{'):sb_spec_renderer.find('}')+1] + sb_json = json.loads(sb_str.encode("utf-8").decode("unicode_escape")) + spec = [sb_json['spec']] + for s in spec: s_parts = s.split('|') base_url = s_parts[0] From 5cea6a3b4307aa0d5453551e64ab8a3d4d89e0f0 Mon Sep 17 00:00:00 2001 From: Marc Abonce Seguin Date: Mon, 29 Apr 2019 23:28:00 -0500 Subject: [PATCH 5/5] [youtube] download storyboards even with no dash manifest When youtube_include_dash_manifest was set to false the storyboards wouldn't download anymore, even if these two things are completely unrelated. --- youtube_dl/extractor/youtube.py | 161 ++++++++++++++++---------------- 1 file changed, 83 insertions(+), 78 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f60438e8b..909bec2a8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1608,6 +1608,86 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + def _get_storyboards(self, video_id, video_info, video_webpage): + storyboards = [] + + # Try to extract storyboards from video_info + player_response = video_info.get('player_response', []) + if len(player_response) > 0 and isinstance(player_response[0], compat_str): + player_response = self._parse_json( + player_response[0], video_id, fatal=False) + if player_response and 'storyboards' in player_response: + sb_spec = [try_get(player_response, + lambda x: x['storyboards']['playerStoryboardSpecRenderer']['spec'], + compat_str)] + else: + sb_spec = [] + else: + sb_spec = video_info.get('storyboard_spec', []) + + # Try to extract storyboards from video_webpage + if len(sb_spec) == 0: + sb_index = video_webpage.find('playerStoryboardSpecRenderer') + if sb_index != -1: + sb_spec_renderer = video_webpage[sb_index:] + sb_str = sb_spec_renderer[sb_spec_renderer.find('{'):sb_spec_renderer.find('}') + 1] + sb_json = self._parse_json( + sb_str.encode("utf-8").decode("unicode_escape"), video_id, fatal=False) + sb_spec = [sb_json.get('spec')] if sb_json else [] + + # Extract information of each storyboard + for s in filter(None, sb_spec): + s_parts = s.split('|') + base_url = s_parts[0] + i = 0 + for params in s_parts[1:]: + storyboard_attrib = params.split('#') + if len(storyboard_attrib) != 8: + self._downloader.report_warning('Unable to extract storyboard') + continue + + frame_width = int_or_none(storyboard_attrib[0]) + frame_height = int_or_none(storyboard_attrib[1]) + total_frames = int_or_none(storyboard_attrib[2]) + cols = int_or_none(storyboard_attrib[3]) + rows = int_or_none(storyboard_attrib[4]) + filename = storyboard_attrib[6] + sigh = storyboard_attrib[7] + + if frame_width and frame_height and cols and rows and total_frames: + frames = cols * rows + width, height = frame_width * cols, frame_height * rows + n_images = int(math.ceil(total_frames / float(cols * rows))) + else: + self._downloader.report_warning('Unable to extract storyboard') + continue + + storyboards_url = base_url.replace('$L', compat_str(i)) + '&' + for j in range(n_images): + url = storyboards_url.replace('$N', filename).replace('$M', compat_str(j)) + 'sigh=' + sigh + if j == n_images - 1: + remaining_frames = total_frames % (cols * rows) + if remaining_frames != 0: + frames = remaining_frames + rows = int(math.ceil(float(remaining_frames) / rows)) + height = rows * frame_height + if rows == 1: + cols = remaining_frames + width = cols * frame_width + + storyboards.append({ + 'id': 'L' + compat_str(i) + '-M' + compat_str(j), + 'width': width, + 'height': height, + 'cols': cols, + 'rows': rows, + 'frames': frames, + 'url': url + }) + i += 1 + + return storyboards + @staticmethod def _extract_urls(webpage): # Embedded YouTube player @@ -1741,85 +1821,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd not in dash_mpds: dash_mpds.append(dash_mpd) - def get_storyboards(video_info, video_webpage): - storyboards = [] - - # Try to extract storyborads from video_info - player_response = video_info.get('player_response', []) - if len(player_response) > 0 and isinstance(player_response[0], compat_str): - player_response = self._parse_json( - player_response[0], video_id, fatal=False) - if player_response and 'storyboards' in player_response: - spec = [player_response['storyboards']['playerStoryboardSpecRenderer']['spec']] - else: - spec = [] - else: - spec = video_info.get('storyboard_spec', []) - - if len(spec) == 0: - # Try to extract storyborads from video_webpage - sb_index = video_webpage.find('playerStoryboardSpecRenderer') - if sb_index != -1: - sb_spec_renderer = video_webpage[sb_index:] - sb_str = sb_spec_renderer[sb_spec_renderer.find('{'):sb_spec_renderer.find('}')+1] - sb_json = json.loads(sb_str.encode("utf-8").decode("unicode_escape")) - spec = [sb_json['spec']] - - for s in spec: - s_parts = s.split('|') - base_url = s_parts[0] - i = 0 - for params in s_parts[1:]: - storyboard_attrib = params.split('#') - if len(storyboard_attrib) != 8: - self._downloader.report_warning('Unable to extract storyboard') - continue - - frame_width = int_or_none(storyboard_attrib[0]) - frame_height = int_or_none(storyboard_attrib[1]) - total_frames = int_or_none(storyboard_attrib[2]) - cols = int_or_none(storyboard_attrib[3]) - rows = int_or_none(storyboard_attrib[4]) - filename = storyboard_attrib[6] - sigh = storyboard_attrib[7] - - if frame_width and frame_height and cols and rows and total_frames: - frames = cols * rows - width, height = frame_width * cols, frame_height * rows - n_images = int(math.ceil(total_frames / float(cols * rows))) - else: - self._downloader.report_warning('Unable to extract storyboard') - continue - - storyboards_url = base_url.replace('$L', compat_str(i)) + '&' - for j in range(n_images): - url = storyboards_url.replace('$N', filename).replace('$M', compat_str(j)) + 'sigh=' + sigh - if j == n_images-1: - remaining_frames = total_frames % (cols * rows) - if remaining_frames != 0: - frames = remaining_frames - rows = int(math.ceil(float(remaining_frames) / rows)) - height = rows * frame_height - if rows == 1: - cols = remaining_frames - width = cols * frame_width - - storyboards.append({ - 'id': 'L' + compat_str(i) + '-M' + compat_str(j), - 'width': width, - 'height': height, - 'cols': cols, - 'rows': rows, - 'frames': frames, - 'url': url - }) - i += 1 - - return storyboards - is_live = None view_count = None - storyboards = None def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) @@ -1864,7 +1867,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_response = extract_player_response(pl_response, video_id) add_dash_mpd(video_info) view_count = extract_view_count(video_info) - storyboards = get_storyboards(video_info) else: age_gate = False # Try looking directly into the video webpage @@ -2263,6 +2265,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) + # storyboards + storyboards = self._get_storyboards(video_id, video_info, video_webpage) + # upload date upload_date = self._html_search_meta( 'datePublished', video_webpage, 'upload date', default=None)