From 04eb2f977caa5bd8ab8204537528f66f5513c053 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Sun, 15 Dec 2019 00:53:30 +0000 Subject: [PATCH 1/9] [bbc] Fix errors Fixes errors on news articles, archive pages, etc --- youtube_dl/extractor/bbc.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 901c5a54f..645dcc502 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -608,9 +608,10 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' _MEDIASELECTOR_URLS = [ + 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/iptv-all/vpid/%s/format/xml/', # Provides HQ HLS streams but fails with geolocation in some cases when it's # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/legacy-iptv-all/vpid/%s', # Provides more formats, namely direct mp4 links, but fails on some videos with # notukerror for non UK (?) users (e.g. # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) @@ -977,20 +978,12 @@ class BBCIE(BBCCoUkIE): if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) - # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 - group_id = self._search_regex( - r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, - webpage, 'group id', default=None) - if playlist_id: - return self.url_result( - 'https://www.bbc.co.uk/programmes/%s' % group_id, - ie=BBCCoUkIE.ie_key()) - # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, - r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX], + r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX, + r'"vpid":"(%s)"' % self._ID_REGEX], webpage, 'vpid', default=None) if programme_id: @@ -1014,6 +1007,15 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) # There are several setPayload calls may be present but the video # seems to be always related to the first one From cb833c1fa0e59a3d839b7f2058d2812bc25a38f8 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Mon, 16 Mar 2020 03:13:49 +0000 Subject: [PATCH 2/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 645dcc502..3d128570a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -983,9 +983,17 @@ class BBCIE(BBCCoUkIE): [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX, - r'"vpid":"(%s)"' % self._ID_REGEX], + r'"vpid":"(%s)"' % self._ID_REGEX, + r'"versionPid":"(%s)"' % self._ID_REGEX], webpage, 'vpid', default=None) + # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) + initial_data = self._search_regex( + r']+id="initial-data"[^>]+data-json=\'(.+)\'>', + webpage, 'initial data', fatal=False, default=None) + if initial_data: + programme_id = self._parse_json(unescapeHTML(initial_data), playlist_id)['initData']['items'][0]['smpData']['items'][0]['versionID'] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) self._sort_formats(formats) From a95e7ecf1460405791ae70be6b9c6e52e4cfd057 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 17 Mar 2020 13:40:45 +0000 Subject: [PATCH 3/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 3d128570a..278a810d2 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -984,7 +984,8 @@ class BBCIE(BBCCoUkIE): r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX, r'"vpid":"(%s)"' % self._ID_REGEX, - r'"versionPid":"(%s)"' % self._ID_REGEX], + r'"versionPid":"(%s)"' % self._ID_REGEX, + r'"pid":"(%s)"' % self._ID_REGEX,], webpage, 'vpid', default=None) # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) From d738e2b472210e9882d225b687379a78be8537f9 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 17 Mar 2020 14:13:17 +0000 Subject: [PATCH 4/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 278a810d2..45667bf54 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -985,7 +985,7 @@ class BBCIE(BBCCoUkIE): r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX, r'"vpid":"(%s)"' % self._ID_REGEX, r'"versionPid":"(%s)"' % self._ID_REGEX, - r'"pid":"(%s)"' % self._ID_REGEX,], + r'"pid":"(%s)"' % self._ID_REGEX], webpage, 'vpid', default=None) # bbc reel (e.g. https://www.bbc.com/reel/video/p07c6sb6/how-positive-thinking-is-harming-your-happiness) From 2008a3161d30f49d6bb14605714829c1ef3d8e04 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Sun, 29 Mar 2020 02:02:42 +0100 Subject: [PATCH 5/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 45667bf54..0ce9a24e1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -755,7 +755,7 @@ class BBCIE(BBCCoUkIE): }, 'skip': 'Georestricted to UK', }, { - # single video with playlist.sxml URL in playlist param + # single video with "pid" paramter 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', @@ -794,10 +794,11 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', 'only_matching': True, }, { - # single video article embedded with data-media-vpid + # single video article embedded with Morph "vpid" parameter 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # single video with "vpid" parameter 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', @@ -810,6 +811,7 @@ class BBCIE(BBCCoUkIE): } }, { # window.__PRELOADED_STATE__ + # 404 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', 'info_dict': { 'id': 'b0b9z4vz', @@ -820,6 +822,7 @@ class BBCIE(BBCCoUkIE): 'uploader_id': 'bbc_radio_three', }, }, { + # article with embedded video using data-pid parameter 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', 'info_dict': { 'id': 'p06w9tws', @@ -984,7 +987,6 @@ class BBCIE(BBCCoUkIE): r']+name="externalIdentifier"[^>]+value="(%s)"' % self._ID_REGEX, r'videoId\s*:\s*["\'](%s)["\']' % self._ID_REGEX, r'"vpid":"(%s)"' % self._ID_REGEX, - r'"versionPid":"(%s)"' % self._ID_REGEX, r'"pid":"(%s)"' % self._ID_REGEX], webpage, 'vpid', default=None) @@ -993,7 +995,10 @@ class BBCIE(BBCCoUkIE): r']+id="initial-data"[^>]+data-json=\'(.+)\'>', webpage, 'initial data', fatal=False, default=None) if initial_data: - programme_id = self._parse_json(unescapeHTML(initial_data), playlist_id)['initData']['items'][0]['smpData']['items'][0]['versionID'] + programme_id = self._search_regex( + r'"versionID":"(%s)"' % self._ID_REGEX, + unescapeHTML(initial_data), + 'programme id', fatal=False, default=None) if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 047b4ea871bf35b46efbb9c2240c259326c66955 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 20 Oct 2020 18:19:33 +0100 Subject: [PATCH 6/9] Update bbc.py Fixed news videos by implementing new window.__INITIAL_DATA__ data, and also fixed Morph videos. --- youtube_dl/extractor/bbc.py | 120 +++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 0ce9a24e1..2fb2ba0b8 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -908,6 +908,37 @@ class BBCIE(BBCCoUkIE): entries = [] + initial_data_re = self._search_regex( + r']*>window.__INITIAL_DATA__=(.*?);', webpage, + 'initial data', default=None) + if initial_data_re: + initial_data = self._parse_json(initial_data_re, playlist_id) + for key in initial_data['data']: + data = initial_data['data'][key].get('data') + if data and isinstance(data, dict): + mediaItem = None + initialItem = data.get('initialItem') + blocks = data.get('blocks') + if initialItem: + mediaItem = initialItem.get('mediaItem') + elif blocks: + for block in blocks: + if block.get('type') == 'media': + mediaItem = block.get('model') + if mediaItem: + title = mediaItem['title']['content'] if mediaItem.get('title') else mediaItem.get('caption') + description = '\n'.join([block['model']['text'] for block in mediaItem['summary']['blocks']]) if mediaItem.get('summary') else None + programme_id = mediaItem['media']['items'][0]['id'] + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'formats': formats, + 'subtitles': subtitles, + }) + # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) @@ -1025,54 +1056,57 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r']+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) - # There are several setPayload calls may be present but the video - # seems to be always related to the first one - morph_payload = self._parse_json( - self._search_regex( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', - webpage, 'morph payload', default='{}'), - playlist_id, fatal=False) - if morph_payload: - components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] - for component in components: - if not isinstance(component, dict): - continue - lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) - if not lead_media: - continue - identifiers = lead_media.get('identifiers') - if not identifiers or not isinstance(identifiers, dict): - continue - programme_id = identifiers.get('vpid') or identifiers.get('playablePid') - if not programme_id: - continue - title = lead_media.get('title') or self._og_search_title(webpage) - formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) - description = lead_media.get('summary') - uploader = lead_media.get('masterBrand') - uploader_id = lead_media.get('mid') - duration = None - duration_d = lead_media.get('duration') - if isinstance(duration_d, dict): - duration = parse_duration(dict_get( - duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) - return { - 'id': programme_id, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, - 'subtitles': subtitles, - } + morph_payloads = re.findall( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', webpage) + if morph_payloads: + for morph_payload_text in morph_payloads: + morph_payload = self._parse_json( + morph_payload_text, playlist_id, fatal=False) + if morph_payload: + body_text = try_get(morph_payload, lambda x: x['body']['content']['article']['body']) or None + if not body_text: + continue + body = self._parse_json( + body_text, playlist_id, fatal=False) + if not isinstance(body, list): + continue + for item in body: + if not isinstance(item, dict): + continue + videoData = item.get('videoData') + if videoData: + programme_id = videoData.get('vpid') or videoData.get('playablePid') + if not programme_id: + continue + title = videoData.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = videoData.get('caption') or videoData.get('summary') + uploader = videoData.get('masterBrand') + uploader_id = videoData.get('mid') + duration = None + duration_d = videoData.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + entries.append({ + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result( + entries, playlist_id) preload_state = self._parse_json(self._search_regex( r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, From 464ff372fd5832a731dcd5d4c6e90a5486c8c23e Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 20 Oct 2020 18:24:04 +0100 Subject: [PATCH 7/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2fb2ba0b8..5880c0e93 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1105,8 +1105,7 @@ class BBCIE(BBCCoUkIE): 'formats': formats, 'subtitles': subtitles, }) - return self.playlist_result( - entries, playlist_id) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) preload_state = self._parse_json(self._search_regex( r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage, From 40caf37ad0c805f96808f847c1f98d3b9fd0e62d Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Tue, 20 Oct 2020 22:15:28 +0100 Subject: [PATCH 8/9] Update bbc.py flake8 --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 5880c0e93..819e076f3 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1063,7 +1063,7 @@ class BBCIE(BBCCoUkIE): # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) morph_payloads = re.findall( - r'Morph\.setPayload\([^,]+,\s*({.+?})\);', webpage) + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', webpage) if morph_payloads: for morph_payload_text in morph_payloads: morph_payload = self._parse_json( From 009cf65c74b1e98ac0086834d5ce93814c46e322 Mon Sep 17 00:00:00 2001 From: ajj8 <35781586+ajj8@users.noreply.github.com> Date: Wed, 21 Oct 2020 15:23:10 +0100 Subject: [PATCH 9/9] Update bbc.py --- youtube_dl/extractor/bbc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 819e076f3..1573456db 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -916,16 +916,16 @@ class BBCIE(BBCCoUkIE): for key in initial_data['data']: data = initial_data['data'][key].get('data') if data and isinstance(data, dict): - mediaItem = None + mediaItems = [] initialItem = data.get('initialItem') blocks = data.get('blocks') if initialItem: - mediaItem = initialItem.get('mediaItem') - elif blocks: + mediaItems.append(initialItem.get('mediaItem')) + if blocks: for block in blocks: if block.get('type') == 'media': - mediaItem = block.get('model') - if mediaItem: + mediaItems.append(block.get('model')) + for mediaItem in mediaItems: title = mediaItem['title']['content'] if mediaItem.get('title') else mediaItem.get('caption') description = '\n'.join([block['model']['text'] for block in mediaItem['summary']['blocks']]) if mediaItem.get('summary') else None programme_id = mediaItem['media']['items'][0]['id']