From 999c87a618544af629e65dcc028c40e5cd795a48 Mon Sep 17 00:00:00 2001 From: Daniel Cassidy Date: Tue, 17 Dec 2019 07:02:41 +0000 Subject: [PATCH 1/4] Fix playlist download from BBC iPlayer. --- youtube_dl/extractor/bbc.py | 96 +++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 901c5a54f..48486fbe8 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1247,31 +1247,13 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): - def _entries(self, webpage, url, playlist_id): - single_page = 'page' in compat_urlparse.parse_qs( - compat_urlparse.urlparse(url).query) - for page_num in itertools.count(2): - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): - yield self.url_result( - self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - if single_page: - return - next_page = self._search_regex( - r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', - webpage, 'next page url', default=None, group='url') - if not next_page: - break - webpage = self._download_webpage( - compat_urlparse.urljoin(url, next_page), playlist_id, - 'Downloading page %d' % page_num, page_num) - def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - title, description = self._extract_title_and_description(webpage) + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) return self.playlist_result( self._entries(webpage, url, playlist_id), @@ -1282,7 +1264,6 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): IE_NAME = 'bbc.co.uk:iplayer:playlist' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' - _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { @@ -1303,12 +1284,51 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'playlist_mincount': 10, }] - def _extract_title_and_description(self, webpage): - title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) - description = self._search_regex( - r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', - webpage, 'description', fatal=False, group='value') - return title, description + def _entries(self, webpage, url, playlist_id): + query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + single_season = 'seriesId' in query + single_page = 'page' in query + + redux_state = self._redux_state(webpage, playlist_id) + slices = redux_state.get('header', {}).get('availableSlices', []) + season_ids = list(map(lambda s: s.get('id'), slices)) + + for season in itertools.count(1): + while True: + pagination = redux_state.get('pagination') + page_num = pagination.get('currentPage') + total_pages = pagination.get('totalPages') + + for entity in redux_state.get('entities'): + video_id = entity.get('id') + yield self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + + if single_page or page_num >= total_pages: + break + + next_page_num = page_num + 1 + next_page_href = pagination.get('pageUrl') % next_page_num + url = compat_urlparse.urljoin(url, next_page_href) + + webpage = self._download_webpage(url, playlist_id, + 'Downloading season %d page %d' % (season, next_page_num), + 'season %d page %d' % (season, next_page_num)) + redux_state = self._redux_state(webpage, playlist_id) + + if single_season or season >= len(season_ids): + break + + next_season_id = season_ids[season] + url = compat_urlparse.urljoin(url, '?seriesId=' + next_season_id) + webpage = self._download_webpage(url, playlist_id, + 'Downloading season %d page 1' % (season + 1), + 'season %d page 1' % (season + 1)) + redux_state = self._redux_state(webpage, playlist_id) + + def _redux_state(self, webpage, playlist_id): + redux_state_regex = r']*>\s*window.__IPLAYER_REDUX_STATE__\s*=\s*(.*?);?\s*' + redux_state_json = self._search_regex(redux_state_regex, webpage, 'redux_state') + return self._parse_json(redux_state_json, playlist_id, transform_source=unescapeHTML) class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): @@ -1353,7 +1373,21 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'only_matching': True, }] - def _extract_title_and_description(self, webpage): - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return title, description + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) From c1294dc9012d955cf33e191e13bdb0c8b54f0515 Mon Sep 17 00:00:00 2001 From: Daniel Cassidy Date: Tue, 17 Dec 2019 07:04:37 +0000 Subject: [PATCH 2/4] Fix downloading multi-page playlists from bbc.co.uk. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 48486fbe8..bfe6db99d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1384,7 +1384,7 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): if single_page: return next_page = self._search_regex( - r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + r']+class=(["\'])pagination_+next\1[^>]*>\s*]+href=(["\'])(?P(?:(?!\2).)+)\2', webpage, 'next page url', default=None, group='url') if not next_page: break From 93678422ef28b85e1318dc1d180b91ce45600bf5 Mon Sep 17 00:00:00 2001 From: Daniel Cassidy Date: Tue, 17 Dec 2019 07:44:00 +0000 Subject: [PATCH 3/4] Fix crash when downloading iPlayer playlists that are not divided into seasons. --- youtube_dl/extractor/bbc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index bfe6db99d..51effeba3 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1291,7 +1291,10 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): redux_state = self._redux_state(webpage, playlist_id) slices = redux_state.get('header', {}).get('availableSlices', []) - season_ids = list(map(lambda s: s.get('id'), slices)) + if slices: + season_ids = list(map(lambda s: s.get('id'), slices)) + else: + season_ids = [] for season in itertools.count(1): while True: From da6790a3a318b9891e628b8af40179f41670852d Mon Sep 17 00:00:00 2001 From: Daniel Cassidy Date: Mon, 23 Dec 2019 08:59:54 +0000 Subject: [PATCH 4/4] BBC iPlayer: Fix multi-page handling for shows with only one season. --- youtube_dl/extractor/bbc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 51effeba3..cca3689a7 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1310,7 +1310,8 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): break next_page_num = page_num + 1 - next_page_href = pagination.get('pageUrl') % next_page_num + page_url_template = pagination.get('pageUrl') or '?page=%s' + next_page_href = page_url_template % next_page_num url = compat_urlparse.urljoin(url, next_page_href) webpage = self._download_webpage(url, playlist_id,