From 1a26d8160b230b2e6832f73c06837fc82729b6e8 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Tue, 22 May 2018 17:39:38 -0500 Subject: [PATCH 1/3] [CBC] Fix title extraction --- youtube_dl/extractor/cbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..de374b44b 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -129,6 +129,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'(.*?)', webpage, 'title', fatal=True) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -136,8 +139,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, title.strip(), self._og_search_description(webpage)) From 6dd658348cacaa6e690f6052f10fc7679c7055f5 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 26 May 2018 02:46:19 -0500 Subject: [PATCH 2/3] [CBC] Set document title regex as non-fatal --- youtube_dl/extractor/cbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index de374b44b..06dee46c9 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -131,7 +131,7 @@ class CBCIE(InfoExtractor): webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'(.*?)', webpage, 'title', fatal=True) + r'(.*?)\s*', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -139,7 +139,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, title.strip(), + entries, display_id, title, self._og_search_description(webpage)) From d67c85882d52f28a3273c0a1b20c5fa59a38fa9e Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sat, 26 May 2018 20:05:01 +0700 Subject: [PATCH 3/3] Update cbc.py --- youtube_dl/extractor/cbc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 06dee46c9..ce8e3d346 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -20,6 +20,7 @@ from ..utils import ( parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -131,7 +132,7 @@ class CBCIE(InfoExtractor): webpage = self._download_webpage(url, display_id) title = self._og_search_title(webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'(.*?)\s*', webpage, 'title', fatal=False) + r'([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -139,7 +140,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, title, + entries, display_id, strip_or_none(title), self._og_search_description(webpage))