From bd3851b9dde71157cb45c1092991944dc6de2d15 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Tue, 8 Jan 2019 10:10:51 +0800 Subject: [PATCH 1/5] [libsyn] fix extracting the episode title The episode-title div had extra classes added to it, so match the class name in any part of the class field. --- youtube_dl/extractor/libsyn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index f7311f483..a1f748c7b 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -48,7 +48,7 @@ class LibsynIE(InfoExtractor): if podcast_title: podcast_title = podcast_title.strip() episode_title = self._search_regex( - r'(?:
|

)([^<]+)|

)([^<]+) Date: Tue, 8 Jan 2019 10:16:58 +0800 Subject: [PATCH 2/5] [libsyn] fix extracting the release date The "Released:" text was removed from the page but the release_date JSON data field has the same info. --- youtube_dl/extractor/libsyn.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index a1f748c7b..51ce03753 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -60,8 +60,6 @@ class LibsynIE(InfoExtractor): if description: # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip() - release_date = unified_strdate(self._search_regex( - r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') data = json.loads(data_json) @@ -75,6 +73,7 @@ class LibsynIE(InfoExtractor): }] thumbnail = data.get('thumbnail_url') duration = parse_duration(data.get('duration')) + release_date = unified_strdate(data['release_date']) return { 'id': video_id, From b46b0ddbd6020e81cc4059e38611bfdc1dd36c6b Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Tue, 8 Jan 2019 10:18:47 +0800 Subject: [PATCH 3/5] [libsyn] fix extracting the JSON data block The page switched to \r\n Windows line endings. --- youtube_dl/extractor/libsyn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 51ce03753..0ad168826 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -61,7 +61,7 @@ class LibsynIE(InfoExtractor): # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip() - data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') + data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});', webpage, 'JSON data block') data = json.loads(data_json) formats = [{ From 058e4d198924ed433f57f0318b1c7df1941a9194 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Tue, 8 Jan 2019 10:42:29 +0800 Subject: [PATCH 4/5] [libsyn] fix extraction of the podcast title The podcast title HTML element switched from a h3 to a div, the title moved to the next line and there is another tag embedded in the div. --- youtube_dl/extractor/libsyn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 0ad168826..a8b58dd6b 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -44,7 +44,8 @@ class LibsynIE(InfoExtractor): webpage = self._download_webpage(url, video_id) podcast_title = self._search_regex( - r'

([^<]+)

', webpage, 'podcast title', default=None) + r'(?:
|

)([^<]+)<', webpage, + 'podcast title', default=None, flags=re.MULTILINE) if podcast_title: podcast_title = podcast_title.strip() episode_title = self._search_regex( From 7932cf67a71f7fd4fea02ee8e1601d7be761e230 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Tue, 8 Jan 2019 10:42:39 +0800 Subject: [PATCH 5/5] [libsyn] fix extraction of the description The description moved to a JSON API endpoint. --- youtube_dl/extractor/libsyn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index a8b58dd6b..077d41edd 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -58,6 +58,9 @@ class LibsynIE(InfoExtractor): description = self._html_search_regex( r'(.+?)

', webpage, 'description', default=None) + if not description: + details = self._download_json('https://html5-player.libsyn.com/embed/getitemdetails?item_id=' + video_id, video_id) + description = self._html_search_regex(r'

(.+?)

', details.get('item_body'), 'description', default=None) if description: # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip()