From 7b6a386dde6680f995bcf6f71bd4caee08e523d3 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 14 Oct 2016 22:16:43 -0400 Subject: [PATCH 01/13] [nytimes] Handle NYT podcasts --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nytimes.py | 42 +++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 08bed8b0c..64f5073e1 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -641,6 +641,7 @@ from .ntvde import NTVDeIE from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, + NYTimesPodcastIE, NYTimesArticleIE, ) from .nuvid import NuvidIE diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 142c34256..dc47ed537 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import hmac @@ -8,6 +9,7 @@ from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, + js_to_json, parse_iso8601, mimetype2ext, determine_ext, @@ -123,9 +125,47 @@ class NYTimesIE(NYTimesBaseIE): return self._extract_video_from_id(video_id) +class NYTimesPodcastIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?nytimes\.com/.*/podcasts/(?P[^.]+)(?:\.html)?' + _TESTS = [{ + 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', + 'md5': 'e0d52040cafb07662acf3c9132db3575', + 'info_dict': { + 'id': '20', + 'title': "The Run-Up: He Was Like an Octopus", + 'ext': 'mp3', + 'description': 'We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’s campaign.', + } + } ] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data'); + audio_data = self._parse_json(data_json, page_id, transform_source=js_to_json)['data'] + + episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes + episode_number = audio_data['podcast']['episode'].split()[1] + + info_dict = { + 'id': episode_number, + 'title': "%s: %s" % (audio_data['podcast']['title'], episode_title), + 'series': audio_data['podcast']['title'], + 'episode': episode_title, + 'episode_number': episode_number, + 'url': audio_data['track']['source'], + 'duration': audio_data['track']['duration'], + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage) + } + + return info_dict + + class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', From 5c31899eb0211a98ab1fc91cd30058253afbf2ce Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 14 Oct 2016 22:56:41 -0400 Subject: [PATCH 02/13] [nytimes] Podcasts aren't always recognizable by url So merge NYTimesPodcastIE class into NYTArticleIE --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/nytimes.py | 107 +++++++++++++++++------------ 2 files changed, 64 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64f5073e1..08bed8b0c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -641,7 +641,6 @@ from .ntvde import NTVDeIE from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, - NYTimesPodcastIE, NYTimesArticleIE, ) from .nuvid import NuvidIE diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index dc47ed537..eb642ccf6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -125,47 +125,9 @@ class NYTimesIE(NYTimesBaseIE): return self._extract_video_from_id(video_id) -class NYTimesPodcastIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:www\.)?nytimes\.com/.*/podcasts/(?P[^.]+)(?:\.html)?' - _TESTS = [{ - 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', - 'md5': 'e0d52040cafb07662acf3c9132db3575', - 'info_dict': { - 'id': '20', - 'title': "The Run-Up: He Was Like an Octopus", - 'ext': 'mp3', - 'description': 'We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’s campaign.', - } - } ] - - - def _real_extract(self, url): - page_id = self._match_id(url) - - webpage = self._download_webpage(url, page_id) - - data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data'); - audio_data = self._parse_json(data_json, page_id, transform_source=js_to_json)['data'] - - episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes - episode_number = audio_data['podcast']['episode'].split()[1] - - info_dict = { - 'id': episode_number, - 'title': "%s: %s" % (audio_data['podcast']['title'], episode_title), - 'series': audio_data['podcast']['title'], - 'episode': episode_title, - 'episode_number': episode_number, - 'url': audio_data['track']['source'], - 'duration': audio_data['track']['duration'], - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage) - } - - return info_dict - class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', @@ -178,16 +140,75 @@ class NYTimesArticleIE(NYTimesBaseIE): 'upload_date': '20150414', 'uploader': 'Matthew Williams', } + }, { + 'url': 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html', + 'md5': 'e0d52040cafb07662acf3c9132db3575', + 'info_dict': { + 'id': '20', + 'title': "The Run-Up: He Was Like an Octopus", + 'ext': 'mp3', + 'description': 'We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’s campaign.', + } + }, { + 'url': 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html', + 'md5': '66fb5471d7ef15da98af176dc1af4cb9', + 'info_dict': { + 'id': 'inside-the-new-york-times-book-review-the-rise-of-hitler', + 'title': "The Rise of Hitler", + 'ext': 'mp3', + 'description': 'Adam Kirsch discusses Volker Ullrich\'s new biography of Hitler; Billy Collins talks about his latest collection of poems; and iO Tillett Wright on his new memoir, "Darling Days."', + } }, { 'url': 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + page_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, page_id) - video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id') + video_id = self._html_search_regex(r'data-videoid="(\d+)"', webpage, 'video id', None, False) + if video_id is not None: + return self._extract_video_from_id(video_id) + + data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data'); + if data_json is not None: + audio_data = self._parse_json(data_json, page_id, transform_source=js_to_json)['data'] - return self._extract_video_from_id(video_id) + print audio_data + + description = audio_data['track']['description'] + if not len(description): + description = self._html_search_meta(['og:description', 'twitter:description'], webpage) + + + episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes + episode_number = None + episode = audio_data['podcast']['episode'].split() + if len(episode): + episode_number = int_or_none(episode[-1]) + video_id = episode[-1] + else: + video_id = page_id + + + podcast_title = audio_data['podcast']['title'] + title = None + if podcast_title: + title = "%s: %s" % (podcast_title, episode_title) + else: + title = episode_title + + info_dict = { + 'id': video_id, + 'title': title, + 'series': audio_data['podcast']['title'], + 'episode': episode_title, + 'episode_number': episode_number, + 'url': audio_data['track']['source'], + 'duration': audio_data['track']['duration'], + 'description': description + } + + return info_dict From 7096a99ce7c505470316d0d40943d52f366dd4be Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 14 Oct 2016 23:03:17 -0400 Subject: [PATCH 03/13] Move podcast extraction into base class --- youtube_dl/extractor/nytimes.py | 80 +++++++++++++++++---------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index eb642ccf6..48f336072 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -98,7 +98,48 @@ class NYTimesBaseIE(InfoExtractor): 'thumbnails': thumbnails, } + def _extract_podcast_from_json(self, json, page_id, webpage): + audio_data = self._parse_json(json, page_id, transform_source=js_to_json)['data'] + + print audio_data + + description = audio_data['track']['description'] + if not len(description): + description = self._html_search_meta(['og:description', 'twitter:description'], webpage) + + episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes + episode_number = None + episode = audio_data['podcast']['episode'].split() + if len(episode): + episode_number = int_or_none(episode[-1]) + video_id = episode[-1] + else: + video_id = page_id + + + podcast_title = audio_data['podcast']['title'] + title = None + if podcast_title: + title = "%s: %s" % (podcast_title, episode_title) + else: + title = episode_title + + info_dict = { + 'id': video_id, + 'title': title, + 'series': audio_data['podcast']['title'], + 'episode': episode_title, + 'episode_number': episode_number, + 'url': audio_data['track']['source'], + 'duration': audio_data['track']['duration'], + 'description': description, + } + + return info_dict + + + class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P\d+)' @@ -174,41 +215,4 @@ class NYTimesArticleIE(NYTimesBaseIE): data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data'); if data_json is not None: - audio_data = self._parse_json(data_json, page_id, transform_source=js_to_json)['data'] - - print audio_data - - description = audio_data['track']['description'] - if not len(description): - description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - - - episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes - episode_number = None - episode = audio_data['podcast']['episode'].split() - if len(episode): - episode_number = int_or_none(episode[-1]) - video_id = episode[-1] - else: - video_id = page_id - - - podcast_title = audio_data['podcast']['title'] - title = None - if podcast_title: - title = "%s: %s" % (podcast_title, episode_title) - else: - title = episode_title - - info_dict = { - 'id': video_id, - 'title': title, - 'series': audio_data['podcast']['title'], - 'episode': episode_title, - 'episode_number': episode_number, - 'url': audio_data['track']['source'], - 'duration': audio_data['track']['duration'], - 'description': description - } - - return info_dict + return self._extract_podcast_from_json(data_json, page_id, webpage) From 2117b36d09bfc55cb633f4cf8acad92184e8d475 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Fri, 14 Oct 2016 23:28:59 -0400 Subject: [PATCH 04/13] [nytimes] cleanup, add a fallback from json parsing --- youtube_dl/extractor/nytimes.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 48f336072..4105802a7 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -101,13 +101,10 @@ class NYTimesBaseIE(InfoExtractor): def _extract_podcast_from_json(self, json, page_id, webpage): audio_data = self._parse_json(json, page_id, transform_source=js_to_json)['data'] - print audio_data - description = audio_data['track']['description'] if not len(description): description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes episode_number = None episode = audio_data['podcast']['episode'].split() @@ -116,7 +113,6 @@ class NYTimesBaseIE(InfoExtractor): video_id = episode[-1] else: video_id = page_id - podcast_title = audio_data['podcast']['title'] title = None @@ -128,6 +124,7 @@ class NYTimesBaseIE(InfoExtractor): info_dict = { 'id': video_id, 'title': title, + 'creator': audio_data['track'].get('credit'), 'series': audio_data['podcast']['title'], 'episode': episode_title, 'episode_number': episode_number, @@ -139,7 +136,6 @@ class NYTimesBaseIE(InfoExtractor): return info_dict - class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P\d+)' @@ -166,7 +162,7 @@ class NYTimesIE(NYTimesBaseIE): return self._extract_video_from_id(video_id) - + class NYTimesArticleIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' _TESTS = [{ @@ -213,6 +209,17 @@ class NYTimesArticleIE(NYTimesBaseIE): if video_id is not None: return self._extract_video_from_id(video_id) - data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data'); + data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data', None, False); if data_json is not None: return self._extract_podcast_from_json(data_json, page_id, webpage) + + # Fallback case + # "source":"https:\/\/rss.art19.com\/episodes\/0e2bd0b3-10ef-42c4-9494-0e3d21d2b82a.mp3"," + url=self._html_search_regex(r'"source":"(https?:[^"]+)"', webpage, 'mp3 url') + url = url.replace('\\/','/') + if url is not None: + return { + 'id': page_id, + 'title': self._og_search_title(webpage), + 'url': url + } From 5375b7ff9cd64f0a9e98260205193804eb890903 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 05:15:37 -0400 Subject: [PATCH 05/13] [nytimes] No u"" unicode literals in python3 --- youtube_dl/extractor/nytimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 4105802a7..d5012af1d 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -105,7 +105,7 @@ class NYTimesBaseIE(InfoExtractor): if not len(description): description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - episode_title = audio_data['track']['title'].strip(u"‘’") # strip curlyquotes + episode_title = audio_data['track']['title'].strip("‘’") # strip curlyquotes episode_number = None episode = audio_data['podcast']['episode'].split() if len(episode): From ccb27a857cd70f761de60224d259efefa5c9b898 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 05:23:47 -0400 Subject: [PATCH 06/13] [nytimes] Remove untestable fallback for JS parsing failure Per @yan12125, who I guess is concerned about testability. Fair enough, though fragility seems an issue too. I could imagine using smuggle_url() to pass in a directive to bypass JS parsing, but that seems way too ugly. Does that imply something about the testing framework? Partial revert of 2117b36d09bfc55cb633f4cf8acad92184e8d475 --- youtube_dl/extractor/nytimes.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index d5012af1d..4cae23b1b 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -212,14 +212,5 @@ class NYTimesArticleIE(NYTimesBaseIE): data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data', None, False); if data_json is not None: return self._extract_podcast_from_json(data_json, page_id, webpage) - - # Fallback case - # "source":"https:\/\/rss.art19.com\/episodes\/0e2bd0b3-10ef-42c4-9494-0e3d21d2b82a.mp3"," - url=self._html_search_regex(r'"source":"(https?:[^"]+)"', webpage, 'mp3 url') - url = url.replace('\\/','/') - if url is not None: - return { - 'id': page_id, - 'title': self._og_search_title(webpage), - 'url': url - } + else: + raise UnsupportedError(url) From 12bf439cc676a6a03dc99cfc055975686b159b87 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 13:25:04 -0400 Subject: [PATCH 07/13] [nytimes] "if var" not "if len(var)" --- youtube_dl/extractor/nytimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 4cae23b1b..781d8755a 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -102,13 +102,13 @@ class NYTimesBaseIE(InfoExtractor): audio_data = self._parse_json(json, page_id, transform_source=js_to_json)['data'] description = audio_data['track']['description'] - if not len(description): + if not description: description = self._html_search_meta(['og:description', 'twitter:description'], webpage) episode_title = audio_data['track']['title'].strip("‘’") # strip curlyquotes episode_number = None episode = audio_data['podcast']['episode'].split() - if len(episode): + if episode: episode_number = int_or_none(episode[-1]) video_id = episode[-1] else: From 9fb7e1bd6cd0946f7e760cd0f103b751e913f571 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 13:29:41 -0400 Subject: [PATCH 08/13] [nytimes] Optional fields shouldn't break extraction Also leverage podcast_title var --- youtube_dl/extractor/nytimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 781d8755a..009752826 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -125,11 +125,11 @@ class NYTimesBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'creator': audio_data['track'].get('credit'), - 'series': audio_data['podcast']['title'], + 'series': podcast_title, 'episode': episode_title, 'episode_number': episode_number, 'url': audio_data['track']['source'], - 'duration': audio_data['track']['duration'], + 'duration': audio_data['track'].get('duration'), 'description': description, } From 3915b5193e3cfbeb72462c52b184bb38e5f1aab7 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 13:38:48 -0400 Subject: [PATCH 09/13] [nytimes] import UnsupportedError before raising it --- youtube_dl/extractor/nytimes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 009752826..c4225acf7 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -13,6 +13,7 @@ from ..utils import ( parse_iso8601, mimetype2ext, determine_ext, + UnsupportedError, ) From 001a30f3352bcc829d8e6d4060af2a19cc2c4a82 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 13:41:12 -0400 Subject: [PATCH 10/13] [nytimes] Allow _html_search_regex() to fail Per @dstftw, allow _html_search_regex() to fail instead of raising UnsupportedError() on our own. --- youtube_dl/extractor/nytimes.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index c4225acf7..d1579a2be 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -13,7 +13,6 @@ from ..utils import ( parse_iso8601, mimetype2ext, determine_ext, - UnsupportedError, ) @@ -210,8 +209,5 @@ class NYTimesArticleIE(NYTimesBaseIE): if video_id is not None: return self._extract_video_from_id(video_id) - data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data', None, False); - if data_json is not None: - return self._extract_podcast_from_json(data_json, page_id, webpage) - else: - raise UnsupportedError(url) + data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data') + return self._extract_podcast_from_json(data_json, page_id, webpage) From 804629ecb9296b08e0052307441b7a1c368b07a7 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 17:32:03 -0400 Subject: [PATCH 11/13] [nytimes] json regexp cleanup Escape dots: \. Favor \(({.*})\); over \(({[^)]*)\) in case there are internal close parens in the json data. --- youtube_dl/extractor/nytimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index d1579a2be..c69f40412 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -209,5 +209,5 @@ class NYTimesArticleIE(NYTimesBaseIE): if video_id is not None: return self._extract_video_from_id(video_id) - data_json = self._html_search_regex(r'NYTD.FlexTypes.push\(({[^)]*)\)', webpage, 'json data') + data_json = self._html_search_regex(r'NYTD\.FlexTypes\.push\(({.*})\);', webpage, 'json data') return self._extract_podcast_from_json(data_json, page_id, webpage) From b0eaeedd437007d638af2113a3cad4bff1848cdb Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 17:33:42 -0400 Subject: [PATCH 12/13] [nytimes] don't break if track.description is missing --- youtube_dl/extractor/nytimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index c69f40412..4978cef42 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -101,7 +101,7 @@ class NYTimesBaseIE(InfoExtractor): def _extract_podcast_from_json(self, json, page_id, webpage): audio_data = self._parse_json(json, page_id, transform_source=js_to_json)['data'] - description = audio_data['track']['description'] + description = audio_data['track'].get('description') if not description: description = self._html_search_meta(['og:description', 'twitter:description'], webpage) From 86596efa05cea9ea48e15bc62d5f8760069b10e5 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 15 Oct 2016 17:34:48 -0400 Subject: [PATCH 13/13] [nytimes] don't strip curlyquotes --- youtube_dl/extractor/nytimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 4978cef42..245d0e9a6 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -105,7 +105,7 @@ class NYTimesBaseIE(InfoExtractor): if not description: description = self._html_search_meta(['og:description', 'twitter:description'], webpage) - episode_title = audio_data['track']['title'].strip("‘’") # strip curlyquotes + episode_title = audio_data['track']['title'] episode_number = None episode = audio_data['podcast']['episode'].split() if episode: @@ -182,7 +182,7 @@ class NYTimesArticleIE(NYTimesBaseIE): 'md5': 'e0d52040cafb07662acf3c9132db3575', 'info_dict': { 'id': '20', - 'title': "The Run-Up: He Was Like an Octopus", + 'title': "The Run-Up: \u2018He Was Like an Octopus\u2019", 'ext': 'mp3', 'description': 'We go behind the story of the two women who told us that Donald Trump touched them inappropriately (which he denies) and check in on Hillary Clinton’s campaign.', }