From e4ae478a0d12aef68de696125ae08877a806c43e Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sat, 25 Mar 2017 19:47:48 -0400 Subject: [PATCH 1/5] [WSJ] Extract videos linked from articles, too --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/wsj.py | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a7028a4d..84bbec6c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1210,7 +1210,10 @@ from .wrzuta import ( WrzutaIE, WrzutaPlaylistIE, ) -from .wsj import WSJIE +from .wsj import ( + WSJIE, + WSJArticleIE, +) from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index deb7483ae..cd03bdc6d 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -37,7 +37,9 @@ class WSJIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_video(video_id) + def _extract_video(self, video_id): api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' @@ -87,3 +89,24 @@ class WSJIE(InfoExtractor): 'title': title, 'categories': info.get('keywords'), } + + +class WSJArticleIE(WSJIE): + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P\w[^/]+)' + _TESTS = [{ + 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', + 'info_dict': { + 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', + 'ext': 'mp4', + 'upload_date': '20170221', + 'uploader_id': 'ralcaraz', + 'title': 'Bao Bao the Panda Leaves for China', + } + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + video_id = self._html_search_regex(r'data-src=["\']([A-Z0-9\-]+)', + webpage, 'video id') + return self._extract_video(video_id) From aa4f4ee753a183a9c01904aff28c98999f77258c Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Apr 2017 08:55:50 -0400 Subject: [PATCH 2/5] [WSJArticle] _search_regex not _html_* data-src attr --- youtube_dl/extractor/wsj.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index cd03bdc6d..dfbad933e 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -107,6 +107,6 @@ class WSJArticleIE(WSJIE): def _real_extract(self, url): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) - video_id = self._html_search_regex(r'data-src=["\']([A-Z0-9\-]+)', - webpage, 'video id') + video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)', + webpage, 'video id') return self._extract_video(video_id) From fdf743bb55accd650e4eecc790b60c35e66bd985 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Apr 2017 09:23:19 -0400 Subject: [PATCH 3/5] [WSJArticle] Delegate via url_result() ...to base WSJIE class. Don't call an inherited _extract_video() method. --- youtube_dl/extractor/wsj.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index dfbad933e..37fe64456 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -17,6 +17,7 @@ class WSJIE(InfoExtractor): ) (?P[a-zA-Z0-9-]+)''' IE_DESC = 'Wall Street Journal' + TEMPLATE_URL = 'https://wsj.com/video/./%s' _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'md5': 'e230a5bb249075e40793b655a54a02e4', @@ -109,4 +110,4 @@ class WSJArticleIE(WSJIE): webpage = self._download_webpage(url, article_id) video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)', webpage, 'video id') - return self._extract_video(video_id) + return self.url_result(self.TEMPLATE_URL % video_id, WSJIE.ie_key()) From 349b15f380567fc2baff9928237c58d347e95ed5 Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Apr 2017 09:32:31 -0400 Subject: [PATCH 4/5] [WSJArticle] revert separate _extract_video() method --- youtube_dl/extractor/wsj.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 37fe64456..7c3521243 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -38,9 +38,7 @@ class WSJIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_video(video_id) - def _extract_video(self, video_id): api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' From 3365000d2a994338c92d9a5321980ff651f4df0b Mon Sep 17 00:00:00 2001 From: John Hawkinson Date: Sun, 9 Apr 2017 10:03:08 -0400 Subject: [PATCH 5/5] [WSJArticle] cleanups... . Create a wsj: shortcut, revert TEMPLATE_URL stuff . Don't inherit from WSJIE . Pass video_id to url_result() --- youtube_dl/extractor/wsj.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 7c3521243..ec38a2ad8 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -10,14 +10,14 @@ from ..utils import ( class WSJIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// + _VALID_URL = r'''(?x) (?: - video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| - (?:www\.)?wsj\.com/video/[^/]+/ + https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| + https?://(?:www\.)?wsj\.com/video/[^/]+/| + wsj: ) (?P[a-zA-Z0-9-]+)''' IE_DESC = 'Wall Street Journal' - TEMPLATE_URL = 'https://wsj.com/video/./%s' _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'md5': 'e230a5bb249075e40793b655a54a02e4', @@ -90,7 +90,7 @@ class WSJIE(InfoExtractor): } -class WSJArticleIE(WSJIE): +class WSJArticleIE(InfoExtractor): _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P\w[^/]+)' _TESTS = [{ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', @@ -108,4 +108,4 @@ class WSJArticleIE(WSJIE): webpage = self._download_webpage(url, article_id) video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)', webpage, 'video id') - return self.url_result(self.TEMPLATE_URL % video_id, WSJIE.ie_key()) + return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)