From e4ae478a0d12aef68de696125ae08877a806c43e Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sat, 25 Mar 2017 19:47:48 -0400
Subject: [PATCH 1/5] [WSJ] Extract videos linked from articles, too

---
 youtube_dl/extractor/extractors.py |  5 ++++-
 youtube_dl/extractor/wsj.py        | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 6a7028a4d..84bbec6c4 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -1210,7 +1210,10 @@ from .wrzuta import (
     WrzutaIE,
     WrzutaPlaylistIE,
 )
-from .wsj import WSJIE
+from .wsj import (
+    WSJIE,
+    WSJArticleIE,
+)
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
 from .xfileshare import XFileShareIE
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483ae..cd03bdc6d 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -37,7 +37,9 @@ class WSJIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+        return self._extract_video(video_id)
 
+    def _extract_video(self, video_id):
         api_url = (
             'http://video-api.wsj.com/api-video/find_all_videos.asp?'
             'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
@@ -87,3 +89,24 @@ class WSJIE(InfoExtractor):
             'title': title,
             'categories': info.get('keywords'),
         }
+
+
+class WSJArticleIE(WSJIE):
+    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)'
+    _TESTS = [{
+        'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+        'info_dict': {
+            'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+            'ext': 'mp4',
+            'upload_date': '20170221',
+            'uploader_id': 'ralcaraz',
+            'title': 'Bao Bao the Panda Leaves for China',
+        }
+    }]
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        video_id = self._html_search_regex(r'data-src=["\']([A-Z0-9\-]+)',
+                                           webpage, 'video id')
+        return self._extract_video(video_id)

From aa4f4ee753a183a9c01904aff28c98999f77258c Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sun, 9 Apr 2017 08:55:50 -0400
Subject: [PATCH 2/5] [WSJArticle] _search_regex not _html_* data-src attr

---
 youtube_dl/extractor/wsj.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index cd03bdc6d..dfbad933e 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -107,6 +107,6 @@ class WSJArticleIE(WSJIE):
     def _real_extract(self, url):
         article_id = self._match_id(url)
         webpage = self._download_webpage(url, article_id)
-        video_id = self._html_search_regex(r'data-src=["\']([A-Z0-9\-]+)',
-                                           webpage, 'video id')
+        video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)',
+                                      webpage, 'video id')
         return self._extract_video(video_id)

From fdf743bb55accd650e4eecc790b60c35e66bd985 Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sun, 9 Apr 2017 09:23:19 -0400
Subject: [PATCH 3/5] [WSJArticle] Delegate via url_result()

...to base WSJIE class.
Don't call an inherited _extract_video() method.
---
 youtube_dl/extractor/wsj.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index dfbad933e..37fe64456 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -17,6 +17,7 @@ class WSJIE(InfoExtractor):
         )
         (?P<id>[a-zA-Z0-9-]+)'''
     IE_DESC = 'Wall Street Journal'
+    TEMPLATE_URL = 'https://wsj.com/video/./%s'
     _TESTS = [{
         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
         'md5': 'e230a5bb249075e40793b655a54a02e4',
@@ -109,4 +110,4 @@ class WSJArticleIE(WSJIE):
         webpage = self._download_webpage(url, article_id)
         video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)',
                                       webpage, 'video id')
-        return self._extract_video(video_id)
+        return self.url_result(self.TEMPLATE_URL % video_id, WSJIE.ie_key())

From 349b15f380567fc2baff9928237c58d347e95ed5 Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sun, 9 Apr 2017 09:32:31 -0400
Subject: [PATCH 4/5] [WSJArticle] revert separate _extract_video() method

---
 youtube_dl/extractor/wsj.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index 37fe64456..7c3521243 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -38,9 +38,7 @@ class WSJIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        return self._extract_video(video_id)
 
-    def _extract_video(self, video_id):
         api_url = (
             'http://video-api.wsj.com/api-video/find_all_videos.asp?'
             'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'

From 3365000d2a994338c92d9a5321980ff651f4df0b Mon Sep 17 00:00:00 2001
From: John Hawkinson <jhawk@mit.edu>
Date: Sun, 9 Apr 2017 10:03:08 -0400
Subject: [PATCH 5/5] [WSJArticle] cleanups...

. Create a wsj: shortcut, revert TEMPLATE_URL stuff
. Don't inherit from WSJIE
. Pass video_id to url_result()
---
 youtube_dl/extractor/wsj.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index 7c3521243..ec38a2ad8 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,14 +10,14 @@ from ..utils import (
 
 
 class WSJIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
+    _VALID_URL = r'''(?x)
         (?:
-            video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
-            (?:www\.)?wsj\.com/video/[^/]+/
+            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
+            https?://(?:www\.)?wsj\.com/video/[^/]+/|
+            wsj:
         )
         (?P<id>[a-zA-Z0-9-]+)'''
     IE_DESC = 'Wall Street Journal'
-    TEMPLATE_URL = 'https://wsj.com/video/./%s'
     _TESTS = [{
         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
         'md5': 'e230a5bb249075e40793b655a54a02e4',
@@ -90,7 +90,7 @@ class WSJIE(InfoExtractor):
         }
 
 
-class WSJArticleIE(WSJIE):
+class WSJArticleIE(InfoExtractor):
     _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>\w[^/]+)'
     _TESTS = [{
         'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
@@ -108,4 +108,4 @@ class WSJArticleIE(WSJIE):
         webpage = self._download_webpage(url, article_id)
         video_id = self._search_regex(r'data-src=["\']([A-Z0-9\-]+)',
                                       webpage, 'video id')
-        return self.url_result(self.TEMPLATE_URL % video_id, WSJIE.ie_key())
+        return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)