From 7c1a8dc8de52d11f0d25e9d20e5ca4d8fef011cc Mon Sep 17 00:00:00 2001 From: Kai Curtis Date: Sat, 10 Mar 2018 17:09:02 -0800 Subject: [PATCH 1/4] [TastyTrade] Fix old test, handle alt urls Since TastyTrade also emails out urls for episodes in another format (and actually puts the content into different templates, thus the alternative handling), additional handling is needed to correctly recognize the URL and extract the metadata. --- youtube_dl/extractor/tastytrade.py | 56 ++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py index 7fe96bd5f..74cb681a5 100644 --- a/youtube_dl/extractor/tastytrade.py +++ b/youtube_dl/extractor/tastytrade.py @@ -2,19 +2,28 @@ from __future__ import unicode_literals from .common import InfoExtractor from .ooyala import OoyalaIE +from youtube_dl.utils import ( + ExtractorError, +) + +import json +import re +import sys class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/(shows|daily_recaps)/[^/]+/episodes/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', + 'id': '8xZW5xYjE6aLXhPwseCpyIf50oQw69JM', 'ext': 'mp4', - 'title': 'A History of Teaming', - 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, + 'title': 'Correlation in Short Volatility', + 'description': '[Correlation](https://www.tastytrade.com/tt/learn/correlation) is always changing and positions can be more correlated than we suspect. We can even have...', + 'duration': 753.0, + 'upload_date': '20170628', + 'timestamp': 1498608000, }, 'params': { 'skip_download': True, @@ -23,6 +32,18 @@ class TastyTradeIE(InfoExtractor): }, { 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', 'only_matching': True, + }, { + 'url': 'https://www.tastytrade.com/tt/daily_recaps/2018-03-09/episodes/soybeans-corn-its-planting-time-03-09-2018', + 'info_dict': { + 'id': 'lud3BtZTE6vnRdolxKRlwNoZQvb3z_LT', + 'ext': 'mp4', + 'title': 'Soybeans & Corn: It\'s Planting Time', + 'description': 'md5:a523504b1227de1b81faeba2876a6d23', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], }] def _real_extract(self, url): @@ -33,7 +54,30 @@ class TastyTradeIE(InfoExtractor): r'data-media-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'ooyala code', group='code') - info = self._search_json_ld(webpage, display_id, fatal=False) + info = {'id': None, 'title': None, 'description': None} + + try: + info = self._search_json_ld(webpage, display_id, fatal=False) + except ExtractorError as ex: + json_string_match = re.search( + r'var episodeData = \$.parseJSON\("(?P.*)"\)', webpage, 0) + + if (json_string_match): + escaped_json_string = json_string_match.group('episode_json') + + if sys.version_info[0] >= 3: + unescaped_json_string = bytes( + escaped_json_string, "utf-8").decode('unicode_escape') + else: + unescaped_json_string = escaped_json_string.decode( + 'string_escape') + metadata = json.loads(unescaped_json_string) + info = { + 'id': metadata.get('mediaId'), + 'title': metadata.get('title'), + 'description': metadata.get('description') + } + info.update({ '_type': 'url_transparent', 'ie_key': OoyalaIE.ie_key(), From 28fd6fade54d149f30edaa563d1d1e12f33b0c21 Mon Sep 17 00:00:00 2001 From: Kai Curtis Date: Sun, 11 Mar 2018 22:19:47 -0700 Subject: [PATCH 2/4] Don't capture groups unnecessarily --- youtube_dl/extractor/tastytrade.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py index 74cb681a5..9d6bbc1c0 100644 --- a/youtube_dl/extractor/tastytrade.py +++ b/youtube_dl/extractor/tastytrade.py @@ -12,7 +12,7 @@ import sys class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/(shows|daily_recaps)/[^/]+/episodes/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/(?:shows|daily_recaps)/[^/]+/episodes/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', From 71b7f6133b79601202e777a90e0dfcbd56d8ed34 Mon Sep 17 00:00:00 2001 From: Kai Curtis Date: Sun, 11 Mar 2018 22:20:08 -0700 Subject: [PATCH 3/4] Prefer helper funcs, control flow w/ return values Native regex and json parse have been replaced by library versions with more comprehensive error handling. There was flow control via try/catch before that was based on an error being thrown only in test. This has been replaced by an if/else based on return value. --- youtube_dl/extractor/tastytrade.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py index 9d6bbc1c0..31acdaed8 100644 --- a/youtube_dl/extractor/tastytrade.py +++ b/youtube_dl/extractor/tastytrade.py @@ -2,12 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from .ooyala import OoyalaIE -from youtube_dl.utils import ( - ExtractorError, -) -import json -import re import sys @@ -56,22 +51,28 @@ class TastyTradeIE(InfoExtractor): info = {'id': None, 'title': None, 'description': None} - try: - info = self._search_json_ld(webpage, display_id, fatal=False) - except ExtractorError as ex: - json_string_match = re.search( - r'var episodeData = \$.parseJSON\("(?P.*)"\)', webpage, 0) + json_ld_info = self._search_json_ld( + webpage, display_id, default=None, fatal=False) - if (json_string_match): - escaped_json_string = json_string_match.group('episode_json') + if (json_ld_info): + info = json_ld_info + else: + escaped_json_string = self._search_regex( + r'var episodeData = \$.parseJSON\("(?P.*)"\)', + webpage, + 'episode json', + fatal=False, + group='episode_json' + ) + if (escaped_json_string): if sys.version_info[0] >= 3: unescaped_json_string = bytes( escaped_json_string, "utf-8").decode('unicode_escape') else: unescaped_json_string = escaped_json_string.decode( 'string_escape') - metadata = json.loads(unescaped_json_string) + metadata = self._parse_json(unescaped_json_string, ooyala_code) info = { 'id': metadata.get('mediaId'), 'title': metadata.get('title'), From 43fff33749397d82cea18090434d7d7d62029478 Mon Sep 17 00:00:00 2001 From: Kai Curtis Date: Mon, 12 Mar 2018 10:23:33 -0700 Subject: [PATCH 4/4] Remove extra metadata parsing Since my goal in modifying this extractor was just to be able to get videos from another set of urls on the site and the code that I had added was doing more than that, I've pared it down to the bare essentials. --- youtube_dl/extractor/tastytrade.py | 36 +++--------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/tastytrade.py b/youtube_dl/extractor/tastytrade.py index 31acdaed8..d5b0bff4b 100644 --- a/youtube_dl/extractor/tastytrade.py +++ b/youtube_dl/extractor/tastytrade.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from .ooyala import OoyalaIE -import sys - class TastyTradeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/(?:shows|daily_recaps)/[^/]+/episodes/(?P[^/?#&]+)' @@ -32,8 +30,8 @@ class TastyTradeIE(InfoExtractor): 'info_dict': { 'id': 'lud3BtZTE6vnRdolxKRlwNoZQvb3z_LT', 'ext': 'mp4', - 'title': 'Soybeans & Corn: It\'s Planting Time', - 'description': 'md5:a523504b1227de1b81faeba2876a6d23', + 'title': 'TTL_CTGFE_180309_SEG_EDIT.mp4', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', }, 'params': { 'skip_download': True, @@ -49,35 +47,7 @@ class TastyTradeIE(InfoExtractor): r'data-media-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'ooyala code', group='code') - info = {'id': None, 'title': None, 'description': None} - - json_ld_info = self._search_json_ld( - webpage, display_id, default=None, fatal=False) - - if (json_ld_info): - info = json_ld_info - else: - escaped_json_string = self._search_regex( - r'var episodeData = \$.parseJSON\("(?P.*)"\)', - webpage, - 'episode json', - fatal=False, - group='episode_json' - ) - - if (escaped_json_string): - if sys.version_info[0] >= 3: - unescaped_json_string = bytes( - escaped_json_string, "utf-8").decode('unicode_escape') - else: - unescaped_json_string = escaped_json_string.decode( - 'string_escape') - metadata = self._parse_json(unescaped_json_string, ooyala_code) - info = { - 'id': metadata.get('mediaId'), - 'title': metadata.get('title'), - 'description': metadata.get('description') - } + info = self._search_json_ld(webpage, display_id, default={}) info.update({ '_type': 'url_transparent',