From d1ec773e4f9208ca7cad2c93b7ed33ce7b91480f Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sun, 7 Aug 2016 00:57:10 -0500 Subject: [PATCH 1/7] [Go90] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/go90.py | 80 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 youtube_dl/extractor/go90.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 11b64eeaa..17d12e315 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -308,6 +308,7 @@ from .globo import ( GloboIE, GloboArticleIE, ) +from .go90 import Go90IE from .godtube import GodTubeIE from .godtv import GodTVIE from .goldenmoustache import GoldenMoustacheIE diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py new file mode 100644 index 000000000..cf51d7308 --- /dev/null +++ b/youtube_dl/extractor/go90.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib #DEBUG + +from .common import InfoExtractor + + +class Go90IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?go90\.com/profiles/va_(?P[a-f0-9]+)' + _TEST = { + 'url': 'https://www.go90.com/profiles/va_07d47f43a7b04eb5b693252f2bd1086b', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '07d47f43a7b04eb5b693252f2bd1086b', + 'ext': 'mp4', + 'title': 't@gged S1:E1 #shotgun', + 'thumbnail': 're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + #title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + + series_title = self._html_search_regex(r']* data-reactid="90">(.+?)', webpage, 'series_title') + season_episode_numbers = self._html_search_regex(r'(.+?)', webpage, 'season_episode_numbers') + episode_title = self._html_search_regex(r'(.+?)', webpage, 'episode_title') + + title = series_title + " " + season_episode_numbers + " " + episode_title + #print "[!!!] " + title + + #page_data_json = self._search_regex(r']*>window\.__data=(.+?);\s*', webpage, 'page_data', flags=re.DOTALL) + #print self.transform_source(page_data_json) + #page_data = self._parse_json(page_data_json, video_id, transform_source=self.transform_source) + + + + video_api = "https://www.go90.com/api/metadata/video/" + video_id + + video_api_data = self._download_json(video_api, video_id) #TODO: overwrite `note=` to output better explanation + #print "[!!!] " + video_api_data['url'] + + video_token_url = re.sub(r'^//', 'https://', video_api_data['url']) #TODO: use utils.sanitize_url() + #print "[!!!] " + video_token_url + + video_token_data = self._download_json(video_token_url, video_id) #TODO: overwrite `note=` to output better explanation + #print "[!!!] " + video_token_data['playURL'] + + m3u8_url = video_token_data['playURL'] + + #DEBUG + testfile = urllib.URLopener() + testfile.retrieve(m3u8_url, video_id + ".m3u8") + #/DEBUG + + formats = [] + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'formats': formats, + #'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } + + #def transform_source(self, json_string): + # return re.sub(re.sub(r':function.*?},([\[{"])', ':"",\g<1>', json_string, flags=re.DOTALL) \ No newline at end of file From f7fe731cbfa34cb96d5cfb97fae317d7dc5557b7 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 12:11:21 -0600 Subject: [PATCH 2/7] [Go90] Use `UplynkPreplayIE` to extract data --- youtube_dl/extractor/go90.py | 82 +++++++++++++----------------------- 1 file changed, 30 insertions(+), 52 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index cf51d7308..c4322cdc7 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -1,10 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import urllib #DEBUG - from .common import InfoExtractor +from .uplynk import UplynkPreplayIE +from ..utils import sanitize_url class Go90IE(InfoExtractor): @@ -16,7 +15,7 @@ class Go90IE(InfoExtractor): 'id': '07d47f43a7b04eb5b693252f2bd1086b', 'ext': 'mp4', 'title': 't@gged S1:E1 #shotgun', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: @@ -29,52 +28,31 @@ class Go90IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... - #title = self._html_search_regex(r'

(.+?)

', webpage, 'title') - - series_title = self._html_search_regex(r']* data-reactid="90">(.+?)', webpage, 'series_title') - season_episode_numbers = self._html_search_regex(r'(.+?)', webpage, 'season_episode_numbers') - episode_title = self._html_search_regex(r'(.+?)', webpage, 'episode_title') - - title = series_title + " " + season_episode_numbers + " " + episode_title - #print "[!!!] " + title - - #page_data_json = self._search_regex(r']*>window\.__data=(.+?);\s*', webpage, 'page_data', flags=re.DOTALL) - #print self.transform_source(page_data_json) - #page_data = self._parse_json(page_data_json, video_id, transform_source=self.transform_source) - - - - video_api = "https://www.go90.com/api/metadata/video/" + video_id - - video_api_data = self._download_json(video_api, video_id) #TODO: overwrite `note=` to output better explanation - #print "[!!!] " + video_api_data['url'] - - video_token_url = re.sub(r'^//', 'https://', video_api_data['url']) #TODO: use utils.sanitize_url() - #print "[!!!] " + video_token_url - - video_token_data = self._download_json(video_token_url, video_id) #TODO: overwrite `note=` to output better explanation - #print "[!!!] " + video_token_data['playURL'] - - m3u8_url = video_token_data['playURL'] - - #DEBUG - testfile = urllib.URLopener() - testfile.retrieve(m3u8_url, video_id + ".m3u8") - #/DEBUG - - formats = [] - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - return { - 'id': video_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'formats': formats, - #'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) - } - - #def transform_source(self, json_string): - # return re.sub(re.sub(r':function.*?},([\[{"])', ':"",\g<1>', json_string, flags=re.DOTALL) \ No newline at end of file + # scrape data from webpage + page_data = {} + self.to_screen("Scrape data from webpage") + + page_data['id'] = video_id + + video_title = self._html_search_regex( + r']*>\s*(.*)\s*', webpage, 'title') + page_data['title'] = video_title + self.to_screen("Title: " + page_data['title']) + + + # retrieve upLynk data + video_api = "https://www.go90.com/api/metadata/video/" + video_id + video_api_data = self._download_json(video_api, video_id) #TODO: overwrite `note=` to output better explanation + video_token_url = sanitize_url(video_api_data['url']) + + uplynk_preplay = UplynkPreplayIE(self._downloader) + uplynk_data = uplynk_preplay.extract(video_token_url) + + + # merge data + video_data = uplynk_data.copy() + video_data.update(page_data) + # TODO more properties (see youtube_dl/extractor/common.py) + + return video_data From f0870a92aa473c9a34a15acf46858d1591c31d7a Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 12:47:46 -0600 Subject: [PATCH 3/7] [Go90] Correctly pass the preplay URL to the uplynk extractor --- youtube_dl/extractor/go90.py | 37 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index c4322cdc7..d60a4d698 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .uplynk import UplynkPreplayIE from ..utils import sanitize_url @@ -14,14 +13,20 @@ class Go90IE(InfoExtractor): 'info_dict': { 'id': '07d47f43a7b04eb5b693252f2bd1086b', 'ext': 'mp4', - 'title': 't@gged S1:E1 #shotgun', + 'title': 't@gged | #shotgun | go90', 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader_id': '98ac1613c7624a8387596b5d5e441064', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: # * A regular expression; start the string with re: # * Any Python type (for example int or float) - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], } def _real_extract(self, url): @@ -33,26 +38,22 @@ class Go90IE(InfoExtractor): page_data = {} self.to_screen("Scrape data from webpage") - page_data['id'] = video_id - video_title = self._html_search_regex( r']*>\s*(.*)\s*', webpage, 'title') - page_data['title'] = video_title - self.to_screen("Title: " + page_data['title']) + self.to_screen("Title: " + video_title) - # retrieve upLynk data + # retrieve upLynk url video_api = "https://www.go90.com/api/metadata/video/" + video_id video_api_data = self._download_json(video_api, video_id) #TODO: overwrite `note=` to output better explanation - video_token_url = sanitize_url(video_api_data['url']) - - uplynk_preplay = UplynkPreplayIE(self._downloader) - uplynk_data = uplynk_preplay.extract(video_token_url) + uplynk_preplay_url = sanitize_url(video_api_data['url']) - # merge data - video_data = uplynk_data.copy() - video_data.update(page_data) - # TODO more properties (see youtube_dl/extractor/common.py) - - return video_data + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': video_title, + 'ie_key': 'UplynkPreplay', + # TODO more properties (see youtube_dl/extractor/common.py) + } From 2e8076b1bb04b616efc618078fd9c20ffa321b37 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 14:49:50 -0600 Subject: [PATCH 4/7] [Go90] Add properties for episodic content --- youtube_dl/extractor/go90.py | 61 +++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index d60a4d698..fc5d7f739 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -1,8 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals +import re +from datetime import datetime + from .common import InfoExtractor -from ..utils import sanitize_url +from ..utils import ( + clean_html, + get_element_by_id, + int_or_none, + sanitize_url, +) class Go90IE(InfoExtractor): @@ -13,14 +21,10 @@ class Go90IE(InfoExtractor): 'info_dict': { 'id': '07d47f43a7b04eb5b693252f2bd1086b', 'ext': 'mp4', - 'title': 't@gged | #shotgun | go90', + 'title': 't@gged S1E1 #shotgun', 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:1ebcc7a686d93456a822d435d2ac7719', 'uploader_id': '98ac1613c7624a8387596b5d5e441064', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) }, 'params': { # m3u8 download @@ -35,17 +39,45 @@ class Go90IE(InfoExtractor): # scrape data from webpage - page_data = {} self.to_screen("Scrape data from webpage") - video_title = self._html_search_regex( - r']*>\s*(.*)\s*', webpage, 'title') + series_title = clean_html(get_element_by_id('series-title', webpage)) + self.to_screen("Series Title: " + series_title) + + episode_info = clean_html(get_element_by_id('episode-title', webpage)) + + season_number = None + episode_number = None + episode_title = None + + episode_match = re.match( + r'S(?P\d+):E(?P\d+)\s+(?P.*)', + episode_info) + if episode_match is not None: + season_number, episode_number, episode_title = episode_match.groups() + self.to_screen("Season: " + season_number) + self.to_screen("Episode Number: " + episode_number) + self.to_screen("Episode Title: " + episode_title) + + video_title = series_title + if episode_match is not None: + video_title = '{} S{}E{} {}'.format( + series_title, season_number, episode_number, episode_title) self.to_screen("Title: " + video_title) + video_description = self._og_search_description(webpage) + + release_date = None + air_date = clean_html(get_element_by_id('asset-air-date', webpage)) + if air_date: + self.to_screen("Air Date: " + air_date) + release_datetime = datetime.strptime(air_date, '%b %d, %Y') + release_date = release_datetime.strftime('%Y%m%d') + # retrieve upLynk url video_api = "https://www.go90.com/api/metadata/video/" + video_id - video_api_data = self._download_json(video_api, video_id) #TODO: overwrite `note=` to output better explanation + video_api_data = self._download_json(video_api, video_id) uplynk_preplay_url = sanitize_url(video_api_data['url']) @@ -54,6 +86,11 @@ class Go90IE(InfoExtractor): 'url': uplynk_preplay_url, 'id': video_id, 'title': video_title, + 'series': series_title, + 'episode': episode_title, + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(episode_number), + 'description': video_description, + 'release_date': release_date, 'ie_key': 'UplynkPreplay', - # TODO more properties (see youtube_dl/extractor/common.py) } From 57e846fcb74388722f257523f4031113b7ea911e Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 16:00:58 -0600 Subject: [PATCH 5/7] [Go90] Add preceding zeros to season and episode numbers --- youtube_dl/extractor/go90.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index fc5d7f739..22d8eae3c 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -17,11 +17,10 @@ class Go90IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?go90\.com/profiles/va_(?P[a-f0-9]+)' _TEST = { 'url': 'https://www.go90.com/profiles/va_07d47f43a7b04eb5b693252f2bd1086b', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '07d47f43a7b04eb5b693252f2bd1086b', 'ext': 'mp4', - 'title': 't@gged S1E1 #shotgun', + 'title': 't@gged S01E01 #shotgun', 'thumbnail': r're:^https?://.*\.jpg$', 'description': 'md5:1ebcc7a686d93456a822d435d2ac7719', 'uploader_id': '98ac1613c7624a8387596b5d5e441064', @@ -61,8 +60,8 @@ class Go90IE(InfoExtractor): video_title = series_title if episode_match is not None: - video_title = '{} S{}E{} {}'.format( - series_title, season_number, episode_number, episode_title) + video_title = '{} S{:02d}E{:02d} {}'.format( + series_title, int_or_none(season_number), int_or_none(episode_number), episode_title) self.to_screen("Title: " + video_title) video_description = self._og_search_description(webpage) From 61799cc758ec3ae70ecafa5f2c4030fd53c2942f Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 16:26:35 -0600 Subject: [PATCH 6/7] [Go90] Add `episode_info` into the title if not episodic --- youtube_dl/extractor/go90.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 22d8eae3c..5ee048564 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -61,7 +61,10 @@ class Go90IE(InfoExtractor): video_title = series_title if episode_match is not None: video_title = '{} S{:02d}E{:02d} {}'.format( - series_title, int_or_none(season_number), int_or_none(episode_number), episode_title) + series_title, int_or_none(season_number), + int_or_none(episode_number), episode_title) + elif episode_info: + video_title = '{} -- {}'.format(series_title, episode_info) self.to_screen("Title: " + video_title) video_description = self._og_search_description(webpage) From f4a31cf69443226e95edbf0ceb145bb69312be01 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 4 Feb 2017 17:15:48 -0600 Subject: [PATCH 7/7] [Go90] Use old string formatting in order to support python2.6 --- youtube_dl/extractor/go90.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py index 5ee048564..24048a764 100644 --- a/youtube_dl/extractor/go90.py +++ b/youtube_dl/extractor/go90.py @@ -60,11 +60,11 @@ class Go90IE(InfoExtractor): video_title = series_title if episode_match is not None: - video_title = '{} S{:02d}E{:02d} {}'.format( + video_title = '%s S%02dE%02d %s' % ( series_title, int_or_none(season_number), int_or_none(episode_number), episode_title) elif episode_info: - video_title = '{} -- {}'.format(series_title, episode_info) + video_title = '%s -- %s' % (series_title, episode_info) self.to_screen("Title: " + video_title) video_description = self._og_search_description(webpage)