From cf2b0fe00e25c40ea591524892b530a1b2697cf9 Mon Sep 17 00:00:00 2001 From: FA Date: Thu, 25 Apr 2019 13:07:14 -0700 Subject: [PATCH 1/7] [earthcams] Add new extractor per #19293 --- youtube_dl/extractor/earthcams.py | 49 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/earthcams.py diff --git a/youtube_dl/extractor/earthcams.py b/youtube_dl/extractor/earthcams.py new file mode 100644 index 000000000..31a893de8 --- /dev/null +++ b/youtube_dl/extractor/earthcams.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + urljoin, +) + + +class EarthCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?earthcam\.com/.*?cam=(?P\w+)' + _TEST = { + 'url': 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1', + 'info_dict': { + 'id': 'tsrobo1', + 'ext': 'mp4', + 'title': 'Times Square, NYC', + 'description': 'EarthCam brings you an HD, panoramic view of Times Square looking up, down, and across 7th Avenue and Broadway. See why Times Square is called the "Crossroads of the World!"', + 'view_count': int, + 'is_live': True, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + json_str = self._search_regex(r'var\sjson_base\s*=\s*(?P{.*});', webpage, 'jstr') + json_base = self._parse_json(json_str, video_id) + + title = json_base["cam"][video_id]["long_title"] + description = json_base["cam"][video_id]["description"] + thumbnail = json_base["cam"][video_id]["thumbimage"] + view_count = int(json_base["cam"][video_id]["streamviews"]) + + domain = json_base["cam"][video_id]["html5_streamingdomain"] + path = json_base["cam"][video_id]["html5_streampath"] + m3u8_url = urljoin(domain, path) + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native'), + 'title': title, + 'description': description, + 'view_count': view_count, + 'is_live': True, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0e3ccb82d..55bc96d79 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -313,6 +313,7 @@ from .dw import ( DWArticleIE, ) from .eagleplatform import EaglePlatformIE +from .earthcams import EarthCamsIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .egghead import ( From 7b392e10fbd09fd77cf1e63a6ca22c1cbeb2fb8b Mon Sep 17 00:00:00 2001 From: FA Date: Thu, 25 Apr 2019 13:19:22 -0700 Subject: [PATCH 2/7] [earthcams] Add new extractor per #19293 --- youtube_dl/extractor/earthcams.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/earthcams.py b/youtube_dl/extractor/earthcams.py index 31a893de8..574e11516 100644 --- a/youtube_dl/extractor/earthcams.py +++ b/youtube_dl/extractor/earthcams.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( urljoin, + int_or_none, + url_or_none, + try_get, ) @@ -25,25 +28,23 @@ class EarthCamsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - json_str = self._search_regex(r'var\sjson_base\s*=\s*(?P{.*});', webpage, 'jstr') + json_str = self._search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'jstr') json_base = self._parse_json(json_str, video_id) - - title = json_base["cam"][video_id]["long_title"] - description = json_base["cam"][video_id]["description"] - thumbnail = json_base["cam"][video_id]["thumbimage"] - view_count = int(json_base["cam"][video_id]["streamviews"]) - - domain = json_base["cam"][video_id]["html5_streamingdomain"] - path = json_base["cam"][video_id]["html5_streampath"] + video_info = try_get(json_base, lambda x: x['cam'][video_id], dict) or {} + title = video_info.get("long_title") + description = video_info.get("description") + thumbnail = video_info.get("thumbimage") + view_count = int_or_none(video_info.get("streamviews")) + domain = video_info.get("html5_streamingdomain") + path = video_info.get("html5_streampath") m3u8_url = urljoin(domain, path) return { 'id': video_id, 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native'), - 'title': title, - 'description': description, + 'title': title or self._og_search_title(webpage), + 'description': description or self._og_search_description(webpage), 'view_count': view_count, 'is_live': True, - 'thumbnail': thumbnail, + 'thumbnail': url_or_none(thumbnail), } From 31278775bb6070d80279b715902e345858e38e09 Mon Sep 17 00:00:00 2001 From: FA Date: Thu, 25 Apr 2019 16:21:02 -0700 Subject: [PATCH 3/7] [earthcams] Add new extractor per #19293 --- youtube_dl/extractor/earthcams.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/earthcams.py b/youtube_dl/extractor/earthcams.py index 574e11516..e722fa9ea 100644 --- a/youtube_dl/extractor/earthcams.py +++ b/youtube_dl/extractor/earthcams.py @@ -11,7 +11,7 @@ from ..utils import ( class EarthCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?earthcam\.com/.*?cam=(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?earthcam\.com/.*\?.*cam=(?P\w+)' _TEST = { 'url': 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1', 'info_dict': { @@ -21,7 +21,7 @@ class EarthCamsIE(InfoExtractor): 'description': 'EarthCam brings you an HD, panoramic view of Times Square looking up, down, and across 7th Avenue and Broadway. See why Times Square is called the "Crossroads of the World!"', 'view_count': int, 'is_live': True, - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(jpg|png)$', }, } From eaabac9a1e7db75e458e3a85183c6a72f3865cfc Mon Sep 17 00:00:00 2001 From: FA Date: Thu, 25 Apr 2019 16:46:07 -0700 Subject: [PATCH 4/7] [earthcam] Add new extractor per #19293 --- youtube_dl/extractor/{earthcams.py => earthcam.py} | 2 +- youtube_dl/extractor/extractors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename youtube_dl/extractor/{earthcams.py => earthcam.py} (98%) diff --git a/youtube_dl/extractor/earthcams.py b/youtube_dl/extractor/earthcam.py similarity index 98% rename from youtube_dl/extractor/earthcams.py rename to youtube_dl/extractor/earthcam.py index e722fa9ea..2488cbef5 100644 --- a/youtube_dl/extractor/earthcams.py +++ b/youtube_dl/extractor/earthcam.py @@ -10,7 +10,7 @@ from ..utils import ( ) -class EarthCamsIE(InfoExtractor): +class EarthCamIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?earthcam\.com/.*\?.*cam=(?P\w+)' _TEST = { 'url': 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 55bc96d79..18c64129f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -313,7 +313,7 @@ from .dw import ( DWArticleIE, ) from .eagleplatform import EaglePlatformIE -from .earthcams import EarthCamsIE +from .earthcam import EarthCamIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE from .egghead import ( From e7e8e4b4453ffb7b3d1a9d78dce1b61bc65d33d5 Mon Sep 17 00:00:00 2001 From: FA Date: Sat, 27 Apr 2019 17:57:43 -0700 Subject: [PATCH 5/7] [earthcam] Add new extractor --- youtube_dl/extractor/earthcam.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/earthcam.py b/youtube_dl/extractor/earthcam.py index 2488cbef5..0c82c7ee0 100644 --- a/youtube_dl/extractor/earthcam.py +++ b/youtube_dl/extractor/earthcam.py @@ -28,7 +28,7 @@ class EarthCamIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - json_str = self._search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'jstr') + json_str = self._html_search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'json', group='jstr') json_base = self._parse_json(json_str, video_id) video_info = try_get(json_base, lambda x: x['cam'][video_id], dict) or {} title = video_info.get("long_title") From c06ca8dcc9cdc70d461769284b1abdcfe4c50b90 Mon Sep 17 00:00:00 2001 From: FA Date: Tue, 7 May 2019 17:17:43 -0700 Subject: [PATCH 6/7] Fail if mandatory fields absent. Add test. --- youtube_dl/extractor/earthcam.py | 48 +++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/earthcam.py b/youtube_dl/extractor/earthcam.py index 0c82c7ee0..2544e8c59 100644 --- a/youtube_dl/extractor/earthcam.py +++ b/youtube_dl/extractor/earthcam.py @@ -3,16 +3,18 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, urljoin, int_or_none, url_or_none, try_get, + js_to_json, ) class EarthCamIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?earthcam\.com/.*\?.*cam=(?P\w+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.earthcam.com/usa/newyork/timessquare/?cam=tsrobo1', 'info_dict': { 'id': 'tsrobo1', @@ -22,29 +24,43 @@ class EarthCamIE(InfoExtractor): 'view_count': int, 'is_live': True, 'thumbnail': r're:^https?://.*\.(jpg|png)$', - }, - } + }, + }, { + 'url': 'https://www.earthcam.com/usa/louisiana/neworleans/bourbonstreet/?cam=catsmeowkaraoke', + 'info_dict': { + 'id': 'catsmeowkaraoke', + 'ext': 'mp4', + 'title': 'New Orleans, LA', + 'description': 'Get a front row seat to all the wild and crazy stage performances happening at the Cat\'s Meow Karaoke Bar! Over the years, thousands of guests have enjoyed their moment singing in the spotlight at this popular local spot!', + 'view_count': int, + 'is_live': True, + 'thumbnail': r're:^https?://.*\.(jpg|png)$', + } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - json_str = self._html_search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'json', group='jstr') - json_base = self._parse_json(json_str, video_id) - video_info = try_get(json_base, lambda x: x['cam'][video_id], dict) or {} - title = video_info.get("long_title") - description = video_info.get("description") - thumbnail = video_info.get("thumbimage") - view_count = int_or_none(video_info.get("streamviews")) - domain = video_info.get("html5_streamingdomain") - path = video_info.get("html5_streampath") + json_str = self._html_search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'json', group='json_str', default='{}') + json_base = self._parse_json(js_to_json(json_str), video_id) + + video_info = jsonn_base['cam'][video_id] + domain = video_info['html5_streamingdomain'] + path = video_info['html5_streampath'] m3u8_url = urljoin(domain, path) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native') + title = video_info.get('long_title') or self._og_search_title(webpage) + description = video_info.get('description') or self._og_search_description(webpage) + thumbnail = url_or_none(video_info.get('thumbimage')) or self._og_search_thumbnail(webpage) + view_count = int_or_none(video_info.get("streamviews")) + return { 'id': video_id, - 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native'), - 'title': title or self._og_search_title(webpage), - 'description': description or self._og_search_description(webpage), + 'formats': formats, + 'title': title, + 'description': description, 'view_count': view_count, 'is_live': True, - 'thumbnail': url_or_none(thumbnail), + 'thumbnail': thumbnail, } From 2ca5809c1dcb9e1d22a55a6061dbf6288d2d9927 Mon Sep 17 00:00:00 2001 From: FA Date: Wed, 8 May 2019 11:28:21 -0700 Subject: [PATCH 7/7] Remove unused import --- youtube_dl/extractor/earthcam.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/earthcam.py b/youtube_dl/extractor/earthcam.py index 2544e8c59..fc1c3e542 100644 --- a/youtube_dl/extractor/earthcam.py +++ b/youtube_dl/extractor/earthcam.py @@ -3,7 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, urljoin, int_or_none, url_or_none, @@ -43,8 +42,8 @@ class EarthCamIE(InfoExtractor): webpage = self._download_webpage(url, video_id) json_str = self._html_search_regex(r'var\s+json_base\s*=\s*(?P{\s*"cam"\s*:\s*{.*}.*});', webpage, 'json', group='json_str', default='{}') json_base = self._parse_json(js_to_json(json_str), video_id) - - video_info = jsonn_base['cam'][video_id] + + video_info = json_base['cam'][video_id] domain = video_info['html5_streamingdomain'] path = video_info['html5_streampath'] m3u8_url = urljoin(domain, path)