From 3d20fc10bffd75b68f2a691766c7c4ccc605ede0 Mon Sep 17 00:00:00 2001 From: andrejsky Date: Mon, 17 Sep 2018 15:58:22 +0200 Subject: [PATCH 1/3] [RT] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rt.py | 92 ++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/rt.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7dc569724..2f2111b02 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -936,6 +936,7 @@ from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rozhlas import RozhlasIE +from .rt import RTIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE diff --git a/youtube_dl/extractor/rt.py b/youtube_dl/extractor/rt.py new file mode 100644 index 000000000..b6d4d97e6 --- /dev/null +++ b/youtube_dl/extractor/rt.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ora import OraTVIE +from .youtube import YoutubeIE + + +class RTIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rt\.com/.*/(?P\d+)-.*' + _TESTS = [ + { + 'url': 'https://www.rt.com/shows/alex-salmond-show/438343-britain-railway-london-communities/', + 'md5': '0d8f6f86673ee8d72215c8d060170a5e', + 'info_dict': { + 'id': '438343', + 'ext': 'mp4', + 'title': 'HS2: The human cost… RT — The Alex Salmond Show' + }, + 'params': { + 'skip_download': False, + } + }, + { + 'url': 'https://www.rt.com/shows/larry-king-now/438502-andie-macdowell-on-ageism-in/', + 'md5': '5852a10576b4add6b250f864546033f4', + 'info_dict': { + 'id': '57786', + 'ext': 'mp4', + 'title': 'Andie MacDowell on ageism in Hollywood, fame, & forest protection', + 'description': 'md5:07b6bce4ad4043b136e21ef9539d46c5' + }, + 'params': { + 'skip_download': False, + } + }, + { + 'url': 'https://www.rt.com/shows/icymi-with-polly-boiko/438450-musk-smoke-marijuana-radio/', + 'md5': '2c2fe0f78f1ca225e82fb7b27c8fd3f5', + 'info_dict': { + 'id': 'SHxygmDAkNE', + 'ext': 'mp4', + 'title': u'ICYMI: Is Elon Musk Tony Stark, or just stark raving mad?', + 'description': 'md5:99e8c3456f6904383399aeeb10784c8b', + 'upload_date': '20180914', + 'uploader_id': 'UCdgFmrDeP9nWj_eDKW6j9kQ', + 'uploader': 'ICYMI' + }, + 'params': { + 'skip_download': False, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_title = self._search_regex( + r'(.+?)', webpage, 'title') + # default RT's CDN + video_url = self._search_regex( + r'file: "(https://cdnv.+?)",', webpage, 'url', fatal=False, default=None) + + if video_url is None: + + oratv = self._search_regex( + r'src="(//www\.ora\.tv.+?)"', webpage, 'oratv', fatal=False, default=None) + + if oratv is not None: + # some videos are embedded from ORATV + + oratv_embedded_webpage = self._download_webpage(oratv, video_id) + ora_website_url = self._search_regex( + r' Date: Mon, 17 Sep 2018 16:59:05 +0200 Subject: [PATCH 2/3] Remove unicode --- youtube_dl/extractor/rt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rt.py b/youtube_dl/extractor/rt.py index b6d4d97e6..220e6fc99 100644 --- a/youtube_dl/extractor/rt.py +++ b/youtube_dl/extractor/rt.py @@ -40,7 +40,7 @@ class RTIE(InfoExtractor): 'info_dict': { 'id': 'SHxygmDAkNE', 'ext': 'mp4', - 'title': u'ICYMI: Is Elon Musk Tony Stark, or just stark raving mad?', + 'title': 'ICYMI: Is Elon Musk Tony Stark, or just stark raving mad?', 'description': 'md5:99e8c3456f6904383399aeeb10784c8b', 'upload_date': '20180914', 'uploader_id': 'UCdgFmrDeP9nWj_eDKW6j9kQ', From a98310bbf863aafdd68d93994c43d5ee922b3dab Mon Sep 17 00:00:00 2001 From: andrejsky Date: Tue, 18 Sep 2018 16:07:25 +0200 Subject: [PATCH 3/3] [RT] Attempt to fix delegation and regexps Attempts to address issues raised https://github.com/rg3/youtube-dl/pull/17594#issuecomment-422050733 --- youtube_dl/extractor/rt.py | 84 +++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/rt.py b/youtube_dl/extractor/rt.py index 220e6fc99..89b2ddc16 100644 --- a/youtube_dl/extractor/rt.py +++ b/youtube_dl/extractor/rt.py @@ -4,10 +4,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from .ora import OraTVIE from .youtube import YoutubeIE +from .generic import GenericIE class RTIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rt\.com/.*/(?P\d+)-.*' + _VALID_URL = r'https?://(?:www\.)?rt\.com/.+/(?P\d+)-.+' _TESTS = [ { 'url': 'https://www.rt.com/shows/alex-salmond-show/438343-britain-railway-london-communities/', @@ -27,7 +28,7 @@ class RTIE(InfoExtractor): 'info_dict': { 'id': '57786', 'ext': 'mp4', - 'title': 'Andie MacDowell on ageism in Hollywood, fame, & forest protection', + 'title': 'md5:fa0da906fbfc7974da14ca53424b1a3a', 'description': 'md5:07b6bce4ad4043b136e21ef9539d46c5' }, 'params': { @@ -40,7 +41,7 @@ class RTIE(InfoExtractor): 'info_dict': { 'id': 'SHxygmDAkNE', 'ext': 'mp4', - 'title': 'ICYMI: Is Elon Musk Tony Stark, or just stark raving mad?', + 'title': 'md5:004bcbc650d8294c5cdefcc470c3cd3d', 'description': 'md5:99e8c3456f6904383399aeeb10784c8b', 'upload_date': '20180914', 'uploader_id': 'UCdgFmrDeP9nWj_eDKW6j9kQ', @@ -49,44 +50,61 @@ class RTIE(InfoExtractor): 'params': { 'skip_download': False, } + }, + { + 'url': 'https://www.rt.com/news/438686-syria-russia-s200-il20/', + 'md5': '03acfb2a27a13fb74eb5c192e53bf7e0', + 'info_dict': { + 'id': 'YEioP7zJzMc', + 'ext': 'mp4', + 'title': 'md5:e703b7c8d88725c1530661d61a626303', + 'description': 'md5:8ab844abcd296d15f4a99b089e1e1347', + 'upload_date': '20180918', + 'uploader_id': 'RussiaToday', + 'uploader': 'RT' + }, + 'params': { + 'skip_download': False, + } }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_title = self._html_search_regex( + r'(.+?)', webpage, 'title', fatal=False) or self._html_search_meta(['og:title', 'twitter:title'], webpage) + + oratv = self._search_regex( + r'src=["\']((https?:)?//(?:www\.)ora\.tv[^\'"]+)', webpage, 'oratv', fatal=False, default=None) + + # some videos are embedded from ORATV + if oratv is not None: + + oratv_embedded_webpage = self._download_webpage(oratv, video_id) + ora_website_url = self._search_regex( + r']rel=["\']canonical["\'].+href=["\']([^\'"]+)', oratv_embedded_webpage, 'orawebsite') + + return self.url_result(ora_website_url, ie=OraTVIE.ie_key()) + + # some videos are embedded from youtube + yturl = self._search_regex( + r']+\bdata-url=["\']((https?:)?//(?:www\.)youtube\.[^\'"]+)', webpage, 'youtube', fatal=False, default=None) or self._search_regex( + r']+\bsrc=["\']((https?:)?//(?:www\.)youtube\.[^\'"]+)', webpage, 'youtube', fatal=False, default=None) + + if yturl is not None: + return self.url_result(yturl, ie=YoutubeIE.ie_key()) - video_title = self._search_regex( - r'(.+?)', webpage, 'title') # default RT's CDN video_url = self._search_regex( - r'file: "(https://cdnv.+?)",', webpage, 'url', fatal=False, default=None) + r'file:\s*["\'](https?://[^\'"]+)', webpage, 'url', fatal=False, default=None) - if video_url is None: + if video_url is not None: - oratv = self._search_regex( - r'src="(//www\.ora\.tv.+?)"', webpage, 'oratv', fatal=False, default=None) + return { + 'id': video_id, + 'title': video_title, + 'url': video_url + } - if oratv is not None: - # some videos are embedded from ORATV - - oratv_embedded_webpage = self._download_webpage(oratv, video_id) - ora_website_url = self._search_regex( - r'