1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-01-23 12:02:51 +08:00

[RT] Attempt to fix delegation and regexps

Attempts to address issues raised https://github.com/rg3/youtube-dl/pull/17594#issuecomment-422050733
This commit is contained in:
andrejsky 2018-09-18 16:07:25 +02:00 committed by GitHub
parent 5b42aa585c
commit a98310bbf8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,10 +4,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from .ora import OraTVIE from .ora import OraTVIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
from .generic import GenericIE
class RTIE(InfoExtractor): class RTIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?rt\.com/.*/(?P<id>\d+)-.*' _VALID_URL = r'https?://(?:www\.)?rt\.com/.+/(?P<id>\d+)-.+'
_TESTS = [ _TESTS = [
{ {
'url': 'https://www.rt.com/shows/alex-salmond-show/438343-britain-railway-london-communities/', 'url': 'https://www.rt.com/shows/alex-salmond-show/438343-britain-railway-london-communities/',
@ -27,7 +28,7 @@ class RTIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '57786', 'id': '57786',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Andie MacDowell on ageism in Hollywood, fame, & forest protection', 'title': 'md5:fa0da906fbfc7974da14ca53424b1a3a',
'description': 'md5:07b6bce4ad4043b136e21ef9539d46c5' 'description': 'md5:07b6bce4ad4043b136e21ef9539d46c5'
}, },
'params': { 'params': {
@ -40,7 +41,7 @@ class RTIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'SHxygmDAkNE', 'id': 'SHxygmDAkNE',
'ext': 'mp4', 'ext': 'mp4',
'title': 'ICYMI: Is Elon Musk Tony Stark, or just stark raving mad?', 'title': 'md5:004bcbc650d8294c5cdefcc470c3cd3d',
'description': 'md5:99e8c3456f6904383399aeeb10784c8b', 'description': 'md5:99e8c3456f6904383399aeeb10784c8b',
'upload_date': '20180914', 'upload_date': '20180914',
'uploader_id': 'UCdgFmrDeP9nWj_eDKW6j9kQ', 'uploader_id': 'UCdgFmrDeP9nWj_eDKW6j9kQ',
@ -49,44 +50,61 @@ class RTIE(InfoExtractor):
'params': { 'params': {
'skip_download': False, 'skip_download': False,
} }
},
{
'url': 'https://www.rt.com/news/438686-syria-russia-s200-il20/',
'md5': '03acfb2a27a13fb74eb5c192e53bf7e0',
'info_dict': {
'id': 'YEioP7zJzMc',
'ext': 'mp4',
'title': 'md5:e703b7c8d88725c1530661d61a626303',
'description': 'md5:8ab844abcd296d15f4a99b089e1e1347',
'upload_date': '20180918',
'uploader_id': 'RussiaToday',
'uploader': 'RT'
},
'params': {
'skip_download': False,
}
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(
video_title = self._search_regex( r'<title>(.+?)</title>', webpage, 'title', fatal=False) or self._html_search_meta(['og:title', 'twitter:title'], webpage)
r'<title>(.+?)</title>', webpage, 'title')
# default RT's CDN
video_url = self._search_regex(
r'file: "(https://cdnv.+?)",', webpage, 'url', fatal=False, default=None)
if video_url is None:
oratv = self._search_regex( oratv = self._search_regex(
r'src="(//www\.ora\.tv.+?)"', webpage, 'oratv', fatal=False, default=None) r'src=["\']((https?:)?//(?:www\.)ora\.tv[^\'"]+)', webpage, 'oratv', fatal=False, default=None)
if oratv is not None:
# some videos are embedded from ORATV # some videos are embedded from ORATV
if oratv is not None:
oratv_embedded_webpage = self._download_webpage(oratv, video_id) oratv_embedded_webpage = self._download_webpage(oratv, video_id)
ora_website_url = self._search_regex( ora_website_url = self._search_regex(
r'<link rel="canonical" href="(.+?)"', oratv_embedded_webpage, 'orawebsite') r'<link[^>]rel=["\']canonical["\'].+href=["\']([^\'"]+)', oratv_embedded_webpage, 'orawebsite')
oratvie = OraTVIE()
oratvie._downloader = self._downloader
return oratvie._real_extract(ora_website_url)
else:
# some videos are embedded from youtube
return self.url_result(ora_website_url, ie=OraTVIE.ie_key())
# some videos are embedded from youtube
yturl = self._search_regex( yturl = self._search_regex(
r'data-url="(//www\.youtube\.com/embed.+?)"', webpage, 'youtube', fatal=False, default=None) r'<div[^>]+\bdata-url=["\']((https?:)?//(?:www\.)youtube\.[^\'"]+)', webpage, 'youtube', fatal=False, default=None) or self._search_regex(
ytie = YoutubeIE() r'<iframe[^>]+\bsrc=["\']((https?:)?//(?:www\.)youtube\.[^\'"]+)', webpage, 'youtube', fatal=False, default=None)
ytie._downloader = self._downloader
return ytie._real_extract(yturl) if yturl is not None:
return self.url_result(yturl, ie=YoutubeIE.ie_key())
# default RT's CDN
video_url = self._search_regex(
r'file:\s*["\'](https?://[^\'"]+)', webpage, 'url', fatal=False, default=None)
if video_url is not None:
return { return {
'id': video_id, 'id': video_id,
'title': video_title, 'title': video_title,
'url': video_url 'url': video_url
} }
# attempt to use generic
return self.url_result(url, ie=GenericIE.ie_key())