From 62962240b1d17c4060fd125f50505c249c7288b4 Mon Sep 17 00:00:00 2001 From: metalgamer Date: Tue, 20 Jan 2015 11:02:01 +0100 Subject: [PATCH 1/4] Added rtl.lu extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/rtllu.py | 108 +++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 youtube_dl/extractor/rtllu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0902eb437..a603fc900 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -346,6 +346,7 @@ from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE +from .rtllu import RtlluIE from .rtlnl import RtlXlIE from .rtlnow import RTLnowIE from .rtp import RTPIE diff --git a/youtube_dl/extractor/rtllu.py b/youtube_dl/extractor/rtllu.py new file mode 100644 index 000000000..7220b226f --- /dev/null +++ b/youtube_dl/extractor/rtllu.py @@ -0,0 +1,108 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor + + +class RtlluIE(InfoExtractor): + IE_NAME = 'rtl.lu' + + _VALID_URL = r'https?://(www|tele|radio|5minutes)\.rtl\.lu\/.*?\/(?P[0-9]+)' + + _TEST = { + 'url': 'http://radio.rtl.lu/emissiounen/background/599319.html', + 'md5': 'TODO:', + 'info_dict': { + 'id': '599319', + 'ext': 'mp4', + }, + } + + def _real_extract(self, url): + match = self._VALID_URL_RE.match(url) + id = match.group('id') + + webpage = self._download_webpage(url, id) + + javascript_regex = r'' + javascript = self._html_search_regex(javascript_regex, webpage, 'javascript') + + try: + javascript_sources_regex = r'object.*\.sources = \'(?P.*?)\';' + sources = json.loads(re.search(javascript_sources_regex, javascript).group('value')) + + javascript_thumbnail_regex = r'object.*\.title = \'(?P.*?)\';' + javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value') + + javascript_videoid_regex = r'object.*\.videoid = \'(?P.*?)\';' + javascript_videoid = re.search(javascript_videoid_regex, javascript).group('value') + + javascript_publicdate_regex = r'object.*\.publicdate = \'(?P.*?)\';' + javascript_publicdate = re.search(javascript_publicdate_regex, javascript).group('value') + + formats = [ + { + 'url': sources['httplq']['src'], + 'format': 'Low Quality', + 'format_id': 'lq', + 'protocol': 'http', + }, + { + 'url': sources['http']['src'], + 'format': 'Standard Quality', + 'format_id': 'sd', + 'protocol': 'http', + }, + { + 'url': sources['httphq']['src'], + 'format': 'High Quality', + 'format_id': 'hq', + 'protocol': 'http', + }, + ] + + return { + 'id': javascript_videoid or id, + 'title': self.get_video_title(webpage, javascript), + 'formats': formats, + 'thumbnail': javascript_thumbnail, + 'upload_date': javascript_publicdate, + } + except AttributeError: + javascript_mp3_regex = r'play_mp3\("object[0-9]*", "(?P.*?)",' + javascript_mp3 = re.search(javascript_mp3_regex, javascript).group('value') + print(javascript_mp3) + + return { + 'id': id, + 'title': self.get_audio_title(webpage), + 'url': javascript_mp3, + } + + def get_video_title(self, webpage, javascript): + + title_regex = r'.*

(?P.*?)</h1>.*?<p class="sub">' + title = re.findall(title_regex, webpage, flags=re.S) + + if title: + title = title[-1] + + javascript_title_regex = r'object.*\.title = \'(?P<value>.*?)\';' + javascript_title = re.search(javascript_title_regex, javascript).group('value') + return javascript_title or title or self._og_search_title(webpage) + + def get_audio_title(self, webpage): + + title_regex = r'<header><h1><span>(?P<span>.*?)</span>(?P<title>.*?)</h1>' + title = self._html_search_regex(title_regex, webpage, 'title', group='title', fatal=False) + span = self._html_search_regex(title_regex, webpage, 'span', group='span', fatal=False) + + if title or span: + title = ' - '.join([span, title]) + + h5_title_regex = r'<h5>(?P<title>.*?)</h5>' + h5_title = self._html_search_regex(h5_title_regex, webpage, 'title', group='title', fatal=False) + + return title or h5_title or self._og_search_title(webpage) From 53ca134ca9e91639330723543c7deefd7369a0c6 Mon Sep 17 00:00:00 2001 From: Dennis Fink <dennis.fink@c3l.lu> Date: Mon, 3 Oct 2016 00:29:17 +0200 Subject: [PATCH 2/4] [Rtllu] Added new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rtllu.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e8928307c..7910ed978 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -748,6 +748,7 @@ from .roxwel import RoxwelIE from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE +from .rtllu import RtlluIE from .rtlnl import RtlNlIE from .rtl2 import RTL2IE from .rtp import RTPIE diff --git a/youtube_dl/extractor/rtllu.py b/youtube_dl/extractor/rtllu.py index 7220b226f..454005388 100644 --- a/youtube_dl/extractor/rtllu.py +++ b/youtube_dl/extractor/rtllu.py @@ -12,11 +12,12 @@ class RtlluIE(InfoExtractor): _VALID_URL = r'https?://(www|tele|radio|5minutes)\.rtl\.lu\/.*?\/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://radio.rtl.lu/emissiounen/background/599319.html', - 'md5': 'TODO:', + 'url': 'http://tele.rtl.lu/emissiounen/documentaire-routwaissgro/lu/890363.html', + 'md5': '38a2d2286ff4b8ccc300e847294cb90a', 'info_dict': { 'id': '599319', 'ext': 'mp4', + 'title': '"Vënz de Prënz" (18.03.2016)', }, } @@ -33,16 +34,23 @@ class RtlluIE(InfoExtractor): javascript_sources_regex = r'object.*\.sources = \'(?P<value>.*?)\';' sources = json.loads(re.search(javascript_sources_regex, javascript).group('value')) - javascript_thumbnail_regex = r'object.*\.title = \'(?P<value>.*?)\';' - javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value') - javascript_videoid_regex = r'object.*\.videoid = \'(?P<value>.*?)\';' javascript_videoid = re.search(javascript_videoid_regex, javascript).group('value') javascript_publicdate_regex = r'object.*\.publicdate = \'(?P<value>.*?)\';' javascript_publicdate = re.search(javascript_publicdate_regex, javascript).group('value') + javascript_thumbnail_regex = r'object.*\.thumbnail = \'(?P<value>.*?)\';' + javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value') + formats = [ + { + 'url': sources['rtmp']['src'], + 'format': 'RTMP Stream', + 'format_id': 'rtmp', + 'protocol': 'rtmp', + }, + { 'url': sources['httplq']['src'], 'format': 'Low Quality', @@ -73,7 +81,6 @@ class RtlluIE(InfoExtractor): except AttributeError: javascript_mp3_regex = r'play_mp3\("object[0-9]*", "(?P<value>.*?)",' javascript_mp3 = re.search(javascript_mp3_regex, javascript).group('value') - print(javascript_mp3) return { 'id': id, From 36ce480413f3e316023565e4e6b3d686913fc29c Mon Sep 17 00:00:00 2001 From: Dennis Fink <dennis.fink@c3l.lu> Date: Mon, 3 Oct 2016 18:19:29 +0200 Subject: [PATCH 3/4] Fixed issues from travis-ci --- youtube_dl/extractor/rtllu.py | 116 +++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/rtllu.py b/youtube_dl/extractor/rtllu.py index 454005388..24f95cd2a 100644 --- a/youtube_dl/extractor/rtllu.py +++ b/youtube_dl/extractor/rtllu.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -22,9 +23,7 @@ class RtlluIE(InfoExtractor): } def _real_extract(self, url): - match = self._VALID_URL_RE.match(url) - id = match.group('id') - + id = self._match_id(url) webpage = self._download_webpage(url, id) javascript_regex = r'<script language="Javascript">((\n*?.*?)*?)</script>' @@ -32,60 +31,91 @@ class RtlluIE(InfoExtractor): try: javascript_sources_regex = r'object.*\.sources = \'(?P<value>.*?)\';' - sources = json.loads(re.search(javascript_sources_regex, javascript).group('value')) + sources = self._search_regex(javascript_sources_regex, javascript, 'sources') + sources = json.loads(sources) - javascript_videoid_regex = r'object.*\.videoid = \'(?P<value>.*?)\';' - javascript_videoid = re.search(javascript_videoid_regex, javascript).group('value') + videoid_regex = r'object.*\.videoid = \'(?P<value>.*?)\';' + videoid = self._search_regex(videoid_regex, javascript, 'videoid', fatal=False, default=id) - javascript_publicdate_regex = r'object.*\.publicdate = \'(?P<value>.*?)\';' - javascript_publicdate = re.search(javascript_publicdate_regex, javascript).group('value') + publicdate_regex = r'object.*\.publicdate = \'(?P<value>.*?)\';' + publicdate = self._search_regex(publicdate_regex, javascript, 'publicdate', fatal=False) - javascript_thumbnail_regex = r'object.*\.thumbnail = \'(?P<value>.*?)\';' - javascript_thumbnail = re.search(javascript_thumbnail_regex, javascript).group('value') + thumbnail_regex = r'object.*\.thumbnail = \'(?P<value>.*?)\';' + thumbnail = self._search_regex(thumbnail_regex, javascript, 'thumbnail', fatal=False) - formats = [ - { - 'url': sources['rtmp']['src'], - 'format': 'RTMP Stream', - 'format_id': 'rtmp', - 'protocol': 'rtmp', - }, + formats = [] - { - 'url': sources['httplq']['src'], - 'format': 'Low Quality', - 'format_id': 'lq', - 'protocol': 'http', - }, - { - 'url': sources['http']['src'], - 'format': 'Standard Quality', - 'format_id': 'sd', - 'protocol': 'http', - }, - { - 'url': sources['httphq']['src'], - 'format': 'High Quality', - 'format_id': 'hq', - 'protocol': 'http', - }, - ] + rtmp_source = sources.get('rtmp') + if rtmp_source is not None: + rtmp_url = rtmp_source.get('src') + + if rtmp_url is not None: + formats.append( + { + 'url': rtmp_url, + 'format': 'RTMP Stream', + 'format_id': 'rtmp', + 'protocol': 'rtmp' + } + ) + + httplq_source = sources.get('httplq') + if httplq_source is not None: + httplq_url = httplq_source.get('src') + + if httplq_url is not None: + formats.append( + { + 'url': httplq_url, + 'format': 'Low Quality', + 'format_id': 'lq', + 'protocol': 'http', + } + ) + + http_source = sources.get('http') + if http_source is not None: + http_url = http_source.get('src') + + if http_url is not None: + formats.append( + { + 'url': http_url, + 'format': 'Standard Quality', + 'format_id': 'sd', + 'protocol': 'http', + } + ) + + httphq_source = sources.get('httphq') + if httphq_source is not None: + httphq_url = httphq_source.get('src') + + if httphq_url is not None: + formats.append( + { + 'url': httphq_url, + 'format': 'High Quality', + 'format_id': 'hq', + 'protocol': 'http', + } + ) return { - 'id': javascript_videoid or id, + 'id': videoid, 'title': self.get_video_title(webpage, javascript), 'formats': formats, - 'thumbnail': javascript_thumbnail, - 'upload_date': javascript_publicdate, + 'thumbnail': thumbnail, + 'upload_date': publicdate, } except AttributeError: - javascript_mp3_regex = r'play_mp3\("object[0-9]*", "(?P<value>.*?)",' - javascript_mp3 = re.search(javascript_mp3_regex, javascript).group('value') + mp3_regex = r'play_mp3\("object[0-9]*", "(?P<value>.*?)",' + mp3_url = self._search_regex(mp3_regex, javascript, 'mp3_url') return { 'id': id, 'title': self.get_audio_title(webpage), - 'url': javascript_mp3, + 'url': mp3_url, } def get_video_title(self, webpage, javascript): @@ -97,7 +127,7 @@ class RtlluIE(InfoExtractor): title = title[-1] javascript_title_regex = r'object.*\.title = \'(?P<value>.*?)\';' - javascript_title = re.search(javascript_title_regex, javascript).group('value') + javascript_title = self._search_regex(javascript_title_regex, javascript, 'javascript_title', fatal=False) return javascript_title or title or self._og_search_title(webpage) def get_audio_title(self, webpage): From 20155ed96a8778641e22582f1bfa153105ca6f14 Mon Sep 17 00:00:00 2001 From: Dennis Fink <dennis.fink@c3l.lu> Date: Mon, 3 Oct 2016 22:15:34 +0200 Subject: [PATCH 4/4] Fixed tests --- youtube_dl/extractor/rtllu.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtllu.py b/youtube_dl/extractor/rtllu.py index 24f95cd2a..df9ff3e54 100644 --- a/youtube_dl/extractor/rtllu.py +++ b/youtube_dl/extractor/rtllu.py @@ -14,11 +14,12 @@ class RtlluIE(InfoExtractor): _TEST = { 'url': 'http://tele.rtl.lu/emissiounen/documentaire-routwaissgro/lu/890363.html', - 'md5': '38a2d2286ff4b8ccc300e847294cb90a', + 'md5': 'a9f34b9c8a20a61c2332b1f2f8c084d6', 'info_dict': { - 'id': '599319', + 'id': '3057497', 'ext': 'mp4', 'title': '"Vënz de Prënz" (18.03.2016)', + 'upload_date': '20160318', }, }