From 98866cc6cad5f11a94325bc75644ba9f3b82b883 Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Wed, 12 Dec 2018 12:41:00 +0100 Subject: [PATCH] [ViMP] Add information extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/generic.py | 16 +++ youtube_dl/extractor/regiotv.py | 62 --------- youtube_dl/extractor/vimp.py | 202 +++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+), 63 deletions(-) delete mode 100644 youtube_dl/extractor/regiotv.py create mode 100644 youtube_dl/extractor/vimp.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a5d12ab1..ad2981ea6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -929,7 +929,6 @@ from .reddit import ( RedditRIE, ) from .redtube import RedTubeIE -from .regiotv import RegioTVIE from .rentv import ( RENTVIE, RENTVArticleIE, @@ -1314,6 +1313,7 @@ from .vimeo import ( VimeoWatchLaterIE, VHXEmbedIE, ) +from .vimp import ViMPIE from .vimple import VimpleIE from .vine import ( VineIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 65b482333..37fd11f0f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -109,6 +109,7 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .vimp import ViMPIE from .teachable import TeachableIE from .indavideo import IndavideoEmbedIE from .apa import APAIE @@ -2018,6 +2019,16 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 2, }, + { # ViMP embed + 'url': 'https://www.regio-tv.de/video_video,-auf-ein-bier-mit-verena-schneider-_vidid,151368.html', + 'info_dict': { + 'id': '31575', + 'ext': 'mp4', + 'title': 'Auf ein Bier mit Verena Schneider', + 'description': 'md5:8fb85042ea46dfaa60f49943256382c9', + 'uploader': 'EckleA', + } + }, { # Indavideo embed 'url': 'https://streetkitchen.hu/receptek/igy_kell_otthon_hamburgert_sutni/', @@ -3113,6 +3124,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + vimp_urls = ViMPIE._extract_urls(webpage, url) + if vimp_urls: + return self.playlist_from_matches( + vimp_urls, video_id, video_title, ie=ViMPIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) if teachable_url: return self.url_result(teachable_url) diff --git a/youtube_dl/extractor/regiotv.py b/youtube_dl/extractor/regiotv.py deleted file mode 100644 index e250a52f0..000000000 --- a/youtube_dl/extractor/regiotv.py +++ /dev/null @@ -1,62 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - sanitized_Request, - xpath_text, - xpath_with_ns, -) - - -class RegioTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?regio-tv\.de/video/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.regio-tv.de/video/395808.html', - 'info_dict': { - 'id': '395808', - 'ext': 'mp4', - 'title': 'Wir in Ludwigsburg', - 'description': 'Mit unseren zuckersüßen Adventskindern, außerdem besuchen wir die Abendsterne!', - } - }, { - 'url': 'http://www.regio-tv.de/video/395808', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - key = self._search_regex( - r'key\s*:\s*(["\'])(?P.+?)\1', webpage, 'key', group='key') - title = self._og_search_title(webpage) - - SOAP_TEMPLATE = '<{0} xmlns="http://v.telvi.de/">{1}' - - request = sanitized_Request( - 'http://v.telvi.de/', - SOAP_TEMPLATE.format('GetHTML5VideoData', key).encode('utf-8')) - video_data = self._download_xml(request, video_id, 'Downloading video XML') - - NS_MAP = { - 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', - 'soap': 'http://schemas.xmlsoap.org/soap/envelope/', - } - - video_url = xpath_text( - video_data, xpath_with_ns('.//video', NS_MAP), 'video url', fatal=True) - thumbnail = xpath_text( - video_data, xpath_with_ns('.//image', NS_MAP), 'thumbnail') - description = self._og_search_description( - webpage) or self._html_search_meta('description', webpage) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/vimp.py b/youtube_dl/extractor/vimp.py new file mode 100644 index 000000000..e1016aa18 --- /dev/null +++ b/youtube_dl/extractor/vimp.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + xpath_element, + xpath_text, +) + + +class ViMPIE(InfoExtractor): + _INSTANCES_RE = r'''(?: + vimp\.schwaebische\.de| + www\.videoportal\.uni-freiburg\.de| + k100186\.vimp\.mivitec\.net| + www\.hd-campus\.tv| + media\.hwr-berlin\.de| + vimp\.oth-regensburg\.de| + video\.irtshdf\.fr| + univideo\.uni-kassel\.de| + mediathek\.htw-berlin\.de| + hd-campus\.de| + medien\.hs-merseburg\.de| + www\.webdancetv\.com| + sign7tv\.com| + www\.salzburgtube\.at| + medienportal-polizei\.land-bw\.de| + www\.bn1\.tv| + video\.tanedo\.de| + www\.fh-bielefeld\.de/medienportal| + framework\.auvica\.net| + schanzer\.tv| + www\.salzi\.tv| + parrotfiles\.com| + bestvision\.tv| + www\.webtv\.coop| + www\.abruzzoinvideo\.tv| + www\.medien-tube\.de + print7tv\.com| + kanutube\.de| + www\.bigcitytv\.de| + www\.drehzahl\.tv| + ursula\.genetics\.emory\.edu| + www\.region-bergstrasse\.tv| + www2\.univ-sba\.dz| + videos\.uni-paderborn\.de + + )''' + _UUID_RE = r'[\da-f]{32}' + _VALID_URL = r'''(?x) + (?: + vimp:(?Ps?):(?P[^:]+):| + http(?Ps?)://(?P%s)/(?:media/embed\?key=|(?:category/|channel/)?video/.+/) + ) + (?P%s) + ''' % (_INSTANCES_RE, _UUID_RE) + _TESTS = [{ + 'url': 'https://www.videoportal.uni-freiburg.de/video/Konzert-des-Akademischen-Orchesters-Freiburg/e2537c92d1d5ff61beba7ed5855c8f7e', + 'md5': '569c906738571d4e17cd91502720b981', + 'info_dict': { + 'id': '6030', + 'ext': 'mp4', + 'title': 'Konzert des Akademischen Orchesters Freiburg', + 'description': 'md5:da634544fde2c5b7556a129eb3c7674b', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'un0', + } + }, { + 'url': 'vimp::vimp.schwaebische.de:e72974cd8a604c8e9c8970d237f07bbf', + 'only_matching': True, + }, { + 'url': 'https://univideo.uni-kassel.de/category/video/12-13-Bauer-et-al/57562e3ed05bc4d74896aa984d518cb1/10', + 'only_matching': True, + }, { + 'url': 'https://univideo.uni-kassel.de/channel/video/Allgemeine-Chemie-vom-04122018/22bc28a5c2cf908cc6ad0f84c0368a89/27', + 'only_matching': True, + }, { + 'url': 'http://vimp.schwaebische.de/media/embed?key=870a38f3dbd9ed10b4e1a1b189e3cf9f', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage, source_url): + entries = re.findall( + r'''(?x)]+\bdata-src=["\'](?P(?:https?:)?//%s/media/embed\?key=%s)''' + % (ViMPIE._INSTANCES_RE, ViMPIE._UUID_RE), webpage) + return entries + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + secure = mobj.group('secure') or mobj.group('secure2') or '' + host = mobj.group('host') or mobj.group('host2') + media_key = mobj.group('media_key') + + webpage = self._download_webpage( + 'http%s://%s/media/embed?key=%s' % (secure, host, media_key), + media_key) + sources = re.findall( + r'addSource\(\'(?P.+)\'\s*,\s*\'video/(?P.+)\'\)', webpage) + if not sources: + sources = re.findall( + r'.+)"\s+type="video/(?P.+)"\s*/>', webpage) + + formats = [] + height = int_or_none( + self._search_regex( + r'preSetVideoHeight\((\d+)\)', webpage, + 'height', default=None)) + width = int_or_none( + self._search_regex( + r'preSetVideoWidth\((\d+)\)', webpage, + 'width', default=None)) + for source in sources: + formats.append({ + 'url': source[0], + 'ext': source[1], + 'height': height, + 'width': width, + }) + + media_id = self._search_regex( + r'(?:preSetCurrendID\((\d+)\)|mediaid=(\d+))', webpage, 'media id') + + media_info = self._download_xml( + 'http%s://%s/media/flashcomm?action=getmediainfo&context=normal&mediaid=%s' % ( + secure, host, media_id), media_id) + media_path = './active_media/media' + title = clean_html( + xpath_text(media_info, '%s/title' % media_path, fatal=True)) + description = clean_html( + xpath_text(media_info, '%s/description' % media_path)) + uploader = xpath_text( + media_info, '%s/author' % media_path) + url = xpath_text( + media_info, '%s/file' % media_path) + duration = int_or_none( + xpath_text(media_info, '%s/duration_sec' % media_path)) + height = int_or_none( + xpath_text(media_info, '%s/height' % media_path)) + width = int_or_none( + xpath_text(media_info, '%s/width' % media_path)) + view_count = int_or_none( + xpath_text(media_info, '%s/views' % media_path)) + thumbnail = xpath_text( + media_info, '%s/previewpic' % media_path) or xpath_text( + media_info, '%s/description_pic' % media_path) + + if url: + formats.append({ + 'url': url, + 'width': width, + 'height': height, + }) + + formats_info = self._download_xml( + 'http%s://%s/webplayer/flashcomm?action=getmediaformats&key=%s' % ( + secure, host, media_key), + 'media formats', fatal=False) + if formats_info: + format_params = [] + files = xpath_element(formats_info, './files') + for f in files: + fkey = xpath_text(f, './key') + ftype = xpath_text(f, './type') + if fkey and fkey != 'default' and ftype: + format_params.append( + { + 'format': fkey, + 'type': ftype, + }) + + for fp in format_params: + url = 'http%s://%s/getMedium/%s.%s?format=%s' % ( + secure, host, media_key, fp['type'], fp['format']) + formats.append( + { + 'url': url, + 'height': int_or_none(fp['format'].strip('p')) + }) + if not formats: + formats.append({ + 'url': 'http%s://%s/getMedia.php?key=%s&type=mp4' % ( + secure, host, media_key), + 'ext': 'mp4', + }) + + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'uploader': uploader, + 'description': description, + 'duration': duration, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'formats': formats, + }