[telebasel] [simplex] Add new information extractors

2025-03-13 11:50:00 +08:00 · 2017-02-06 17:01:34 +01:00 · 2017-02-06 17:01:34 +01:00 · 91d21e0a84
commit 91d21e0a84
parent d5d904ff7d
3 changed files with 366 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -849,6 +849,10 @@ from .shared import (
    VivoIE,
 )
 from .showroomlive import ShowRoomLiveIE
+from .simplex import (
+    SimplexIE,
+    SimplexHostsIE,
+)
 from .sina import SinaIE
 from .sixplay import SixPlayIE
 from .skynewsarabia import (
@ -931,6 +935,10 @@ from .teamfourstar import TeamFourStarIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tele13 import Tele13IE
+from .telebasel import (
+    TelebaselMediathekIE,
+    TelebaselArticleIE,
+)
 from .telebruxelles import TeleBruxellesIE
 from .telecinco import TelecincoIE
 from .telegraaf import TelegraafIE
--- a/youtube_dl/extractor/simplex.py
+++ b/youtube_dl/extractor/simplex.py
@ -0,0 +1,233 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    str_or_none,
+    try_get,
+    urljoin,
+)
+
+
+class SimplexIE(InfoExtractor):
+    IE_DESC = 'Simplex Player'
+    _VALID_URL = r'''(?x)
+                simplex:
+                (?P<server_url>https?://(?:www\.)?.+):
+                (?P<customer_id>\d+):
+                (?P<author_id>\d+):
+                (?P<project_id>\d+)
+                '''
+
+    _TEST = {
+        'url': 'simplex:http://video.telebasel.ch:4062:4063:62349',
+        'only_matching': True,
+    }
+
+    @staticmethod
+    def _extract_width_height(resolution):
+        try:
+            w, h = resolution.split('x')
+            w = int_or_none(w)
+            h = int_or_none(h)
+            return w, h
+        except (AttributeError, ValueError):
+            return None, None
+
+    def _known_simplex_format(self, simplex_formats, fid):
+        for sf in simplex_formats:
+            if type(sf['id']) == str and sf['id'] == fid:
+                return sf
+            elif type(sf['id']) == list and fid in sf['id']:
+                return sf
+        return None
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        server_url = mobj.group('server_url')
+        customer_id = mobj.group('customer_id')
+        author_id = mobj.group('author_id')
+        project_id = mobj.group('project_id')
+        video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
+
+        content_url = urljoin(
+            server_url,
+            'content/%s/%s/%s/' % (customer_id, author_id, project_id))
+
+        player_data = self._download_json(
+            urljoin(content_url, 'data.sid'),
+            video_id,
+            note='Downloading player data JSON',
+            errnote='Unable to download player data JSON')
+        video_data = self._download_json(
+            urljoin(content_url, 'pl01.sid'),
+            video_id,
+            note='Downloading video data JSON',
+            errnote='Unable to download video data JSON',
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        title = str_or_none(player_data['title'])
+        description = str_or_none(player_data.get('description'))
+        timestamp = int_or_none(player_data.get('createDate'))
+        language = str_or_none(player_data.get('language'))
+        duration = float_or_none(player_data.get('duration'), scale=10)
+
+        file_information = try_get(video_data, lambda x: x['data'], dict)
+        if not file_information:
+            raise ExtractorError('Cannot extract file information data.')
+
+        filename = str_or_none(file_information.get('filename'))
+        thumbname = str_or_none(file_information.get('thumb'))
+        thumbnail = urljoin(content_url, thumbname + '.jpg') if thumbname else None
+
+        qualities = try_get(player_data, lambda x: x['qualities'], list)
+        if not qualities:
+            raise ExtractorError('Cannot find available formats.')
+
+        # simplex_formats is the list of known simplex player formats.
+        # There might be some more format ids, but we are not sure, what they do:
+        # id 400: It was indicated to be for Apple TV.
+        # id 500: No additional information found.
+        simplex_formats = [
+            {'id': '20', 'filename': filename + '.flv', 'method': 'url'},
+            {'id': '40', 'filename': filename + '_40.flv', 'method': 'url'},
+            {'id': '200', 'filename': filename + '.mp4', 'method': 'url'},
+            {'id': ['300', '350', '355', '360'], 'filename': 'index.m3u8', 'method': 'm3u8'},
+        ]
+
+        formats = []
+
+        m3u8_done = False
+        format_infos = []
+        for quali in qualities:
+            fid = str_or_none(quali.get('id'))
+
+            vbr = int_or_none(quali.get('b'))
+            resolution = str_or_none(quali.get('s'))
+            width, height = SimplexIE._extract_width_height(resolution)
+            form_info = {
+                'resolution': resolution,
+                'width': width,
+                'height': height,
+                'vbr': vbr,
+                'abr': int_or_none(quali.get('ab')),
+                'asr': int_or_none(quali.get('ar')),
+                'fps': int_or_none(quali.get('r')),
+                'language': language,
+                'format_id': 'hls-%s' % str_or_none(vbr)
+            }
+            format_infos.append(form_info)
+
+            simplex_format = self._known_simplex_format(simplex_formats, fid)
+            if simplex_format:
+                format_url = urljoin(content_url, simplex_format['filename'])
+                if simplex_format['method'] == 'url':
+                    form = {
+                        'url': format_url
+                    }
+                    form.update(form_info)
+                    formats.append(form)
+                elif simplex_format['method'] == 'm3u8' and not m3u8_done:
+                    forms = self._extract_m3u8_formats(
+                        format_url,
+                        video_id,
+                        ext='mp4',
+                        entry_protocol='m3u8_native')
+                    formats.extend(forms)
+                    m3u8_done = True
+
+        # Try to add additional information to the formats exracted by _extract_m3u8_formats:
+        for form in formats:
+            if form['url'].endswith('.m3u8'):
+                vbr = int_or_none(
+                    self._search_regex(r'(\d+)kb.m3u8', form['url'], 'm3u8 vbr', default=None))
+                if vbr:
+                    try:
+                        form_info = next(f for f in format_infos if f['vbr'] == vbr)
+                        form.update(form_info)
+                    except StopIteration:
+                        pass
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+
+class SimplexHostsIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                (?P<server_url>https?://(?:www\.)?
+                    (?:
+                        video\.telebasel\.ch|
+                        media10\.simplex\.tv
+                    )
+                )
+                /content/
+                (?P<customer_id>\d+)/
+                (?P<author_id>\d+)/
+                (?P<project_id>\d+)
+                '''
+
+    _TESTS = [{
+        'url': 'http://media10.simplex.tv/content/906/907/76997/',
+        'md5': 'e6b8ebefac5aeae4a6790fec18382ca0',
+        'info_dict': {
+            'id': '906-907-76997',
+            'ext': 'flv',
+            'title': '03.02.17: Der Trailer zum Rückrunden-Start',
+            'description': None,
+            'duration': 44.0,
+            'timestamp': 1486135964,
+            'upload_date': '20170203',
+            'url': 'http://media10.simplex.tv/content/906/907/76997/simvid_1_40.flv',
+            'thumbnail': 'http://media10.simplex.tv/content/906/907/76997/simvid_1.jpg',
+            'language': 'de',
+            'width': 1280,
+            'height': 720,
+            'vbr': 2304,
+            'abr': 160,
+            'fps': 25,
+            'asr': 44100,
+            'resolution': '1280x720'
+        }
+    }, {
+        'url': 'https://video.telebasel.ch/content/4062/4063/77067',
+        'info_dict': {
+            'id': '4062-4063-77067',
+            'ext': 'flv',
+            'title': 'News vom 05.02.2017',
+            'description': 'md5:23fb960068621263d5d4418996387674',
+            'timestamp': 1486314961,
+            'upload_date': '20170205',
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        server_url = mobj.group('server_url')
+        customer_id = mobj.group('customer_id')
+        author_id = mobj.group('author_id')
+        project_id = mobj.group('project_id')
+
+        video_id = '%s-%s-%s' % (customer_id, author_id, project_id)
+        simplex_url = 'simplex:%s:%s:%s:%s' % (server_url, customer_id, author_id, project_id)
+
+        return self.url_result(
+            simplex_url,
+            ie=SimplexIE.ie_key(),
+            video_id=video_id)
--- a/youtube_dl/extractor/telebasel.py
+++ b/youtube_dl/extractor/telebasel.py
@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .simplex import SimplexIE
+from ..utils import (
+    ExtractorError,
+    str_or_none,
+    strip_or_none,
+    remove_end,
+    try_get,
+    urljoin,
+)
+
+
+class TelebaselBaseIE(InfoExtractor):
+    _SERVER_URL = 'https://video.telebasel.ch/'
+    _CUSTOMER_ID = '4062'
+    _AUTHOR_ID = '4063'
+
+
+class TelebaselMediathekIE(TelebaselBaseIE):
+    IE_DESC = 'telebasel.ch Mediathek'
+    _VALID_URL = r'''(?x)
+                https?://
+                    (?:www\.)?
+                    telebasel\.ch/
+                    (?!telebasel-archiv)
+                    (?!\d+)
+                    (?P<show_name>[^/]+)
+                    (?:
+                        /.*pid=(?P<pid>\d+).*
+                    )?
+                '''
+
+    _TESTS = [{
+        'url': 'https://telebasel.ch/telebasel-gastro-tipp/?aid=4063&pid=75290&channel=15881',
+        'only_matching': True,
+    }, {
+        'url': 'https://telebasel.ch/telebasel-reihe-8',
+        'only_matching': True,
+    }, {
+        'url': 'https://telebasel.ch/telebasel-talk/?channel=15881',
+        'only_matching': True,
+    }]
+
+    def _extract_video_id(self, url, show_name):
+        webpage = self._download_webpage(url, show_name)
+        channel_id = self._html_search_regex(
+            r'<div[^>]+class=["\']tb-mediathek-videos["\'][^>]+data-channels=["\'](\d+)["\']',
+            webpage, 'channel id')
+
+        episodes_url = urljoin(
+            self._SERVER_URL,
+            'multichannel/%s/%s/.ofdd/json' % (self._CUSTOMER_ID, channel_id))
+        episodes = self._download_json(
+            episodes_url,
+            channel_id,
+            note='Downloading episodes JSON',
+            errnote='Unable to download episodes JSON',
+            transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
+
+        video_id = str_or_none(
+            try_get(episodes, lambda x: x['projects'][0]['projectId'], int))
+        if not video_id:
+            raise ExtractorError('Could not extract video id from the webpage.')
+
+        return video_id
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        show_name = mobj.group('show_name')
+        video_id = mobj.group('pid')
+
+        if not video_id:
+            video_id = self._extract_video_id(url, show_name)
+
+        return self.url_result(
+            'simplex:%s:%s:%s:%s' % (
+                self._SERVER_URL, self._CUSTOMER_ID,
+                self._AUTHOR_ID, video_id),
+            ie=SimplexIE.ie_key())
+
+
+class TelebaselArticleIE(TelebaselBaseIE):
+    IE_DESC = 'telebasel.ch articles'
+    _VALID_URL = r'https?://(?:www\.)?telebasel\.ch/(?P<id>\d{4}/\d{2}/\d{2}/[^/]+)/?'
+
+    _TEST = {
+        'url': 'https://telebasel.ch/2017/02/01/report-usr-iii-einfach-erklaert/?channel=105100',
+        'info_dict': {
+            'id': '2017/02/01/report-usr-iii-einfach-erklaert',
+            'title': 'Report: USR III einfach erklärt',
+            'description': 'md5:2cb2b94ac023a6a9517cffc58d500c7e',
+        },
+        'playlist_count': 3,
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        search_url = urljoin(
+            self._SERVER_URL,
+            r'content/%s/%s/(?P<pid>\d+)' % (self._CUSTOMER_ID, self._AUTHOR_ID))
+        embed_regex = r'<iframe[^>]+src=["\']%s.+["\']' % search_url
+        entries = [
+            self.url_result(
+                'simplex:%s:%s:%s:%s' % (
+                    self._SERVER_URL, self._CUSTOMER_ID,
+                    self._AUTHOR_ID, m.group('pid')),
+                ie=SimplexIE.ie_key())
+            for m in re.finditer(embed_regex, webpage)]
+
+        title = strip_or_none(
+            remove_end(self._og_search_title(webpage), '- Telebasel'))
+        description = self._og_search_description(webpage)
+
+        return self.playlist_result(
+            entries,
+            playlist_id=display_id,
+            playlist_title=title,
+            playlist_description=description)