Added WEBVTT subtitle support for TV4.

2025-02-13 20:52:52 +08:00 · 2018-07-22 20:20:19 +02:00 · 2018-07-22 20:20:19 +02:00 · 2a42334a87
commit 2a42334a87
parent d4e7065111
2 changed files with 257 additions and 10 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1606,7 +1606,7 @@ class InfoExtractor(object):
            if not (media_type and group_id and name):
                return
            groups.setdefault(group_id, []).append(media)
-            if media_type not in ('VIDEO', 'AUDIO'):
+            if media_type not in ('AUDIO', 'SUBTITLES', 'VIDEO'):
                return
            media_url = media.get('URI')
            if media_url:
@ -1623,7 +1623,9 @@ class InfoExtractor(object):
                    'protocol': entry_protocol,
                    'preference': preference,
                }
-                if media_type == 'AUDIO':
+                if media_type in ['SUBTITLES']:
                    f['acodec'] = 'none'
                if media_type in ['AUDIO', 'SUBTITLES']:
                    f['vcodec'] = 'none'
                formats.append(f)
--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from io import StringIO
 import re
 from .common import InfoExtractor
@ -68,6 +69,20 @@ class TV4IE(InfoExtractor):
        }
    ]
    _ENCODING = 'UTF-8'
    _MPEG2_PTS_RATE_HZ = 90000
    """:type Pattern"""
    _REGEX_SUB_ENTRY_TIMELINE = re.compile(
        r'^(\d+):(\d+):(\d+).(\d+)[^0-9]+(\d+):(\d+):(\d+).(\d+)$'
    )
    """:type Pattern"""
    _REGEX_X_TIMESTAMP = re.compile(
        r'^X-TIMESTAMP-MAP=MPEGTS:(\d+),LOCAL:(\d+):(\d+):(\d+).(\d+)$'
    )
    """:type Pattern"""
    _REGEX_WEBVTT = re.compile(r'^(.+-(\d+)\.webvtt.*)$')
    """:type Pattern"""
    def _real_extract(self, url):
        video_id = self._match_id(url)
@ -84,32 +99,262 @@ class TV4IE(InfoExtractor):
                'device': 'browser',
                'protocol': 'hls',
            })['playbackItem']['manifestUrl']
-        formats = self._extract_m3u8_formats(
+        all_formats = self._extract_m3u8_formats(
            manifest_url, video_id, 'mp4',
            'm3u8_native', m3u8_id='hls', fatal=False)
-        formats.extend(self._extract_mpd_formats(
+        all_formats.extend(self._extract_mpd_formats(
            manifest_url.replace('.m3u8', '.mpd'),
            video_id, mpd_id='dash', fatal=False))
-        formats.extend(self._extract_f4m_formats(
+        all_formats.extend(self._extract_f4m_formats(
            manifest_url.replace('.m3u8', '.f4m'),
            video_id, f4m_id='hds', fatal=False))
-        formats.extend(self._extract_ism_formats(
+        all_formats.extend(self._extract_ism_formats(
            re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
            video_id, ism_id='mss', fatal=False))
-        if not formats and info.get('is_geo_restricted'):
+        if not all_formats and info.get('is_geo_restricted'):
            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
-        self._sort_formats(formats)
+        subtitle_formats = []
        other_formats = []
        for _index, _format in enumerate(all_formats):
            if re.match(r'^.*textstream.*$', _format['format_id']):
                subtitle_formats.append(_format)
            else:
                other_formats.append(_format)
        self._sort_formats(other_formats)
        subtitles = self._webvtt_download_all_subtitle_data(
            video_id,
            subtitle_formats
        )
        return {
            'id': video_id,
            'title': title,
-            'formats': formats,
+            'formats': other_formats,
-            # 'subtitles': subtitles,
+            'subtitles': subtitles,
            'description': info.get('description'),
            'timestamp': parse_iso8601(info.get('broadcast_date_time')),
            'duration': int_or_none(info.get('duration')),
            'thumbnail': info.get('image'),
            'is_live': info.get('is_live') is True,
        }
    @staticmethod
    def _webvtt_adjust_time(reference_sec, ahead_sec, actual_sec):
        """
        :param reference_sec:
        :type reference_sec: float
        :param ahead_sec:
        :type ahead_sec: float
        :param actual_sec:
        :type actual_sec: float
        :return:
        :rtype: float
        """
        return reference_sec - ahead_sec + actual_sec
    def _webvtt_download_all_subtitle_data(self, video_id, subtitle_formats):
        subtitles = {}
        for subtitle_format in subtitle_formats:
            tag = subtitle_format['language']
            subtitle = self._webvtt_download_subtitle_data(
                video_id, subtitle_format
            )
            if subtitle is not None:
                if tag not in subtitles.keys():
                    subtitles[tag] = []
                subtitles[tag].append(subtitle)
        return subtitles
    def _webvtt_download_subtitle_data(self, video_id, subtitle_format):
        subs_m3u8_url = subtitle_format['url']
        urlh = self._request_webpage(subs_m3u8_url, video_id, fatal=False)
        subs_m3u8_body = ''
        if urlh:
            subs_m3u8_data = urlh.read()
            if subs_m3u8_data:
                subs_m3u8_body = subs_m3u8_data.decode(encoding=self._ENCODING)
        subs_body_io = StringIO()
        base_url = re.search(
            r'^(.+)/[^/]+',
            subtitle_format['manifest_url']
        ).group(1)
        first_fragment = True
        for subs_m3u8_line in subs_m3u8_body.split('\n'):
            match = self._REGEX_WEBVTT.match(subs_m3u8_line)
            if match:
                subs_fragment_partial_url = match.group(1)
                subs_fragment_index = match.group(2)
                subs_fragment_url = '/'.join(
                    [base_url, subs_fragment_partial_url]
                )
                urlh = self._request_webpage(
                    subs_fragment_url,
                    '{}-{}'.format(video_id, subs_fragment_index),
                    fatal=False
                )
                if urlh:
                    subs_fragment_data = urlh.read()
                    if subs_fragment_data:
                        self._webvtt_write_fragment(
                            subs_fragment_data, subs_body_io, first_fragment
                        )
                        first_fragment = False
        subtitle = {'ext': 'vtt', 'data': subs_body_io.getvalue()}
        subs_body_io.close()
        return subtitle
    def _webvtt_handle_one_fragment(
            self,
            webvtt_bytes,
            vtt_file,
            first_fragment=False
    ):
        """
        :param webvtt_bytes:
        :type webvtt_bytes: bytes
        :param vtt_file:
        :type vtt_file: TextIO
        :param first_fragment:
        :type first_fragment: bool
        :return:
        :rtype: int
        """
        mpeg_ref_sec: float = None
        local_ref_sec: float = 0.0
        for line_index, line in enumerate(
                StringIO(
                    webvtt_bytes.decode(encoding=self._ENCODING)
                ).readlines()
        ):
            line = line.strip()
            if line_index == 0:
                if line == 'WEBVTT':
                    if first_fragment:
                        print(line, file=vtt_file)
                    continue
                else:
                    break
            elif line_index == 1:
                match = self._REGEX_X_TIMESTAMP.match(line)
                """:type: Match"""
                if match:
                    mpeg_ref_sec = (
                       int(match.group(1)) - (10 * self._MPEG2_PTS_RATE_HZ)
                    ) / self._MPEG2_PTS_RATE_HZ
                    local_ref_sec: float = self._webvtt_time_parts_to_float(
                        int(match.group(2)),
                        int(match.group(3)),
                        int(match.group(4)),
                        int(match.group(5))
                    )
                continue
            else:
                if len(line.strip()) > 0:
                    match = self._REGEX_SUB_ENTRY_TIMELINE.match(line)
                    """:type: Match"""
                    if match:
                        print('', file=vtt_file)
                        print(self._webvtt_make_timeline(
                            self._webvtt_adjust_time(
                                mpeg_ref_sec,
                                local_ref_sec,
                                self._webvtt_time_parts_to_float(
                                    int(match.group(1)),
                                    int(match.group(2)),
                                    int(match.group(3)),
                                    int(match.group(4))
                                )
                            ),
                            self._webvtt_adjust_time(
                                mpeg_ref_sec,
                                local_ref_sec,
                                self._webvtt_time_parts_to_float(
                                    int(match.group(5)),
                                    int(match.group(6)),
                                    int(match.group(7)),
                                    int(match.group(8))
                                )
                            )
                        ), file=vtt_file)
                    else:
                        print('{}'.format(line), file=vtt_file)
    def _webvtt_make_timeline(self, start_sec=0.0, stop_sec=0.0):
        """
        :param start_sec:
        :type start_sec: float
        :param stop_sec:
        :type stop_sec: float
        :return:
        :rtype: str
        """
        return (
            '{:02d}:{:02d}:{:02d}.{:03d} --> {:02d}:{:02d}:{:02d}.{:03d}'
        ).format(
            *self._webvtt_time_float_to_parts(start_sec),
            *self._webvtt_time_float_to_parts(stop_sec)
        )
    @staticmethod
    def _webvtt_time_parts_to_float(
            hours=0, minutes=0, seconds=0, milli_seconds=0
    ):
        """
        :param hours:
        :type hours: int
        :param minutes:
        :type minutes: int
        :param seconds:
        :type seconds: int
        :param milli_seconds:
        :type milli_seconds: int
        :return:
        :rtype: float
        """
        return seconds + 60 * (minutes + 60 * hours) + milli_seconds / 1000
    @staticmethod
    def _webvtt_time_float_to_parts(input_sec=0.0):
        """
        :param input_sec:
        :type input_sec: float
        :return:
        :rtype: Tuple[int, int, int, int]
        """
        minutes, seconds = divmod(input_sec, 60)
        hours, minutes = divmod(minutes, 60)
        milli_seconds: int = int(1000 * (input_sec % 1))
        return int(hours), int(minutes), int(seconds), milli_seconds
    def _webvtt_write_fragment(
            self,
            webvtt_data,
            output_stream,
            first_fragment=False
    ):
        """
        :param webvtt_data:
        :type webvtt_data: bytes
        :param output_stream:
        :type output_stream: TextIO
        :param first_fragment:
        :type first_fragment: bool
        """
        self._webvtt_handle_one_fragment(
            webvtt_data, output_stream, first_fragment
        )