Added WEBVTT subtitle support for TV4.

2025-03-12 19:10:02 +08:00 · 2018-07-22 20:20:19 +02:00 · 2018-07-22 20:20:19 +02:00 · 2a42334a87
commit 2a42334a87
parent d4e7065111
2 changed files with 257 additions and 10 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1606,7 +1606,7 @@ class InfoExtractor(object):
            if not (media_type and group_id and name):
                return
            groups.setdefault(group_id, []).append(media)
-            if media_type not in ('VIDEO', 'AUDIO'):
+            if media_type not in ('AUDIO', 'SUBTITLES', 'VIDEO'):
                return
            media_url = media.get('URI')
            if media_url:
@ -1623,7 +1623,9 @@ class InfoExtractor(object):
                    'protocol': entry_protocol,
                    'preference': preference,
                }
-                if media_type == 'AUDIO':
+                if media_type in ['SUBTITLES']:
+                    f['acodec'] = 'none'
+                if media_type in ['AUDIO', 'SUBTITLES']:
                    f['vcodec'] = 'none'
                formats.append(f)

--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals

+from io import StringIO
 import re

 from .common import InfoExtractor
@ -68,6 +69,20 @@ class TV4IE(InfoExtractor):
        }
    ]

+    _ENCODING = 'UTF-8'
+    _MPEG2_PTS_RATE_HZ = 90000
+    """:type Pattern"""
+    _REGEX_SUB_ENTRY_TIMELINE = re.compile(
+        r'^(\d+):(\d+):(\d+).(\d+)[^0-9]+(\d+):(\d+):(\d+).(\d+)$'
+    )
+    """:type Pattern"""
+    _REGEX_X_TIMESTAMP = re.compile(
+        r'^X-TIMESTAMP-MAP=MPEGTS:(\d+),LOCAL:(\d+):(\d+):(\d+).(\d+)$'
+    )
+    """:type Pattern"""
+    _REGEX_WEBVTT = re.compile(r'^(.+-(\d+)\.webvtt.*)$')
+    """:type Pattern"""
+
    def _real_extract(self, url):
        video_id = self._match_id(url)

@ -84,32 +99,262 @@ class TV4IE(InfoExtractor):
                'device': 'browser',
                'protocol': 'hls',
            })['playbackItem']['manifestUrl']
-        formats = self._extract_m3u8_formats(
+        all_formats = self._extract_m3u8_formats(
            manifest_url, video_id, 'mp4',
            'm3u8_native', m3u8_id='hls', fatal=False)
-        formats.extend(self._extract_mpd_formats(
+        all_formats.extend(self._extract_mpd_formats(
            manifest_url.replace('.m3u8', '.mpd'),
            video_id, mpd_id='dash', fatal=False))
-        formats.extend(self._extract_f4m_formats(
+        all_formats.extend(self._extract_f4m_formats(
            manifest_url.replace('.m3u8', '.f4m'),
            video_id, f4m_id='hds', fatal=False))
-        formats.extend(self._extract_ism_formats(
+        all_formats.extend(self._extract_ism_formats(
            re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
            video_id, ism_id='mss', fatal=False))

-        if not formats and info.get('is_geo_restricted'):
+        if not all_formats and info.get('is_geo_restricted'):
            self.raise_geo_restricted(countries=self._GEO_COUNTRIES)

-        self._sort_formats(formats)
+        subtitle_formats = []
+        other_formats = []
+        for _index, _format in enumerate(all_formats):
+            if re.match(r'^.*textstream.*$', _format['format_id']):
+                subtitle_formats.append(_format)
+            else:
+                other_formats.append(_format)
+
+        self._sort_formats(other_formats)
+
+        subtitles = self._webvtt_download_all_subtitle_data(
+            video_id,
+            subtitle_formats
+        )

        return {
            'id': video_id,
            'title': title,
-            'formats': formats,
-            # 'subtitles': subtitles,
+            'formats': other_formats,
+            'subtitles': subtitles,
            'description': info.get('description'),
            'timestamp': parse_iso8601(info.get('broadcast_date_time')),
            'duration': int_or_none(info.get('duration')),
            'thumbnail': info.get('image'),
            'is_live': info.get('is_live') is True,
        }
+
+    @staticmethod
+    def _webvtt_adjust_time(reference_sec, ahead_sec, actual_sec):
+        """
+
+        :param reference_sec:
+        :type reference_sec: float
+        :param ahead_sec:
+        :type ahead_sec: float
+        :param actual_sec:
+        :type actual_sec: float
+        :return:
+        :rtype: float
+        """
+        return reference_sec - ahead_sec + actual_sec
+
+    def _webvtt_download_all_subtitle_data(self, video_id, subtitle_formats):
+        subtitles = {}
+        for subtitle_format in subtitle_formats:
+            tag = subtitle_format['language']
+            subtitle = self._webvtt_download_subtitle_data(
+                video_id, subtitle_format
+            )
+            if subtitle is not None:
+                if tag not in subtitles.keys():
+                    subtitles[tag] = []
+                subtitles[tag].append(subtitle)
+
+        return subtitles
+
+    def _webvtt_download_subtitle_data(self, video_id, subtitle_format):
+        subs_m3u8_url = subtitle_format['url']
+        urlh = self._request_webpage(subs_m3u8_url, video_id, fatal=False)
+        subs_m3u8_body = ''
+        if urlh:
+            subs_m3u8_data = urlh.read()
+            if subs_m3u8_data:
+                subs_m3u8_body = subs_m3u8_data.decode(encoding=self._ENCODING)
+        subs_body_io = StringIO()
+        base_url = re.search(
+            r'^(.+)/[^/]+',
+            subtitle_format['manifest_url']
+        ).group(1)
+        first_fragment = True
+        for subs_m3u8_line in subs_m3u8_body.split('\n'):
+            match = self._REGEX_WEBVTT.match(subs_m3u8_line)
+            if match:
+                subs_fragment_partial_url = match.group(1)
+                subs_fragment_index = match.group(2)
+                subs_fragment_url = '/'.join(
+                    [base_url, subs_fragment_partial_url]
+                )
+                urlh = self._request_webpage(
+                    subs_fragment_url,
+                    '{}-{}'.format(video_id, subs_fragment_index),
+                    fatal=False
+                )
+                if urlh:
+                    subs_fragment_data = urlh.read()
+                    if subs_fragment_data:
+                        self._webvtt_write_fragment(
+                            subs_fragment_data, subs_body_io, first_fragment
+                        )
+                        first_fragment = False
+        subtitle = {'ext': 'vtt', 'data': subs_body_io.getvalue()}
+        subs_body_io.close()
+
+        return subtitle
+
+    def _webvtt_handle_one_fragment(
+            self,
+            webvtt_bytes,
+            vtt_file,
+            first_fragment=False
+    ):
+        """
+
+        :param webvtt_bytes:
+        :type webvtt_bytes: bytes
+        :param vtt_file:
+        :type vtt_file: TextIO
+        :param first_fragment:
+        :type first_fragment: bool
+        :return:
+        :rtype: int
+        """
+        mpeg_ref_sec: float = None
+        local_ref_sec: float = 0.0
+
+        for line_index, line in enumerate(
+                StringIO(
+                    webvtt_bytes.decode(encoding=self._ENCODING)
+                ).readlines()
+        ):
+            line = line.strip()
+            if line_index == 0:
+                if line == 'WEBVTT':
+                    if first_fragment:
+                        print(line, file=vtt_file)
+                    continue
+                else:
+                    break
+            elif line_index == 1:
+                match = self._REGEX_X_TIMESTAMP.match(line)
+                """:type: Match"""
+                if match:
+                    mpeg_ref_sec = (
+                       int(match.group(1)) - (10 * self._MPEG2_PTS_RATE_HZ)
+                    ) / self._MPEG2_PTS_RATE_HZ
+                    local_ref_sec: float = self._webvtt_time_parts_to_float(
+                        int(match.group(2)),
+                        int(match.group(3)),
+                        int(match.group(4)),
+                        int(match.group(5))
+                    )
+                continue
+            else:
+                if len(line.strip()) > 0:
+                    match = self._REGEX_SUB_ENTRY_TIMELINE.match(line)
+                    """:type: Match"""
+                    if match:
+                        print('', file=vtt_file)
+                        print(self._webvtt_make_timeline(
+                            self._webvtt_adjust_time(
+                                mpeg_ref_sec,
+                                local_ref_sec,
+                                self._webvtt_time_parts_to_float(
+                                    int(match.group(1)),
+                                    int(match.group(2)),
+                                    int(match.group(3)),
+                                    int(match.group(4))
+                                )
+                            ),
+                            self._webvtt_adjust_time(
+                                mpeg_ref_sec,
+                                local_ref_sec,
+                                self._webvtt_time_parts_to_float(
+                                    int(match.group(5)),
+                                    int(match.group(6)),
+                                    int(match.group(7)),
+                                    int(match.group(8))
+                                )
+                            )
+                        ), file=vtt_file)
+                    else:
+                        print('{}'.format(line), file=vtt_file)
+
+    def _webvtt_make_timeline(self, start_sec=0.0, stop_sec=0.0):
+        """
+
+        :param start_sec:
+        :type start_sec: float
+        :param stop_sec:
+        :type stop_sec: float
+        :return:
+        :rtype: str
+        """
+        return (
+            '{:02d}:{:02d}:{:02d}.{:03d} --> {:02d}:{:02d}:{:02d}.{:03d}'
+        ).format(
+            *self._webvtt_time_float_to_parts(start_sec),
+            *self._webvtt_time_float_to_parts(stop_sec)
+        )
+
+    @staticmethod
+    def _webvtt_time_parts_to_float(
+            hours=0, minutes=0, seconds=0, milli_seconds=0
+    ):
+        """
+
+        :param hours:
+        :type hours: int
+        :param minutes:
+        :type minutes: int
+        :param seconds:
+        :type seconds: int
+        :param milli_seconds:
+        :type milli_seconds: int
+        :return:
+        :rtype: float
+        """
+        return seconds + 60 * (minutes + 60 * hours) + milli_seconds / 1000
+
+    @staticmethod
+    def _webvtt_time_float_to_parts(input_sec=0.0):
+        """
+
+        :param input_sec:
+        :type input_sec: float
+        :return:
+        :rtype: Tuple[int, int, int, int]
+        """
+        minutes, seconds = divmod(input_sec, 60)
+        hours, minutes = divmod(minutes, 60)
+        milli_seconds: int = int(1000 * (input_sec % 1))
+
+        return int(hours), int(minutes), int(seconds), milli_seconds
+
+    def _webvtt_write_fragment(
+            self,
+            webvtt_data,
+            output_stream,
+            first_fragment=False
+    ):
+        """
+
+        :param webvtt_data:
+        :type webvtt_data: bytes
+        :param output_stream:
+        :type output_stream: TextIO
+        :param first_fragment:
+        :type first_fragment: bool
+        """
+        self._webvtt_handle_one_fragment(
+            webvtt_data, output_stream, first_fragment
+        )
+