1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-10 07:42:53 +08:00

Added WEBVTT subtitle support for TV4.

This commit is contained in:
Johan Westin 2018-07-22 20:20:19 +02:00
parent d4e7065111
commit 2a42334a87
2 changed files with 257 additions and 10 deletions

View File

@ -1606,7 +1606,7 @@ class InfoExtractor(object):
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
if media_type not in ('VIDEO', 'AUDIO'):
if media_type not in ('AUDIO', 'SUBTITLES', 'VIDEO'):
return
media_url = media.get('URI')
if media_url:
@ -1623,7 +1623,9 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}
if media_type == 'AUDIO':
if media_type in ['SUBTITLES']:
f['acodec'] = 'none'
if media_type in ['AUDIO', 'SUBTITLES']:
f['vcodec'] = 'none'
formats.append(f)

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from io import StringIO
import re
from .common import InfoExtractor
@ -68,6 +69,20 @@ class TV4IE(InfoExtractor):
}
]
_ENCODING = 'UTF-8'
_MPEG2_PTS_RATE_HZ = 90000
""":type Pattern"""
_REGEX_SUB_ENTRY_TIMELINE = re.compile(
r'^(\d+):(\d+):(\d+).(\d+)[^0-9]+(\d+):(\d+):(\d+).(\d+)$'
)
""":type Pattern"""
_REGEX_X_TIMESTAMP = re.compile(
r'^X-TIMESTAMP-MAP=MPEGTS:(\d+),LOCAL:(\d+):(\d+):(\d+).(\d+)$'
)
""":type Pattern"""
_REGEX_WEBVTT = re.compile(r'^(.+-(\d+)\.webvtt.*)$')
""":type Pattern"""
def _real_extract(self, url):
video_id = self._match_id(url)
@ -84,32 +99,262 @@ class TV4IE(InfoExtractor):
'device': 'browser',
'protocol': 'hls',
})['playbackItem']['manifestUrl']
formats = self._extract_m3u8_formats(
all_formats = self._extract_m3u8_formats(
manifest_url, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False)
formats.extend(self._extract_mpd_formats(
all_formats.extend(self._extract_mpd_formats(
manifest_url.replace('.m3u8', '.mpd'),
video_id, mpd_id='dash', fatal=False))
formats.extend(self._extract_f4m_formats(
all_formats.extend(self._extract_f4m_formats(
manifest_url.replace('.m3u8', '.f4m'),
video_id, f4m_id='hds', fatal=False))
formats.extend(self._extract_ism_formats(
all_formats.extend(self._extract_ism_formats(
re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
video_id, ism_id='mss', fatal=False))
if not formats and info.get('is_geo_restricted'):
if not all_formats and info.get('is_geo_restricted'):
self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
self._sort_formats(formats)
subtitle_formats = []
other_formats = []
for _index, _format in enumerate(all_formats):
if re.match(r'^.*textstream.*$', _format['format_id']):
subtitle_formats.append(_format)
else:
other_formats.append(_format)
self._sort_formats(other_formats)
subtitles = self._webvtt_download_all_subtitle_data(
video_id,
subtitle_formats
)
return {
'id': video_id,
'title': title,
'formats': formats,
# 'subtitles': subtitles,
'formats': other_formats,
'subtitles': subtitles,
'description': info.get('description'),
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
'duration': int_or_none(info.get('duration')),
'thumbnail': info.get('image'),
'is_live': info.get('is_live') is True,
}
@staticmethod
def _webvtt_adjust_time(reference_sec, ahead_sec, actual_sec):
"""
:param reference_sec:
:type reference_sec: float
:param ahead_sec:
:type ahead_sec: float
:param actual_sec:
:type actual_sec: float
:return:
:rtype: float
"""
return reference_sec - ahead_sec + actual_sec
def _webvtt_download_all_subtitle_data(self, video_id, subtitle_formats):
subtitles = {}
for subtitle_format in subtitle_formats:
tag = subtitle_format['language']
subtitle = self._webvtt_download_subtitle_data(
video_id, subtitle_format
)
if subtitle is not None:
if tag not in subtitles.keys():
subtitles[tag] = []
subtitles[tag].append(subtitle)
return subtitles
def _webvtt_download_subtitle_data(self, video_id, subtitle_format):
subs_m3u8_url = subtitle_format['url']
urlh = self._request_webpage(subs_m3u8_url, video_id, fatal=False)
subs_m3u8_body = ''
if urlh:
subs_m3u8_data = urlh.read()
if subs_m3u8_data:
subs_m3u8_body = subs_m3u8_data.decode(encoding=self._ENCODING)
subs_body_io = StringIO()
base_url = re.search(
r'^(.+)/[^/]+',
subtitle_format['manifest_url']
).group(1)
first_fragment = True
for subs_m3u8_line in subs_m3u8_body.split('\n'):
match = self._REGEX_WEBVTT.match(subs_m3u8_line)
if match:
subs_fragment_partial_url = match.group(1)
subs_fragment_index = match.group(2)
subs_fragment_url = '/'.join(
[base_url, subs_fragment_partial_url]
)
urlh = self._request_webpage(
subs_fragment_url,
'{}-{}'.format(video_id, subs_fragment_index),
fatal=False
)
if urlh:
subs_fragment_data = urlh.read()
if subs_fragment_data:
self._webvtt_write_fragment(
subs_fragment_data, subs_body_io, first_fragment
)
first_fragment = False
subtitle = {'ext': 'vtt', 'data': subs_body_io.getvalue()}
subs_body_io.close()
return subtitle
def _webvtt_handle_one_fragment(
self,
webvtt_bytes,
vtt_file,
first_fragment=False
):
"""
:param webvtt_bytes:
:type webvtt_bytes: bytes
:param vtt_file:
:type vtt_file: TextIO
:param first_fragment:
:type first_fragment: bool
:return:
:rtype: int
"""
mpeg_ref_sec: float = None
local_ref_sec: float = 0.0
for line_index, line in enumerate(
StringIO(
webvtt_bytes.decode(encoding=self._ENCODING)
).readlines()
):
line = line.strip()
if line_index == 0:
if line == 'WEBVTT':
if first_fragment:
print(line, file=vtt_file)
continue
else:
break
elif line_index == 1:
match = self._REGEX_X_TIMESTAMP.match(line)
""":type: Match"""
if match:
mpeg_ref_sec = (
int(match.group(1)) - (10 * self._MPEG2_PTS_RATE_HZ)
) / self._MPEG2_PTS_RATE_HZ
local_ref_sec: float = self._webvtt_time_parts_to_float(
int(match.group(2)),
int(match.group(3)),
int(match.group(4)),
int(match.group(5))
)
continue
else:
if len(line.strip()) > 0:
match = self._REGEX_SUB_ENTRY_TIMELINE.match(line)
""":type: Match"""
if match:
print('', file=vtt_file)
print(self._webvtt_make_timeline(
self._webvtt_adjust_time(
mpeg_ref_sec,
local_ref_sec,
self._webvtt_time_parts_to_float(
int(match.group(1)),
int(match.group(2)),
int(match.group(3)),
int(match.group(4))
)
),
self._webvtt_adjust_time(
mpeg_ref_sec,
local_ref_sec,
self._webvtt_time_parts_to_float(
int(match.group(5)),
int(match.group(6)),
int(match.group(7)),
int(match.group(8))
)
)
), file=vtt_file)
else:
print('{}'.format(line), file=vtt_file)
def _webvtt_make_timeline(self, start_sec=0.0, stop_sec=0.0):
"""
:param start_sec:
:type start_sec: float
:param stop_sec:
:type stop_sec: float
:return:
:rtype: str
"""
return (
'{:02d}:{:02d}:{:02d}.{:03d} --> {:02d}:{:02d}:{:02d}.{:03d}'
).format(
*self._webvtt_time_float_to_parts(start_sec),
*self._webvtt_time_float_to_parts(stop_sec)
)
@staticmethod
def _webvtt_time_parts_to_float(
hours=0, minutes=0, seconds=0, milli_seconds=0
):
"""
:param hours:
:type hours: int
:param minutes:
:type minutes: int
:param seconds:
:type seconds: int
:param milli_seconds:
:type milli_seconds: int
:return:
:rtype: float
"""
return seconds + 60 * (minutes + 60 * hours) + milli_seconds / 1000
@staticmethod
def _webvtt_time_float_to_parts(input_sec=0.0):
"""
:param input_sec:
:type input_sec: float
:return:
:rtype: Tuple[int, int, int, int]
"""
minutes, seconds = divmod(input_sec, 60)
hours, minutes = divmod(minutes, 60)
milli_seconds: int = int(1000 * (input_sec % 1))
return int(hours), int(minutes), int(seconds), milli_seconds
def _webvtt_write_fragment(
self,
webvtt_data,
output_stream,
first_fragment=False
):
"""
:param webvtt_data:
:type webvtt_data: bytes
:param output_stream:
:type output_stream: TextIO
:param first_fragment:
:type first_fragment: bool
"""
self._webvtt_handle_one_fragment(
webvtt_data, output_stream, first_fragment
)