From 2a42334a87810acdcfd41ae42816c11f2a9ec862 Mon Sep 17 00:00:00 2001 From: Johan Westin Date: Sun, 22 Jul 2018 20:20:19 +0200 Subject: [PATCH 1/3] Added WEBVTT subtitle support for TV4. --- youtube_dl/extractor/common.py | 6 +- youtube_dl/extractor/tv4.py | 261 ++++++++++++++++++++++++++++++++- 2 files changed, 257 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b8bbaf81a..6d6af9de8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1606,7 +1606,7 @@ class InfoExtractor(object): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) - if media_type not in ('VIDEO', 'AUDIO'): + if media_type not in ('AUDIO', 'SUBTITLES', 'VIDEO'): return media_url = media.get('URI') if media_url: @@ -1623,7 +1623,9 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, } - if media_type == 'AUDIO': + if media_type in ['SUBTITLES']: + f['acodec'] = 'none' + if media_type in ['AUDIO', 'SUBTITLES']: f['vcodec'] = 'none' formats.append(f) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 51923e44a..499c16ec4 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +from io import StringIO import re from .common import InfoExtractor @@ -68,6 +69,20 @@ class TV4IE(InfoExtractor): } ] + _ENCODING = 'UTF-8' + _MPEG2_PTS_RATE_HZ = 90000 + """:type Pattern""" + _REGEX_SUB_ENTRY_TIMELINE = re.compile( + r'^(\d+):(\d+):(\d+).(\d+)[^0-9]+(\d+):(\d+):(\d+).(\d+)$' + ) + """:type Pattern""" + _REGEX_X_TIMESTAMP = re.compile( + r'^X-TIMESTAMP-MAP=MPEGTS:(\d+),LOCAL:(\d+):(\d+):(\d+).(\d+)$' + ) + """:type Pattern""" + _REGEX_WEBVTT = re.compile(r'^(.+-(\d+)\.webvtt.*)$') + """:type Pattern""" + def _real_extract(self, url): video_id = self._match_id(url) @@ -84,32 +99,262 @@ class TV4IE(InfoExtractor): 'device': 'browser', 'protocol': 'hls', })['playbackItem']['manifestUrl'] - formats = self._extract_m3u8_formats( + all_formats = self._extract_m3u8_formats( manifest_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(self._extract_mpd_formats( + all_formats.extend(self._extract_mpd_formats( manifest_url.replace('.m3u8', '.mpd'), video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_f4m_formats( + all_formats.extend(self._extract_f4m_formats( manifest_url.replace('.m3u8', '.f4m'), video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_ism_formats( + all_formats.extend(self._extract_ism_formats( re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url), video_id, ism_id='mss', fatal=False)) - if not formats and info.get('is_geo_restricted'): + if not all_formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - self._sort_formats(formats) + subtitle_formats = [] + other_formats = [] + for _index, _format in enumerate(all_formats): + if re.match(r'^.*textstream.*$', _format['format_id']): + subtitle_formats.append(_format) + else: + other_formats.append(_format) + + self._sort_formats(other_formats) + + subtitles = self._webvtt_download_all_subtitle_data( + video_id, + subtitle_formats + ) return { 'id': video_id, 'title': title, - 'formats': formats, - # 'subtitles': subtitles, + 'formats': other_formats, + 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), 'thumbnail': info.get('image'), 'is_live': info.get('is_live') is True, } + + @staticmethod + def _webvtt_adjust_time(reference_sec, ahead_sec, actual_sec): + """ + + :param reference_sec: + :type reference_sec: float + :param ahead_sec: + :type ahead_sec: float + :param actual_sec: + :type actual_sec: float + :return: + :rtype: float + """ + return reference_sec - ahead_sec + actual_sec + + def _webvtt_download_all_subtitle_data(self, video_id, subtitle_formats): + subtitles = {} + for subtitle_format in subtitle_formats: + tag = subtitle_format['language'] + subtitle = self._webvtt_download_subtitle_data( + video_id, subtitle_format + ) + if subtitle is not None: + if tag not in subtitles.keys(): + subtitles[tag] = [] + subtitles[tag].append(subtitle) + + return subtitles + + def _webvtt_download_subtitle_data(self, video_id, subtitle_format): + subs_m3u8_url = subtitle_format['url'] + urlh = self._request_webpage(subs_m3u8_url, video_id, fatal=False) + subs_m3u8_body = '' + if urlh: + subs_m3u8_data = urlh.read() + if subs_m3u8_data: + subs_m3u8_body = subs_m3u8_data.decode(encoding=self._ENCODING) + subs_body_io = StringIO() + base_url = re.search( + r'^(.+)/[^/]+', + subtitle_format['manifest_url'] + ).group(1) + first_fragment = True + for subs_m3u8_line in subs_m3u8_body.split('\n'): + match = self._REGEX_WEBVTT.match(subs_m3u8_line) + if match: + subs_fragment_partial_url = match.group(1) + subs_fragment_index = match.group(2) + subs_fragment_url = '/'.join( + [base_url, subs_fragment_partial_url] + ) + urlh = self._request_webpage( + subs_fragment_url, + '{}-{}'.format(video_id, subs_fragment_index), + fatal=False + ) + if urlh: + subs_fragment_data = urlh.read() + if subs_fragment_data: + self._webvtt_write_fragment( + subs_fragment_data, subs_body_io, first_fragment + ) + first_fragment = False + subtitle = {'ext': 'vtt', 'data': subs_body_io.getvalue()} + subs_body_io.close() + + return subtitle + + def _webvtt_handle_one_fragment( + self, + webvtt_bytes, + vtt_file, + first_fragment=False + ): + """ + + :param webvtt_bytes: + :type webvtt_bytes: bytes + :param vtt_file: + :type vtt_file: TextIO + :param first_fragment: + :type first_fragment: bool + :return: + :rtype: int + """ + mpeg_ref_sec: float = None + local_ref_sec: float = 0.0 + + for line_index, line in enumerate( + StringIO( + webvtt_bytes.decode(encoding=self._ENCODING) + ).readlines() + ): + line = line.strip() + if line_index == 0: + if line == 'WEBVTT': + if first_fragment: + print(line, file=vtt_file) + continue + else: + break + elif line_index == 1: + match = self._REGEX_X_TIMESTAMP.match(line) + """:type: Match""" + if match: + mpeg_ref_sec = ( + int(match.group(1)) - (10 * self._MPEG2_PTS_RATE_HZ) + ) / self._MPEG2_PTS_RATE_HZ + local_ref_sec: float = self._webvtt_time_parts_to_float( + int(match.group(2)), + int(match.group(3)), + int(match.group(4)), + int(match.group(5)) + ) + continue + else: + if len(line.strip()) > 0: + match = self._REGEX_SUB_ENTRY_TIMELINE.match(line) + """:type: Match""" + if match: + print('', file=vtt_file) + print(self._webvtt_make_timeline( + self._webvtt_adjust_time( + mpeg_ref_sec, + local_ref_sec, + self._webvtt_time_parts_to_float( + int(match.group(1)), + int(match.group(2)), + int(match.group(3)), + int(match.group(4)) + ) + ), + self._webvtt_adjust_time( + mpeg_ref_sec, + local_ref_sec, + self._webvtt_time_parts_to_float( + int(match.group(5)), + int(match.group(6)), + int(match.group(7)), + int(match.group(8)) + ) + ) + ), file=vtt_file) + else: + print('{}'.format(line), file=vtt_file) + + def _webvtt_make_timeline(self, start_sec=0.0, stop_sec=0.0): + """ + + :param start_sec: + :type start_sec: float + :param stop_sec: + :type stop_sec: float + :return: + :rtype: str + """ + return ( + '{:02d}:{:02d}:{:02d}.{:03d} --> {:02d}:{:02d}:{:02d}.{:03d}' + ).format( + *self._webvtt_time_float_to_parts(start_sec), + *self._webvtt_time_float_to_parts(stop_sec) + ) + + @staticmethod + def _webvtt_time_parts_to_float( + hours=0, minutes=0, seconds=0, milli_seconds=0 + ): + """ + + :param hours: + :type hours: int + :param minutes: + :type minutes: int + :param seconds: + :type seconds: int + :param milli_seconds: + :type milli_seconds: int + :return: + :rtype: float + """ + return seconds + 60 * (minutes + 60 * hours) + milli_seconds / 1000 + + @staticmethod + def _webvtt_time_float_to_parts(input_sec=0.0): + """ + + :param input_sec: + :type input_sec: float + :return: + :rtype: Tuple[int, int, int, int] + """ + minutes, seconds = divmod(input_sec, 60) + hours, minutes = divmod(minutes, 60) + milli_seconds: int = int(1000 * (input_sec % 1)) + + return int(hours), int(minutes), int(seconds), milli_seconds + + def _webvtt_write_fragment( + self, + webvtt_data, + output_stream, + first_fragment=False + ): + """ + + :param webvtt_data: + :type webvtt_data: bytes + :param output_stream: + :type output_stream: TextIO + :param first_fragment: + :type first_fragment: bool + """ + self._webvtt_handle_one_fragment( + webvtt_data, output_stream, first_fragment + ) + From 6fe0146485b1fa183325f095d8ea3d3dfba6151a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 23 Jul 2018 06:20:00 +0100 Subject: [PATCH 2/3] [facebook] fix tahoe request for authenticated users(closes #16655) --- youtube_dl/extractor/facebook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f78479b92..97cfe0fc3 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor): tahoe_data = self._download_webpage( self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, data=urlencode_postdata({ - '__user': 0, '__a': 1, '__pc': self._search_regex( r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, @@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor): '__rev': self._search_regex( r'client_revision["\']\s*:\s*(\d+),', webpage, 'client revision', default='3944515'), + 'fb_dtsg': self._search_regex( + r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"', + webpage, 'dtsg token', default=''), }), headers={ 'Content-Type': 'application/x-www-form-urlencoded', From 15d862ddf7a0d3237c392bf4c55354ae7201ff8e Mon Sep 17 00:00:00 2001 From: Johan Westin Date: Wed, 25 Jul 2018 12:56:19 +0200 Subject: [PATCH 3/3] Set version 2018.07.21-JMW. --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9bf0ea30d..5fb635e40 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.07.21' +__version__ = '2018.07.21-JMW'