From 2a42334a87810acdcfd41ae42816c11f2a9ec862 Mon Sep 17 00:00:00 2001
From: Johan Westin <johan.m.westin@gmail.com>
Date: Sun, 22 Jul 2018 20:20:19 +0200
Subject: [PATCH 1/3] Added WEBVTT subtitle support for TV4.

---
 youtube_dl/extractor/common.py |   6 +-
 youtube_dl/extractor/tv4.py    | 261 ++++++++++++++++++++++++++++++++-
 2 files changed, 257 insertions(+), 10 deletions(-)

diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index b8bbaf81a..6d6af9de8 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1606,7 +1606,7 @@ class InfoExtractor(object):
             if not (media_type and group_id and name):
                 return
             groups.setdefault(group_id, []).append(media)
-            if media_type not in ('VIDEO', 'AUDIO'):
+            if media_type not in ('AUDIO', 'SUBTITLES', 'VIDEO'):
                 return
             media_url = media.get('URI')
             if media_url:
@@ -1623,7 +1623,9 @@ class InfoExtractor(object):
                     'protocol': entry_protocol,
                     'preference': preference,
                 }
-                if media_type == 'AUDIO':
+                if media_type in ['SUBTITLES']:
+                    f['acodec'] = 'none'
+                if media_type in ['AUDIO', 'SUBTITLES']:
                     f['vcodec'] = 'none'
                 formats.append(f)
 
diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py
index 51923e44a..499c16ec4 100644
--- a/youtube_dl/extractor/tv4.py
+++ b/youtube_dl/extractor/tv4.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+from io import StringIO
 import re
 
 from .common import InfoExtractor
@@ -68,6 +69,20 @@ class TV4IE(InfoExtractor):
         }
     ]
 
+    _ENCODING = 'UTF-8'
+    _MPEG2_PTS_RATE_HZ = 90000
+    """:type Pattern"""
+    _REGEX_SUB_ENTRY_TIMELINE = re.compile(
+        r'^(\d+):(\d+):(\d+).(\d+)[^0-9]+(\d+):(\d+):(\d+).(\d+)$'
+    )
+    """:type Pattern"""
+    _REGEX_X_TIMESTAMP = re.compile(
+        r'^X-TIMESTAMP-MAP=MPEGTS:(\d+),LOCAL:(\d+):(\d+):(\d+).(\d+)$'
+    )
+    """:type Pattern"""
+    _REGEX_WEBVTT = re.compile(r'^(.+-(\d+)\.webvtt.*)$')
+    """:type Pattern"""
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -84,32 +99,262 @@ class TV4IE(InfoExtractor):
                 'device': 'browser',
                 'protocol': 'hls',
             })['playbackItem']['manifestUrl']
-        formats = self._extract_m3u8_formats(
+        all_formats = self._extract_m3u8_formats(
             manifest_url, video_id, 'mp4',
             'm3u8_native', m3u8_id='hls', fatal=False)
-        formats.extend(self._extract_mpd_formats(
+        all_formats.extend(self._extract_mpd_formats(
             manifest_url.replace('.m3u8', '.mpd'),
             video_id, mpd_id='dash', fatal=False))
-        formats.extend(self._extract_f4m_formats(
+        all_formats.extend(self._extract_f4m_formats(
             manifest_url.replace('.m3u8', '.f4m'),
             video_id, f4m_id='hds', fatal=False))
-        formats.extend(self._extract_ism_formats(
+        all_formats.extend(self._extract_ism_formats(
             re.sub(r'\.ism/.+?\.m3u8', r'.ism/Manifest', manifest_url),
             video_id, ism_id='mss', fatal=False))
 
-        if not formats and info.get('is_geo_restricted'):
+        if not all_formats and info.get('is_geo_restricted'):
             self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
 
-        self._sort_formats(formats)
+        subtitle_formats = []
+        other_formats = []
+        for _index, _format in enumerate(all_formats):
+            if re.match(r'^.*textstream.*$', _format['format_id']):
+                subtitle_formats.append(_format)
+            else:
+                other_formats.append(_format)
+
+        self._sort_formats(other_formats)
+
+        subtitles = self._webvtt_download_all_subtitle_data(
+            video_id,
+            subtitle_formats
+        )
 
         return {
             'id': video_id,
             'title': title,
-            'formats': formats,
-            # 'subtitles': subtitles,
+            'formats': other_formats,
+            'subtitles': subtitles,
             'description': info.get('description'),
             'timestamp': parse_iso8601(info.get('broadcast_date_time')),
             'duration': int_or_none(info.get('duration')),
             'thumbnail': info.get('image'),
             'is_live': info.get('is_live') is True,
         }
+
+    @staticmethod
+    def _webvtt_adjust_time(reference_sec, ahead_sec, actual_sec):
+        """
+
+        :param reference_sec:
+        :type reference_sec: float
+        :param ahead_sec:
+        :type ahead_sec: float
+        :param actual_sec:
+        :type actual_sec: float
+        :return:
+        :rtype: float
+        """
+        return reference_sec - ahead_sec + actual_sec
+
+    def _webvtt_download_all_subtitle_data(self, video_id, subtitle_formats):
+        subtitles = {}
+        for subtitle_format in subtitle_formats:
+            tag = subtitle_format['language']
+            subtitle = self._webvtt_download_subtitle_data(
+                video_id, subtitle_format
+            )
+            if subtitle is not None:
+                if tag not in subtitles.keys():
+                    subtitles[tag] = []
+                subtitles[tag].append(subtitle)
+
+        return subtitles
+
+    def _webvtt_download_subtitle_data(self, video_id, subtitle_format):
+        subs_m3u8_url = subtitle_format['url']
+        urlh = self._request_webpage(subs_m3u8_url, video_id, fatal=False)
+        subs_m3u8_body = ''
+        if urlh:
+            subs_m3u8_data = urlh.read()
+            if subs_m3u8_data:
+                subs_m3u8_body = subs_m3u8_data.decode(encoding=self._ENCODING)
+        subs_body_io = StringIO()
+        base_url = re.search(
+            r'^(.+)/[^/]+',
+            subtitle_format['manifest_url']
+        ).group(1)
+        first_fragment = True
+        for subs_m3u8_line in subs_m3u8_body.split('\n'):
+            match = self._REGEX_WEBVTT.match(subs_m3u8_line)
+            if match:
+                subs_fragment_partial_url = match.group(1)
+                subs_fragment_index = match.group(2)
+                subs_fragment_url = '/'.join(
+                    [base_url, subs_fragment_partial_url]
+                )
+                urlh = self._request_webpage(
+                    subs_fragment_url,
+                    '{}-{}'.format(video_id, subs_fragment_index),
+                    fatal=False
+                )
+                if urlh:
+                    subs_fragment_data = urlh.read()
+                    if subs_fragment_data:
+                        self._webvtt_write_fragment(
+                            subs_fragment_data, subs_body_io, first_fragment
+                        )
+                        first_fragment = False
+        subtitle = {'ext': 'vtt', 'data': subs_body_io.getvalue()}
+        subs_body_io.close()
+
+        return subtitle
+
+    def _webvtt_handle_one_fragment(
+            self,
+            webvtt_bytes,
+            vtt_file,
+            first_fragment=False
+    ):
+        """
+
+        :param webvtt_bytes:
+        :type webvtt_bytes: bytes
+        :param vtt_file:
+        :type vtt_file: TextIO
+        :param first_fragment:
+        :type first_fragment: bool
+        :return:
+        :rtype: int
+        """
+        mpeg_ref_sec: float = None
+        local_ref_sec: float = 0.0
+
+        for line_index, line in enumerate(
+                StringIO(
+                    webvtt_bytes.decode(encoding=self._ENCODING)
+                ).readlines()
+        ):
+            line = line.strip()
+            if line_index == 0:
+                if line == 'WEBVTT':
+                    if first_fragment:
+                        print(line, file=vtt_file)
+                    continue
+                else:
+                    break
+            elif line_index == 1:
+                match = self._REGEX_X_TIMESTAMP.match(line)
+                """:type: Match"""
+                if match:
+                    mpeg_ref_sec = (
+                       int(match.group(1)) - (10 * self._MPEG2_PTS_RATE_HZ)
+                    ) / self._MPEG2_PTS_RATE_HZ
+                    local_ref_sec: float = self._webvtt_time_parts_to_float(
+                        int(match.group(2)),
+                        int(match.group(3)),
+                        int(match.group(4)),
+                        int(match.group(5))
+                    )
+                continue
+            else:
+                if len(line.strip()) > 0:
+                    match = self._REGEX_SUB_ENTRY_TIMELINE.match(line)
+                    """:type: Match"""
+                    if match:
+                        print('', file=vtt_file)
+                        print(self._webvtt_make_timeline(
+                            self._webvtt_adjust_time(
+                                mpeg_ref_sec,
+                                local_ref_sec,
+                                self._webvtt_time_parts_to_float(
+                                    int(match.group(1)),
+                                    int(match.group(2)),
+                                    int(match.group(3)),
+                                    int(match.group(4))
+                                )
+                            ),
+                            self._webvtt_adjust_time(
+                                mpeg_ref_sec,
+                                local_ref_sec,
+                                self._webvtt_time_parts_to_float(
+                                    int(match.group(5)),
+                                    int(match.group(6)),
+                                    int(match.group(7)),
+                                    int(match.group(8))
+                                )
+                            )
+                        ), file=vtt_file)
+                    else:
+                        print('{}'.format(line), file=vtt_file)
+
+    def _webvtt_make_timeline(self, start_sec=0.0, stop_sec=0.0):
+        """
+
+        :param start_sec:
+        :type start_sec: float
+        :param stop_sec:
+        :type stop_sec: float
+        :return:
+        :rtype: str
+        """
+        return (
+            '{:02d}:{:02d}:{:02d}.{:03d} --> {:02d}:{:02d}:{:02d}.{:03d}'
+        ).format(
+            *self._webvtt_time_float_to_parts(start_sec),
+            *self._webvtt_time_float_to_parts(stop_sec)
+        )
+
+    @staticmethod
+    def _webvtt_time_parts_to_float(
+            hours=0, minutes=0, seconds=0, milli_seconds=0
+    ):
+        """
+
+        :param hours:
+        :type hours: int
+        :param minutes:
+        :type minutes: int
+        :param seconds:
+        :type seconds: int
+        :param milli_seconds:
+        :type milli_seconds: int
+        :return:
+        :rtype: float
+        """
+        return seconds + 60 * (minutes + 60 * hours) + milli_seconds / 1000
+
+    @staticmethod
+    def _webvtt_time_float_to_parts(input_sec=0.0):
+        """
+
+        :param input_sec:
+        :type input_sec: float
+        :return:
+        :rtype: Tuple[int, int, int, int]
+        """
+        minutes, seconds = divmod(input_sec, 60)
+        hours, minutes = divmod(minutes, 60)
+        milli_seconds: int = int(1000 * (input_sec % 1))
+
+        return int(hours), int(minutes), int(seconds), milli_seconds
+
+    def _webvtt_write_fragment(
+            self,
+            webvtt_data,
+            output_stream,
+            first_fragment=False
+    ):
+        """
+
+        :param webvtt_data:
+        :type webvtt_data: bytes
+        :param output_stream:
+        :type output_stream: TextIO
+        :param first_fragment:
+        :type first_fragment: bool
+        """
+        self._webvtt_handle_one_fragment(
+            webvtt_data, output_stream, first_fragment
+        )
+

From 6fe0146485b1fa183325f095d8ea3d3dfba6151a Mon Sep 17 00:00:00 2001
From: Remita Amine <remitamine@gmail.com>
Date: Mon, 23 Jul 2018 06:20:00 +0100
Subject: [PATCH 2/3] [facebook] fix tahoe request for authenticated
 users(closes #16655)

---
 youtube_dl/extractor/facebook.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index f78479b92..97cfe0fc3 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor):
             tahoe_data = self._download_webpage(
                 self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
                 data=urlencode_postdata({
-                    '__user': 0,
                     '__a': 1,
                     '__pc': self._search_regex(
                         r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
@@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor):
                     '__rev': self._search_regex(
                         r'client_revision["\']\s*:\s*(\d+),', webpage,
                         'client revision', default='3944515'),
+                    'fb_dtsg': self._search_regex(
+                        r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
+                        webpage, 'dtsg token', default=''),
                 }),
                 headers={
                     'Content-Type': 'application/x-www-form-urlencoded',

From 15d862ddf7a0d3237c392bf4c55354ae7201ff8e Mon Sep 17 00:00:00 2001
From: Johan Westin <johan.m.westin@gmail.com>
Date: Wed, 25 Jul 2018 12:56:19 +0200
Subject: [PATCH 3/3] Set version 2018.07.21-JMW.

---
 youtube_dl/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 9bf0ea30d..5fb635e40 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2018.07.21'
+__version__ = '2018.07.21-JMW'