From 8f738c28476bdbe03c0149055feaad5c65947eea Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 13:55:07 +0200 Subject: [PATCH 1/9] [tf1] fix wat id extraction(closes ytdl-org#21365) --- youtube_dl/extractor/tf1.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 903f47380..f54bd28ec 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -43,12 +43,25 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', 'only_matching': True, + }, { + 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', + 'info_dict': { + 'id': '13641379', + 'ext': 'mp4', + 'title': 'Quotidien, première partie du 11 juin 2019', + 'description': 'Retrouvez l’intégralité du replay de la première partie de Quotidien du 11 juin. On parle des enfants français rapatriés de Syrie avec Salhia Brakhlia, de la décision du New York Times d’arrêter les dessins politiques avec Lilia Hassaine, on part voir les Bleues à J-1 de leur rencontre avec la ...', + 'upload_date': '20190611', + }, + 'params': { + # Sometimes wat serves the whole file with the --test option + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', + r'"streamId":"(?P\d{8})"', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From 6e6151aa5e870008920fbf552d64ed4ddca09bb0 Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 14:42:27 +0200 Subject: [PATCH 2/9] [tf1] relax wat id regex --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index f54bd28ec..638e18235 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -62,6 +62,6 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) wat_id = self._html_search_regex( - r'"streamId":"(?P\d{8})"', + r'\bstreamId\W+(?P\d+)', webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From 77d6e33ac6198a33c07fd94947348d47739cb271 Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 14:50:45 +0200 Subject: [PATCH 3/9] [tf1] reintroduce old wat id pattern --- youtube_dl/extractor/tf1.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 638e18235..f1e5035a4 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor +from youtube_dl.utils import ( + RegexNotFoundError, +) class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" @@ -61,7 +64,13 @@ class TF1IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._html_search_regex( - r'\bstreamId\W+(?P\d+)', - webpage, 'wat id', group='id') + try: + wat_id = self._html_search_regex( + # the old pattern. Should no longer work as of 2019-06-12 + r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', + webpage, 'wat id', group='id') + except RegexNotFoundError: + wat_id = self._html_search_regex( + r'\bstreamId\W+(?P\d+)', + webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From ae4df7d3469c7b4adf26c4b55d475eb1f11e9c7f Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 15:21:54 +0200 Subject: [PATCH 4/9] [tf1] proper multipattern and relaxed regex --- youtube_dl/extractor/tf1.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index f1e5035a4..e908e2327 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -64,13 +64,8 @@ class TF1IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - try: - wat_id = self._html_search_regex( - # the old pattern. Should no longer work as of 2019-06-12 - r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - webpage, 'wat id', group='id') - except RegexNotFoundError: - wat_id = self._html_search_regex( - r'\bstreamId\W+(?P\d+)', - webpage, 'wat id', group='id') + wat_id = self._html_search_regex( + [r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', + r'(["\']?)streamId\1\s*:\s*["\']?(?P\d+)'], + webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From eb5d7a45d1068f9d757c8362b9742309587cfa50 Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 23:23:50 +0200 Subject: [PATCH 5/9] remove unused import --- youtube_dl/extractor/tf1.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e908e2327..97aa601b0 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -3,9 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from youtube_dl.utils import ( - RegexNotFoundError, -) class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" From c0319bc4da9eac176c9d19dd6401b4b14c657874 Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 23:25:08 +0200 Subject: [PATCH 6/9] replaced long strings with md5 --- youtube_dl/extractor/tf1.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 97aa601b0..8e92ab626 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -48,8 +48,8 @@ class TF1IE(InfoExtractor): 'info_dict': { 'id': '13641379', 'ext': 'mp4', - 'title': 'Quotidien, première partie du 11 juin 2019', - 'description': 'Retrouvez l’intégralité du replay de la première partie de Quotidien du 11 juin. On parle des enfants français rapatriés de Syrie avec Salhia Brakhlia, de la décision du New York Times d’arrêter les dessins politiques avec Lilia Hassaine, on part voir les Bleues à J-1 de leur rencontre avec la ...', + 'title': 'md5:f392bc52245dc5ad43771650c96fb620', + 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', 'upload_date': '20190611', }, 'params': { From acda141738fa45710205a67558e22ee5750e1dbd Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Wed, 12 Jun 2019 23:27:04 +0200 Subject: [PATCH 7/9] disambiguated id patterns using the slug --- youtube_dl/extractor/tf1.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 8e92ab626..124472db0 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -61,8 +61,16 @@ class TF1IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + slug = self._search_regex( + r'(?<=/)(?P[^/]+)(?=\.html$)', + url, 'slug', group='slug', default='') wat_id = self._html_search_regex( [r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*["\']?(?P\d+)'], + r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2\s*,[^}}]*(["\']?)slug\4\s*:\s*(["\']){}\5' + .format(slug), + r'(["\']?)slug\1\s*:\s*(["\']){}\2,[^}}]*(["\']?)streamId\3\s*:\s*(["\']?)(?P\d+)\4' + .format(slug), + r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2' + ], webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From c381d375108ad6ff9d035c6c9969556eb7548a9b Mon Sep 17 00:00:00 2001 From: Emmanuel Froissart Date: Fri, 14 Jun 2019 19:38:19 +0200 Subject: [PATCH 8/9] extract wat_id by parsing JSON --- youtube_dl/extractor/tf1.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 124472db0..82ae3dc29 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import js_to_json + class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" @@ -61,16 +63,22 @@ class TF1IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - slug = self._search_regex( - r'(?<=/)(?P[^/]+)(?=\.html$)', - url, 'slug', group='slug', default='') - wat_id = self._html_search_regex( - [r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2\s*,[^}}]*(["\']?)slug\4\s*:\s*(["\']){}\5' - .format(slug), - r'(["\']?)slug\1\s*:\s*(["\']){}\2,[^}}]*(["\']?)streamId\3\s*:\s*(["\']?)(?P\d+)\4' - .format(slug), - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2' - ], - webpage, 'wat id', group='id') + vids_data_string = self._html_search_regex( + r'', + webpage, 'videos data string', group='vids_data_string', default=None) + wat_id = None + if vids_data_string is not None: + vids_data = self._parse_json( + vids_data_string, video_id, + transform_source=js_to_json) + video_data = [v for v in vids_data.values() + if 'slug' in v and v['slug'] == video_id] + if len(video_data) > 0 and 'streamId' in video_data[0]: + wat_id = video_data[0]['streamId'] + if wat_id is None: + wat_id = self._html_search_regex( + [r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', + r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2' + ], + webpage, 'wat id', group='id') return self.url_result('wat:%s' % wat_id, 'Wat') From c352f7438a6dabba1185b2f00d50946228250258 Mon Sep 17 00:00:00 2001 From: froiss Date: Sun, 16 Jun 2019 15:36:43 +0200 Subject: [PATCH 9/9] improved regex to extract videos data --- youtube_dl/extractor/tf1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 82ae3dc29..091350848 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -64,7 +64,7 @@ class TF1IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) vids_data_string = self._html_search_regex( - r'', + r'', webpage, 'videos data string', group='vids_data_string', default=None) wat_id = None if vids_data_string is not None: