From 60098e149109a8e612a6571704503c256e5a3de9 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Thu, 11 Jan 2018 23:02:20 +0100 Subject: [PATCH 1/7] [mytaratata] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/mytaratata.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/mytaratata.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 37624d37a..706bf3b7a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -631,6 +631,7 @@ from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE +from .mytaratata import MyTaratataIE from .myvi import MyviIE from .myvidster import MyVidsterIE from .nationalgeographic import ( diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py new file mode 100644 index 000000000..44e32a42f --- /dev/null +++ b/youtube_dl/extractor/mytaratata.py @@ -0,0 +1,68 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class MyTaratataIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mytaratata\.com/taratata/(?P[a-zA-Z0-9_\-/]+)' + _TEST = { + 'url': 'http://mytaratata.com/taratata/519/shaka-ponk-camille-et-julie-berthollet-smells-like-teen-spirit-nirvana', + 'md5': 'c2876e18716b350c9de69cfda2662919', + 'info_dict': { + 'id': '519/shaka-ponk-camille-et-julie-berthollet-smells-like-teen-spirit-nirvana', + 'ext': 'mp4', + 'title': 'Taratata - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', + 'uploader': 'Taratata', + 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', + # 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + formats = [] + + video_source_re = re.compile( + r'data-source="(?Phttp://videos.air-productions.cdn.sfr.net' + r'/mytaratata/Taratata[^"]+\.mp4)"' + ) + + last_vid = None + for url in video_source_re.findall(webpage): + info_m = re.match(r'.*(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', url) + if info_m is None: + continue + vid = info_m.group('vid') + w = info_m.group('w') + h = info_m.group('h') + if last_vid is None: + last_vid = vid + if vid != last_vid: + break + + formats.append({'url': url, 'width': int(w), 'height': int(h)}) + + formats = list(sorted(formats, key=lambda f: f['width'])) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': "Taratata", + # TODO more properties (see youtube_dl/extractor/common.py) + 'formats': formats, + } From 63c3a458cb3df4e182338d0ec4b5b700edf4c3f7 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Fri, 12 Jan 2018 13:20:11 +0100 Subject: [PATCH 2/7] [mytaratata] Improve title & id extraction, add thumbnail --- youtube_dl/extractor/mytaratata.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index 44e32a42f..b4a7e49c4 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -11,11 +11,11 @@ class MyTaratataIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mytaratata\.com/taratata/(?P[a-zA-Z0-9_\-/]+)' _TEST = { 'url': 'http://mytaratata.com/taratata/519/shaka-ponk-camille-et-julie-berthollet-smells-like-teen-spirit-nirvana', - 'md5': 'c2876e18716b350c9de69cfda2662919', + 'md5': '99657330eb7dec6d63a329d7f26ec93e', 'info_dict': { - 'id': '519/shaka-ponk-camille-et-julie-berthollet-smells-like-teen-spirit-nirvana', + 'id': '7174', 'ext': 'mp4', - 'title': 'Taratata - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', + 'title': u'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'uploader': 'Taratata', 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', # 'thumbnail': r're:^https?://.*\.jpg$', @@ -31,8 +31,8 @@ class MyTaratataIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) formats = [] @@ -43,7 +43,7 @@ class MyTaratataIE(InfoExtractor): last_vid = None for url in video_source_re.findall(webpage): - info_m = re.match(r'.*(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', url) + info_m = re.match(r'.*/(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', url) if info_m is None: continue vid = info_m.group('vid') @@ -54,15 +54,22 @@ class MyTaratataIE(InfoExtractor): if vid != last_vid: break - formats.append({'url': url, 'width': int(w), 'height': int(h)}) + formats.append({ + 'url': url, + 'width': int(w), + 'height': int(h), + }) formats = list(sorted(formats, key=lambda f: f['width'])) return { - 'id': video_id, - 'title': title, - 'description': self._og_search_description(webpage), + 'id': last_vid, + 'title': '%s - %s' % (title, description), + 'description': description, + # TODO Improve the filename, id, title. 'uploader': "Taratata", - # TODO more properties (see youtube_dl/extractor/common.py) 'formats': formats, + 'thumbnails': [ + {'url': self._og_search_thumbnail(webpage)}, + ], } From 52f319ddd0485cf8ba602925be63edf2cebcc828 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Fri, 12 Jan 2018 13:40:06 +0100 Subject: [PATCH 3/7] Cleanup --- youtube_dl/extractor/mytaratata.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index b4a7e49c4..52733594d 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -18,12 +18,7 @@ class MyTaratataIE(InfoExtractor): 'title': u'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'uploader': 'Taratata', 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', - # 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'thumbnail': 'http://static.mytaratata.com/content/image/5a2562a1a5ee5.jpeg', } } @@ -41,6 +36,8 @@ class MyTaratataIE(InfoExtractor): r'/mytaratata/Taratata[^"]+\.mp4)"' ) + # The first videos are the live videos, coming in 2 formats. The next videos are + # bonuses, multi-cams... that we won't download. last_vid = None for url in video_source_re.findall(webpage): info_m = re.match(r'.*/(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', url) @@ -49,9 +46,12 @@ class MyTaratataIE(InfoExtractor): vid = info_m.group('vid') w = info_m.group('w') h = info_m.group('h') + if last_vid is None: last_vid = vid + if vid != last_vid: + # We found a new video, not another format of the same. Stops here. break formats.append({ @@ -66,10 +66,7 @@ class MyTaratataIE(InfoExtractor): 'id': last_vid, 'title': '%s - %s' % (title, description), 'description': description, - # TODO Improve the filename, id, title. 'uploader': "Taratata", 'formats': formats, - 'thumbnails': [ - {'url': self._og_search_thumbnail(webpage)}, - ], + 'thumbnail': self._og_search_thumbnail(webpage), } From 553b8b28f97636dd34b029e71bd324cda4b79452 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Sun, 14 Jan 2018 23:07:16 +0100 Subject: [PATCH 4/7] [mytaratata] Cleanup Cleanup a few things based on the PR review. --- youtube_dl/extractor/mytaratata.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index 52733594d..e21972249 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -15,7 +15,7 @@ class MyTaratataIE(InfoExtractor): 'info_dict': { 'id': '7174', 'ext': 'mp4', - 'title': u'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', + 'title': 'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'uploader': 'Taratata', 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'thumbnail': 'http://static.mytaratata.com/content/image/5a2562a1a5ee5.jpeg', @@ -26,21 +26,25 @@ class MyTaratataIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + # The title contains only the program name and episode number. + # Each episode containts many videos title = self._og_search_title(webpage) + # The description is the title of the video within the episode. description = self._og_search_description(webpage) formats = [] video_source_re = re.compile( - r'data-source="(?Phttp://videos.air-productions.cdn.sfr.net' - r'/mytaratata/Taratata[^"]+\.mp4)"' + r'data-source="(?Phttp://[^/]*/mytaratata/Taratata[^"]+\.mp4)"' ) # The first videos are the live videos, coming in 2 formats. The next videos are # bonuses, multi-cams... that we won't download. last_vid = None - for url in video_source_re.findall(webpage): - info_m = re.match(r'.*/(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', url) + for video_url in video_source_re.findall(webpage): + info_m = re.match( + r'.*/(?P[0-9]+)-[a-f0-9]+-(?P[0-9]+)x(?P[0-9]+)\.mp4', + video_url) if info_m is None: continue vid = info_m.group('vid') @@ -55,12 +59,12 @@ class MyTaratataIE(InfoExtractor): break formats.append({ - 'url': url, + 'url': video_url, 'width': int(w), 'height': int(h), }) - formats = list(sorted(formats, key=lambda f: f['width'])) + self._sort_formats(formats) return { 'id': last_vid, From 202846d5c70d2eb1a1bd3817eb577e6049ea10bb Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Mon, 15 Jan 2018 09:40:44 +0100 Subject: [PATCH 5/7] [mytaratata] Use more relaxed regex --- youtube_dl/extractor/mytaratata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index e21972249..542653a7d 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -18,7 +18,7 @@ class MyTaratataIE(InfoExtractor): 'title': 'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'uploader': 'Taratata', 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', - 'thumbnail': 'http://static.mytaratata.com/content/image/5a2562a1a5ee5.jpeg', + 'thumbnail': 're:https?://.*\.jpeg$', } } @@ -35,7 +35,7 @@ class MyTaratataIE(InfoExtractor): formats = [] video_source_re = re.compile( - r'data-source="(?Phttp://[^/]*/mytaratata/Taratata[^"]+\.mp4)"' + r'
]*class="jwplayer" [^\>]*data-source="(?Phttps?://.*/Taratata[^"]+\.mp4)"' ) # The first videos are the live videos, coming in 2 formats. The next videos are From 8d1e047e861ca0d27a2a3e609b7e6fe47f1f76b8 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Mon, 22 Jan 2018 16:56:18 +0100 Subject: [PATCH 6/7] Use a more relaxed regex for video links --- youtube_dl/extractor/mytaratata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index 542653a7d..0e6f67626 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -35,7 +35,7 @@ class MyTaratataIE(InfoExtractor): formats = [] video_source_re = re.compile( - r'
]*class="jwplayer" [^\>]*data-source="(?Phttps?://.*/Taratata[^"]+\.mp4)"' + r'
]*class="jwplayer" [^\>]*data-source="(?P[^"]+)"' ) # The first videos are the live videos, coming in 2 formats. The next videos are From 275853a4d2741afe88bc36637de263c51507c250 Mon Sep 17 00:00:00 2001 From: Christophe de Vienne Date: Mon, 22 Jan 2018 17:00:07 +0100 Subject: [PATCH 7/7] Remove the 'uploader' static value --- youtube_dl/extractor/mytaratata.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/mytaratata.py b/youtube_dl/extractor/mytaratata.py index 0e6f67626..281e919da 100644 --- a/youtube_dl/extractor/mytaratata.py +++ b/youtube_dl/extractor/mytaratata.py @@ -16,7 +16,6 @@ class MyTaratataIE(InfoExtractor): 'id': '7174', 'ext': 'mp4', 'title': 'TARATATA N°519 - Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', - 'uploader': 'Taratata', 'description': 'Shaka Ponk / Camille et Julie Berthollet "Smells Like Teen Spirit" (Nirvana)', 'thumbnail': 're:https?://.*\.jpeg$', } @@ -70,7 +69,6 @@ class MyTaratataIE(InfoExtractor): 'id': last_vid, 'title': '%s - %s' % (title, description), 'description': description, - 'uploader': "Taratata", 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), }