From fa675891347fd3344a25dec2039599902959dba9 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 18:21:42 +0100 Subject: [PATCH 01/11] [LCP] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/lcp.py | 137 +++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 youtube_dl/extractor/lcp.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1ae606f1e..8bb2631a7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -349,6 +349,7 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import LcpIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE from .letv import ( diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..be586e1f1 --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + int_or_none +) + + +class LcpIE(InfoExtractor): + IE_NAME = 'LCP' + _VALID_URL = r'https?:\/\/(?:www\.)?lcp\.fr\/(?:[^\/]+/)*(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'aecf5a330cfc1061445a9af5b2df392d', + 'info_dict': { + 'id': 'd56d03e9', + 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche' + } + }, { + 'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin', + 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'info_dict': { + 'id': '327336', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'ext': 'mp4', + 'title': 'Politique Matin - Politique matin' + } + }] + + def _real_extract(self, url): + """Extracts the information for a given url and returns it in a dictionary""" + display_id = self._match_id(url) + + # Extract the web page + webpage = self._download_webpage(url, display_id) + + # Extract the required info of the media files + media_files_info = self.__extract_from_webpage(display_id, webpage) + # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URL + if media_files_info is None: + return self.url_result(self.__extract_embed_url(webpage)) + + # Extract the video formats from the media info + video_formats = self.__get_video_formats(media_files_info) + # Extract the thumbnails from the media info + video_thumbnails = self.__get_thumbnails(media_files_info) + + # Return the dictionary with the information about the video to download + return { + 'id': media_files_info['EntryName'], + 'title': self._og_search_title(webpage), + 'formats': video_formats, + 'thumbnails': video_thumbnails + } + + def __extract_from_webpage(self, display_id, webpage): + """Extracts the media info JSON object for the video for the provided web page.""" + embed_url = self.__extract_embed_url(webpage) + embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr\/embed\/(?P[A-za-z0-9]+)\/(?P[A-za-z0-9]+)\/(?P[^\/]+)' + + # Extract the identifying attributes from the embed url of the web page + clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False) + player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False) + skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False) + + # Check whether the extraction of the clip id, player id or skin name + if (clip_id is None) or (player_id is None) or (skin_name is None): + return None + + # Extract the video url from the embedded player + return self.__extract_from_player(display_id, clip_id, player_id, skin_name) + + def __extract_embed_url(self, webpage): + """Extracts the embedded player url for the video.""" + return self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'embed url', group='url') + + def __extract_from_player(self, display_id, clip_id, player_id, skin_name): + """Extracts the JSON object containing the required media info from the embedded arkena player""" + arkena_url = 'http://play.arkena.com/config/avp/v1/player/media/{0}/{1}/{2}/?callbackMethod=?'.format(clip_id, + skin_name, + player_id) + arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) + + # Extract the json containing information about the video files + arkena_info_regex = r'\?\((?P.*)\);' + info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) + + # All videos are part of a playlist, a single video is in a playlist of size 1 + media_files_info = info_json.get('Playlist') + if media_files_info is not None: + media_files_info = media_files_info[0] + return media_files_info + + def __get_thumbnails(self, media_files_info): + """Retrieves the thumbnails contained in the media info""" + thumbnails = [] + media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') + if media_thumbnail_info is not None: + for thumbnail in media_thumbnail_info: + thumbnails.append({ + 'url': thumbnail.get('Url'), + 'width': int_or_none(thumbnail.get('Size')) + }) + return thumbnails + + def __get_video_formats(self, media_files_info): + """Retrieves the video formats contained in the media file info""" + formats = [] + media_files = media_files_info.get('MediaFiles') + + if media_files is not None: + formats.extend(self.__get_mp4_video_formats(media_files)) + self._sort_formats(formats) + + return formats + + def __get_mp4_video_formats(self, media_files_json): + """Retrieves all mp4 video formats contained in the media file info""" + formats = [] + mp4_files_json = media_files_json.get('Mp4') + if mp4_files_json is not None: + for video_info in mp4_files_json: + bitrate = int_or_none(video_info.get('Bitrate')) + if bitrate is not None: + bitrate /= 1000 # Set bitrate to KBit/s + formats.append({ + 'url': video_info.get('Url'), + 'ext': 'mp4', + 'tbr': bitrate + }) + return formats From ccdbe1e8faac13fa78950f3001d6cdbc02798c70 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 19:54:38 +0100 Subject: [PATCH 02/11] Removed redundant comments, needless escaping of slashes in regular expressions, using scala argument of int_or_none and direct use of the generic extractor for already supported embeds --- youtube_dl/extractor/lcp.py | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index be586e1f1..eb3f16698 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -8,7 +8,7 @@ from ..utils import ( class LcpIE(InfoExtractor): IE_NAME = 'LCP' - _VALID_URL = r'https?:\/\/(?:www\.)?lcp\.fr\/(?:[^\/]+/)*(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^\/]+/)*(?P[^/]+)' _TESTS = [{ 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', @@ -31,24 +31,18 @@ class LcpIE(InfoExtractor): }] def _real_extract(self, url): - """Extracts the information for a given url and returns it in a dictionary""" display_id = self._match_id(url) - - # Extract the web page webpage = self._download_webpage(url, display_id) - # Extract the required info of the media files + # Extract the required info of the media files gathered in a dictionary media_files_info = self.__extract_from_webpage(display_id, webpage) - # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URL + # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs if media_files_info is None: - return self.url_result(self.__extract_embed_url(webpage)) + return self.url_result(url, 'Generic') - # Extract the video formats from the media info video_formats = self.__get_video_formats(media_files_info) - # Extract the thumbnails from the media info video_thumbnails = self.__get_thumbnails(media_files_info) - # Return the dictionary with the information about the video to download return { 'id': media_files_info['EntryName'], 'title': self._og_search_title(webpage), @@ -59,22 +53,19 @@ class LcpIE(InfoExtractor): def __extract_from_webpage(self, display_id, webpage): """Extracts the media info JSON object for the video for the provided web page.""" embed_url = self.__extract_embed_url(webpage) - embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr\/embed\/(?P[A-za-z0-9]+)\/(?P[A-za-z0-9]+)\/(?P[^\/]+)' + embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr/embed/(?P[A-za-z0-9]+)/(?P[A-za-z0-9]+)/(?P[^\/]+)' - # Extract the identifying attributes from the embed url of the web page clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False) player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False) skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False) - # Check whether the extraction of the clip id, player id or skin name + # Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream) if (clip_id is None) or (player_id is None) or (skin_name is None): return None - # Extract the video url from the embedded player return self.__extract_from_player(display_id, clip_id, player_id, skin_name) def __extract_embed_url(self, webpage): - """Extracts the embedded player url for the video.""" return self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'embed url', group='url') @@ -86,19 +77,17 @@ class LcpIE(InfoExtractor): player_id) arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) - # Extract the json containing information about the video files arkena_info_regex = r'\?\((?P.*)\);' info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), display_id) - # All videos are part of a playlist, a single video is in a playlist of size 1 + # All videos are part of a playlist, a single video is also put in a playlist media_files_info = info_json.get('Playlist') if media_files_info is not None: media_files_info = media_files_info[0] return media_files_info def __get_thumbnails(self, media_files_info): - """Retrieves the thumbnails contained in the media info""" thumbnails = [] media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') if media_thumbnail_info is not None: @@ -110,7 +99,6 @@ class LcpIE(InfoExtractor): return thumbnails def __get_video_formats(self, media_files_info): - """Retrieves the video formats contained in the media file info""" formats = [] media_files = media_files_info.get('MediaFiles') @@ -121,14 +109,11 @@ class LcpIE(InfoExtractor): return formats def __get_mp4_video_formats(self, media_files_json): - """Retrieves all mp4 video formats contained in the media file info""" formats = [] mp4_files_json = media_files_json.get('Mp4') if mp4_files_json is not None: for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate')) - if bitrate is not None: - bitrate /= 1000 # Set bitrate to KBit/s + bitrate = int_or_none(video_info.get('Bitrate'), scale=0.001) # Scale bitrate to KBit/s formats.append({ 'url': video_info.get('Url'), 'ext': 'mp4', From e13bd53a94fe0153c7cce2a38e5c072b9e9123e0 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 21:40:05 +0100 Subject: [PATCH 03/11] Check for None through truth value checks, added test for scenario for delegating the url --- youtube_dl/extractor/lcp.py | 63 +++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index eb3f16698..9bc0e8f0b 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -28,6 +28,13 @@ class LcpIE(InfoExtractor): 'ext': 'mp4', 'title': 'Politique Matin - Politique matin' } + }, { + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'title': 'Le direct | LCP Assembl\xe9e nationale', + 'id': 'le-direct', + }, + 'playlist_mincount': 1 }] def _real_extract(self, url): @@ -35,9 +42,9 @@ class LcpIE(InfoExtractor): webpage = self._download_webpage(url, display_id) # Extract the required info of the media files gathered in a dictionary - media_files_info = self.__extract_from_webpage(display_id, webpage) + media_files_info = None #self.__extract_from_webpage(display_id, webpage) # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs - if media_files_info is None: + if not media_files_info: return self.url_result(url, 'Generic') video_formats = self.__get_video_formats(media_files_info) @@ -55,12 +62,12 @@ class LcpIE(InfoExtractor): embed_url = self.__extract_embed_url(webpage) embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr/embed/(?P[A-za-z0-9]+)/(?P[A-za-z0-9]+)/(?P[^\/]+)' - clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', fatal=False) - player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', fatal=False) - skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', fatal=False) + clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', default=None) + player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', default=None) + skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', default=None) # Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream) - if (clip_id is None) or (player_id is None) or (skin_name is None): + if not clip_id or not player_id or not skin_name: return None return self.__extract_from_player(display_id, clip_id, player_id, skin_name) @@ -83,40 +90,42 @@ class LcpIE(InfoExtractor): # All videos are part of a playlist, a single video is also put in a playlist media_files_info = info_json.get('Playlist') - if media_files_info is not None: - media_files_info = media_files_info[0] - return media_files_info + if not media_files_info: + return None + return media_files_info[0] def __get_thumbnails(self, media_files_info): thumbnails = [] media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') - if media_thumbnail_info is not None: - for thumbnail in media_thumbnail_info: - thumbnails.append({ - 'url': thumbnail.get('Url'), - 'width': int_or_none(thumbnail.get('Size')) - }) + if not media_thumbnail_info: + return None + for thumbnail in media_thumbnail_info: + thumbnails.append({ + 'url': thumbnail.get('Url'), + 'width': int_or_none(thumbnail.get('Size')) + }) return thumbnails def __get_video_formats(self, media_files_info): formats = [] media_files = media_files_info.get('MediaFiles') + if not media_files: + return None - if media_files is not None: - formats.extend(self.__get_mp4_video_formats(media_files)) - self._sort_formats(formats) - + formats.extend(self.__get_mp4_video_formats(media_files)) + self._sort_formats(formats) return formats def __get_mp4_video_formats(self, media_files_json): formats = [] mp4_files_json = media_files_json.get('Mp4') - if mp4_files_json is not None: - for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate'), scale=0.001) # Scale bitrate to KBit/s - formats.append({ - 'url': video_info.get('Url'), - 'ext': 'mp4', - 'tbr': bitrate - }) + if not mp4_files_json: + return None + for video_info in mp4_files_json: + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s + formats.append({ + 'url': video_info.get('Url'), + 'ext': 'mp4', + 'tbr': bitrate + }) return formats From c7293db7b7c3fa365374789e4374bc54e039eaeb Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 22:07:38 +0100 Subject: [PATCH 04/11] Skip formats and thumbnails missing Url --- youtube_dl/extractor/lcp.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 9bc0e8f0b..eb4a14056 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -100,8 +100,11 @@ class LcpIE(InfoExtractor): if not media_thumbnail_info: return None for thumbnail in media_thumbnail_info: + thumbnail_url = thumbnail.get('Url') + if not thumbnail_url: + continue thumbnails.append({ - 'url': thumbnail.get('Url'), + 'url': thumbnail_url, 'width': int_or_none(thumbnail.get('Size')) }) return thumbnails @@ -123,8 +126,11 @@ class LcpIE(InfoExtractor): return None for video_info in mp4_files_json: bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s + video_url = video_info.get('Url') + if not video_url: + continue formats.append({ - 'url': video_info.get('Url'), + 'url': video_url, 'ext': 'mp4', 'tbr': bitrate }) From 4e23fd0d368e0685f67d15581d9f3863850518a6 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 22:11:48 +0100 Subject: [PATCH 05/11] Restored line of code with function call to retrieve media info --- youtube_dl/extractor/lcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index eb4a14056..315e9ab44 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -42,7 +42,7 @@ class LcpIE(InfoExtractor): webpage = self._download_webpage(url, display_id) # Extract the required info of the media files gathered in a dictionary - media_files_info = None #self.__extract_from_webpage(display_id, webpage) + media_files_info = self.__extract_from_webpage(display_id, webpage) # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs if not media_files_info: return self.url_result(url, 'Generic') From a4b7e38dc4a7780c9b0e82d90011d3f2a33a5a8f Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sat, 27 Feb 2016 23:20:45 +0100 Subject: [PATCH 06/11] Added timestamp and description, which is the title of the webpage when the description of the page is unavailable --- youtube_dl/extractor/lcp.py | 48 +++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 315e9ab44..608b14923 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none + int_or_none, + parse_iso8601 ) @@ -17,7 +18,10 @@ class LcpIE(InfoExtractor): 'id': 'd56d03e9', 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche' + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'upload_date': '20160226', + 'description': 'Le président du groupe parlementaire radical, républicain, démocrate et progressiste (RRDP) y voit une bonne occasion pour le président de la République de se "relégitimer".', + 'timestamp': 1456488895 } }, { 'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin', @@ -26,7 +30,10 @@ class LcpIE(InfoExtractor): 'id': '327336', 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Politique Matin - Politique matin' + 'title': 'Politique Matin - Politique matin', + 'upload_date': '20160225', + 'description': 'Politique Matin - Politique matin', + 'timestamp': 1456391602 } }, { 'url': 'http://www.lcp.fr/le-direct', @@ -42,19 +49,31 @@ class LcpIE(InfoExtractor): webpage = self._download_webpage(url, display_id) # Extract the required info of the media files gathered in a dictionary - media_files_info = self.__extract_from_webpage(display_id, webpage) + media_dict = self.__extract_from_webpage(display_id, webpage) # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs - if not media_files_info: + if not media_dict: return self.url_result(url, 'Generic') + # All videos are part of a playlist, a single video is also put in a playlist + playlist_files_info = media_dict.get('Playlist') + if not playlist_files_info: + return self.url_result(url, 'Generic') + + media_files_info = playlist_files_info[0] video_formats = self.__get_video_formats(media_files_info) video_thumbnails = self.__get_thumbnails(media_files_info) + video_timestamp = parse_iso8601(media_files_info.get('MediaInfo', {}).get('PublishDate')) + + title = self._og_search_title(webpage) + description = self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content', default=title) return { - 'id': media_files_info['EntryName'], - 'title': self._og_search_title(webpage), + 'id': media_files_info.get('EntryName'), + 'title': title, 'formats': video_formats, - 'thumbnails': video_thumbnails + 'thumbnails': video_thumbnails, + 'description': description, + 'timestamp': video_timestamp } def __extract_from_webpage(self, display_id, webpage): @@ -83,16 +102,9 @@ class LcpIE(InfoExtractor): skin_name, player_id) arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) - arkena_info_regex = r'\?\((?P.*)\);' - info_json = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), - display_id) - - # All videos are part of a playlist, a single video is also put in a playlist - media_files_info = info_json.get('Playlist') - if not media_files_info: - return None - return media_files_info[0] + return self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) def __get_thumbnails(self, media_files_info): thumbnails = [] @@ -125,7 +137,7 @@ class LcpIE(InfoExtractor): if not mp4_files_json: return None for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s video_url = video_info.get('Url') if not video_url: continue From 620856702658e6525e93cc6b612dc611911d4f0a Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Sun, 28 Feb 2016 00:29:09 +0100 Subject: [PATCH 07/11] Changed _html_search_regex to _html_search_meta for retrieving description when available, otherwise leave it empty --- youtube_dl/extractor/lcp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 608b14923..e49744e91 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -32,7 +32,6 @@ class LcpIE(InfoExtractor): 'ext': 'mp4', 'title': 'Politique Matin - Politique matin', 'upload_date': '20160225', - 'description': 'Politique Matin - Politique matin', 'timestamp': 1456391602 } }, { @@ -65,7 +64,7 @@ class LcpIE(InfoExtractor): video_timestamp = parse_iso8601(media_files_info.get('MediaInfo', {}).get('PublishDate')) title = self._og_search_title(webpage) - description = self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content', default=title) + description = self._html_search_meta('description', webpage, default=None) return { 'id': media_files_info.get('EntryName'), From 26bab54ef5239dcfa3f2ff180e472f828225c08a Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Mon, 7 Mar 2016 01:14:18 +0100 Subject: [PATCH 08/11] Added ArkenaPlay extractor [Extraction for different formats needs to be implemented] --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/arkenaplay.py | 129 +++++++++++++++++++++++++++++ youtube_dl/extractor/lcp.py | 124 +++------------------------ 3 files changed, 143 insertions(+), 111 deletions(-) create mode 100644 youtube_dl/extractor/arkenaplay.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8bb2631a7..a15572d7e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,6 +37,7 @@ from .ard import ( ARDMediathekIE, SportschauIE, ) +from .arkenaplay import ArkenaPlayIE from .arte import ( ArteTvIE, ArteTVPlus7IE, diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py new file mode 100644 index 000000000..ea6faac31 --- /dev/null +++ b/youtube_dl/extractor/arkenaplay.py @@ -0,0 +1,129 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601 +) + + +class ArkenaPlayIE(InfoExtractor): + IE_NAME = 'ArkenaPlay' + _VALID_URL = r'(?Phttps?://(?:www\.)?play\..*\..*)/embed/.*(?P\d+)?/.*' + + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'info_dict': { + 'id': '327336', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'ext': 'mp4', + 'title': '327336', + 'upload_date': '20160225', + 'timestamp': 1456391602 + } + }, { + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'url': 'http://88e04ec095b07cd1aa3ea588be47e870.httpcache0.90034-httpcache0.dna.qbrick.com/90034-httpcache0/4bf759a1-00090034/bbb_sunflower_2160p_60fps_normal_720p.mp4', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'upload_date': '20150528', + 'timestamp': 1432816365 + } + }] + + def _real_extract(self, url): + display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') + webpage = self._download_webpage(url, display_id) + + media_url_regex = '"(?P(?P.*)/config/avp/.*/\?callbackMethod=\?)"' + media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') + hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') + if not hostname: + hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') + media_url = hostname + media_url + + # Extract the required info of the media files gathered in a dictionary + arkena_info = self._download_webpage(media_url, 'arkena_info_') + arkena_info_regex = r'\?\((?P.*)\);' + media_dict = self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), + display_id) + + # All videos are part of a playlist, a single video is also put in a playlist + playlist_items = media_dict.get('Playlist', []) + if len(playlist_items) == 0: + return self.url_result(url, 'Generic') + elif len(playlist_items) == 1: + arkena_media_info = playlist_items[0] + return self.__extract_from_playlistentry(arkena_media_info) + else: + entries_info = [] + for arkena_playlist_item in playlist_items: + entries_info.append(self.__extract_from_playlistentry(arkena_playlist_item)) + return { + 'id': display_id, + 'entries': entries_info + } + + def __extract_from_playlistentry(self, arkena_playlistentry_info): + formats = self.__get_video_formats(arkena_playlistentry_info) + media_info = arkena_playlistentry_info.get('MediaInfo', {}) + thumbnails = self.__get_thumbnails(media_info) + title = media_info.get('Title') + description = media_info.get('Description') + timestamp = parse_iso8601(media_info.get('PublishDate')) + return { + 'id': arkena_playlistentry_info.get('EntryName'), + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': description, + 'timestamp': timestamp + } + + def __get_thumbnails(self, arkena_mediainfo): + thumbnails = [] + thumbnails_info = arkena_mediainfo.get('Poster') + if not thumbnails_info: + return None + for thumbnail in thumbnails_info: + thumbnail_url = thumbnail.get('Url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('Size')) + }) + return thumbnails + + def __get_video_formats(self, media_files_info): + formats = [] + media_files = media_files_info.get('MediaFiles') + if not media_files: + return None + + formats.extend(self.__get_mp4_video_formats(media_files)) + # TODO + self._sort_formats(formats) + return formats + + def __get_mp4_video_formats(self, media_files_json): + formats = [] + mp4_files_json = media_files_json.get('Mp4') + if not mp4_files_json: + return None + for video_info in mp4_files_json: + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s + video_url = video_info.get('Url') + if not video_url: + continue + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'tbr': bitrate + }) + return formats diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index e49744e91..22c9642a6 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -1,11 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601 -) - class LcpIE(InfoExtractor): IE_NAME = 'LCP' @@ -18,21 +13,20 @@ class LcpIE(InfoExtractor): 'id': 'd56d03e9', 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'title': 'd56d03e9', 'upload_date': '20160226', - 'description': 'Le président du groupe parlementaire radical, républicain, démocrate et progressiste (RRDP) y voit une bonne occasion pour le président de la République de se "relégitimer".', 'timestamp': 1456488895 } }, { - 'url': 'http://www.lcp.fr/emissions/politique-matin/271085-politique-matin', - 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'url': 'http://www.lcp.fr/emissions/parlementair', + 'md5': '9b63769445cbe5f26952bef71f281e8c', 'info_dict': { - 'id': '327336', - 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'id': '327499', + 'url': 're:http://httpod.scdn.arkena.com/11970/327499_[0-9]+.mp4', 'ext': 'mp4', - 'title': 'Politique Matin - Politique matin', - 'upload_date': '20160225', - 'timestamp': 1456391602 + 'title': '327499', + 'upload_date': '20160304', + 'timestamp': 1457098658 } }, { 'url': 'http://www.lcp.fr/le-direct', @@ -47,102 +41,10 @@ class LcpIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # Extract the required info of the media files gathered in a dictionary - media_dict = self.__extract_from_webpage(display_id, webpage) - # Some web pages embed videos from other platforms like dailymotion, therefore we pass on these URLs - if not media_dict: + embed_url_regex = r'"(?P(?:https?://(?:www\.)?)?play\.lcp\.fr/embed/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+/[A-za-z0-9]+)"' + embed_url = self._html_search_regex(embed_url_regex, webpage, 'player_url', default=None, fatal=False) + if not embed_url: return self.url_result(url, 'Generic') - # All videos are part of a playlist, a single video is also put in a playlist - playlist_files_info = media_dict.get('Playlist') - if not playlist_files_info: - return self.url_result(url, 'Generic') - - media_files_info = playlist_files_info[0] - video_formats = self.__get_video_formats(media_files_info) - video_thumbnails = self.__get_thumbnails(media_files_info) - video_timestamp = parse_iso8601(media_files_info.get('MediaInfo', {}).get('PublishDate')) - - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage, default=None) - - return { - 'id': media_files_info.get('EntryName'), - 'title': title, - 'formats': video_formats, - 'thumbnails': video_thumbnails, - 'description': description, - 'timestamp': video_timestamp - } - - def __extract_from_webpage(self, display_id, webpage): - """Extracts the media info JSON object for the video for the provided web page.""" - embed_url = self.__extract_embed_url(webpage) - embed_regex = r'(?:[a-zA-Z0-9]+\.)?lcp\.fr/embed/(?P[A-za-z0-9]+)/(?P[A-za-z0-9]+)/(?P[^\/]+)' - - clip_id = self._search_regex(embed_regex, embed_url, 'clip id', group='clip_id', default=None) - player_id = self._search_regex(embed_regex, embed_url, 'player id', group='player_id', default=None) - skin_name = self._search_regex(embed_regex, embed_url, 'skin name', group='skin_name', default=None) - - # Check whether the matches failed, which might be when dealing with other players (e.g., dailymotion stream) - if not clip_id or not player_id or not skin_name: - return None - - return self.__extract_from_player(display_id, clip_id, player_id, skin_name) - - def __extract_embed_url(self, webpage): - return self._search_regex( - r']+src=(["\'])(?P.+?)\1', - webpage, 'embed url', group='url') - - def __extract_from_player(self, display_id, clip_id, player_id, skin_name): - """Extracts the JSON object containing the required media info from the embedded arkena player""" - arkena_url = 'http://play.arkena.com/config/avp/v1/player/media/{0}/{1}/{2}/?callbackMethod=?'.format(clip_id, - skin_name, - player_id) - arkena_info = self._download_webpage(arkena_url, 'clip_info_' + clip_id) - arkena_info_regex = r'\?\((?P.*)\);' - return self._parse_json(self._search_regex(arkena_info_regex, arkena_info, 'json', group='json'), - display_id) - - def __get_thumbnails(self, media_files_info): - thumbnails = [] - media_thumbnail_info = media_files_info.get('MediaInfo', {}).get('Poster') - if not media_thumbnail_info: - return None - for thumbnail in media_thumbnail_info: - thumbnail_url = thumbnail.get('Url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int_or_none(thumbnail.get('Size')) - }) - return thumbnails - - def __get_video_formats(self, media_files_info): - formats = [] - media_files = media_files_info.get('MediaFiles') - if not media_files: - return None - - formats.extend(self.__get_mp4_video_formats(media_files)) - self._sort_formats(formats) - return formats - - def __get_mp4_video_formats(self, media_files_json): - formats = [] - mp4_files_json = media_files_json.get('Mp4') - if not mp4_files_json: - return None - for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s - video_url = video_info.get('Url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'tbr': bitrate - }) - return formats + title = self._og_search_title(webpage, default=None) + return self.url_result(embed_url, 'ArkenaPlay', video_id=display_id, video_title=title) From 32719d16c375a7df702a36969599ea3a01e460c1 Mon Sep 17 00:00:00 2001 From: Sander van den Oever Date: Wed, 9 Mar 2016 17:26:10 +0100 Subject: [PATCH 09/11] Add flash support to Arkenaplay Extractor --- youtube_dl/extractor/arkenaplay.py | 33 +++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py index ea6faac31..5f4f65fe2 100644 --- a/youtube_dl/extractor/arkenaplay.py +++ b/youtube_dl/extractor/arkenaplay.py @@ -70,14 +70,15 @@ class ArkenaPlayIE(InfoExtractor): } def __extract_from_playlistentry(self, arkena_playlistentry_info): - formats = self.__get_video_formats(arkena_playlistentry_info) media_info = arkena_playlistentry_info.get('MediaInfo', {}) thumbnails = self.__get_thumbnails(media_info) title = media_info.get('Title') description = media_info.get('Description') + video_id = media_info.get('VideoId') timestamp = parse_iso8601(media_info.get('PublishDate')) + formats = self.__get_video_formats(arkena_playlistentry_info, video_id) return { - 'id': arkena_playlistentry_info.get('EntryName'), + 'id': video_id, 'title': title, 'formats': formats, 'thumbnails': thumbnails, @@ -100,13 +101,15 @@ class ArkenaPlayIE(InfoExtractor): }) return thumbnails - def __get_video_formats(self, media_files_info): + def __get_video_formats(self, media_files_info, video_id): formats = [] media_files = media_files_info.get('MediaFiles') if not media_files: return None formats.extend(self.__get_mp4_video_formats(media_files)) + formats.extend(self.__get_m3u8_video_formats(media_files, video_id)) + formats.extend(self.__get_f4m_video_formats(media_files, video_id)) # TODO self._sort_formats(formats) return formats @@ -127,3 +130,27 @@ class ArkenaPlayIE(InfoExtractor): 'tbr': bitrate }) return formats + + def __get_m3u8_video_formats(self, media_files_json, video_id): + formats = [] + m3u8_files_json = media_files_json.get("M3u8") + if not m3u8_files_json: + return None + for video_info in m3u8_files_json: + video_url = video_info.get('Url') + if not video_url: + continue + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + return formats + + def __get_f4m_video_formats(self, media_files_json, video_id): + formats = [] + f4m_files_json = media_files_json.get("Flash") + if not f4m_files_json: + return None + for video_info in f4m_files_json: + video_url = video_info.get("Url") + if not video_url: + continue + formats = self._extract_f4m_formats(video_url, video_id, 'f4m', f4m_id='hds', fatal=False) + return formats From ff32acc9bd2dad5714ee4fc93ee1b12436af985a Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Thu, 10 Mar 2016 01:33:10 +0100 Subject: [PATCH 10/11] Fixed some bugs in ArkenaPlay Extractor and made distinction between flv and fmd --- youtube_dl/extractor/arkenaplay.py | 33 ++++++++++++++++++------------ youtube_dl/extractor/lcp.py | 15 ++------------ 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py index 5f4f65fe2..81c6d6e35 100644 --- a/youtube_dl/extractor/arkenaplay.py +++ b/youtube_dl/extractor/arkenaplay.py @@ -13,10 +13,10 @@ class ArkenaPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', - 'md5': '6cea4f7d13810464ef8485a924fc3333', + 'md5': '7d857b1af491ec0f6c2610e52df1ff82', 'info_dict': { 'id': '327336', - 'url': 're:http://httpod.scdn.arkena.com/11970/327336_[0-9]+.mp4', + 'url': 're:http://httpod.scdn.arkena.com/11970/327336.*', 'ext': 'mp4', 'title': '327336', 'upload_date': '20160225', @@ -40,7 +40,7 @@ class ArkenaPlayIE(InfoExtractor): display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') webpage = self._download_webpage(url, display_id) - media_url_regex = '"(?P(?P.*)/config/avp/.*/\?callbackMethod=\?)"' + media_url_regex = '"(?P(?P.*)/(c|C)onfig/.*\?callbackMethod=\?)"' media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') if not hostname: @@ -109,8 +109,8 @@ class ArkenaPlayIE(InfoExtractor): formats.extend(self.__get_mp4_video_formats(media_files)) formats.extend(self.__get_m3u8_video_formats(media_files, video_id)) - formats.extend(self.__get_f4m_video_formats(media_files, video_id)) - # TODO + formats.extend(self.__get_flash_video_formats(media_files, video_id)) + # TODO self._sort_formats(formats) return formats @@ -133,24 +133,31 @@ class ArkenaPlayIE(InfoExtractor): def __get_m3u8_video_formats(self, media_files_json, video_id): formats = [] - m3u8_files_json = media_files_json.get("M3u8") + m3u8_files_json = media_files_json.get('M3u8') if not m3u8_files_json: return None for video_info in m3u8_files_json: video_url = video_info.get('Url') if not video_url: continue - formats = self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) return formats - def __get_f4m_video_formats(self, media_files_json, video_id): + def __get_flash_video_formats(self, media_files_json, video_id): formats = [] - f4m_files_json = media_files_json.get("Flash") - if not f4m_files_json: + flash_files_json = media_files_json.get('Flash') + if not flash_files_json: return None - for video_info in f4m_files_json: - video_url = video_info.get("Url") + for video_info in flash_files_json: + video_url = video_info.get('Url') if not video_url: continue - formats = self._extract_f4m_formats(video_url, video_id, 'f4m', f4m_id='hds', fatal=False) + video_type = video_info.get('Type') + if video_type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif video_type == 'video/x-flv': + formats.append({ + 'url': video_url, + 'ext': 'flv' + }) return formats diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py index 22c9642a6..38d7502df 100644 --- a/youtube_dl/extractor/lcp.py +++ b/youtube_dl/extractor/lcp.py @@ -8,26 +8,15 @@ class LcpIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', - 'md5': 'aecf5a330cfc1061445a9af5b2df392d', + 'md5': 'ab96c4dae94322ece1e98d97c8dc7807', 'info_dict': { 'id': 'd56d03e9', - 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_[0-9]+.mp4', + 'url': 're:http://httpod.scdn.arkena.com/11970/d56d03e9_.*', 'ext': 'mp4', 'title': 'd56d03e9', 'upload_date': '20160226', 'timestamp': 1456488895 } - }, { - 'url': 'http://www.lcp.fr/emissions/parlementair', - 'md5': '9b63769445cbe5f26952bef71f281e8c', - 'info_dict': { - 'id': '327499', - 'url': 're:http://httpod.scdn.arkena.com/11970/327499_[0-9]+.mp4', - 'ext': 'mp4', - 'title': '327499', - 'upload_date': '20160304', - 'timestamp': 1457098658 - } }, { 'url': 'http://www.lcp.fr/le-direct', 'info_dict': { From d4951f7489e9b84b3b76842a6c8bfa1f3b47cd44 Mon Sep 17 00:00:00 2001 From: Rob van Bekkum Date: Fri, 11 Mar 2016 01:24:51 +0100 Subject: [PATCH 11/11] Added support for different formats (DASH mpd, webm) and refactored ArkenaPlay extractor code --- youtube_dl/extractor/arkenaplay.py | 116 +++++++++++++---------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/youtube_dl/extractor/arkenaplay.py b/youtube_dl/extractor/arkenaplay.py index 81c6d6e35..0061ea196 100644 --- a/youtube_dl/extractor/arkenaplay.py +++ b/youtube_dl/extractor/arkenaplay.py @@ -5,15 +5,16 @@ from ..utils import ( int_or_none, parse_iso8601 ) +import re class ArkenaPlayIE(InfoExtractor): IE_NAME = 'ArkenaPlay' - _VALID_URL = r'(?Phttps?://(?:www\.)?play\..*\..*)/embed/.*(?P\d+)?/.*' + _VALID_URL = r'(?Parkena:(?P[0-9]+):(?P[A-Za-z0-9]+):(?P[^:]+):(?P[A-Za-z0-9]+):(?P[A-Za-z0-9]+))|(?:(?Phttps?://(?:www\.)?play\..*\..*)/embed/(?:avp/v[0-9]+/player/[A-Za-z0-9]+/)?(?P.*)?)' _TESTS = [{ 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', - 'md5': '7d857b1af491ec0f6c2610e52df1ff82', + 'md5': '6cea4f7d13810464ef8485a924fc3333', 'info_dict': { 'id': '327336', 'url': 're:http://httpod.scdn.arkena.com/11970/327336.*', @@ -23,7 +24,8 @@ class ArkenaPlayIE(InfoExtractor): 'timestamp': 1456391602 } }, { - 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + # Shortcut for: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 + 'url': 'arkena:2:media:b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe:1:129411', 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', 'info_dict': { 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', @@ -37,15 +39,26 @@ class ArkenaPlayIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') - webpage = self._download_webpage(url, display_id) + mobj = re.match(self._VALID_URL, url) + if mobj.group('shortcut'): + version = mobj.group('version') + mediatype = mobj.group('mediatype') + mediaid = mobj.group('mediaId') + widgetsettingid = mobj.group('widgetsettingId') + accountid = mobj.group('accountId') + display_id = '{0}:{1}:{2}:{3}'.format(mediatype, mediaid, widgetsettingid, accountid) + media_url = 'https://play.arkena.com/config/avp/v{0}/player/{1}/{2}/{3}/{4}/?callbackMethod=?'.format( + version, mediatype, mediaid, widgetsettingid, accountid) + else: + display_id = self._search_regex(self._VALID_URL, url, 'host_name', group='id') + webpage = self._download_webpage(url, display_id) - media_url_regex = '"(?P(?P.*)/(c|C)onfig/.*\?callbackMethod=\?)"' - media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') - hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') - if not hostname: - hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') - media_url = hostname + media_url + media_url_regex = '"(?P(?P.*)/(c|C)onfig/.*\?callbackMethod=\?)"' + media_url = self._html_search_regex(media_url_regex, webpage, 'arkena_media_info_url') + hostname = self._html_search_regex(media_url_regex, webpage, 'arkena_media_host', group='host') + if not hostname: + hostname = self._search_regex(self._VALID_URL, url, 'host_name', group='host') + media_url = hostname + media_url # Extract the required info of the media files gathered in a dictionary arkena_info = self._download_webpage(media_url, 'arkena_info_') @@ -107,57 +120,32 @@ class ArkenaPlayIE(InfoExtractor): if not media_files: return None - formats.extend(self.__get_mp4_video_formats(media_files)) - formats.extend(self.__get_m3u8_video_formats(media_files, video_id)) - formats.extend(self.__get_flash_video_formats(media_files, video_id)) - # TODO + for type_name, video_files_json in media_files.iteritems(): + for video_info in video_files_json: + video_url = video_info.get('Url') + if not video_url: + continue + type = video_info.get('Type') + if type_name in ['Mp4', 'WebM', 'Flash']: + bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) + ext = None + if type == 'video/mp4': + ext = 'mp4' + elif type == 'video/webm': + ext = 'webm' + elif type == 'video/x-flv': + ext = 'flv' + formats.append({ + 'url': video_url, + 'ext': ext, + 'tbr': bitrate + }) + elif type_name == 'M3u8' and type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif type_name == 'Flash' and type == 'application/hds+xml': + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif type_name == 'Dash' and type == 'application/dash+xml': + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) - return formats - - def __get_mp4_video_formats(self, media_files_json): - formats = [] - mp4_files_json = media_files_json.get('Mp4') - if not mp4_files_json: - return None - for video_info in mp4_files_json: - bitrate = int_or_none(video_info.get('Bitrate'), scale=1000) # Scale bitrate to KBit/s - video_url = video_info.get('Url') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'tbr': bitrate - }) - return formats - - def __get_m3u8_video_formats(self, media_files_json, video_id): - formats = [] - m3u8_files_json = media_files_json.get('M3u8') - if not m3u8_files_json: - return None - for video_info in m3u8_files_json: - video_url = video_info.get('Url') - if not video_url: - continue - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) - return formats - - def __get_flash_video_formats(self, media_files_json, video_id): - formats = [] - flash_files_json = media_files_json.get('Flash') - if not flash_files_json: - return None - for video_info in flash_files_json: - video_url = video_info.get('Url') - if not video_url: - continue - video_type = video_info.get('Type') - if video_type == 'application/hds+xml': - formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) - elif video_type == 'video/x-flv': - formats.append({ - 'url': video_url, - 'ext': 'flv' - }) - return formats + return formats \ No newline at end of file