1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-03 15:12:50 +08:00

[ellentube] Improve extraction of playlists.

Instead of searching the webpage for video ids and making a api
request for each video id, we now make one single api request to
get all the required information needed to extract a playlist.
This commit is contained in:
Alex Seiler 2017-10-30 01:47:49 +01:00
parent 061b47ce71
commit 5d81166f78
2 changed files with 52 additions and 39 deletions

View File

@ -1,10 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
int_or_none,
urljoin,
)
@ -13,24 +12,21 @@ from ..utils import (
class EllenTubeBaseIE(InfoExtractor):
API_URL = 'https://api-prod.ellentube.com/'
def _extract_from_video_id(self, video_id, display_id=None):
video_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/item/%s' % video_id), video_id)
title = video_data['title']
description = video_data.get('description')
publish_time = int_or_none(video_data.get('publishTime'))
thumbnail = video_data.get('thumbnail')
def _extract_video_from_json(self, data, video_id, display_id=None):
title = data['title']
description = data.get('description')
publish_time = int_or_none(data.get('publishTime'))
thumbnail = data.get('thumbnail')
formats = []
duration = None
for entry in video_data.get('media'):
for entry in data.get('media'):
if entry.get('id') == 'm3u8':
formats = self._extract_m3u8_formats(entry.get(
'url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
formats = self._extract_m3u8_formats(
entry.get('url'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
duration = int_or_none(entry.get('duration'))
break
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
@ -42,10 +38,31 @@ class EllenTubeBaseIE(InfoExtractor):
'formats': formats,
}
def _extract_video_ids_from_api_search(self, api_search, display_id):
feed_data = self._download_json(
def _extract_playlist_entries_from_json(self, data, display_id):
return [self._extract_video_from_json(elem, elem['id'])
for elem in data if elem.get('type') == 'VIDEO']
def _extract_from_video_id(self, video_id, display_id=None):
api_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/item/%s' % video_id), video_id)
return self._extract_video_from_json(api_data, video_id, display_id)
def _extract_playlist(self, url, display_id, extract_description=True):
webpage = self._download_webpage(url, display_id)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'playlist data')
playlist_title = self._search_regex(
r'"title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
playlist_description = clean_html(self._search_regex(
r'"description"\s*:\s*"(.+?)"', playlist_data, 'playlist description',
fatal=False)) if extract_description else None
api_search = self._search_regex(
r'"filter"\s*:\s*"(.+?)"', playlist_data, 'playlist api request')
api_data = self._download_json(
urljoin(self.API_URL, 'ellenapi/api/feed/?%s' % api_search), display_id)
return [entry.get('id') for entry in feed_data if entry.get('type') == 'VIDEO']
return self.playlist_result(
self._extract_playlist_entries_from_json(api_data, display_id),
display_id, playlist_title, playlist_description)
class EllenTubeVideoIE(EllenTubeBaseIE):
@ -74,41 +91,36 @@ class EllenTubeVideoIE(EllenTubeBaseIE):
return self._extract_from_video_id(video_id, display_id)
class EllenTubePlaylistIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/(?:episode|studios)/(?P<id>.+)\.html'
class EllenTubeEpisodeIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/episode/(?P<id>.+)\.html'
_TESTS = [{
_TEST = {
'url': 'https://www.ellentube.com/episode/dax-shepard-jordan-fisher-haim.html',
'info_dict': {
'id': 'dax-shepard-jordan-fisher-haim',
'title': 'Dax Shepard, \'DWTS\' Team Jordan Fisher & Lindsay Arnold, HAIM',
'description': 'md5:aed85d42892f6126e71ec5ed2aea2a0d'
},
'playlist_count': 6,
}, {
}
def _real_extract(self, url):
display_id = self._match_id(url)
return self._extract_playlist(url, display_id)
class EllenTubeStudioIE(EllenTubeBaseIE):
_VALID_URL = r'https?://(?:www\.)?ellentube\.com/studios/(?P<id>.+)\.html'
_TEST = {
'url': 'https://www.ellentube.com/studios/macey-goes-rving0.html',
'info_dict': {
'id': 'macey-goes-rving0',
'title': 'Macey Goes RVing',
},
'playlist_mincount': 3,
}]
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
playlist_data = self._html_search_regex(
r'<div\s+data-component\s*=\s*"Details"(.+)</div>', webpage, 'episode data')
playlist_title = self._search_regex(
r'title"\s*:\s*"(.+?)"', playlist_data, 'playlist title')
entries = [self._extract_from_video_id(m.group('vid')) for m in re.finditer(
r'pid=(?P<vid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', playlist_data)]
if not entries:
api_search = self._search_regex(
r'filter"\s*:\s*"(.+?)"', playlist_data, 'api search')
video_ids = self._extract_video_ids_from_api_search(
api_search, display_id)
entries = [self._extract_from_video_id(
vid, display_id) for vid in video_ids]
return self.playlist_result(entries, display_id, playlist_title)
return self._extract_playlist(url, display_id, False)

View File

@ -309,7 +309,8 @@ from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
from .eitb import EitbIE
from .ellentube import (
EllenTubePlaylistIE,
EllenTubeEpisodeIE,
EllenTubeStudioIE,
EllenTubeVideoIE,
)
from .elpais import ElPaisIE