1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-09 11:42:54 +08:00

Merge branch 'master' into Vimeo-issue-16717

This commit is contained in:
Parmjit Virk 2018-07-28 19:58:51 -05:00
commit 899387c453
20 changed files with 633 additions and 225 deletions

View File

@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.29**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2018.07.21
[debug] youtube-dl version 2018.07.29
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@ -239,3 +239,10 @@ Martin Weinelt
Surya Oktafendri
TingPing
Alexandre Macabies
Bastian de Groot
Niklas Haas
András Veres-Szentkirályi
Enes Solak
Nathan Rossi
Thomas van der Berg
Luca Cherubin

View File

@ -1,3 +1,21 @@
version 2018.07.29
Extractors
* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076)
+ [pornhub] Add support for subtitles (#16924, #17088)
* [ceskatelevize] Use https for API call (#16997, #16999)
* [dailymotion:playlist] Fix extraction (#16894)
* [ted] Improve extraction
* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085)
* [telecinco] Fix extraction (#17080)
* [mitele] Reduce number of requests
* [rai] Return non HTTP relinker URL intact (#17055)
* [vk] Fix extraction for inline only videos (#16923)
* [streamcloud] Fix extraction (#17054)
* [facebook] Fix tahoe player extraction with authentication (#16655)
+ [puhutv] Add support for puhutv.com (#12712, #16010, #16269)
version 2018.07.21
Core

View File

@ -870,7 +870,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op
Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`.
In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox).
In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox).
Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format.

View File

@ -672,6 +672,8 @@
- **PrimeShareTV**
- **PromptFile**
- **prosiebensat1**: ProSiebenSat.1 Digital
- **puhutv**
- **puhutv:serie**
- **Puls4**
- **Pyvideo**
- **qqmusic**: QQ音乐

View File

@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor):
for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request(
'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')

View File

@ -262,6 +262,9 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# Just test metadata extraction
'skip_download': True,
},
}, {
'url': 'http://www.crunchyroll.com/media-723735',
'only_matching': True,
}]
_FORMAT_IDS = {
@ -580,7 +583,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = 'crunchyroll:playlist'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import base64
import functools
import hashlib
import itertools
import json
@ -16,11 +17,13 @@ from ..utils import (
error_to_compat_str,
ExtractorError,
int_or_none,
mimetype2ext,
OnDemandPagedList,
parse_iso8601,
sanitized_Request,
str_to_int,
unescapeHTML,
mimetype2ext,
urlencode_postdata,
)
@ -343,17 +346,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'
_MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
_TESTS = [{
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': {
'title': 'SPORT',
'id': 'xv4bw_nqtv_sport',
'id': 'xv4bw',
},
'playlist_mincount': 20,
}]
_PAGE_SIZE = 100
def _fetch_page(self, playlist_id, authorizaion, page):
page += 1
videos = self._download_json(
'https://graphql.api.dailymotion.com',
playlist_id, 'Downloading page %d' % page,
data=json.dumps({
'query': '''{
collection(xid: "%s") {
videos(first: %d, page: %d) {
pageInfo {
hasNextPage
nextPage
}
edges {
node {
xid
url
}
}
}
}
}''' % (playlist_id, self._PAGE_SIZE, page)
}).encode(), headers={
'Authorization': authorizaion,
'Origin': 'https://www.dailymotion.com',
})['data']['collection']['videos']
for edge in videos['edges']:
node = edge['node']
yield self.url_result(
node['url'], DailymotionIE.ie_key(), node['xid'])
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
api = self._parse_json(self._search_regex(
r'__PLAYER_CONFIG__\s*=\s*({.+?});',
webpage, 'player config'), playlist_id)['context']['api']
auth = self._download_json(
api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
playlist_id, data=urlencode_postdata({
'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
'grant_type': 'client_credentials',
}))
authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
entries = OnDemandPagedList(functools.partial(
self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
return self.playlist_result(
entries, playlist_id,
self._og_search_title(webpage))
class DailymotionUserIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {
'id': 'nqtv',
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
}, {
'url': 'http://www.dailymotion.com/user/UnderProject',
'info_dict': {
'id': 'UnderProject',
'title': 'UnderProject',
},
'playlist_mincount': 1800,
'expected_warnings': [
'Stopped at duplicated page',
],
'skip': 'Takes too long time',
}]
def _extract_entries(self, id):
video_ids = set()
@ -379,43 +458,6 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'entries': self._extract_entries(playlist_id),
}
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {
'id': 'nqtv',
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
}, {
'url': 'http://www.dailymotion.com/user/UnderProject',
'info_dict': {
'id': 'UnderProject',
'title': 'UnderProject',
},
'playlist_mincount': 1800,
'expected_warnings': [
'Stopped at duplicated page',
],
'skip': 'Takes too long time',
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')

View File

@ -860,6 +860,10 @@ from .pornhub import (
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
)
from .presstv import PressTVIE
from .primesharetv import PrimeShareTVIE
from .promptfile import PromptFileIE

View File

@ -355,7 +355,6 @@ class FacebookIE(InfoExtractor):
tahoe_data = self._download_webpage(
self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
data=urlencode_postdata({
'__user': 0,
'__a': 1,
'__pc': self._search_regex(
r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
@ -363,6 +362,9 @@ class FacebookIE(InfoExtractor):
'__rev': self._search_regex(
r'client_revision["\']\s*:\s*(\d+),', webpage,
'client revision', default='3944515'),
'fb_dtsg': self._search_regex(
r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
webpage, 'dtsg token', default=''),
}),
headers={
'Content-Type': 'application/x-www-form-urlencoded',

View File

@ -1,84 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import uuid
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
int_or_none,
extract_attributes,
determine_ext,
smuggle_url,
parse_duration,
)
class MiTeleBaseIE(InfoExtractor):
def _get_player_info(self, url, webpage):
player_data = extract_attributes(self._search_regex(
r'(?s)(<ms-video-player.+?</ms-video-player>)',
webpage, 'ms video player'))
video_id = player_data['data-media-id']
if player_data.get('data-cms-id') == 'ooyala':
return self.url_result(
'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)
config_url = compat_urlparse.urljoin(url, player_data['data-config'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')
mmc_url = config['services']['mmc']
duration = None
formats = []
for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
mmc = self._download_json(
m_url, video_id, 'Downloading mmc JSON')
if not duration:
duration = int_or_none(mmc.get('duration'))
for location in mmc['locations']:
gat = self._proto_relative_url(location.get('gat'), 'http:')
gcp = location.get('gcp')
ogn = location.get('ogn')
if None in (gat, gcp, ogn):
continue
token_data = {
'gcp': gcp,
'ogn': ogn,
'sta': 0,
}
media = self._download_json(
gat, video_id, data=json.dumps(token_data).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
})
stream = media.get('stream') or media.get('file')
if not stream:
continue
ext = determine_ext(stream)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
'duration': duration,
}
class MiTeleIE(InfoExtractor):
IE_DESC = 'mitele.es'
_VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player',
'info_dict': {
'id': '57b0dfb9c715da65618b4afa',
'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg',
'ext': 'mp4',
'title': 'Tor, la web invisible',
'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor):
# no explicit title
'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player',
'info_dict': {
'id': '57b0de3dc915da14058b4876',
'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq',
'ext': 'mp4',
'title': 'Cuarto Milenio Temporada 6 Programa 226',
'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f',
@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
gigya_url = self._search_regex(
r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>',
webpage, 'gigya', default=None)
gigya_sc = self._download_webpage(
compat_urlparse.urljoin('http://www.mitele.es/', gigya_url),
video_id, 'Downloading gigya script')
# Get a appKey/uuid for getting the session key
appKey = self._search_regex(
r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)',
gigya_sc, 'appKey')
session_json = self._download_json(
'https://appgrid-api.cloud.accedo.tv/session',
video_id, 'Downloading session keys', query={
'appKey': appKey,
'uuid': compat_str(uuid.uuid4()),
})
paths = self._download_json(
'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration',
video_id, 'Downloading paths JSON',
query={'sessionKey': compat_str(session_json['sessionKey'])})
'https://www.mitele.es/amd/agp/web/metadata/general_configuration',
video_id, 'Downloading paths JSON')
ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search']
base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com')
full_path = ooyala_s.get('full_path', '/search/v1/full/providers/')
source = self._download_json(
'http://%s%s%s/docs/%s' % (
ooyala_s['base_url'], ooyala_s['full_path'],
ooyala_s['provider_id'], video_id),
'%s://%s%s%s/docs/%s' % (
ooyala_s.get('protocol', 'https'), base_url, full_path,
ooyala_s.get('provider_id', '104951'), video_id),
video_id, 'Downloading data JSON', query={
'include_titles': 'Series,Season',
'product_name': 'test',
'product_name': ooyala_s.get('product_name', 'test'),
'format': 'full',
})['hits']['hits'][0]['_source']

View File

@ -18,6 +18,7 @@ from ..utils import (
orderedSet,
remove_quotes,
str_to_int,
url_or_none,
)
@ -68,6 +69,31 @@ class PornHubIE(InfoExtractor):
'params': {
'skip_download': True,
},
}, {
# subtitles
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
'info_dict': {
'id': 'ph5af5fef7c2aa7',
'ext': 'mp4',
'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
'uploader': 'BFFs',
'duration': 622,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'age_limit': 18,
'tags': list,
'categories': list,
'subtitles': {
'en': [{
"ext": 'srt'
}]
},
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True,
@ -139,12 +165,19 @@ class PornHubIE(InfoExtractor):
video_urls = []
video_urls_set = set()
subtitles = {}
flashvars = self._parse_json(
self._search_regex(
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id)
if flashvars:
subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
if subtitle_url:
subtitles.setdefault('en', []).append({
'url': subtitle_url,
'ext': 'srt',
})
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
media_definitions = flashvars.get('mediaDefinitions')
@ -256,6 +289,7 @@ class PornHubIE(InfoExtractor):
'age_limit': 18,
'tags': tags,
'categories': categories,
'subtitles': subtitles,
}

View File

@ -0,0 +1,247 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_resolution,
str_or_none,
try_get,
unified_timestamp,
url_or_none,
urljoin,
)
class PuhuTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle'
IE_NAME = 'puhutv'
_TESTS = [{
# film
'url': 'https://puhutv.com/sut-kardesler-izle',
'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7',
'info_dict': {
'id': '5085',
'display_id': 'sut-kardesler',
'ext': 'mp4',
'title': 'Süt Kardeşler',
'description': 'md5:405fd024df916ca16731114eb18e511a',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 4832.44,
'creator': 'Arzu Film',
'timestamp': 1469778212,
'upload_date': '20160729',
'release_year': 1976,
'view_count': int,
'tags': ['Aile', 'Komedi', 'Klasikler'],
},
}, {
# episode, geo restricted, bypassable with --geo-verification-proxy
'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle',
'only_matching': True,
}, {
# 4k, with subtitles
'url': 'https://puhutv.com/dip-1-bolum-izle',
'only_matching': True,
}]
_SUBTITLE_LANGS = {
'English': 'en',
'Deutsch': 'de',
'عربى': 'ar'
}
def _real_extract(self, url):
display_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-izle' % display_id),
display_id)['data']
video_id = compat_str(info['id'])
title = info.get('name') or info['title']['name']
if info.get('display_name'):
title = '%s %s' % (title, info.get('display_name'))
try:
videos = self._download_json(
'https://puhutv.com/api/assets/%s/videos' % video_id,
display_id, 'Downloading video JSON',
headers=self.geo_verification_headers())
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self.raise_geo_restricted()
raise
formats = []
for video in videos['data']['videos']:
media_url = url_or_none(video.get('url'))
if not media_url:
continue
playlist = video.get('is_playlist')
if video.get('stream_type') == 'hls' and playlist is True:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
quality = int_or_none(video.get('quality'))
f = {
'url': media_url,
'ext': 'mp4',
'height': quality
}
video_format = video.get('video_format')
if video_format == 'hls' and playlist is False:
format_id = 'hls'
f['protocol'] = 'm3u8_native'
elif video_format == 'mp4':
format_id = 'http'
else:
continue
if quality:
format_id += '-%sp' % quality
f['format_id'] = format_id
formats.append(f)
self._sort_formats(formats)
description = try_get(
info, lambda x: x['title']['description'],
compat_str) or info.get('description')
timestamp = unified_timestamp(info.get('created_at'))
creator = try_get(
info, lambda x: x['title']['producer']['name'], compat_str)
duration = float_or_none(
try_get(info, lambda x: x['content']['duration_in_ms'], int),
scale=1000)
view_count = try_get(info, lambda x: x['content']['watch_count'], int)
images = try_get(
info, lambda x: x['content']['images']['wide'], dict) or {}
thumbnails = []
for image_id, image_url in images.items():
if not isinstance(image_url, compat_str):
continue
if not image_url.startswith(('http', '//')):
image_url = 'https://%s' % image_url
t = parse_resolution(image_id)
t.update({
'id': image_id,
'url': image_url
})
thumbnails.append(t)
release_year = try_get(info, lambda x: x['title']['released_at'], int)
season_number = int_or_none(info.get('season_number'))
season_id = str_or_none(info.get('season_id'))
episode_number = int_or_none(info.get('episode_number'))
tags = []
for genre in try_get(info, lambda x: x['title']['genres'], list) or []:
if not isinstance(genre, dict):
continue
genre_name = genre.get('name')
if genre_name and isinstance(genre_name, compat_str):
tags.append(genre_name)
subtitles = {}
for subtitle in try_get(
info, lambda x: x['content']['subtitles'], list) or []:
if not isinstance(subtitle, dict):
continue
lang = subtitle.get('language')
sub_url = url_or_none(subtitle.get('url'))
if not lang or not isinstance(lang, compat_str) or not sub_url:
continue
subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
'url': sub_url
}]
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'season_id': season_id,
'season_number': season_number,
'episode_number': episode_number,
'release_year': release_year,
'timestamp': timestamp,
'creator': creator,
'view_count': view_count,
'duration': duration,
'tags': tags,
'subtitles': subtitles,
'thumbnails': thumbnails,
'formats': formats
}
class PuhuTVSerieIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay'
IE_NAME = 'puhutv:serie'
_TESTS = [{
'url': 'https://puhutv.com/deniz-yildizi-detay',
'info_dict': {
'title': 'Deniz Yıldızı',
'id': 'deniz-yildizi',
},
'playlist_mincount': 205,
}, {
# a film detail page which is using same url with serie page
'url': 'https://puhutv.com/kaybedenler-kulubu-detay',
'only_matching': True,
}]
def _extract_entries(self, seasons):
for season in seasons:
season_id = season.get('id')
if not season_id:
continue
page = 1
has_more = True
while has_more is True:
season = self._download_json(
'https://galadriel.puhutv.com/seasons/%s' % season_id,
season_id, 'Downloading page %s' % page, query={
'page': page,
'per': 40,
})
episodes = season.get('episodes')
if isinstance(episodes, list):
for ep in episodes:
slug_path = str_or_none(ep.get('slugPath'))
if not slug_path:
continue
video_id = str_or_none(int_or_none(ep.get('id')))
yield self.url_result(
'https://puhutv.com/%s' % slug_path,
ie=PuhuTVIE.ie_key(), video_id=video_id,
video_title=ep.get('name') or ep.get('eventLabel'))
page += 1
has_more = season.get('hasMore')
def _real_extract(self, url):
playlist_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-detay' % playlist_id),
playlist_id)['data']
seasons = info.get('seasons')
if seasons:
return self.playlist_result(
self._extract_entries(seasons), playlist_id, info.get('name'))
# For films, these are using same url with series
video_id = info.get('slug') or info['assets'][0]['slug']
return self.url_result(
'https://puhutv.com/%s-izle' % video_id,
PuhuTVIE.ie_key(), video_id)

View File

@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor):
_GEO_BYPASS = False
def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
return {'formats': [{'url': relinker_url}]}
formats = []
geoprotection = None
is_live = None
@ -369,6 +372,10 @@ class RaiIE(RaiBaseIE):
'params': {
'skip_download': True,
},
}, {
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}]
def _extract_from_content_id(self, content_id, url):

View File

@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor):
'title': title,
'url': video_url,
'thumbnail': thumbnail,
'http_headers': {
'Referer': url,
},
}

View File

@ -7,8 +7,10 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
float_or_none,
int_or_none,
try_get,
url_or_none,
)
@ -30,7 +32,7 @@ class TEDIE(InfoExtractor):
'''
_TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
'md5': 'b0ce2b05ca215042124fbc9e3886493a',
'info_dict': {
'id': '102',
'ext': 'mp4',
@ -42,24 +44,30 @@ class TEDIE(InfoExtractor):
'uploader': 'Dan Dennett',
'width': 853,
'duration': 1308,
}
}, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
'md5': 'b899ac15e345fb39534d913f7606082b',
'info_dict': {
'id': 'tSVI8ta_P4w',
'ext': 'mp4',
'title': 'Vishal Sikka: The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
'upload_date': '20140122',
'uploader_id': 'TEDInstitute',
'uploader': 'TED Institute',
'view_count': int,
'comment_count': int,
'tags': list,
},
'params': {
'skip_download': True,
},
}, {
# missing HTTP bitrates
'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
'info_dict': {
'id': '6069',
'ext': 'mp4',
'title': 'The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
'description': 'md5:734e352710fb00d840ab87ae31aaf688',
'uploader': 'Vishal Sikka',
},
'params': {
'skip_download': True,
},
'add_ie': ['Youtube'],
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
'md5': '71b3ab2f4233012dce09d515c9c39ce2',
'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
'info_dict': {
'id': '1972',
'ext': 'mp4',
@ -68,6 +76,9 @@ class TEDIE(InfoExtractor):
'description': 'md5:5174aed4d0f16021b704120360f72b92',
'duration': 1128,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'info_dict': {
@ -92,17 +103,17 @@ class TEDIE(InfoExtractor):
'skip_download': True,
},
}, {
# YouTube video
'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
'add_ie': ['Youtube'],
# no nativeDownloads
'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
'info_dict': {
'id': 'aFBIPO-P7LM',
'id': '1792',
'ext': 'mp4',
'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
'uploader': 'TEDx Talks',
'uploader_id': 'TEDxTalks',
'upload_date': '20111216',
'title': 'The orchestra in my mouth',
'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
'uploader': 'Tom Thum',
'view_count': int,
'comment_count': int,
'tags': list,
},
'params': {
'skip_download': True,
@ -161,27 +172,16 @@ class TEDIE(InfoExtractor):
info = self._extract_info(webpage)
talk_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'][0],
dict) or info['talks'][0]
data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
talk_info = data['talks'][0]
title = talk_info['title'].strip()
external = talk_info.get('external')
if external:
service = external['service']
self.to_screen('Found video from %s' % service)
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return {
'_type': 'url',
'url': ext_url or external['uri'],
}
native_downloads = try_get(
talk_info, lambda x: x['downloads']['nativeDownloads'],
dict) or talk_info['nativeDownloads']
talk_info,
(lambda x: x['downloads']['nativeDownloads'],
lambda x: x['nativeDownloads']),
dict) or {}
formats = [{
'url': format_url,
@ -196,10 +196,24 @@ class TEDIE(InfoExtractor):
player_talk = talk_info['player_talks'][0]
external = player_talk.get('external')
if isinstance(external, dict):
service = external.get('service')
if isinstance(service, compat_str):
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return {
'_type': 'url',
'url': ext_url or external['uri'],
}
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None
for format_id, resources in resources_.items():
if not isinstance(resources, dict):
continue
if format_id == 'h264':
for resource in resources:
h264_url = resource.get('file')
@ -228,8 +242,12 @@ class TEDIE(InfoExtractor):
'tbr': int_or_none(resource.get('bitrate')),
})
elif format_id == 'hls':
stream_url = url_or_none(resources.get('stream'))
if not stream_url:
continue
formats.extend(self._extract_m3u8_formats(
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
stream_url, video_name, 'mp4', m3u8_id=format_id,
fatal=False))
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
@ -239,9 +257,13 @@ class TEDIE(InfoExtractor):
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate:
continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url(
bitrate_url, video_name, '%s bitrate' % bitrate):
continue
f = m3u8_format.copy()
f.update({
'url': re.sub(r'\d+k', bitrate, http_url),
'url': bitrate_url,
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
@ -267,7 +289,11 @@ class TEDIE(InfoExtractor):
'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,
'duration': talk_info.get('duration'),
'duration': float_or_none(talk_info.get('duration')),
'view_count': int_or_none(data.get('viewed_count')),
'comment_count': int_or_none(
try_get(data, lambda x: x['comments']['count'])),
'tags': try_get(talk_info, lambda x: x['tags'], list),
}
def _get_subtitles(self, video_id, talk_info):

View File

@ -1,26 +1,43 @@
# coding: utf-8
from __future__ import unicode_literals
from .mitele import MiTeleBaseIE
import json
import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import (
clean_html,
determine_ext,
int_or_none,
str_or_none,
urljoin,
)
class TelecincoIE(MiTeleBaseIE):
class TelecincoIE(InfoExtractor):
IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
_VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
'info_dict': {
'id': 'JEA5ijCnF6p5W08A1rNKn7',
'ext': 'mp4',
'id': '1876350223',
'title': 'Bacalao con kokotxas al pil-pil',
'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
'duration': 662,
},
'playlist': [{
'md5': 'adb28c37238b675dad0f042292f209a7',
'info_dict': {
'id': 'JEA5ijCnF6p5W08A1rNKn7',
'ext': 'mp4',
'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido',
'duration': 662,
},
}]
}, {
'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
'md5': '284393e5387b3b947b77c613ef04749a',
'md5': '9468140ebc300fbb8b9d65dc6e5c4b43',
'info_dict': {
'id': 'jn24Od1zGLG4XUZcnUnZB6',
'ext': 'mp4',
@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE):
},
}, {
'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
'md5': '749afab6ea5a136a8806855166ae46a2',
'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6',
'info_dict': {
'id': 'aywerkD2Sv1vGNqq9b85Q2',
'ext': 'mp4',
@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE):
'only_matching': True,
}]
def _parse_content(self, content, url):
video_id = content['dataMediaId']
if content.get('dataCmsId') == 'ooyala':
return self.url_result(
'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id)
config_url = urljoin(url, content['dataConfig'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')
title = config['info']['title']
def mmc_url(mmc_type):
return re.sub(
r'/(?:flash|html5)\.json', '/%s.json' % mmc_type,
config['services']['mmc'])
duration = None
formats = []
for mmc_type in ('flash', 'html5'):
mmc = self._download_json(
mmc_url(mmc_type), video_id,
'Downloading %s mmc JSON' % mmc_type, fatal=False)
if not mmc:
continue
if not duration:
duration = int_or_none(mmc.get('duration'))
for location in mmc['locations']:
gat = self._proto_relative_url(location.get('gat'), 'http:')
gcp = location.get('gcp')
ogn = location.get('ogn')
if None in (gat, gcp, ogn):
continue
token_data = {
'gcp': gcp,
'ogn': ogn,
'sta': 0,
}
media = self._download_json(
gat, video_id, data=json.dumps(token_data).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
}, fatal=False) or {}
stream = media.get('stream') or media.get('file')
if not stream:
continue
ext = determine_ext(stream)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
'duration': duration,
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage, 'title')
info = self._get_player_info(url, webpage)
article = self._parse_json(self._search_regex(
r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})',
webpage, 'article'), display_id)['article']
title = article.get('title')
description = clean_html(article.get('leadParagraph'))
if article.get('editorialType') != 'VID':
entries = []
for p in article.get('body', []):
content = p.get('content')
if p.get('type') != 'video' or not content:
continue
entries.append(self._parse_content(content, url))
return self.playlist_result(
entries, str_or_none(article.get('id')), title, description)
content = article['opening']['content']
info = self._parse_content(content, url)
info.update({
'display_id': display_id,
'title': title,
'description': self._html_search_meta(
['og:description', 'twitter:description'],
webpage, 'title', fatal=False),
'description': description,
})
return info

View File

@ -17,6 +17,7 @@ from ..utils import (
int_or_none,
orderedSet,
remove_start,
str_or_none,
str_to_int,
unescapeHTML,
unified_timestamp,
@ -106,10 +107,10 @@ class VKIE(VKBaseIE):
'ext': 'mp4',
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'uploader_id': '-77521',
'duration': 195,
'timestamp': 1329060660,
'timestamp': 1329049880,
'upload_date': '20120212',
'view_count': int,
},
},
{
@ -118,12 +119,12 @@ class VKIE(VKBaseIE):
'info_dict': {
'id': '165548505',
'ext': 'mp4',
'uploader': 'Tom Cruise',
'title': 'No name',
'uploader': 'Tom Cruise',
'uploader_id': '205387401',
'duration': 9,
'timestamp': 1374374880,
'upload_date': '20130721',
'view_count': int,
'timestamp': 1374364108,
'upload_date': '20130720',
}
},
{
@ -207,10 +208,10 @@ class VKIE(VKBaseIE):
'id': 'V3K4mi0SYkc',
'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
'description': 'md5:d9903938abdc74c738af77f527ca0596',
'duration': 178,
'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
'duration': 179,
'upload_date': '20130116',
'uploader': "Children's Joy Foundation",
'uploader': "Children's Joy Foundation Inc.",
'uploader_id': 'thecjf',
'view_count': int,
},
@ -222,6 +223,7 @@ class VKIE(VKBaseIE):
'id': 'k3lz2cmXyRuJQSjGHUv',
'ext': 'mp4',
'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
# TODO: fix test by fixing dailymotion description extraction
'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
'uploader': 'AniLibria.Tv',
'upload_date': '20160914',
@ -241,9 +243,12 @@ class VKIE(VKBaseIE):
'ext': 'mp4',
'title': 'S-Dance, репетиции к The way show',
'uploader': 'THE WAY SHOW | 17 апреля',
'timestamp': 1454870100,
'uploader_id': '-110305615',
'timestamp': 1454859345,
'upload_date': '20160207',
'view_count': int,
},
'params': {
'skip_download': True,
},
},
{
@ -296,7 +301,7 @@ class VKIE(VKBaseIE):
video_id = mobj.group('videoid')
if video_id:
info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
# Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id')
if list_id:
@ -346,6 +351,9 @@ class VKIE(VKBaseIE):
r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.',
}
for error_re, error_msg in ERRORS.items():
@ -394,7 +402,8 @@ class VKIE(VKBaseIE):
if not data:
data = self._parse_json(
self._search_regex(
r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'),
[r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
info_page, 'json', default='{}'),
video_id)
if data:
data = data['player']['params'][0]
@ -416,7 +425,7 @@ class VKIE(VKBaseIE):
timestamp = unified_timestamp(self._html_search_regex(
r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page,
'upload date', fatal=False))
'upload date', default=None)) or int_or_none(data.get('date'))
view_count = str_to_int(self._search_regex(
r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)',
@ -454,9 +463,12 @@ class VKIE(VKBaseIE):
'title': title,
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
'uploader_id': str_or_none(data.get('author_id')),
'duration': data.get('duration'),
'timestamp': timestamp,
'view_count': view_count,
'like_count': int_or_none(data.get('liked')),
'dislike_count': int_or_none(data.get('nolikes')),
'is_live': is_live,
}

View File

@ -3569,7 +3569,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
setattr(self, '%s_open' % type,
lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
meth(r, proxy, type))
return compat_urllib_request.ProxyHandler.__init__(self, proxies)
compat_urllib_request.ProxyHandler.__init__(self, proxies)
def proxy_open(self, req, proxy, type):
req_proxy = req.headers.get('Ytdl-request-proxy')

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = '2018.07.21'
__version__ = '2018.07.29'