1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-10 23:37:18 +08:00

Merge branch 'master' into fix.25.12.2018

# Conflicts:
#	youtube_dl/version.py
This commit is contained in:
Avi Peretz 2019-03-04 15:40:02 +02:00
commit 1e4e1b5962
31 changed files with 638 additions and 133 deletions

View File

@ -6,8 +6,8 @@
---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.08*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.08**
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.01**
### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2019.02.08
[debug] youtube-dl version 2019.03.01
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}

View File

@ -9,7 +9,6 @@ python:
- "3.6"
- "pypy"
- "pypy3"
sudo: false
env:
- YTDL_TEST_SET=core
- YTDL_TEST_SET=download

View File

@ -1,3 +1,43 @@
version 2019.03.01
Core
+ [downloader/external] Add support for rate limit and retries for wget
* [downloader/external] Fix infinite retries for curl (#19303)
Extractors
* [npo] Fix extraction (#20084)
* [francetv:site] Extend video id regex (#20029, #20071)
+ [periscope] Extract width and height (#20015)
* [servus] Fix extraction (#19297)
* [bbccouk] Make subtitles non fatal (#19651)
* [metacafe] Fix family filter bypass (#19287)
version 2019.02.18
Extractors
* [tvp:website] Fix and improve extraction
+ [tvp] Detect unavailable videos
* [tvp] Fix description extraction and make thumbnail optional
+ [linuxacademy] Add support for linuxacademy.com (#12207)
* [bilibili] Update keys (#19233)
* [udemy] Extend URL regular expressions (#14330, #15883)
* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126)
* [noovo] Fix extraction (#19230)
* [rai] Relax URL regular expression (#19232)
+ [vshare] Pass Referer to download request (#19205, #19221)
+ [openload] Add support for oload.live (#19222)
* [imgur] Use video id as title fallback (#18590)
+ [twitch] Add new source format detection approach (#19193)
* [tvplayhome] Fix video id extraction (#19190)
* [tvplayhome] Fix episode metadata extraction (#19190)
* [rutube:embed] Fix extraction (#19163)
+ [rutube:embed] Add support private videos (#19163)
+ [soundcloud] Extract more metadata
+ [trunews] Add support for trunews.com (#19153)
+ [linkedin:learning] Extract chapter_number and chapter_id (#19162)
version 2019.02.08
Core

View File

@ -458,6 +458,7 @@
- **LineTV**
- **linkedin:learning**
- **linkedin:learning:course**
- **LinuxAcademy**
- **LiTV**
- **LiveLeak**
- **LiveLeakEmbed**
@ -915,6 +916,7 @@
- **ToypicsUser**: Toypics user profile
- **TrailerAddict** (Currently broken)
- **Trilulilu**
- **TruNews**
- **TruTV**
- **Tube8**
- **TubiTv**

View File

@ -29,6 +29,16 @@ class TestYoutubeDLCookieJar(unittest.TestCase):
tf.close()
os.remove(tf.name)
def test_strip_httponly_prefix(self):
cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt')
cookiejar.load(ignore_discard=True, ignore_expires=True)
def assert_cookie_has_value(key):
self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE')
assert_cookie_has_value('HTTPONLY_COOKIE')
assert_cookie_has_value('JS_ACCESSIBLE_COOKIE')
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,6 @@
# Netscape HTTP Cookie File
# http://curl.haxx.se/rfc/cookie_spec.html
# This is a generated file! Do not edit.
#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE
www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE

View File

@ -121,7 +121,11 @@ class CurlFD(ExternalFD):
cmd += self._valueless_option('--silent', 'noprogress')
cmd += self._valueless_option('--verbose', 'verbose')
cmd += self._option('--limit-rate', 'ratelimit')
cmd += self._option('--retry', 'retries')
retry = self._option('--retry', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '2147483647'
cmd += retry
cmd += self._option('--max-filesize', 'max_filesize')
cmd += self._option('--interface', 'source_address')
cmd += self._option('--proxy', 'proxy')
@ -160,6 +164,12 @@ class WgetFD(ExternalFD):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries')
if len(retry) == 2:
if retry[1] in ('inf', 'infinite'):
retry[1] = '0'
cmd += retry
cmd += self._option('--bind-address', 'source_address')
cmd += self._option('--proxy', 'proxy')
cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')

View File

@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import itertools
import re
import xml
from .common import InfoExtractor
from ..utils import (
@ -17,6 +18,7 @@ from ..utils import (
parse_iso8601,
try_get,
unescapeHTML,
url_or_none,
urlencode_postdata,
urljoin,
)
@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor):
def _get_subtitles(self, media, programme_id):
subtitles = {}
for connection in self._extract_connections(media):
captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
cc_url = url_or_none(connection.get('href'))
if not cc_url:
continue
captions = self._download_xml(
cc_url, programme_id, 'Downloading captions', fatal=False)
if not isinstance(captions, xml.etree.ElementTree.Element):
continue
lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
subtitles[lang] = [
{

View File

@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor):
}]
}]
_APP_KEY = '84956560bc028eb7'
_BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e'
_APP_KEY = 'iVGUTjsxvpLeuDCf'
_BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt'
def _report_error(self, result):
if 'message' in result:

View File

@ -56,22 +56,11 @@ class CrunchyrollBaseIE(InfoExtractor):
if username is None:
return
self._download_webpage(
'https://www.crunchyroll.com/?a=formhandler',
None, 'Logging in', 'Wrong login info',
data=urlencode_postdata({
'formname': 'RpcApiUser_Login',
'next_url': 'https://www.crunchyroll.com/acct/membership',
'name': username,
'password': password,
}))
'''
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
def is_logged(webpage):
return '<title>Redirecting' in webpage
return 'href="/logout"' in webpage
# Already logged in
if is_logged(login_page):
@ -110,7 +99,6 @@ class CrunchyrollBaseIE(InfoExtractor):
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
'''
def _real_initialize(self):
self._login()

View File

@ -29,7 +29,8 @@ class ESPNIE(OnceIE):
(?:
.*?\?.*?\bid=|
/_/id/
)
)|
[^/]+/video/
)
)|
(?:www\.)espnfc\.(?:com|us)/(?:video/)?[^/]+/\d+/video/
@ -94,6 +95,9 @@ class ESPNIE(OnceIE):
}, {
'url': 'http://www.espnfc.com/english-premier-league/23/video/3324163/premier-league-in-90-seconds-golden-tweets',
'only_matching': True,
}, {
'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings',
'only_matching': True,
}]
def _real_extract(self, url):

View File

@ -593,6 +593,7 @@ from .linkedin import (
LinkedInLearningIE,
LinkedInLearningCourseIE,
)
from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE
from .liveleak import (
LiveLeakIE,
@ -1217,7 +1218,7 @@ from .tvnow import (
from .tvp import (
TVPEmbedIE,
TVPIE,
TVPSeriesIE,
TVPWebsiteIE,
)
from .tvplay import (
TVPlayIE,

View File

@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):
catalogue = None
video_id = self._search_regex(
r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
r'(?:data-main-video\s*=|videoId\s*:)\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
webpage, 'video id', default=None, group='id')
if not video_id:

View File

@ -1,12 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
get_element_by_class,
parse_duration,
strip_or_none,
unified_strdate,
)
@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor):
'id': '6385796',
'ext': 'mp3',
'title': "Champion Minded - Developing a Growth Mindset",
'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
# description fetched using another request:
# http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796
# 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
'upload_date': '20180320',
'thumbnail': 're:^https?://.*',
},
@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor):
}]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
video_id = m.group('id')
url = m.group('mainurl')
url, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
podcast_title = self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
if podcast_title:
podcast_title = podcast_title.strip()
episode_title = self._search_regex(
r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
if episode_title:
episode_title = episode_title.strip()
data = self._parse_json(self._search_regex(
r'var\s+playlistItem\s*=\s*({.+?});',
webpage, 'JSON data block'), video_id)
episode_title = data.get('item_title') or get_element_by_class('episode-title', webpage)
if not episode_title:
self._search_regex(
[r'data-title="([^"]+)"', r'<title>(.+?)</title>'],
webpage, 'episode title')
episode_title = episode_title.strip()
podcast_title = strip_or_none(clean_html(self._search_regex(
r'<h3>([^<]+)</h3>', webpage, 'podcast title',
default=None) or get_element_by_class('podcast-title', webpage)))
title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
formats = []
for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')):
f_url = data.get(k)
if not f_url:
continue
formats.append({
'url': f_url,
'format_id': format_id,
})
description = self._html_search_regex(
r'<p\s+id="info_text_body">(.+?)</p>', webpage,
'description', default=None)
@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor):
# Strip non-breaking and normal spaces
description = description.replace('\u00A0', ' ').strip()
release_date = unified_strdate(self._search_regex(
r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
data = json.loads(data_json)
formats = [{
'url': data['media_url'],
'format_id': 'main',
}, {
'url': data['media_url_libsyn'],
'format_id': 'libsyn',
}]
thumbnail = data.get('thumbnail_url')
duration = parse_duration(data.get('duration'))
r'<div class="release_date">Released: ([^<]+)<',
webpage, 'release date', default=None) or data.get('release_date'))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'thumbnail': data.get('thumbnail_url'),
'upload_date': release_date,
'duration': duration,
'duration': parse_duration(data.get('duration')),
'formats': formats,
}

View File

@ -0,0 +1,174 @@
from __future__ import unicode_literals
import json
import random
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
orderedSet,
unescapeHTML,
urlencode_postdata,
urljoin,
)
class LinuxAcademyIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:www\.)?linuxacademy\.com/cp/
(?:
courses/lesson/course/(?P<chapter_id>\d+)/lesson/(?P<lesson_id>\d+)|
modules/view/id/(?P<course_id>\d+)
)
'''
_TESTS = [{
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154',
'info_dict': {
'id': '1498-2',
'ext': 'mp4',
'title': "Introduction to the Practitioner's Brief",
},
'params': {
'skip_download': True,
},
'skip': 'Requires Linux Academy account credentials',
}, {
'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2',
'only_matching': True,
}, {
'url': 'https://linuxacademy.com/cp/modules/view/id/154',
'info_dict': {
'id': '154',
'title': 'AWS Certified Cloud Practitioner',
'description': 'md5:039db7e60e4aac9cf43630e0a75fa834',
},
'playlist_count': 41,
'skip': 'Requires Linux Academy account credentials',
}]
_AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize'
_ORIGIN_URL = 'https://linuxacademy.com'
_CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx'
_NETRC_MACHINE = 'linuxacademy'
def _real_initialize(self):
self._login()
def _login(self):
username, password = self._get_login_info()
if username is None:
return
def random_string():
return ''.join([
random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~')
for _ in range(32)])
webpage, urlh = self._download_webpage_handle(
self._AUTHORIZE_URL, None, 'Downloading authorize page', query={
'client_id': self._CLIENT_ID,
'response_type': 'token id_token',
'redirect_uri': self._ORIGIN_URL,
'scope': 'openid email user_impersonation profile',
'audience': self._ORIGIN_URL,
'state': random_string(),
'nonce': random_string(),
})
login_data = self._parse_json(
self._search_regex(
r'atob\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
'login info', group='value'), None,
transform_source=lambda x: compat_b64decode(x).decode('utf-8')
)['extraParams']
login_data.update({
'client_id': self._CLIENT_ID,
'redirect_uri': self._ORIGIN_URL,
'tenant': 'lacausers',
'connection': 'Username-Password-Authentication',
'username': username,
'password': password,
'sso': 'true',
})
login_state_url = compat_str(urlh.geturl())
try:
login_page = self._download_webpage(
'https://login.linuxacademy.com/usernamepassword/login', None,
'Downloading login page', data=json.dumps(login_data).encode(),
headers={
'Content-Type': 'application/json',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
error = self._parse_json(e.cause.read(), None)
message = error.get('description') or error['code']
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)
raise
callback_page, urlh = self._download_webpage_handle(
'https://login.linuxacademy.com/login/callback', None,
'Downloading callback page',
data=urlencode_postdata(self._hidden_inputs(login_page)),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://login.linuxacademy.com',
'Referer': login_state_url,
})
access_token = self._search_regex(
r'access_token=([^=&]+)', compat_str(urlh.geturl()),
'access token')
self._download_webpage(
'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s'
% access_token, None, 'Downloading token validation page')
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id')
item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id)
webpage = self._download_webpage(url, item_id)
# course path
if course_id:
entries = [
self.url_result(
urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key())
for lesson_url in orderedSet(re.findall(
r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)',
webpage))]
title = unescapeHTML(self._html_search_regex(
(r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)',
r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'),
webpage, 'title', default=None, group='value'))
description = unescapeHTML(self._html_search_regex(
r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'description', default=None, group='value'))
return self.playlist_result(entries, course_id, title, description)
# single video path
info = self._extract_jwplayer_data(
webpage, item_id, require_title=False, m3u8_id='hls',)
title = self._search_regex(
(r'>Lecture\s*:\s*(?P<value>[^<]+)',
r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
'title', group='value')
info.update({
'id': item_id,
'title': title,
})
return info

View File

@ -1,12 +1,13 @@
from __future__ import unicode_literals
import json
import re
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_urlencode,
)
from ..utils import (
determine_ext,
@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor):
headers = {
# Disable family filter
'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False})
'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False}))
}
# AnyClip videos require the flashversion cookie so that we get the link

View File

@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
bc_url = BrightcoveNewIE._extract_url(self, webpage)
brightcove_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
data = self._parse_json(
self._search_regex(
@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor):
return {
'_type': 'url_transparent',
'ie_key': BrightcoveNewIE.ie_key(),
'url': smuggle_url(bc_url, {'geo_countries': ['CA']}),
'url': smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
{'geo_countries': ['CA']}),
'id': brightcove_id,
'title': title,
'description': description,
'series': series,

View File

@ -12,11 +12,16 @@ from ..utils import (
ExtractorError,
fix_xml_ampersands,
int_or_none,
merge_dicts,
orderedSet,
parse_duration,
qualities,
str_or_none,
strip_jsonp,
unified_strdate,
unified_timestamp,
url_or_none,
urlencode_postdata,
)
@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
return self._get_info(video_id)
try:
return self._get_info(url, video_id)
except ExtractorError:
return self._get_old_info(video_id)
def _get_info(self, video_id):
def _get_info(self, url, video_id):
token = self._download_json(
'https://www.npostart.nl/api/token', video_id,
'Downloading token', headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
})['token']
player = self._download_json(
'https://www.npostart.nl/player/%s' % video_id, video_id,
'Downloading player JSON', data=urlencode_postdata({
'autoplay': 0,
'share': 1,
'pageUrl': url,
'hasAdConsent': 0,
'_token': token,
}))
player_token = player['token']
format_urls = set()
formats = []
for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
streams = self._download_json(
'https://start-player.npo.nl/video/%s/streams' % video_id,
video_id, 'Downloading %s profile JSON' % profile, fatal=False,
query={
'profile': profile,
'quality': 'npo',
'tokenId': player_token,
'streamType': 'broadcast',
})
if not streams:
continue
stream = streams.get('stream')
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('src'))
if not stream_url or stream_url in format_urls:
continue
format_urls.add(stream_url)
if stream.get('protection') is not None:
continue
stream_type = stream.get('type')
stream_ext = determine_ext(stream_url)
if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
stream_url, video_id, mpd_id='dash', fatal=False))
elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream_url, video_id, ext='mp4',
entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
elif '.ism/Manifest' in stream_url:
formats.extend(self._extract_ism_formats(
stream_url, video_id, ism_id='mss', fatal=False))
else:
formats.append({
'url': stream_url,
})
self._sort_formats(formats)
info = {
'id': video_id,
'title': video_id,
'formats': formats,
}
embed_url = url_or_none(player.get('embedUrl'))
if embed_url:
webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed page', fatal=False)
if webpage:
video = self._parse_json(
self._search_regex(
r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
default='{}'), video_id)
if video:
title = video.get('episodeTitle')
subtitles = {}
subtitles_list = video.get('subtitles')
if isinstance(subtitles_list, list):
for cc in subtitles_list:
cc_url = url_or_none(cc.get('src'))
if not cc_url:
continue
lang = str_or_none(cc.get('language')) or 'nl'
subtitles.setdefault(lang, []).append({
'url': cc_url,
})
return merge_dicts({
'title': title,
'description': video.get('description'),
'thumbnail': url_or_none(
video.get('still_image_url') or video.get('orig_image_url')),
'duration': int_or_none(video.get('duration')),
'timestamp': unified_timestamp(video.get('broadcastDate')),
'creator': video.get('channel'),
'series': video.get('title'),
'episode': title,
'episode_number': int_or_none(video.get('episodeNumber')),
'subtitles': subtitles,
}, info)
return info
def _get_old_info(self, video_id):
metadata = self._download_json(
'http://e.omroep.nl/metadata/%s' % video_id,
video_id,
@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
# JSON
else:
video_url = stream_info.get('url')
if not video_url or video_url in urls:
if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
continue
urls.add(video_url)
if determine_ext(video_url) == 'm3u8':

View File

@ -249,7 +249,7 @@ class OpenloadIE(InfoExtractor):
(?:www\.)?
(?:
openload\.(?:co|io|link|pw)|
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw)
oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|pw|live)
)
)/
(?:f|embed)/
@ -346,6 +346,9 @@ class OpenloadIE(InfoExtractor):
}, {
'url': 'https://oload.pw/f/WyKgK8s94N0',
'only_matching': True,
}, {
'url': 'https://oload.live/f/-Z58UZ-GR4M',
'only_matching': True,
}]
_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
parse_iso8601,
unescapeHTML,
)
@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE):
'url': broadcast[image],
} for image in ('image_url', 'image_url_small') if broadcast.get(image)]
width = int_or_none(broadcast.get('width'))
height = int_or_none(broadcast.get('height'))
def add_width_and_height(f):
for key, val in (('width', width), ('height', height)):
if not f.get(key):
f[key] = val
video_urls = set()
formats = []
for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE):
continue
video_urls.add(video_url)
if format_id != 'rtmp':
formats.extend(self._extract_m3u8_formats(
m3u8_formats = self._extract_m3u8_formats(
video_url, token, 'mp4',
entry_protocol='m3u8_native'
if state in ('ended', 'timed_out') else 'm3u8',
m3u8_id=format_id, fatal=False))
m3u8_id=format_id, fatal=False)
if len(m3u8_formats) == 1:
add_width_and_height(m3u8_formats[0])
formats.extend(m3u8_formats)
continue
formats.append({
rtmp_format = {
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
})
}
add_width_and_height(rtmp_format)
formats.append(rtmp_format)
self._sort_formats(formats)
return {

View File

@ -288,7 +288,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
class RaiIE(RaiBaseIE):
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
_TESTS = [{
# var uniquename = "ContentItem-..."
# data-id="ContentItem-..."
@ -375,6 +375,9 @@ class RaiIE(RaiBaseIE):
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}, {
'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html',
'only_matching': True,
}]
def _extract_from_content_id(self, content_id, url):

View File

@ -1,31 +1,44 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class ServusIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)'
_VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)'
_TESTS = [{
'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
'md5': '046dee641cda1c4cabe13baef3be2c1c',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Volkssicht',
'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba',
'thumbnail': r're:^https?://.*\.jpg$',
'title': 'Die Grünen aus Sicht des Volkes',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
}
}, {
'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/',
'only_matching': True,
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage)
title = self._search_regex(
(r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
webpage, 'title', default=None,
group='title') or self._og_search_title(webpage)
title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)

View File

@ -61,7 +61,8 @@ class SixPlayIE(InfoExtractor):
quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
formats = []
subtitles = {}
for asset in clip_data['assets']:
assets = clip_data.get('assets') or []
for asset in assets:
asset_url = asset.get('full_physical_path')
protocol = asset.get('protocol')
if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls:

View File

@ -9,6 +9,8 @@ from ..utils import (
parse_duration,
parse_resolution,
str_to_int,
url_or_none,
urlencode_postdata,
)
@ -64,16 +66,49 @@ class SpankBangIE(InfoExtractor):
'Video %s is not available' % video_id, expected=True)
formats = []
for mobj in re.finditer(
r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
webpage):
format_id, format_url = mobj.group('id', 'url')
def extract_format(format_id, format_url):
f_url = url_or_none(format_url)
if not f_url:
return
f = parse_resolution(format_id)
f.update({
'url': format_url,
'url': f_url,
'format_id': format_id,
})
formats.append(f)
STREAM_URL_PREFIX = 'stream_url_'
for mobj in re.finditer(
r'%s(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2'
% STREAM_URL_PREFIX, webpage):
extract_format(mobj.group('id', 'url'))
if not formats:
stream_key = self._search_regex(
r'data-streamkey\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
webpage, 'stream key', group='value')
sb_csrf_session = self._get_cookies(
'https://spankbang.com')['sb_csrf_session'].value
stream = self._download_json(
'https://spankbang.com/api/videos/stream', video_id,
'Downloading stream JSON', data=urlencode_postdata({
'id': stream_key,
'data': 0,
'sb_csrf_session': sb_csrf_session,
}), headers={
'Referer': url,
'X-CSRFToken': sb_csrf_session,
})
for format_id, format_url in stream.items():
if format_id.startswith(STREAM_URL_PREFIX):
extract_format(
format_id[len(STREAM_URL_PREFIX):], format_url)
self._sort_formats(formats)
title = self._html_search_regex(

View File

@ -1,14 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
clean_html,
get_element_by_attribute,
determine_ext,
ExtractorError,
get_element_by_attribute,
orderedSet,
)
@ -19,12 +21,12 @@ class TVPIE(InfoExtractor):
_TESTS = [{
'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536',
'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, I seria odc. 13',
'description': 'md5:381afa5bca72655fe94b05cfe82bf53d',
'title': 'Czas honoru, odc. 13 Władek',
'description': 'md5:437f48b93558370b031740546b696e24',
},
}, {
'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
@ -45,6 +47,7 @@ class TVPIE(InfoExtractor):
'title': 'Wiadomości, 28.09.2017, 19:30',
'description': 'Wydanie główne codziennego serwisu informacyjnego.'
},
'skip': 'HTTP Error 404: Not Found',
}, {
'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
'only_matching': True,
@ -75,8 +78,10 @@ class TVPIE(InfoExtractor):
return {
'_type': 'url_transparent',
'url': 'tvp:' + video_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage),
'description': self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'ie_key': 'TVPEmbed',
}
@ -87,6 +92,15 @@ class TVPEmbedIE(InfoExtractor):
_VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
_TESTS = [{
'url': 'tvp:194536',
'md5': 'a21eb0aa862f25414430f15fdfb9e76c',
'info_dict': {
'id': '194536',
'ext': 'mp4',
'title': 'Czas honoru, odc. 13 Władek',
},
}, {
# not available
'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
'md5': '8c9cd59d16edabf39331f93bf8a766c7',
'info_dict': {
@ -94,6 +108,7 @@ class TVPEmbedIE(InfoExtractor):
'ext': 'mp4',
'title': 'Panorama, 07.12.2015, 15:40',
},
'skip': 'Transmisja została zakończona lub materiał niedostępny',
}, {
'url': 'tvp:22670268',
'only_matching': True,
@ -105,10 +120,13 @@ class TVPEmbedIE(InfoExtractor):
webpage = self._download_webpage(
'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
error_massage = get_element_by_attribute('class', 'msg error', webpage)
if error_massage:
error = self._html_search_regex(
r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>',
webpage, 'error', default=None) or clean_html(
get_element_by_attribute('class', 'msg error', webpage))
if error:
raise ExtractorError('%s said: %s' % (
self.IE_NAME, clean_html(error_massage)), expected=True)
self.IE_NAME, clean_html(error)), expected=True)
title = self._search_regex(
r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
@ -180,48 +198,55 @@ class TVPEmbedIE(InfoExtractor):
}
class TVPSeriesIE(InfoExtractor):
class TVPWebsiteIE(InfoExtractor):
IE_NAME = 'tvp:series'
_VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
_VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)'
_TESTS = [{
'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
# series
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video',
'info_dict': {
'title': 'Ogniem i mieczem',
'id': '4278026',
'id': '38678312',
},
'playlist_count': 4,
'playlist_count': 115,
}, {
'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
# film
'url': 'https://vod.tvp.pl/website/gloria,35139666',
'info_dict': {
'title': 'Boso przez świat',
'id': '9329207',
'id': '36637049',
'ext': 'mp4',
'title': 'Gloria, Gloria',
},
'playlist_count': 86,
'params': {
'skip_download': True,
},
'add_ie': ['TVPEmbed'],
}, {
'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312',
'only_matching': True,
}]
def _entries(self, display_id, playlist_id):
url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id)
for page_num in itertools.count(1):
page = self._download_webpage(
url, display_id, 'Downloading page %d' % page_num,
query={'page': page_num})
video_ids = orderedSet(re.findall(
r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id,
page))
if not video_ids:
break
for video_id in video_ids:
yield self.url_result(
'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(),
video_id=video_id)
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id, tries=5)
title = self._html_search_regex(
r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
playlist = self._download_webpage(
'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
note='Downloading playlist')
videos_paths = re.findall(
'(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
entries = [
self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
for v_path in videos_paths]
return {
'_type': 'playlist',
'id': playlist_id,
'display_id': display_id,
'title': title,
'entries': entries,
}
mobj = re.match(self._VALID_URL, url)
display_id, playlist_id = mobj.group('display_id', 'id')
return self.playlist_result(
self._entries(display_id, playlist_id), playlist_id)

View File

@ -29,7 +29,7 @@ class UdemyIE(InfoExtractor):
IE_NAME = 'udemy'
_VALID_URL = r'''(?x)
https?://
www\.udemy\.com/
(?:[^/]+\.)?udemy\.com/
(?:
[^#]+\#/lecture/|
lecture/view/?\?lectureId=|
@ -64,6 +64,9 @@ class UdemyIE(InfoExtractor):
# only outputs rendition
'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757',
'only_matching': True,
}]
def _extract_course_info(self, webpage, video_id):
@ -123,10 +126,22 @@ class UdemyIE(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
headers = kwargs.get('headers', {}).copy()
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4'
headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
kwargs['headers'] = headers
return super(UdemyIE, self)._download_webpage_handle(
ret = super(UdemyIE, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
if not ret:
return ret
webpage, _ = ret
if any(p in webpage for p in (
'>Please verify you are a human',
'Access to this page has been denied because we believe you are using automation tools to browse the website',
'"_pxCaptcha"')):
raise ExtractorError(
'Udemy asks you to solve a CAPTCHA. Login with browser, '
'solve CAPTCHA, then export cookies and pass cookie file to '
'youtube-dl with --cookies.', expected=True)
return ret
def _download_json(self, url_or_request, *args, **kwargs):
headers = {
@ -403,8 +418,14 @@ class UdemyIE(InfoExtractor):
class UdemyCourseIE(UdemyIE):
IE_NAME = 'udemy:course'
_VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = []
_VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.udemy.com/java-tutorial/',
'only_matching': True,
}, {
'url': 'https://wipro.udemy.com/java-tutorial/',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):

View File

@ -503,7 +503,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
orig_url = url
if mobj.group('pro') or mobj.group('player'):
if mobj.group('pro'):
# some videos require portfolio_id to be present in player url
# https://github.com/rg3/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
elif mobj.group('player'):
url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id

View File

@ -48,7 +48,7 @@ class VShareIE(InfoExtractor):
webpage = self._download_webpage(
'https://vshare.io/v/%s/width-650/height-430/1' % video_id,
video_id)
video_id, headers={'Referer': url})
title = self._html_search_regex(
r'<title>([^<]+)</title>', webpage, 'title')

View File

@ -352,6 +352,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
(?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
(?:www\.)?invidio\.us/|
(?:www\.)?invidious\.snopyta\.org/|
(?:www\.)?invidious\.kabi\.tk/|
(?:www\.)?vid\.wxzm\.sx/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:

View File

@ -1141,6 +1141,8 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
_HTTPONLY_PREFIX = '#HttpOnly_'
def save(self, filename=None, ignore_discard=False, ignore_expires=False):
# Store session cookies with `expires` set to 0 instead of an empty
# string
@ -1150,7 +1152,21 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
def load(self, filename=None, ignore_discard=False, ignore_expires=False):
compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
"""Load cookies from a file."""
if filename is None:
if self.filename is not None:
filename = self.filename
else:
raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
cf = io.StringIO()
with open(filename) as f:
for line in f:
if line.startswith(self._HTTPONLY_PREFIX):
line = line[len(self._HTTPONLY_PREFIX):]
cf.write(compat_str(line))
cf.seek(0)
self._really_load(cf, filename, ignore_discard, ignore_expires)
# Session cookies are denoted by either `expires` field set to
# an empty string or 0. MozillaCookieJar only recognizes the former
# (see [1]). So we need force the latter to be recognized as session

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals
__version__ = 'vc.2019.02.14'
__version__ = 'vc.2019.03.04'