1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-01-24 11:43:21 +08:00

Merge branch 'master' into BlenderCloud-issue-13282

This commit is contained in:
Parmjit Virk 2017-09-02 20:56:36 -05:00
commit 7feaf65da7
17 changed files with 431 additions and 226 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.08.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.02*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.08.23** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.02**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -35,7 +35,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2017.08.23 [debug] youtube-dl version 2017.09.02
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -1,3 +1,43 @@
version 2017.09.02
Extractors
* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076,
#14077, #14079, #14082, #14083, #14094, #14095, #14096)
* [youtube] Fix upload date extraction (#14065)
+ [charlierose] Add support for episodes (#14062)
+ [bbccouk] Add support for w-prefixed ids (#14056)
* [googledrive] Extend URL regular expression (#9785)
+ [googledrive] Add support for source format (#14046)
* [pornhd] Fix extraction (#14005)
version 2017.08.27.1
Extractors
* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037)
version 2017.08.27
Core
+ [extractor/common] Extract height and format id for HTML5 videos (#14034)
* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023,
#8625, #9483)
* Simplify code and split into separate routines to facilitate maintaining
* Make retry mechanism work on errors during actual download not only
during connection establishment phase
* Retry on ECONNRESET and ETIMEDOUT during reading data from network
* Retry on content too short
* Show error description on retry
Extractors
* [generic] Lower preference for extraction from LD-JSON
* [rai] Fix audio formats extraction (#14024)
* [youtube] Fix controversy videos extraction (#14027, #14029)
* [mixcloud] Fix extraction (#14015, #14020)
version 2017.08.23 version 2017.08.23
Core Core

View File

@ -49,11 +49,11 @@ youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
mkdir -p zip mkdir -p zip
for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \
mkdir -p zip/$$d ;\ mkdir -p zip/$$d ;\
cp -a $$d/*.py zip/$$d/ ;\ cp -pPR $$d/*.py zip/$$d/ ;\
done done
touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py
mv zip/youtube_dl/__main__.py zip/ mv zip/youtube_dl/__main__.py zip/
cd zip ; zip --quiet ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py
rm -rf zip rm -rf zip
echo '#!$(PYTHON)' > youtube-dl echo '#!$(PYTHON)' > youtube-dl
cat youtube-dl.zip >> youtube-dl cat youtube-dl.zip >> youtube-dl

View File

@ -304,11 +304,11 @@ class FileDownloader(object):
"""Report attempt to resume at given byte.""" """Report attempt to resume at given byte."""
self.to_screen('[download] Resuming download at byte %s' % resume_len) self.to_screen('[download] Resuming download at byte %s' % resume_len)
def report_retry(self, count, retries): def report_retry(self, err, count, retries):
"""Report retry in case of HTTP error 5xx""" """Report retry in case of HTTP error 5xx"""
self.to_screen( self.to_screen(
'[download] Got server HTTP error. Retrying (attempt %d of %s)...' '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...'
% (count, self.format_retries(retries))) % (error_to_compat_str(err), count, self.format_retries(retries)))
def report_file_already_downloaded(self, file_name): def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded.""" """Report file has already been fully downloaded."""

View File

@ -22,8 +22,16 @@ from ..utils import (
class HttpFD(FileDownloader): class HttpFD(FileDownloader):
def real_download(self, filename, info_dict): def real_download(self, filename, info_dict):
url = info_dict['url'] url = info_dict['url']
tmpfilename = self.temp_name(filename)
stream = None class DownloadContext(dict):
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
ctx = DownloadContext()
ctx.filename = filename
ctx.tmpfilename = self.temp_name(filename)
ctx.stream = None
# Do not include the Accept-Encoding header # Do not include the Accept-Encoding header
headers = {'Youtubedl-no-compression': 'True'} headers = {'Youtubedl-no-compression': 'True'}
@ -38,46 +46,51 @@ class HttpFD(FileDownloader):
if is_test: if is_test:
request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1))
# Establish possible resume length ctx.open_mode = 'wb'
if os.path.isfile(encodeFilename(tmpfilename)): ctx.resume_len = 0
resume_len = os.path.getsize(encodeFilename(tmpfilename))
else:
resume_len = 0
open_mode = 'wb' if self.params.get('continuedl', True):
if resume_len != 0: # Establish possible resume length
if self.params.get('continuedl', True): if os.path.isfile(encodeFilename(ctx.tmpfilename)):
self.report_resuming_byte(resume_len) ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
request.add_header('Range', 'bytes=%d-' % resume_len)
open_mode = 'ab'
else:
resume_len = 0
count = 0 count = 0
retries = self.params.get('retries', 0) retries = self.params.get('retries', 0)
while count <= retries:
class SucceedDownload(Exception):
pass
class RetryDownload(Exception):
def __init__(self, source_error):
self.source_error = source_error
def establish_connection():
if ctx.resume_len != 0:
self.report_resuming_byte(ctx.resume_len)
request.add_header('Range', 'bytes=%d-' % ctx.resume_len)
ctx.open_mode = 'ab'
# Establish connection # Establish connection
try: try:
data = self.ydl.urlopen(request) ctx.data = self.ydl.urlopen(request)
# When trying to resume, Content-Range HTTP header of response has to be checked # When trying to resume, Content-Range HTTP header of response has to be checked
# to match the value of requested Range HTTP header. This is due to a webservers # to match the value of requested Range HTTP header. This is due to a webservers
# that don't support resuming and serve a whole file with no Content-Range # that don't support resuming and serve a whole file with no Content-Range
# set in response despite of requested Range (see # set in response despite of requested Range (see
# https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799)
if resume_len > 0: if ctx.resume_len > 0:
content_range = data.headers.get('Content-Range') content_range = ctx.data.headers.get('Content-Range')
if content_range: if content_range:
content_range_m = re.search(r'bytes (\d+)-', content_range) content_range_m = re.search(r'bytes (\d+)-', content_range)
# Content-Range is present and matches requested Range, resume is possible # Content-Range is present and matches requested Range, resume is possible
if content_range_m and resume_len == int(content_range_m.group(1)): if content_range_m and ctx.resume_len == int(content_range_m.group(1)):
break return
# Content-Range is either not present or invalid. Assuming remote webserver is # Content-Range is either not present or invalid. Assuming remote webserver is
# trying to send the whole file, resume is not possible, so wiping the local file # trying to send the whole file, resume is not possible, so wiping the local file
# and performing entire redownload # and performing entire redownload
self.report_unable_to_resume() self.report_unable_to_resume()
resume_len = 0 ctx.resume_len = 0
open_mode = 'wb' ctx.open_mode = 'wb'
break return
except (compat_urllib_error.HTTPError, ) as err: except (compat_urllib_error.HTTPError, ) as err:
if (err.code < 500 or err.code >= 600) and err.code != 416: if (err.code < 500 or err.code >= 600) and err.code != 416:
# Unexpected HTTP error # Unexpected HTTP error
@ -86,15 +99,15 @@ class HttpFD(FileDownloader):
# Unable to resume (requested range not satisfiable) # Unable to resume (requested range not satisfiable)
try: try:
# Open the connection again without the range header # Open the connection again without the range header
data = self.ydl.urlopen(basic_request) ctx.data = self.ydl.urlopen(basic_request)
content_length = data.info()['Content-Length'] content_length = ctx.data.info()['Content-Length']
except (compat_urllib_error.HTTPError, ) as err: except (compat_urllib_error.HTTPError, ) as err:
if err.code < 500 or err.code >= 600: if err.code < 500 or err.code >= 600:
raise raise
else: else:
# Examine the reported length # Examine the reported length
if (content_length is not None and if (content_length is not None and
(resume_len - 100 < int(content_length) < resume_len + 100)): (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
# The file had already been fully downloaded. # The file had already been fully downloaded.
# Explanation to the above condition: in issue #175 it was revealed that # Explanation to the above condition: in issue #175 it was revealed that
# YouTube sometimes adds or removes a few bytes from the end of the file, # YouTube sometimes adds or removes a few bytes from the end of the file,
@ -102,152 +115,184 @@ class HttpFD(FileDownloader):
# I decided to implement a suggested change and consider the file # I decided to implement a suggested change and consider the file
# completely downloaded if the file size differs less than 100 bytes from # completely downloaded if the file size differs less than 100 bytes from
# the one in the hard drive. # the one in the hard drive.
self.report_file_already_downloaded(filename) self.report_file_already_downloaded(ctx.filename)
self.try_rename(tmpfilename, filename) self.try_rename(ctx.tmpfilename, ctx.filename)
self._hook_progress({ self._hook_progress({
'filename': filename, 'filename': ctx.filename,
'status': 'finished', 'status': 'finished',
'downloaded_bytes': resume_len, 'downloaded_bytes': ctx.resume_len,
'total_bytes': resume_len, 'total_bytes': ctx.resume_len,
}) })
return True raise SucceedDownload()
else: else:
# The length does not match, we start the download over # The length does not match, we start the download over
self.report_unable_to_resume() self.report_unable_to_resume()
resume_len = 0 ctx.resume_len = 0
open_mode = 'wb' ctx.open_mode = 'wb'
break return
except socket.error as e: raise RetryDownload(err)
if e.errno != errno.ECONNRESET: except socket.error as err:
if err.errno != errno.ECONNRESET:
# Connection reset is no problem, just retry # Connection reset is no problem, just retry
raise raise
raise RetryDownload(err)
# Retry def download():
count += 1 data_len = ctx.data.info().get('Content-length', None)
if count <= retries:
self.report_retry(count, retries)
if count > retries: # Range HTTP header may be ignored/unsupported by a webserver
self.report_error('giving up after %s retries' % retries) # (e.g. extractor/scivee.py, extractor/bambuser.py).
return False # However, for a test we still would like to download just a piece of a file.
# To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
# block size when downloading a file.
if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
data_len = self._TEST_FILE_SIZE
data_len = data.info().get('Content-length', None) if data_len is not None:
data_len = int(data_len) + ctx.resume_len
# Range HTTP header may be ignored/unsupported by a webserver min_data_len = self.params.get('min_filesize')
# (e.g. extractor/scivee.py, extractor/bambuser.py). max_data_len = self.params.get('max_filesize')
# However, for a test we still would like to download just a piece of a file. if min_data_len is not None and data_len < min_data_len:
# To achieve this we limit data_len to _TEST_FILE_SIZE and manually control self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
# block size when downloading a file. return False
if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): if max_data_len is not None and data_len > max_data_len:
data_len = self._TEST_FILE_SIZE self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
if data_len is not None:
data_len = int(data_len) + resume_len
min_data_len = self.params.get('min_filesize')
max_data_len = self.params.get('max_filesize')
if min_data_len is not None and data_len < min_data_len:
self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
return False
if max_data_len is not None and data_len > max_data_len:
self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
# measure time over whole while-loop, so slow_down() and best_block_size() work together properly
now = None # needed for slow_down() in the first loop run
before = start # start measuring
while True:
# Download and write
data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
byte_counter += len(data_block)
# exit loop when download is finished
if len(data_block) == 0:
break
# Open destination file just in time
if stream is None:
try:
(stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
assert stream is not None
filename = self.undo_temp_name(tmpfilename)
self.report_destination(filename)
except (OSError, IOError) as err:
self.report_error('unable to open for writing: %s' % str(err))
return False return False
if self.params.get('xattr_set_filesize', False) and data_len is not None: byte_counter = 0 + ctx.resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
# measure time over whole while-loop, so slow_down() and best_block_size() work together properly
now = None # needed for slow_down() in the first loop run
before = start # start measuring
def retry(e):
if ctx.tmpfilename != '-':
ctx.stream.close()
ctx.stream = None
ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
raise RetryDownload(e)
while True:
try:
# Download and write
data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
# socket.timeout is a subclass of socket.error but may not have
# errno set
except socket.timeout as e:
retry(e)
except socket.error as e:
if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT):
raise
retry(e)
byte_counter += len(data_block)
# exit loop when download is finished
if len(data_block) == 0:
break
# Open destination file just in time
if ctx.stream is None:
try: try:
write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) ctx.stream, ctx.tmpfilename = sanitize_open(
except (XAttrUnavailableError, XAttrMetadataError) as err: ctx.tmpfilename, ctx.open_mode)
self.report_error('unable to set filesize xattr: %s' % str(err)) assert ctx.stream is not None
ctx.filename = self.undo_temp_name(ctx.tmpfilename)
self.report_destination(ctx.filename)
except (OSError, IOError) as err:
self.report_error('unable to open for writing: %s' % str(err))
return False
try: if self.params.get('xattr_set_filesize', False) and data_len is not None:
stream.write(data_block) try:
except (IOError, OSError) as err: write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
except (XAttrUnavailableError, XAttrMetadataError) as err:
self.report_error('unable to set filesize xattr: %s' % str(err))
try:
ctx.stream.write(data_block)
except (IOError, OSError) as err:
self.to_stderr('\n')
self.report_error('unable to write data: %s' % str(err))
return False
# Apply rate limit
self.slow_down(start, now, byte_counter - ctx.resume_len)
# end measuring of one loop run
now = time.time()
after = now
# Adjust block size
if not self.params.get('noresizebuffer', False):
block_size = self.best_block_size(after - before, len(data_block))
before = after
# Progress message
speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
if data_len is None:
eta = None
else:
eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len)
self._hook_progress({
'status': 'downloading',
'downloaded_bytes': byte_counter,
'total_bytes': data_len,
'tmpfilename': ctx.tmpfilename,
'filename': ctx.filename,
'eta': eta,
'speed': speed,
'elapsed': now - start,
})
if is_test and byte_counter == data_len:
break
if ctx.stream is None:
self.to_stderr('\n') self.to_stderr('\n')
self.report_error('unable to write data: %s' % str(err)) self.report_error('Did not get any data blocks')
return False return False
if ctx.tmpfilename != '-':
ctx.stream.close()
# Apply rate limit if data_len is not None and byte_counter != data_len:
self.slow_down(start, now, byte_counter - resume_len) err = ContentTooShortError(byte_counter, int(data_len))
if count <= retries:
retry(err)
raise err
# end measuring of one loop run self.try_rename(ctx.tmpfilename, ctx.filename)
now = time.time()
after = now
# Adjust block size # Update file modification time
if not self.params.get('noresizebuffer', False): if self.params.get('updatetime', True):
block_size = self.best_block_size(after - before, len(data_block)) info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
before = after
# Progress message
speed = self.calc_speed(start, now, byte_counter - resume_len)
if data_len is None:
eta = None
else:
eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
self._hook_progress({ self._hook_progress({
'status': 'downloading',
'downloaded_bytes': byte_counter, 'downloaded_bytes': byte_counter,
'total_bytes': data_len, 'total_bytes': byte_counter,
'tmpfilename': tmpfilename, 'filename': ctx.filename,
'filename': filename, 'status': 'finished',
'eta': eta, 'elapsed': time.time() - start,
'speed': speed,
'elapsed': now - start,
}) })
if is_test and byte_counter == data_len: return True
break
if stream is None: while count <= retries:
self.to_stderr('\n') try:
self.report_error('Did not get any data blocks') establish_connection()
return False download()
if tmpfilename != '-': return True
stream.close() except RetryDownload as e:
count += 1
if count <= retries:
self.report_retry(e.source_error, count, retries)
continue
except SucceedDownload:
return True
if data_len is not None and byte_counter != data_len: self.report_error('giving up after %s retries' % retries)
raise ContentTooShortError(byte_counter, int(data_len)) return False
self.try_rename(tmpfilename, filename)
# Update file modification time
if self.params.get('updatetime', True):
info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
self._hook_progress({
'downloaded_bytes': byte_counter,
'total_bytes': byte_counter,
'filename': filename,
'status': 'finished',
'elapsed': time.time() - start,
})
return True

View File

@ -29,7 +29,7 @@ from ..compat import (
class BBCCoUkIE(InfoExtractor): class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk' IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer' IE_DESC = 'BBC iPlayer'
_ID_REGEX = r'[pb][\da-z]{7}' _ID_REGEX = r'[pbw][\da-z]{7}'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?:www\.)?bbc\.co\.uk/ (?:www\.)?bbc\.co\.uk/
@ -233,6 +233,9 @@ class BBCCoUkIE(InfoExtractor):
}, { }, {
'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
'only_matching': True,
}] }]
_USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'

View File

@ -5,7 +5,7 @@ from ..utils import remove_end
class CharlieRoseIE(InfoExtractor): class CharlieRoseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://charlierose.com/videos/27996', 'url': 'https://charlierose.com/videos/27996',
'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor):
}, { }, {
'url': 'https://charlierose.com/videos/27996', 'url': 'https://charlierose.com/videos/27996',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://charlierose.com/episodes/30887?autoplay=true',
'only_matching': True,
}] }]
_PLAYER_BASE = 'https://charlierose.com/video/player/%s' _PLAYER_BASE = 'https://charlierose.com/video/player/%s'

View File

@ -2184,6 +2184,12 @@ class InfoExtractor(object):
f = parse_content_type(source_attributes.get('type')) f = parse_content_type(source_attributes.get('type'))
is_plain_url, formats = _media_formats(src, media_type, f) is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url: if is_plain_url:
# res attribute is not standard but seen several times
# in the wild
f.update({
'height': int_or_none(source_attributes.get('res')),
'format_id': source_attributes.get('label'),
})
f.update(formats[0]) f.update(formats[0])
media_info['formats'].append(f) media_info['formats'].append(f)
else: else:

View File

@ -2871,12 +2871,6 @@ class GenericIE(InfoExtractor):
merged[k] = v merged[k] = v
return merged return merged
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
if json_ld.get('url'):
return merge_dicts(json_ld, info_dict)
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:
@ -2895,6 +2889,12 @@ class GenericIE(InfoExtractor):
jwplayer_data, video_id, require_title=False, base_url=url) jwplayer_data, video_id, require_title=False, base_url=url)
return merge_dicts(info, info_dict) return merge_dicts(info, info_dict)
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
if json_ld.get('url'):
return merge_dicts(json_ld, info_dict)
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

View File

@ -4,6 +4,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
lowercase_escape, lowercase_escape,
@ -12,27 +13,53 @@ from ..utils import (
class GoogleDriveIE(InfoExtractor): class GoogleDriveIE(InfoExtractor):
_VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' _VALID_URL = r'''(?x)
https?://
(?:
(?:docs|drive)\.google\.com/
(?:
(?:uc|open)\?.*?id=|
file/d/
)|
video\.google\.com/get_player\?.*?docid=
)
(?P<id>[a-zA-Z0-9_-]{28,})
'''
_TESTS = [{ _TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
'md5': 'd109872761f7e7ecf353fa108c0dbe1e', 'md5': '5c602afbbf2c1db91831f5d82f678554',
'info_dict': { 'info_dict': {
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Big Buck Bunny.mp4', 'title': 'Big Buck Bunny.mp4',
'duration': 45, 'duration': 45,
} }
}, {
# video can't be watched anonymously due to view count limit reached,
# but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
'info_dict': {
'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
'ext': 'mp4',
'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
}
}, { }, {
# video id is longer than 28 characters # video id is longer than 28 characters
'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
'md5': 'c230c67252874fddd8170e3fd1a45886',
'info_dict': { 'info_dict': {
'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
'duration': 189, 'duration': 189,
}, },
'only_matching': True 'only_matching': True,
}, {
'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
'only_matching': True,
}, {
'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
'only_matching': True,
}] }]
_FORMATS_EXT = { _FORMATS_EXT = {
'5': 'flv', '5': 'flv',
@ -147,47 +174,84 @@ class GoogleDriveIE(InfoExtractor):
webpage = self._download_webpage( webpage = self._download_webpage(
'http://docs.google.com/file/d/%s' % video_id, video_id) 'http://docs.google.com/file/d/%s' % video_id, video_id)
reason = self._search_regex( title = self._search_regex(
r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) r'"title"\s*,\s*"([^"]+)', webpage, 'title',
if reason: default=None) or self._og_search_title(webpage)
raise ExtractorError(reason)
title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
duration = int_or_none(self._search_regex( duration = int_or_none(self._search_regex(
r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
default=None)) default=None))
fmt_stream_map = self._search_regex(
r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
'fmt stream map').split(',')
fmt_list = self._search_regex(
r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
resolutions = {}
for fmt in fmt_list:
mobj = re.search(
r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
if mobj:
resolutions[mobj.group('format_id')] = (
int(mobj.group('width')), int(mobj.group('height')))
formats = [] formats = []
for fmt_stream in fmt_stream_map: fmt_stream_map = self._search_regex(
fmt_stream_split = fmt_stream.split('|') r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
if len(fmt_stream_split) < 2: 'fmt stream map', default='').split(',')
continue fmt_list = self._search_regex(
format_id, format_url = fmt_stream_split[:2] r'"fmt_list"\s*,\s*"([^"]+)', webpage,
f = { 'fmt_list', default='').split(',')
'url': lowercase_escape(format_url), if fmt_stream_map and fmt_list:
'format_id': format_id, resolutions = {}
'ext': self._FORMATS_EXT[format_id], for fmt in fmt_list:
} mobj = re.search(
resolution = resolutions.get(format_id) r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
if resolution: if mobj:
f.update({ resolutions[mobj.group('format_id')] = (
'width': resolution[0], int(mobj.group('width')), int(mobj.group('height')))
'height': resolution[1],
for fmt_stream in fmt_stream_map:
fmt_stream_split = fmt_stream.split('|')
if len(fmt_stream_split) < 2:
continue
format_id, format_url = fmt_stream_split[:2]
f = {
'url': lowercase_escape(format_url),
'format_id': format_id,
'ext': self._FORMATS_EXT[format_id],
}
resolution = resolutions.get(format_id)
if resolution:
f.update({
'width': resolution[0],
'height': resolution[1],
})
formats.append(f)
source_url = update_url_query(
'https://drive.google.com/uc', {
'id': video_id,
'export': 'download',
})
urlh = self._request_webpage(
source_url, video_id, note='Requesting source file',
errnote='Unable to request source file', fatal=False)
if urlh:
def add_source_format(src_url):
formats.append({
'url': src_url,
'ext': determine_ext(title, 'mp4').lower(),
'format_id': 'source',
'quality': 1,
}) })
formats.append(f) if urlh.headers.get('Content-Disposition'):
add_source_format(source_url)
else:
confirmation_webpage = self._webpage_read_content(
urlh, url, video_id, note='Downloading confirmation page',
errnote='Unable to confirm download', fatal=False)
if confirmation_webpage:
confirm = self._search_regex(
r'confirm=([^&"\']+)', confirmation_webpage,
'confirmation code', fatal=False)
if confirm:
add_source_format(update_url_query(source_url, {
'confirm': confirm,
}))
if not formats:
reason = self._search_regex(
r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
if reason:
raise ExtractorError(reason, expected=True)
self._sort_formats(formats) self._sort_formats(formats)
hl = self._search_regex( hl = self._search_regex(

View File

@ -92,7 +92,7 @@ class MixcloudIE(InfoExtractor):
js = self._download_webpage(js_url, track_id, fatal=False) js = self._download_webpage(js_url, track_id, fatal=False)
if js: if js:
KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1' KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
for key_name in ('value', 'key_value', 'key_value_two'): for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
key = self._search_regex( key = self._search_regex(
KEY_RE_TEMPLATE % key_name, js, 'key', KEY_RE_TEMPLATE % key_name, js, 'key',
default=None, group='key') default=None, group='key')

View File

@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):
r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title') r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
sources = self._parse_json(js_to_json(self._search_regex( sources = self._parse_json(js_to_json(self._search_regex(
r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]", r"(?s)sources'?\s*[:=]\s*(\{.+?\})",
webpage, 'sources', default='{}')), video_id) webpage, 'sources', default='{}')), video_id)
if not sources: if not sources:
@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor):
view_count = int_or_none(self._html_search_regex( view_count = int_or_none(self._html_search_regex(
r'(\d+) views\s*<', webpage, 'view count', fatal=False)) r'(\d+) views\s*<', webpage, 'view count', fatal=False))
thumbnail = self._search_regex( thumbnail = self._search_regex(
r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
'thumbnail', fatal=False, group='url')
return { return {
'id': video_id, 'id': video_id,

View File

@ -59,6 +59,7 @@ class RadioCanadaIE(InfoExtractor):
device_types.append('android') device_types.append('android')
formats = [] formats = []
error = None
# TODO: extract f4m formats # TODO: extract f4m formats
# f4m formats can be extracted using flashhd device_type but they produce unplayable file # f4m formats can be extracted using flashhd device_type but they produce unplayable file
for device_type in device_types: for device_type in device_types:
@ -84,8 +85,8 @@ class RadioCanadaIE(InfoExtractor):
if not v_url: if not v_url:
continue continue
if v_url == 'null': if v_url == 'null':
raise ExtractorError('%s said: %s' % ( error = xpath_text(v_data, 'message')
self.IE_NAME, xpath_text(v_data, 'message')), expected=True) continue
ext = determine_ext(v_url) ext = determine_ext(v_url)
if ext == 'm3u8': if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
@ -129,6 +130,9 @@ class RadioCanadaIE(InfoExtractor):
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
base_url + '/manifest.f4m', video_id, base_url + '/manifest.f4m', video_id,
f4m_id='hds', fatal=False)) f4m_id='hds', fatal=False))
if not formats and error:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats) self._sort_formats(formats)
subtitles = {} subtitles = {}

View File

@ -345,11 +345,11 @@ class RaiIE(RaiBaseIE):
media_type = media['type'] media_type = media['type']
if 'Audio' in media_type: if 'Audio' in media_type:
relinker_info = { relinker_info = {
'formats': { 'formats': [{
'format_id': media.get('formatoAudio'), 'format_id': media.get('formatoAudio'),
'url': media['audioUrl'], 'url': media['audioUrl'],
'ext': media.get('formatoAudio'), 'ext': media.get('formatoAudio'),
} }]
} }
elif 'Video' in media_type: elif 'Video' in media_type:
relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)

View File

@ -4,12 +4,14 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urlparse, compat_HTTPError,
compat_str, compat_str,
compat_urlparse,
) )
from ..utils import ( from ..utils import (
parse_duration, ExtractorError,
js_to_json, js_to_json,
parse_duration,
parse_iso8601, parse_iso8601,
) )
@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor):
base_url = self._proto_relative_url(cfg['livepipe'], 'http:') base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
lecture_data = self._download_json( try:
'%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_data = self._download_json(
lecture_id)['lecture'][0] '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
lecture_id)['lecture'][0]
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
msg = self._parse_json(
e.cause.read().decode('utf-8'), lecture_id)
raise ExtractorError(msg['detail'], expected=True)
raise
lecture_info = { lecture_info = {
'id': lecture_id, 'id': lecture_id,

View File

@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter from ..swfinterp import SWFInterpreter
from ..compat import ( from ..compat import (
compat_chr, compat_chr,
compat_kwargs,
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus, compat_urllib_parse_unquote_plus,
@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return True return True
def _download_webpage(self, *args, **kwargs):
kwargs.setdefault('query', {})['disable_polymer'] = 'true'
return super(YoutubeBaseInfoExtractor, self)._download_webpage(
*args, **compat_kwargs(kwargs))
def _real_initialize(self): def _real_initialize(self):
if self._downloader is None: if self._downloader is None:
return return
@ -1003,6 +1009,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'Skipping DASH manifest', 'Skipping DASH manifest',
], ],
}, },
{
# The following content has been identified by the YouTube community
# as inappropriate or offensive to some audiences.
'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
'info_dict': {
'id': '6SJNVb0GnPI',
'ext': 'mp4',
'title': 'Race Differences in Intelligence',
'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
'duration': 965,
'upload_date': '20140124',
'uploader': 'New Century Foundation',
'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
'license': 'Standard YouTube License',
'view_count': int,
},
'params': {
'skip_download': True,
},
},
{ {
# itag 212 # itag 212
'url': '1t24XAntNCY', 'url': '1t24XAntNCY',
@ -1437,9 +1464,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if dash_mpd and dash_mpd[0] not in dash_mpds: if dash_mpd and dash_mpd[0] not in dash_mpds:
dash_mpds.append(dash_mpd[0]) dash_mpds.append(dash_mpd[0])
is_live = None
view_count = None
def extract_view_count(v_info):
return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
# Get video info # Get video info
embed_webpage = None embed_webpage = None
is_live = None
if re.search(r'player-age-gate-content">', video_webpage) is not None: if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id} # We simulate the access to the video from www.youtube.com/v/{video_id}
@ -1509,6 +1541,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue continue
get_video_info = compat_parse_qs(video_info_webpage) get_video_info = compat_parse_qs(video_info_webpage)
add_dash_mpd(get_video_info) add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
if not video_info: if not video_info:
video_info = get_video_info video_info = get_video_info
if 'token' in get_video_info: if 'token' in get_video_info:
@ -1592,10 +1626,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self.playlist_result(entries, video_id, video_title, video_description) return self.playlist_result(entries, video_id, video_title, video_description)
self.to_screen('Downloading just video %s because of --no-playlist' % video_id) self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
if 'view_count' in video_info: if view_count is None:
view_count = int(video_info['view_count'][0]) view_count = extract_view_count(video_info)
else:
view_count = None
# Check for "rental" videos # Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
@ -1639,10 +1671,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not upload_date: if not upload_date:
upload_date = self._search_regex( upload_date = self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>', [r'(?s)id="eow-date.*?>(.*?)</span>',
r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'], r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None) video_webpage, 'upload date', default=None)
if upload_date:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date) upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex( video_license = self._html_search_regex(
@ -2028,7 +2058,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
| |
(%(playlist_id)s) (%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist' IE_NAME = 'youtube:playlist'
_TESTS = [{ _TESTS = [{

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2017.08.23' __version__ = '2017.09.02'