1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-11 07:17:21 +08:00

Merge remote-tracking branch 'upstream/master' into myversion

This commit is contained in:
Andrew Udvare 2018-07-22 19:29:54 -04:00
commit 9cc3f3ced3
No known key found for this signature in database
GPG Key ID: 1AFD9AFC120C26DD
79 changed files with 1493 additions and 452 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.25*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.21*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.25** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.21**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2018.06.25 [debug] youtube-dl version 2018.07.21
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -239,3 +239,10 @@ Martin Weinelt
Surya Oktafendri Surya Oktafendri
TingPing TingPing
Alexandre Macabies Alexandre Macabies
Bastian de Groot
Niklas Haas
András Veres-Szentkirályi
Enes Solak
Nathan Rossi
Thomas van der Berg
Luca Cherubin

View File

@ -1,3 +1,64 @@
version 2018.07.21
Core
+ [utils] Introduce url_or_none
* [utils] Allow JSONP without function name (#17028)
+ [extractor/common] Extract DASH and MSS formats from SMIL manifests
Extractors
+ [bbc] Add support for BBC Radio Play pages (#17022)
* [iwara] Fix download URLs (#17026)
* [vrtnu] Relax title extraction and extract JSON-LD (#17018)
+ [viu] Pass Referer and Origin headers and area id (#16992)
+ [vimeo] Add another config regular expression (#17013)
+ [facebook] Extract view count (#16942)
* [dailymotion] Improve description extraction (#16984)
* [slutload] Fix and improve extraction (#17001)
* [mediaset] Fix extraction (#16977)
+ [theplatform] Add support for theplatform TLD customization (#16977)
* [imgur] Relax URL regular expression (#16987)
* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262,
#16959)
version 2018.07.10
Core
* [utils] Share JSON-LD regular expression
* [downloader/dash] Improve error handling (#16927)
Extractors
+ [nrktv] Add support for new season and serie URL schema
+ [nrktv] Add support for new episode URL schema (#16909)
+ [frontendmasters] Add support for frontendmasters.com (#3661, #16328)
* [funk] Fix extraction (#16918)
* [watchbox] Fix extraction (#16904)
* [dplayit] Sort formats
* [dplayit] Fix extraction (#16901)
* [youtube] Improve login error handling (#13822)
version 2018.07.04
Core
* [extractor/common] Properly escape % in MPD templates (#16867)
* [extractor/common] Use source URL as Referer for HTML5 entries (16849)
* Prefer ffmpeg over avconv by default (#8622)
Extractors
* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899)
* [lynda] Simplify login and improve error capturing (#16891)
+ [go90] Add support for embed URLs (#16873)
* [go90] Detect geo restriction error and pass geo verification headers
(#16874)
* [vlive] Fix live streams extraction (#16871)
* [npo] Fix typo (#16872)
+ [mediaset] Add support for new videos and extract all formats (#16568)
* [dctptv] Restore extraction based on REST API (#16850)
* [svt] Improve extraction and add support for pages (#16802)
* [porncom] Fix extraction (#16808)
version 2018.06.25 version 2018.06.25
Extractors Extractors

View File

@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms
# INSTALLATION # INSTALLATION
To install it right away for all UNIX users (Linux, OS X, etc.), type: To install it right away for all UNIX users (Linux, macOS, etc.), type:
sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
sudo chmod a+rx /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl
@ -35,7 +35,7 @@ You can also use pip:
This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
OS X users can install youtube-dl with [Homebrew](https://brew.sh/): macOS users can install youtube-dl with [Homebrew](https://brew.sh/):
brew install youtube-dl brew install youtube-dl
@ -427,9 +427,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
default; fix file if we can, warn default; fix file if we can, warn
otherwise) otherwise)
--prefer-avconv Prefer avconv over ffmpeg for running the --prefer-avconv Prefer avconv over ffmpeg for running the
postprocessors (default)
--prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors postprocessors
--prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors (default)
--ffmpeg-location PATH Location of the ffmpeg/avconv binary; --ffmpeg-location PATH Location of the ffmpeg/avconv binary;
either the path to the binary or its either the path to the binary or its
containing directory. containing directory.
@ -442,7 +442,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
# CONFIGURATION # CONFIGURATION
You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
``` ```

View File

@ -302,6 +302,9 @@
- **Freesound** - **Freesound**
- **freespeech.org** - **freespeech.org**
- **FreshLive** - **FreshLive**
- **FrontendMasters**
- **FrontendMastersCourse**
- **FrontendMastersLesson**
- **Funimation** - **Funimation**
- **FunkChannel** - **FunkChannel**
- **FunkMix** - **FunkMix**
@ -589,7 +592,9 @@
- **NRKSkole**: NRK Skole - **NRKSkole**: NRK Skole
- **NRKTV**: NRK TV and NRK Radio - **NRKTV**: NRK TV and NRK Radio
- **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte
- **NRKTVEpisode**
- **NRKTVEpisodes** - **NRKTVEpisodes**
- **NRKTVSeason**
- **NRKTVSeries** - **NRKTVSeries**
- **ntv.ru** - **ntv.ru**
- **Nuvid** - **Nuvid**
@ -813,6 +818,7 @@
- **StretchInternet** - **StretchInternet**
- **SunPorno** - **SunPorno**
- **SVT** - **SVT**
- **SVTPage**
- **SVTPlay**: SVT Play and Öppet arkiv - **SVTPlay**: SVT Play and Öppet arkiv
- **SVTSeries** - **SVTSeries**
- **SWRMediathek** - **SWRMediathek**

View File

@ -78,6 +78,7 @@ from youtube_dl.utils import (
uppercase_escape, uppercase_escape,
lowercase_escape, lowercase_escape,
url_basename, url_basename,
url_or_none,
base_url, base_url,
urljoin, urljoin,
urlencode_postdata, urlencode_postdata,
@ -507,6 +508,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
def test_url_or_none(self):
self.assertEqual(url_or_none(None), None)
self.assertEqual(url_or_none(''), None)
self.assertEqual(url_or_none('foo'), None)
self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de')
self.assertEqual(url_or_none('http$://foo.de'), None)
self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
self.assertEqual(url_or_none('//foo.de'), '//foo.de')
def test_parse_age_limit(self): def test_parse_age_limit(self):
self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(None), None)
self.assertEqual(parse_age_limit(False), None) self.assertEqual(parse_age_limit(False), None)
@ -717,6 +728,10 @@ class TestUtil(unittest.TestCase):
d = json.loads(stripped) d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'}) self.assertEqual(d, {'status': 'success'})
stripped = strip_jsonp('({"status": "success"});')
d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'})
def test_uppercase_escape(self): def test_uppercase_escape(self):
self.assertEqual(uppercase_escape(''), '') self.assertEqual(uppercase_escape(''), '')
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')

View File

@ -2,7 +2,10 @@ from __future__ import unicode_literals
from .fragment import FragmentFD from .fragment import FragmentFD
from ..compat import compat_urllib_error from ..compat import compat_urllib_error
from ..utils import urljoin from ..utils import (
DownloadError,
urljoin,
)
class DashSegmentsFD(FragmentFD): class DashSegmentsFD(FragmentFD):
@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD):
count += 1 count += 1
if count <= fragment_retries: if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries) self.report_retry_fragment(err, frag_index, count, fragment_retries)
except DownloadError:
# Don't retry fragment if error occurred during HTTP downloading
# itself since it has own retry settings
if not fatal:
self.report_skip_fragment(frag_index)
break
raise
if count > fragment_retries: if count > fragment_retries:
if not fatal: if not fatal:
self.report_skip_fragment(frag_index) self.report_skip_fragment(frag_index)

View File

@ -7,6 +7,7 @@ from .turner import TurnerBaseIE
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE):
if not video_id: if not video_id:
entries = [] entries = []
for episode in video_data.get('archiveEpisodes', []): for episode in video_data.get('archiveEpisodes', []):
episode_url = episode.get('url') episode_url = url_or_none(episode.get('url'))
if not episode_url: if not episode_url:
continue continue
entries.append(self.url_result( entries.append(self.url_result(

View File

@ -9,6 +9,7 @@ from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
xpath_text, xpath_text,
) )
@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor):
file_elements = video_element.findall(compat_xpath('./file')) file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1 one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1): for file_num, file_element in enumerate(file_elements, start=1):
file_url = file_element.text file_url = url_or_none(file_element.text)
if not file_url: if not file_url:
continue continue
key = file_element.get('key', '') key = file_element.get('key', '')

View File

@ -3,11 +3,12 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none,
parse_iso8601,
mimetype2ext,
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
url_or_none,
) )
@ -35,7 +36,7 @@ class AMPIE(InfoExtractor):
media_thumbnail = [media_thumbnail] media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail: for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {}) thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = thumbnail.get('url') thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
thumbnails.append({ thumbnails.append({
@ -51,7 +52,7 @@ class AMPIE(InfoExtractor):
media_subtitle = [media_subtitle] media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle: for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {}) subtitle = subtitle_data.get('@attributes', {})
subtitle_href = subtitle.get('href') subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href: if not subtitle_href:
continue continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
@ -65,7 +66,7 @@ class AMPIE(InfoExtractor):
media_content = [media_content] media_content = [media_content]
for media_data in media_content: for media_data in media_content:
media = media_data.get('@attributes', {}) media = media_data.get('@attributes', {})
media_url = media.get('url') media_url = url_or_none(media.get('url'))
if not media_url: if not media_url:
continue continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url) ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
@ -79,7 +80,7 @@ class AMPIE(InfoExtractor):
else: else:
formats.append({ formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'], 'url': media_url,
'tbr': int_or_none(media.get('bitrate')), 'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')), 'filesize': int_or_none(media.get('fileSize')),
'ext': ext, 'ext': ext,

View File

@ -8,6 +8,7 @@ from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor):
}, fatal=False) }, fatal=False)
if not playlist: if not playlist:
continue continue
stream_url = playlist.get('streamurl') stream_url = url_or_none(playlist.get('streamurl'))
if stream_url: if stream_url:
rtmp = re.search( rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',

View File

@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -77,7 +78,7 @@ class AolIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []): for rendition in video_data.get('renditions', []):
video_url = rendition.get('url') video_url = url_or_none(rendition.get('url'))
if not video_url: if not video_url:
continue continue
ext = rendition.get('format') ext = rendition.get('format')

View File

@ -4,10 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
js_to_json, js_to_json,
url_or_none,
) )
@ -68,8 +68,8 @@ class APAIE(InfoExtractor):
for source in sources: for source in sources:
if not isinstance(source, dict): if not isinstance(source, dict):
continue continue
source_url = source.get('file') source_url = url_or_none(source.get('file'))
if not source_url or not isinstance(source_url, compat_str): if not source_url:
continue continue
ext = determine_ext(source_url) ext = determine_ext(source_url)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
mimetype2ext, mimetype2ext,
url_or_none,
) )
@ -43,7 +44,7 @@ class AparatIE(InfoExtractor):
formats = [] formats = []
for item in file_list[0]: for item in file_list[0]:
file_url = item.get('file') file_url = url_or_none(item.get('file'))
if not file_url: if not file_url:
continue continue
ext = mimetype2ext(item.get('type')) ext = mimetype2ext(item.get('type'))

View File

@ -5,7 +5,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .generic import GenericIE from .generic import GenericIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -15,6 +14,7 @@ from ..utils import (
unified_strdate, unified_strdate,
xpath_text, xpath_text,
update_url_query, update_url_query,
url_or_none,
) )
from ..compat import compat_etree_fromstring from ..compat import compat_etree_fromstring
@ -100,7 +100,7 @@ class ARDMediathekIE(InfoExtractor):
quality = stream.get('_quality') quality = stream.get('_quality')
server = stream.get('_server') server = stream.get('_server')
for stream_url in stream_urls: for stream_url in stream_urls:
if not isinstance(stream_url, compat_str) or '//' not in stream_url: if not url_or_none(stream_url):
continue continue
ext = determine_ext(stream_url) ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'): if quality != 'auto' and ext in ('f4m', 'm3u8'):

View File

@ -19,6 +19,7 @@ from ..utils import (
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -131,8 +132,8 @@ class BandcampIE(InfoExtractor):
fatal=False) fatal=False)
if not stat: if not stat:
continue continue
retry_url = stat.get('retry_url') retry_url = url_or_none(stat.get('retry_url'))
if not isinstance(retry_url, compat_str): if not retry_url:
continue continue
formats.append({ formats.append({
'url': self._proto_relative_url(retry_url, 'http:'), 'url': self._proto_relative_url(retry_url, 'http:'),
@ -306,7 +307,7 @@ class BandcampWeeklyIE(InfoExtractor):
formats = [] formats = []
for format_id, format_url in show['audio_stream'].items(): for format_id, format_url in show['audio_stream'].items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
for known_ext in KNOWN_EXTENSIONS: for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id: if known_ext in format_id:

View File

@ -778,6 +778,17 @@ class BBCIE(BBCCoUkIE):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} }
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
'info_dict': {
'id': 'b0b9z4vz',
'ext': 'mp4',
'title': 'Prom 6: An American in Paris and Turangalila',
'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
}] }]
@classmethod @classmethod
@ -1000,6 +1011,36 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles, 'subtitles': subtitles,
} }
preload_state = self._parse_json(self._search_regex(
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), playlist_id, fatal=False)
if preload_state:
current_programme = preload_state.get('programmes', {}).get('current') or {}
programme_id = current_programme.get('id')
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {}
duration = int_or_none(
current_programme.get('duration', {}).get('value'))
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
thumbnail = image_url.replace('{recipe}', '1920x1920')
return {
'id': programme_id,
'title': title,
'description': dict_get(synopses, ('long', 'medium', 'short')),
'thumbnail': thumbnail,
'duration': duration,
'uploader': network.get('short_title'),
'uploader_id': network.get('id'),
'formats': formats,
'subtitles': subtitles,
}
bbc3_config = self._parse_json( bbc3_config = self._parse_json(
self._search_regex( self._search_regex(
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,

View File

@ -4,8 +4,10 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import compat_str from ..utils import (
from ..utils import int_or_none int_or_none,
url_or_none,
)
class BreakIE(InfoExtractor): class BreakIE(InfoExtractor):
@ -55,8 +57,8 @@ class BreakIE(InfoExtractor):
formats = [] formats = []
for video in content: for video in content:
video_url = video.get('url') video_url = url_or_none(video.get('url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
bitrate = int_or_none(self._search_regex( bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None)) r'(\d+)_kbps', video_url, 'tbr', default=None))

View File

@ -2,10 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -56,8 +56,8 @@ class CamModelsIE(InfoExtractor):
for media in encodings: for media in encodings:
if not isinstance(media, dict): if not isinstance(media, dict):
continue continue
media_url = media.get('location') media_url = url_or_none(media.get('location'))
if not media_url or not isinstance(media_url, compat_str): if not media_url:
continue continue
format_id_list = [format_id] format_id_list = [format_id]

View File

@ -11,6 +11,7 @@ from ..utils import (
strip_or_none, strip_or_none,
float_or_none, float_or_none,
int_or_none, int_or_none,
merge_dicts,
parse_iso8601, parse_iso8601,
) )
@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE):
webpage, urlh = self._download_webpage_handle(url, display_id) webpage, urlh = self._download_webpage_handle(url, display_id)
title = self._html_search_regex( info = self._search_json_ld(webpage, display_id, default={})
# title is optional here since it may be extracted by extractor
# that is delegated from here
title = strip_or_none(self._html_search_regex(
r'(?ms)<h1 class="content__heading">(.+?)</h1>', r'(?ms)<h1 class="content__heading">(.+?)</h1>',
webpage, 'title').strip() webpage, 'title', default=None))
description = self._html_search_regex( description = self._html_search_regex(
r'(?ms)<div class="content__description">(.+?)</div>', r'(?ms)<div class="content__description">(.+?)</div>',
@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE):
# the first one # the first one
video_id = list(video.values())[0].get('videoid') video_id = list(video.values())[0].get('videoid')
return { return merge_dicts(info, {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(), 'ie_key': CanvasIE.ie_key(),
@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE):
'season_number': season_number, 'season_number': season_number,
'episode_number': episode_number, 'episode_number': episode_number,
'release_date': release_date, 'release_date': release_date,
} })

View File

@ -4,13 +4,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
clean_html, clean_html,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_resolution, parse_resolution,
url_or_none,
) )
@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor):
media_url = media['media']['url'] media_url = media['media']['url']
if isinstance(media_url, list): if isinstance(media_url, list):
for format_ in media_url: for format_ in media_url:
format_url = format_.get('file') format_url = url_or_none(format_.get('file'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
label = format_.get('label') label = format_.get('label')
f = parse_resolution(label) f = parse_resolution(label)

View File

@ -52,6 +52,7 @@ from ..utils import (
GeoUtils, GeoUtils,
int_or_none, int_or_none,
js_to_json, js_to_json,
JSON_LD_RE,
mimetype2ext, mimetype2ext,
orderedSet, orderedSet,
parse_codecs, parse_codecs,
@ -1149,8 +1150,7 @@ class InfoExtractor(object):
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex( json_ld = self._search_regex(
r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
html, 'JSON-LD', group='json_ld', **kwargs)
default = kwargs.get('default', NO_DEFAULT) default = kwargs.get('default', NO_DEFAULT)
if not json_ld: if not json_ld:
return default if default is not NO_DEFAULT else {} return default if default is not NO_DEFAULT else {}
@ -1859,9 +1859,7 @@ class InfoExtractor(object):
'height': height, 'height': height,
}) })
formats.extend(m3u8_formats) formats.extend(m3u8_formats)
continue elif src_ext == 'f4m':
if src_ext == 'f4m':
f4m_url = src_url f4m_url = src_url
if not f4m_params: if not f4m_params:
f4m_params = { f4m_params = {
@ -1871,9 +1869,13 @@ class InfoExtractor(object):
f4m_url += '&' if '?' in f4m_url else '?' f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse_urlencode(f4m_params) f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
continue elif src_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
if src_url.startswith('http') and self._is_valid_url(src, video_id): src_url, video_id, mpd_id='dash', fatal=False))
elif re.search(r'\.ism/[Mm]anifest', src_url):
formats.extend(self._extract_ism_formats(
src_url, video_id, ism_id='mss', fatal=False))
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1 http_count += 1
formats.append({ formats.append({
'url': src_url, 'url': src_url,
@ -1884,7 +1886,6 @@ class InfoExtractor(object):
'width': width, 'width': width,
'height': height, 'height': height,
}) })
continue
return formats return formats

View File

@ -4,16 +4,14 @@ from __future__ import unicode_literals, division
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_str,
compat_HTTPError,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
url_or_none,
ExtractorError ExtractorError
) )
@ -86,8 +84,8 @@ class CrackleIE(InfoExtractor):
for e in media['MediaURLs']: for e in media['MediaURLs']:
if e.get('UseDRM') is True: if e.get('UseDRM') is True:
continue continue
format_url = e.get('Path') format_url = url_or_none(e.get('Path'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':
@ -124,8 +122,8 @@ class CrackleIE(InfoExtractor):
for cc_file in cc_files: for cc_file in cc_files:
if not isinstance(cc_file, dict): if not isinstance(cc_file, dict):
continue continue
cc_url = cc_file.get('Path') cc_url = url_or_none(cc_file.get('Path'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = cc_file.get('Locale') or 'en' lang = cc_file.get('Locale') or 'en'
subtitles.setdefault(lang, []).append({'url': cc_url}) subtitles.setdefault(lang, []).append({'url': cc_url})

View File

@ -144,7 +144,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
description = self._og_search_description(webpage) or self._html_search_meta( description = self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, 'description') 'description', webpage, 'description')
view_count_str = self._search_regex( view_count_str = self._search_regex(

View File

@ -7,6 +7,7 @@ from ..utils import (
float_or_none, float_or_none,
int_or_none, int_or_none,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -69,7 +70,7 @@ class DctpTvIE(InfoExtractor):
endpoint = next( endpoint = next(
server['endpoint'] server['endpoint']
for server in servers for server in servers
if isinstance(server.get('endpoint'), compat_str) and if url_or_none(server.get('endpoint')) and
'cloudfront' in server['endpoint']) 'cloudfront' in server['endpoint'])
else: else:
endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/'
@ -92,8 +93,8 @@ class DctpTvIE(InfoExtractor):
for image in images: for image in images:
if not isinstance(image, dict): if not isinstance(image, dict):
continue continue
image_url = image.get('url') image_url = url_or_none(image.get('url'))
if not image_url or not isinstance(image_url, compat_str): if not image_url:
continue continue
thumbnails.append({ thumbnails.append({
'url': image_url, 'url': image_url,

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
@ -12,6 +11,7 @@ from ..utils import (
parse_age_limit, parse_age_limit,
remove_end, remove_end,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor):
captions = stream.get('captions') captions = stream.get('captions')
if isinstance(captions, list): if isinstance(captions, list):
for caption in captions: for caption in captions:
subtitle_url = caption.get('fileUrl') subtitle_url = url_or_none(caption.get('fileUrl'))
if (not subtitle_url or not isinstance(subtitle_url, compat_str) or if not subtitle_url or not subtitle_url.startswith('http'):
not subtitle_url.startswith('http')):
continue continue
lang = caption.get('fileLang', 'en') lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url) ext = determine_ext(subtitle_url)

View File

@ -21,6 +21,7 @@ from ..utils import (
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urljoin,
USER_AGENTS, USER_AGENTS,
) )
@ -310,9 +311,11 @@ class DPlayItIE(InfoExtractor):
if not info: if not info:
info_url = self._search_regex( info_url = self._search_regex(
r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'info url') r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'),
webpage, 'info url', group='url')
info_url = urljoin(url, info_url)
video_id = info_url.rpartition('/')[-1] video_id = info_url.rpartition('/')[-1]
try: try:
@ -322,6 +325,8 @@ class DPlayItIE(InfoExtractor):
'dplayit_token').value, 'dplayit_token').value,
'Referer': url, 'Referer': url,
}) })
if isinstance(info, compat_str):
info = self._parse_json(info, display_id)
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
info = self._parse_json(e.cause.read().decode('utf-8'), display_id) info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
@ -337,6 +342,7 @@ class DPlayItIE(InfoExtractor):
formats = self._extract_m3u8_formats( formats = self._extract_m3u8_formats(
hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls') m3u8_id='hls')
self._sort_formats(formats)
series = self._html_search_regex( series = self._html_search_regex(
r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',

View File

@ -7,7 +7,6 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_str,
compat_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
@ -17,6 +16,7 @@ from ..utils import (
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
sub_url = sub.get('url') sub_url = url_or_none(sub.get('url'))
if not sub_url or not isinstance(sub_url, compat_str): if not sub_url:
continue continue
subtitles.setdefault( subtitles.setdefault(
sub.get('code') or sub.get('language') or 'en', []).append({ sub.get('code') or sub.get('language') or 'en', []).append({
@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE):
for format_id, format_dict in download_assets.items(): for format_id, format_dict in download_assets.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
format_url = format_dict.get('url') format_url = url_or_none(format_dict.get('url'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

View File

@ -4,14 +4,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
unsmuggle_url, unsmuggle_url,
url_or_none,
) )
@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor):
video_id, 'Downloading mp4 JSON', fatal=False) video_id, 'Downloading mp4 JSON', fatal=False)
if mp4_data: if mp4_data:
for format_id, format_url in mp4_data.get('data', {}).items(): for format_id, format_url in mp4_data.get('data', {}).items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
height = int_or_none(format_id) height = int_or_none(format_id)
if height is not None and m3u8_formats_dict.get(height): if height is not None and m3u8_formats_dict.get(height):

View File

@ -8,6 +8,7 @@ from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor):
entries = [] entries = []
for lesson in lessons: for lesson in lessons:
lesson_url = lesson.get('http_url') lesson_url = url_or_none(lesson.get('http_url'))
if not lesson_url or not isinstance(lesson_url, compat_str): if not lesson_url:
continue continue
lesson_id = lesson.get('id') lesson_id = lesson.get('id')
if lesson_id: if lesson_id:
@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor):
formats = [] formats = []
for _, format_url in lesson['media_urls'].items(): for _, format_url in lesson['media_urls'].items():
if not format_url or not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -11,6 +11,7 @@ from ..utils import (
int_or_none, int_or_none,
parse_duration, parse_duration,
str_to_int, str_to_int,
url_or_none,
) )
@ -82,8 +83,8 @@ class EpornerIE(InfoExtractor):
for format_id, format_dict in formats_dict.items(): for format_id, format_dict in formats_dict.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
src = format_dict.get('src') src = url_or_none(format_dict.get('src'))
if not isinstance(src, compat_str) or not src.startswith('http'): if not src or not src.startswith('http'):
continue continue
if kind == 'hls': if kind == 'hls':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View File

@ -390,6 +390,11 @@ from .francetv import (
from .freesound import FreesoundIE from .freesound import FreesoundIE
from .freespeech import FreespeechIE from .freespeech import FreespeechIE
from .freshlive import FreshLiveIE from .freshlive import FreshLiveIE
from .frontendmasters import (
FrontendMastersIE,
FrontendMastersLessonIE,
FrontendMastersCourseIE
)
from .funimation import FunimationIE from .funimation import FunimationIE
from .funk import ( from .funk import (
FunkMixIE, FunkMixIE,
@ -763,7 +768,9 @@ from .nrk import (
NRKSkoleIE, NRKSkoleIE,
NRKTVIE, NRKTVIE,
NRKTVDirekteIE, NRKTVDirekteIE,
NRKTVEpisodeIE,
NRKTVEpisodesIE, NRKTVEpisodesIE,
NRKTVSeasonIE,
NRKTVSeriesIE, NRKTVSeriesIE,
) )
from .ntvde import NTVDeIE from .ntvde import NTVDeIE
@ -853,6 +860,10 @@ from .pornhub import (
from .pornotube import PornotubeIE from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE from .pornoxo import PornoXOIE
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
)
from .presstv import PressTVIE from .presstv import PressTVIE
from .primesharetv import PrimeShareTVIE from .primesharetv import PrimeShareTVIE
from .promptfile import PromptFileIE from .promptfile import PromptFileIE

View File

@ -20,6 +20,7 @@ from ..utils import (
int_or_none, int_or_none,
js_to_json, js_to_json,
limit_length, limit_length,
parse_count,
sanitized_Request, sanitized_Request,
try_get, try_get,
urlencode_postdata, urlencode_postdata,
@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '274175099429670', 'id': '274175099429670',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Asif Nawab Butt posted a video to his Timeline.', 'title': 're:^Asif Nawab Butt posted a video',
'uploader': 'Asif Nawab Butt', 'uploader': 'Asif Nawab Butt',
'upload_date': '20140506', 'upload_date': '20140506',
'timestamp': 1399398998, 'timestamp': 1399398998,
@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor):
}, { }, {
# have 1080P, but only up to 720p in swf params # have 1080P, but only up to 720p in swf params
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': '0d9813160b146b3bc8744e006027fcc6', 'md5': '9571fae53d4165bbbadb17a94651dcdc',
'info_dict': { 'info_dict': {
'id': '10155529876156509', 'id': '10155529876156509',
'ext': 'mp4', 'ext': 'mp4',
@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20161030', 'upload_date': '20161030',
'uploader': 'CNN', 'uploader': 'CNN',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'view_count': int,
}, },
}, { }, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1417995061575415', 'id': '1417995061575415',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', 'title': 'md5:1db063d6a8c13faa8da727817339c857',
'timestamp': 1486648217, 'timestamp': 1486648217,
'upload_date': '20170209', 'upload_date': '20170209',
'uploader': 'Yaroslav Korpan', 'uploader': 'Yaroslav Korpan',
@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1396382447100162', 'id': '1396382447100162',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', 'title': 'md5:19a428bbde91364e3de815383b54a235',
'timestamp': 1486035494, 'timestamp': 1486035494,
'upload_date': '20170202', 'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn', 'uploader': 'Elisabeth Ahtn',
@ -426,6 +428,10 @@ class FacebookIE(InfoExtractor):
'timestamp', default=None)) 'timestamp', default=None))
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
view_count = parse_count(self._search_regex(
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
default=None))
info_dict = { info_dict = {
'id': video_id, 'id': video_id,
'title': video_title, 'title': video_title,
@ -433,6 +439,7 @@ class FacebookIE(InfoExtractor):
'uploader': uploader, 'uploader': uploader,
'timestamp': timestamp, 'timestamp': timestamp,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'view_count': view_count,
} }
return webpage, info_dict return webpage, info_dict

View File

@ -10,6 +10,7 @@ from ..utils import (
int_or_none, int_or_none,
qualities, qualities,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor):
formats = [] formats = []
path = None path = None
for f in item.get('mbr', []): for f in item.get('mbr', []):
src = f.get('src') src = url_or_none(f.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
tbr = int_or_none(self._search_regex( tbr = int_or_none(self._search_regex(
r'_(\d{3,})\.mp4', src, 'tbr', default=None)) r'_(\d{3,})\.mp4', src, 'tbr', default=None))

View File

@ -16,6 +16,7 @@ from ..utils import (
int_or_none, int_or_none,
parse_duration, parse_duration,
try_get, try_get,
url_or_none,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor):
def sign(manifest_url, manifest_id): def sign(manifest_url, manifest_id):
for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
signed_url = self._download_webpage( signed_url = url_or_none(self._download_webpage(
'https://%s/esi/TA' % host, video_id, 'https://%s/esi/TA' % host, video_id,
'Downloading signed %s manifest URL' % manifest_id, 'Downloading signed %s manifest URL' % manifest_id,
fatal=False, query={ fatal=False, query={
'url': manifest_url, 'url': manifest_url,
}) }))
if (signed_url and isinstance(signed_url, compat_str) and if signed_url:
re.search(r'^(?:https?:)?//', signed_url)):
return signed_url return signed_url
return manifest_url return manifest_url

View File

@ -0,0 +1,263 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
parse_duration,
url_or_none,
urlencode_postdata,
)
class FrontendMastersBaseIE(InfoExtractor):
_API_BASE = 'https://api.frontendmasters.com/v1/kabuki'
_LOGIN_URL = 'https://frontendmasters.com/login/'
_NETRC_MACHINE = 'frontendmasters'
_QUALITIES = {
'low': {'width': 480, 'height': 360},
'mid': {'width': 1280, 'height': 720},
'high': {'width': 1920, 'height': 1080}
}
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
login_form.update({
'username': username,
'password': password
})
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post_url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in', data=urlencode_postdata(login_form),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
# Successful login
if any(p in response for p in (
'wp-login.php?action=logout', '>Logout')):
return
error = self._html_search_regex(
r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<',
response, 'error message', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
class FrontendMastersPageBaseIE(FrontendMastersBaseIE):
def _download_course(self, course_name, url):
return self._download_json(
'%s/courses/%s' % (self._API_BASE, course_name), course_name,
'Downloading course JSON', headers={'Referer': url})
@staticmethod
def _extract_chapters(course):
chapters = []
lesson_elements = course.get('lessonElements')
if isinstance(lesson_elements, list):
chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)]
return chapters
@staticmethod
def _extract_lesson(chapters, lesson_id, lesson):
title = lesson.get('title') or lesson_id
display_id = lesson.get('slug')
description = lesson.get('description')
thumbnail = lesson.get('thumbnail')
chapter_number = None
index = lesson.get('index')
element_index = lesson.get('elementIndex')
if (isinstance(index, int) and isinstance(element_index, int) and
index < element_index):
chapter_number = element_index - index
chapter = (chapters[chapter_number - 1]
if chapter_number - 1 < len(chapters) else None)
duration = None
timestamp = lesson.get('timestamp')
if isinstance(timestamp, compat_str):
mobj = re.search(
r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})',
timestamp)
if mobj:
duration = parse_duration(mobj.group('end')) - parse_duration(
mobj.group('start'))
return {
'_type': 'url_transparent',
'url': 'frontendmasters:%s' % lesson_id,
'ie_key': FrontendMastersIE.ie_key(),
'id': lesson_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'chapter': chapter,
'chapter_number': chapter_number,
}
class FrontendMastersIE(FrontendMastersBaseIE):
_VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba',
'md5': '7f161159710d6b7016a4f4af6fcb05e2',
'info_dict': {
'id': 'a2qogef6ba',
'ext': 'mp4',
'title': 'a2qogef6ba',
},
'skip': 'Requires FrontendMasters account credentials',
}, {
'url': 'frontendmasters:a2qogef6ba',
'only_matching': True,
}]
def _real_extract(self, url):
lesson_id = self._match_id(url)
source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id)
formats = []
for ext in ('webm', 'mp4'):
for quality in ('low', 'mid', 'high'):
resolution = self._QUALITIES[quality].copy()
format_id = '%s-%s' % (ext, quality)
format_url = self._download_json(
source_url, lesson_id,
'Downloading %s source JSON' % format_id, query={
'f': ext,
'r': resolution['height'],
}, headers={
'Referer': url,
}, fatal=False)['url']
if not format_url:
continue
f = resolution.copy()
f.update({
'url': format_url,
'ext': ext,
'format_id': format_id,
})
formats.append(f)
self._sort_formats(formats)
subtitles = {
'en': [{
'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id),
}]
}
return {
'id': lesson_id,
'title': lesson_id,
'formats': formats,
'subtitles': subtitles
}
class FrontendMastersLessonIE(FrontendMastersPageBaseIE):
_VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)'
_TEST = {
'url': 'https://frontendmasters.com/courses/web-development/tools',
'info_dict': {
'id': 'a2qogef6ba',
'display_id': 'tools',
'ext': 'mp4',
'title': 'Tools',
'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
'thumbnail': r're:^https?://.*\.jpg$',
'chapter': 'Introduction',
'chapter_number': 1,
},
'params': {
'skip_download': True,
},
'skip': 'Requires FrontendMasters account credentials',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
course_name, lesson_name = mobj.group('course_name', 'lesson_name')
course = self._download_course(course_name, url)
lesson_id, lesson = next(
(video_id, data)
for video_id, data in course['lessonData'].items()
if data.get('slug') == lesson_name)
chapters = self._extract_chapters(course)
return self._extract_lesson(chapters, lesson_id, lesson)
class FrontendMastersCourseIE(FrontendMastersPageBaseIE):
_VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)'
_TEST = {
'url': 'https://frontendmasters.com/courses/web-development/',
'info_dict': {
'id': 'web-development',
'title': 'Introduction to Web Development',
'description': 'md5:9317e6e842098bf725d62360e52d49a6',
},
'playlist_count': 81,
'skip': 'Requires FrontendMasters account credentials',
}
@classmethod
def suitable(cls, url):
return False if FrontendMastersLessonIE.suitable(url) else super(
FrontendMastersBaseIE, cls).suitable(url)
def _real_extract(self, url):
course_name = self._match_id(url)
course = self._download_course(course_name, url)
chapters = self._extract_chapters(course)
lessons = sorted(
course['lessonData'].values(), key=lambda data: data['index'])
entries = []
for lesson in lessons:
lesson_name = lesson.get('slug')
if not lesson_name:
continue
lesson_id = lesson.get('hash') or lesson.get('statsId')
entries.append(self._extract_lesson(chapters, lesson_id, lesson))
title = course.get('title')
description = course.get('description')
return self.playlist_result(entries, course_name, title, description)

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .nexx import NexxIE from .nexx import NexxIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
@ -12,6 +13,19 @@ from ..utils import (
class FunkBaseIE(InfoExtractor): class FunkBaseIE(InfoExtractor):
_HEADERS = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4',
}
_AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4'
@staticmethod
def _make_headers(referer):
headers = FunkBaseIE._HEADERS.copy()
headers['Referer'] = referer
return headers
def _make_url_result(self, video): def _make_url_result(self, video):
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
@ -48,19 +62,19 @@ class FunkMixIE(FunkBaseIE):
lists = self._download_json( lists = self._download_json(
'https://www.funk.net/api/v3.1/curation/curatedLists/', 'https://www.funk.net/api/v3.1/curation/curatedLists/',
mix_id, headers={ mix_id, headers=self._make_headers(url), query={
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI',
'Referer': url,
}, query={
'size': 100, 'size': 100,
})['result']['lists'] })['_embedded']['curatedListList']
metas = next( metas = next(
l for l in lists l for l in lists
if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas']
video = next( video = next(
meta['videoDataDelegate'] meta['videoDataDelegate']
for meta in metas if meta.get('alias') == alias) for meta in metas
if try_get(
meta, lambda x: x['videoDataDelegate']['alias'],
compat_str) == alias)
return self._make_url_result(video) return self._make_url_result(video)
@ -104,25 +118,39 @@ class FunkChannelIE(FunkBaseIE):
channel_id = mobj.group('id') channel_id = mobj.group('id')
alias = mobj.group('alias') alias = mobj.group('alias')
headers = { headers = self._make_headers(url)
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc',
'Referer': url,
}
video = None video = None
by_id_list = self._download_json( # Id-based channels are currently broken on their side: webplayer
'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, # tries to process them via byChannelAlias endpoint and fails
headers=headers, query={ # predictably.
'ids': alias, by_channel_alias = self._download_json(
'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s'
% channel_id,
'Downloading byChannelAlias JSON', headers=headers, query={
'size': 100,
}, fatal=False) }, fatal=False)
if by_id_list: if by_channel_alias:
video = try_get(by_id_list, lambda x: x['result'][0], dict) video_list = try_get(
by_channel_alias, lambda x: x['_embedded']['videoList'], list)
if video_list:
video = next(r for r in video_list if r.get('alias') == alias)
if not video:
by_id_list = self._download_json(
'https://www.funk.net/api/v3.0/content/videos/byIdList',
channel_id, 'Downloading byIdList JSON', headers=headers,
query={
'ids': alias,
}, fatal=False)
if by_id_list:
video = try_get(by_id_list, lambda x: x['result'][0], dict)
if not video: if not video:
results = self._download_json( results = self._download_json(
'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, 'https://www.funk.net/api/v3.0/content/videos/filter',
headers=headers, query={ channel_id, 'Downloading filter JSON', headers=headers, query={
'channelId': channel_id, 'channelId': channel_id,
'size': 100, 'size': 100,
})['result'] })['result']

View File

@ -32,6 +32,7 @@ from ..utils import (
unified_strdate, unified_strdate,
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_or_none,
xpath_text, xpath_text,
) )
from .commonprotocols import RtmpIE from .commonprotocols import RtmpIE
@ -3130,8 +3131,8 @@ class GenericIE(InfoExtractor):
sources = [sources] sources = [sources]
formats = [] formats = []
for source in sources: for source in sources:
src = source.get('src') src = url_or_none(source.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
src = compat_urlparse.urljoin(url, src) src = compat_urlparse.urljoin(url, src)
src_type = source.get('type') src_type = source.get('type')

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -14,8 +15,8 @@ from ..utils import (
class Go90IE(InfoExtractor): class Go90IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
_TEST = { _TESTS = [{
'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'url': 'https://www.go90.com/videos/84BUqjLpf9D',
'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'md5': 'efa7670dbbbf21a7b07b360652b24a32',
'info_dict': { 'info_dict': {
@ -27,15 +28,31 @@ class Go90IE(InfoExtractor):
'upload_date': '20170411', 'upload_date': '20170411',
'age_limit': 14, 'age_limit': 14,
} }
} }, {
'url': 'https://www.go90.com/embed/261MflWkD3N',
'only_matching': True,
}]
_GEO_BYPASS = False
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_data = self._download_json(
'https://www.go90.com/api/view/items/' + video_id, try:
video_id, headers={ headers = self.geo_verification_headers()
headers.update({
'Content-Type': 'application/json; charset=utf-8', 'Content-Type': 'application/json; charset=utf-8',
}, data=b'{"client":"web","device_type":"pc"}') })
video_data = self._download_json(
'https://www.go90.com/api/view/items/' + video_id, video_id,
headers=headers, data=b'{"client":"web","device_type":"pc"}')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
message = self._parse_json(e.cause.read().decode(), None)['error']['message']
if 'region unavailable' in message:
self.raise_geo_restricted(countries=['US'])
raise ExtractorError(message, expected=True)
raise
if video_data.get('requires_drm'): if video_data.get('requires_drm'):
raise ExtractorError('This video is DRM protected.', expected=True) raise ExtractorError('This video is DRM protected.', expected=True)
main_video_asset = video_data['main_video_asset'] main_video_asset = video_data['main_video_asset']

View File

@ -8,6 +8,7 @@ from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -80,8 +81,8 @@ class HiDiveIE(InfoExtractor):
bitrates = rendition.get('bitrates') bitrates = rendition.get('bitrates')
if not isinstance(bitrates, dict): if not isinstance(bitrates, dict):
continue continue
m3u8_url = bitrates.get('hls') m3u8_url = url_or_none(bitrates.get('hls'))
if not isinstance(m3u8_url, compat_str): if not m3u8_url:
continue continue
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
@ -93,9 +94,8 @@ class HiDiveIE(InfoExtractor):
if not isinstance(cc_file, list) or len(cc_file) < 3: if not isinstance(cc_file, list) or len(cc_file) < 3:
continue continue
cc_lang = cc_file[0] cc_lang = cc_file[0]
cc_url = cc_file[2] cc_url = url_or_none(cc_file[2])
if not isinstance(cc_lang, compat_str) or not isinstance( if not isinstance(cc_lang, compat_str) or not cc_url:
cc_url, compat_str):
continue continue
subtitles.setdefault(cc_lang, []).append({ subtitles.setdefault(cc_lang, []).append({
'url': cc_url, 'url': cc_url,

View File

@ -3,12 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
mimetype2ext, mimetype2ext,
parse_duration, parse_duration,
qualities, qualities,
url_or_none,
) )
@ -61,8 +61,8 @@ class ImdbIE(InfoExtractor):
for encoding in video_metadata.get('encodings', []): for encoding in video_metadata.get('encodings', []):
if not encoding or not isinstance(encoding, dict): if not encoding or not isinstance(encoding, dict):
continue continue
video_url = encoding.get('videoUrl') video_url = url_or_none(encoding.get('videoUrl'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType')))
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -12,7 +12,7 @@ from ..utils import (
class ImgurIE(InfoExtractor): class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$'
_TESTS = [{ _TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv', 'url': 'https://i.imgur.com/A61SaA1.gifv',
@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor):
}, { }, {
'url': 'http://imgur.com/r/aww/VQcQPhM', 'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -17,6 +17,7 @@ from ..utils import (
lowercase_escape, lowercase_escape,
std_headers, std_headers,
try_get, try_get,
url_or_none,
) )
@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor):
node = try_get(edge, lambda x: x['node'], dict) node = try_get(edge, lambda x: x['node'], dict)
if not node: if not node:
continue continue
node_video_url = try_get(node, lambda x: x['video_url'], compat_str) node_video_url = url_or_none(node.get('video_url'))
if not node_video_url: if not node_video_url:
continue continue
entries.append({ entries.append({

View File

@ -20,6 +20,7 @@ from ..utils import (
merge_dicts, merge_dicts,
parse_duration, parse_duration,
smuggle_url, smuggle_url,
url_or_none,
xpath_with_ns, xpath_with_ns,
xpath_element, xpath_element,
xpath_text, xpath_text,
@ -250,8 +251,8 @@ class ITVIE(InfoExtractor):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
href = sub.get('Href') href = url_or_none(sub.get('Href'))
if isinstance(href, compat_str): if href:
extract_subtitle(href) extract_subtitle(href)
if not info.get('duration'): if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration')) info['duration'] = parse_duration(video_data.get('Duration'))

View File

@ -7,6 +7,7 @@ from ..utils import (
int_or_none, int_or_none,
mimetype2ext, mimetype2ext,
remove_end, remove_end,
url_or_none,
) )
@ -73,11 +74,14 @@ class IwaraIE(InfoExtractor):
formats = [] formats = []
for a_format in video_data: for a_format in video_data:
format_uri = url_or_none(a_format.get('uri'))
if not format_uri:
continue
format_id = a_format.get('resolution') format_id = a_format.get('resolution')
height = int_or_none(self._search_regex( height = int_or_none(self._search_regex(
r'(\d+)p', format_id, 'height', default=None)) r'(\d+)p', format_id, 'height', default=None))
formats.append({ formats.append({
'url': a_format['uri'], 'url': self._proto_relative_url(format_uri, 'https:'),
'format_id': format_id, 'format_id': format_id,
'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
'height': height, 'height': height,

View File

@ -4,16 +4,14 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_decrypt_text from ..aes import aes_decrypt_text
from ..compat import ( from ..compat import compat_urllib_parse_unquote
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor):
encrypted = False encrypted = False
def extract_format(format_url, height=None): def extract_format(format_url, height=None):
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//')):
return return
if format_url in format_urls: if format_url in format_urls:
return return

View File

@ -2,11 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
url_or_none,
) )
@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor):
captions = source.get('captionsAvailableLanguages') captions = source.get('captionsAvailableLanguages')
if isinstance(captions, dict): if isinstance(captions, dict):
for lang, subtitle_url in captions.items(): for lang, subtitle_url in captions.items():
if lang != 'none' and isinstance(subtitle_url, compat_str): subtitle_url = url_or_none(subtitle_url)
if lang != 'none' and subtitle_url:
subtitles.setdefault(lang, []).append({'url': subtitle_url}) subtitles.setdefault(lang, []).append({'url': subtitle_url})
return { return {

View File

@ -4,7 +4,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError,
compat_str, compat_str,
compat_urlparse, compat_urlparse,
) )
@ -44,21 +43,15 @@ class LyndaBaseIE(InfoExtractor):
form_data = self._hidden_inputs(form_html) form_data = self._hidden_inputs(form_html)
form_data.update(extra_form_data) form_data.update(extra_form_data)
try: response = self._download_json(
response = self._download_json( action_url, None, note,
action_url, None, note, data=urlencode_postdata(form_data),
data=urlencode_postdata(form_data), headers={
headers={ 'Referer': referrer_url,
'Referer': referrer_url, 'X-Requested-With': 'XMLHttpRequest',
'X-Requested-With': 'XMLHttpRequest', }, expected_status=(418, 500, ))
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
response = self._parse_json(e.cause.read().decode('utf-8'), None)
self._check_error(response, ('email', 'password'))
raise
self._check_error(response, 'ErrorMessage') self._check_error(response, ('email', 'password', 'ErrorMessage'))
return response, action_url return response, action_url

View File

@ -3,75 +3,75 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .theplatform import ThePlatformBaseIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, ExtractorError,
parse_duration, int_or_none,
try_get, update_url_query,
unified_strdate,
) )
class MediasetIE(InfoExtractor): class MediasetIE(ThePlatformBaseIE):
_TP_TLD = 'eu'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
mediaset:| mediaset:|
https?:// https?://
(?:www\.)?video\.mediaset\.it/ (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?: (?:
(?:video|on-demand)/(?:[^/]+/)+[^/]+_| (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= player/index\.html\?.*?\bprogramGuid=
) )
)(?P<id>[0-9]+) )(?P<id>[0-9A-Z]{16})
''' '''
_TESTS = [{ _TESTS = [{
# full episode # full episode
'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824',
'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
'info_dict': { 'info_dict': {
'id': '661824', 'id': 'FAFU000000661824',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Quarta puntata', 'title': 'Quarta puntata',
'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1414, 'duration': 1414.26,
'creator': 'mediaset',
'upload_date': '20161107', 'upload_date': '20161107',
'series': 'Hello Goodbye', 'series': 'Hello Goodbye',
'categories': ['reality'], 'timestamp': 1478532900,
'uploader': 'Rete 4',
'uploader_id': 'R4',
}, },
'expected_warnings': ['is not a supported codec'],
}, { }, {
'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html', 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
'md5': '1276f966ac423d16ba255ce867de073e', 'md5': '288532f0ad18307705b01e581304cd7b',
'info_dict': { 'info_dict': {
'id': '846685', 'id': 'F309013801000501',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Puntata del 25 maggio', 'title': 'Puntata del 25 maggio',
'description': 'md5:ee2e456e3eb1dba5e814596655bb5296', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 6565, 'duration': 6565.007,
'creator': 'mediaset', 'upload_date': '20180526',
'upload_date': '20180525',
'series': 'Matrix', 'series': 'Matrix',
'categories': ['infotainment'], 'timestamp': 1527326245,
'uploader': 'Canale 5',
'uploader_id': 'C5',
}, },
'expected_warnings': ['HTTP Error 403: Forbidden'], 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, { }, {
# clip # clip
'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
'only_matching': True, 'only_matching': True,
}, { }, {
# iframe simple # iframe simple
'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924',
'only_matching': True, 'only_matching': True,
}, { }, {
# iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'mediaset:661824', 'url': 'mediaset:FAFU000000665924',
'only_matching': True, 'only_matching': True,
}] }]
@ -84,61 +84,54 @@ class MediasetIE(InfoExtractor):
webpage)] webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid
video = self._download_json( info = self._extract_theplatform_metadata(tp_path, guid)
'https://www.video.mediaset.it/html/metainfo.sjson',
video_id, 'Downloading media info', query={
'id': video_id
})['video']
title = video['title']
media_id = video.get('guid') or video_id
video_list = self._download_json(
'http://cdnsel01.mediaset.net/GetCdn2018.aspx',
video_id, 'Downloading video CDN JSON', query={
'streamid': media_id,
'format': 'json',
})['videoList']
formats = [] formats = []
for format_url in video_list: subtitles = {}
ext = determine_ext(format_url) first_e = None
if ext == 'm3u8': for asset_type in ('SD', 'HD'):
formats.extend(self._extract_m3u8_formats( for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'):
format_url, video_id, 'mp4', entry_protocol='m3u8_native', try:
m3u8_id='hls', fatal=False)) tp_formats, tp_subtitles = self._extract_theplatform_smil(
elif ext == 'mpd': update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
formats.extend(self._extract_mpd_formats( 'mbr': 'true',
format_url, video_id, mpd_id='dash', fatal=False)) 'formats': f,
elif ext == 'ism' or '.ism' in format_url: 'assetTypes': asset_type,
formats.extend(self._extract_ism_formats( }), guid, 'Downloading %s %s SMIL data' % (f, asset_type))
format_url, video_id, ism_id='mss', fatal=False)) except ExtractorError as e:
else: if not first_e:
formats.append({ first_e = e
'url': format_url, break
'format_id': determine_ext(format_url), for tp_f in tp_formats:
}) tp_f['quality'] = 1 if asset_type == 'HD' else 0
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if first_e and not formats:
raise first_e
self._sort_formats(formats) self._sort_formats(formats)
creator = try_get( fields = []
video, lambda x: x['brand-info']['publisher'], compat_str) for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
category = try_get( fields.extend(templ % repl for repl in repls)
video, lambda x: x['brand-info']['category'], compat_str) feed_data = self._download_json(
categories = [category] if category else None 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid,
guid, fatal=False, query={'fields': ','.join(fields)})
if feed_data:
publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
info.update({
'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
'series': feed_data.get('mediasetprogram$brandTitle'),
'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
})
return { info.update({
'id': video_id, 'id': guid,
'title': title,
'description': video.get('short-description'),
'thumbnail': video.get('thumbnail'),
'duration': parse_duration(video.get('duration')),
'creator': creator,
'upload_date': unified_strdate(video.get('production-date')),
'webpage_url': video.get('url'),
'series': video.get('brand-value'),
'season': video.get('season'),
'categories': categories,
'formats': formats, 'formats': formats,
} 'subtitles': subtitles,
})
return info

View File

@ -15,6 +15,7 @@ from ..utils import (
mimetype2ext, mimetype2ext,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
url_or_none,
urljoin, urljoin,
) )
@ -156,8 +157,8 @@ class MediasiteIE(InfoExtractor):
stream_formats = [] stream_formats = []
for unum, VideoUrl in enumerate(video_urls): for unum, VideoUrl in enumerate(video_urls):
video_url = VideoUrl.get('Location') video_url = url_or_none(VideoUrl.get('Location'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS

View File

@ -4,12 +4,18 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
JSON_LD_RE,
NO_DEFAULT,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
try_get,
) )
@ -359,6 +365,182 @@ class NRKTVIE(NRKBaseIE):
}] }]
class NRKTVEpisodeIE(InfoExtractor):
_VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
_TEST = {
'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
'info_dict': {
'id': 'MSUI14000816AA',
'ext': 'mp4',
'title': 'Backstage 8:30',
'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
'duration': 1320,
'series': 'Backstage',
'season_number': 1,
'episode_number': 8,
'episode': '8:30',
},
'params': {
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nrk_id = self._parse_json(
self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'),
display_id)['@id']
assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
return self.url_result(
'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)
class NRKTVSerieBaseIE(InfoExtractor):
def _extract_series(self, webpage, display_id, fatal=True):
config = self._parse_json(
self._search_regex(
r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config',
default='{}' if not fatal else NO_DEFAULT),
display_id, fatal=False)
if not config:
return
return try_get(config, lambda x: x['series'], dict)
def _extract_episodes(self, season):
entries = []
if not isinstance(season, dict):
return entries
episodes = season.get('episodes')
if not isinstance(episodes, list):
return entries
for episode in episodes:
nrk_id = episode.get('prfId')
if not nrk_id or not isinstance(nrk_id, compat_str):
continue
entries.append(self.url_result(
'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
return entries
class NRKTVSeasonIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
_TEST = {
'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
'info_dict': {
'id': '1',
'title': 'Sesong 1',
},
'playlist_mincount': 30,
}
@classmethod
def suitable(cls, url):
return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
else super(NRKTVSeasonIE, cls).suitable(url))
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
series = self._extract_series(webpage, display_id)
season = next(
s for s in series['seasons']
if int(display_id) == s.get('seasonNumber'))
title = try_get(season, lambda x: x['titles']['title'], compat_str)
return self.playlist_result(
self._extract_episodes(season), display_id, title)
class NRKTVSeriesIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
# new layout
'url': 'https://tv.nrk.no/serie/backstage',
'info_dict': {
'id': 'backstage',
'title': 'Backstage',
'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
},
'playlist_mincount': 60,
}, {
# old layout
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
'playlist_mincount': 9,
}, {
'url': 'http://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
'description': 'md5:58afd450974c89e27d5a19212eee7115',
},
'playlist_mincount': 3,
}, {
'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/saving-the-human-race',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/postmann-pat',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return (
False if any(ie.suitable(url)
for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
else super(NRKTVSeriesIE, cls).suitable(url))
def _real_extract(self, url):
series_id = self._match_id(url)
webpage = self._download_webpage(url, series_id)
# New layout (e.g. https://tv.nrk.no/serie/backstage)
series = self._extract_series(webpage, series_id, fatal=False)
if series:
title = try_get(series, lambda x: x['titles']['title'], compat_str)
description = try_get(
series, lambda x: x['titles']['subtitle'], compat_str)
entries = []
for season in series['seasons']:
entries.extend(self._extract_episodes(season))
return self.playlist_result(entries, series_id, title, description)
# Old layout (e.g. https://tv.nrk.no/serie/groenn-glede)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
series=series_id, season=season_id))
for season_id in re.findall(self._ITEM_RE, webpage)
]
title = self._html_search_meta(
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
description = self._html_search_meta(
'series_description', webpage,
'description', default=None) or self._og_search_description(webpage)
return self.playlist_result(entries, series_id, title, description)
class NRKTVDirekteIE(NRKTVIE): class NRKTVDirekteIE(NRKTVIE):
IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'
_VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)'
@ -438,64 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE):
r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
class NRKTVSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
'playlist_mincount': 9,
}, {
'url': 'http://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
'description': 'md5:58afd450974c89e27d5a19212eee7115',
},
'playlist_mincount': 3,
}, {
'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/saving-the-human-race',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/postmann-pat',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url)
def _real_extract(self, url):
series_id = self._match_id(url)
webpage = self._download_webpage(url, series_id)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
series=series_id, season=season_id))
for season_id in re.findall(self._ITEM_RE, webpage)
]
title = self._html_search_meta(
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
description = self._html_search_meta(
'series_description', webpage,
'description', default=None) or self._og_search_description(webpage)
return self.playlist_result(entries, series_id, title, description)
class NRKSkoleIE(InfoExtractor): class NRKSkoleIE(InfoExtractor):
IE_DESC = 'NRK Skole' IE_DESC = 'NRK Skole'
_VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'

View File

@ -10,6 +10,7 @@ from ..utils import (
parse_resolution, parse_resolution,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
urljoin, urljoin,
) )
@ -200,8 +201,8 @@ class PeerTubeIE(InfoExtractor):
for file_ in video['files']: for file_ in video['files']:
if not isinstance(file_, dict): if not isinstance(file_, dict):
continue continue
file_url = file_.get('fileUrl') file_url = url_or_none(file_.get('fileUrl'))
if not file_url or not isinstance(file_url, compat_str): if not file_url:
continue continue
file_size = int_or_none(file_.get('size')) file_size = int_or_none(file_.get('size'))
format_id = try_get( format_id = try_get(

View File

@ -27,6 +27,60 @@ from ..utils import (
class PluralsightBaseIE(InfoExtractor): class PluralsightBaseIE(InfoExtractor):
_API_BASE = 'https://app.pluralsight.com' _API_BASE = 'https://app.pluralsight.com'
_GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE
_GRAPHQL_HEADERS = {
'Content-Type': 'application/json;charset=UTF-8',
}
_GRAPHQL_COURSE_TMPL = '''
query BootstrapPlayer {
rpc {
bootstrapPlayer {
profile {
firstName
lastName
email
username
userHandle
authed
isAuthed
plan
}
course(courseId: "%s") {
name
title
courseHasCaptions
translationLanguages {
code
name
}
supportsWideScreenVideoFormats
timestamp
modules {
name
title
duration
formattedDuration
author
authorized
clips {
authorized
clipId
duration
formattedDuration
id
index
moduleIndex
moduleTitle
name
title
watched
}
}
}
}
}
}'''
def _download_course(self, course_id, url, display_id): def _download_course(self, course_id, url, display_id):
try: try:
return self._download_course_rpc(course_id, url, display_id) return self._download_course_rpc(course_id, url, display_id)
@ -39,20 +93,14 @@ class PluralsightBaseIE(InfoExtractor):
def _download_course_rpc(self, course_id, url, display_id): def _download_course_rpc(self, course_id, url, display_id):
response = self._download_json( response = self._download_json(
'%s/player/functions/rpc' % self._API_BASE, display_id, self._GRAPHQL_EP, display_id, data=json.dumps({
'Downloading course JSON', 'query': self._GRAPHQL_COURSE_TMPL % course_id,
data=json.dumps({ 'variables': {}
'fn': 'bootstrapPlayer', }).encode('utf-8'), headers=self._GRAPHQL_HEADERS)
'payload': {
'courseId': course_id,
},
}).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
})
course = try_get(response, lambda x: x['payload']['course'], dict) course = try_get(
response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'],
dict)
if course: if course:
return course return course
@ -90,6 +138,28 @@ class PluralsightIE(PluralsightBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
GRAPHQL_VIEWCLIP_TMPL = '''
query viewClip {
viewClip(input: {
author: "%(author)s",
clipIndex: %(clipIndex)d,
courseName: "%(courseName)s",
includeCaptions: %(includeCaptions)s,
locale: "%(locale)s",
mediaType: "%(mediaType)s",
moduleName: "%(moduleName)s",
quality: "%(quality)s"
}) {
urls {
url
cdn
rank
source
},
status
}
}'''
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -277,7 +347,7 @@ class PluralsightIE(PluralsightBaseIE):
f = QUALITIES[quality].copy() f = QUALITIES[quality].copy()
clip_post = { clip_post = {
'author': author, 'author': author,
'includeCaptions': False, 'includeCaptions': 'false',
'clipIndex': int(clip_idx), 'clipIndex': int(clip_idx),
'courseName': course_name, 'courseName': course_name,
'locale': 'en', 'locale': 'en',
@ -286,11 +356,23 @@ class PluralsightIE(PluralsightBaseIE):
'quality': '%dx%d' % (f['width'], f['height']), 'quality': '%dx%d' % (f['width'], f['height']),
} }
format_id = '%s-%s' % (ext, quality) format_id = '%s-%s' % (ext, quality)
viewclip = self._download_json(
'%s/video/clips/viewclip' % self._API_BASE, display_id, try:
'Downloading %s viewclip JSON' % format_id, fatal=False, viewclip = self._download_json(
data=json.dumps(clip_post).encode('utf-8'), self._GRAPHQL_EP, display_id,
headers={'Content-Type': 'application/json;charset=utf-8'}) 'Downloading %s viewclip graphql' % format_id,
data=json.dumps({
'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post,
'variables': {}
}).encode('utf-8'),
headers=self._GRAPHQL_HEADERS)['data']['viewClip']
except ExtractorError:
# Still works but most likely will go soon
viewclip = self._download_json(
'%s/video/clips/viewclip' % self._API_BASE, display_id,
'Downloading %s viewclip JSON' % format_id, fatal=False,
data=json.dumps(clip_post).encode('utf-8'),
headers={'Content-Type': 'application/json;charset=utf-8'})
# Pluralsight tracks multiple sequential calls to ViewClip API and start # Pluralsight tracks multiple sequential calls to ViewClip API and start
# to return 429 HTTP errors after some time (see # to return 429 HTTP errors after some time (see

View File

@ -4,28 +4,21 @@ from __future__ import unicode_literals
import functools import functools
import itertools import itertools
import operator import operator
# import os
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
# compat_urllib_parse_unquote, compat_str,
# compat_urllib_parse_unquote_plus,
# compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
js_to_json, js_to_json,
orderedSet, orderedSet,
# sanitized_Request,
remove_quotes, remove_quotes,
str_to_int, str_to_int,
) )
# from ..aes import (
# aes_decrypt_text
# )
class PornHubIE(InfoExtractor): class PornHubIE(InfoExtractor):
@ -62,7 +55,7 @@ class PornHubIE(InfoExtractor):
'id': '1331683002', 'id': '1331683002',
'ext': 'mp4', 'ext': 'mp4',
'title': '重庆婷婷女王足交', 'title': '重庆婷婷女王足交',
'uploader': 'cj397186295', 'uploader': 'Unknown',
'duration': 1753, 'duration': 1753,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
@ -121,7 +114,7 @@ class PornHubIE(InfoExtractor):
self._set_cookie('pornhub.com', 'platform', platform) self._set_cookie('pornhub.com', 'platform', platform)
return self._download_webpage( return self._download_webpage(
'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
video_id) video_id, 'Downloading %s webpage' % platform)
webpage = dl_webpage('pc') webpage = dl_webpage('pc')
@ -134,48 +127,19 @@ class PornHubIE(InfoExtractor):
'PornHub said: %s' % error_msg, 'PornHub said: %s' % error_msg,
expected=True, video_id=video_id) expected=True, video_id=video_id)
tv_webpage = dl_webpage('tv')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
title = self._search_regex(
r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
# video_title from flashvars contains whitespace instead of non-ASCII (see # video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore. # on that anymore.
title = title or self._html_search_meta( title = self._html_search_meta(
'twitter:title', webpage, default=None) or self._search_regex( 'twitter:title', webpage, default=None) or self._search_regex(
(r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
webpage, 'title', group='title') webpage, 'title', group='title')
video_urls = []
video_urls_set = set()
flashvars = self._parse_json( flashvars = self._parse_json(
self._search_regex( self._search_regex(
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
@ -183,8 +147,78 @@ class PornHubIE(InfoExtractor):
if flashvars: if flashvars:
thumbnail = flashvars.get('image_url') thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration')) duration = int_or_none(flashvars.get('video_duration'))
media_definitions = flashvars.get('mediaDefinitions')
if isinstance(media_definitions, list):
for definition in media_definitions:
if not isinstance(definition, dict):
continue
video_url = definition.get('videoUrl')
if not video_url or not isinstance(video_url, compat_str):
continue
if video_url in video_urls_set:
continue
video_urls_set.add(video_url)
video_urls.append(
(video_url, int_or_none(definition.get('quality'))))
else: else:
title, thumbnail, duration = [None] * 3 thumbnail, duration = [None] * 2
if not video_urls:
tv_webpage = dl_webpage('tv')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
if video_url not in video_urls_set:
video_urls.append((video_url, None))
video_urls_set.add(video_url)
for mobj in re.finditer(
r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage):
video_url = mobj.group('url')
if video_url not in video_urls_set:
video_urls.append((video_url, None))
video_urls_set.add(video_url)
formats = []
for video_url, height in video_urls:
tbr = None
mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
if mobj:
if not height:
height = int(mobj.group('height'))
tbr = int(mobj.group('tbr'))
formats.append({
'url': video_url,
'format_id': '%dp' % height if height else None,
'height': height,
'tbr': tbr,
})
self._sort_formats(formats)
video_uploader = self._html_search_regex( video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@ -210,7 +244,6 @@ class PornHubIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'title': title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
@ -219,7 +252,7 @@ class PornHubIE(InfoExtractor):
'like_count': like_count, 'like_count': like_count,
'dislike_count': dislike_count, 'dislike_count': dislike_count,
'comment_count': comment_count, 'comment_count': comment_count,
# 'formats': formats, 'formats': formats,
'age_limit': 18, 'age_limit': 18,
'tags': tags, 'tags': tags,
'categories': categories, 'categories': categories,

View File

@ -0,0 +1,247 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_resolution,
str_or_none,
try_get,
unified_timestamp,
url_or_none,
urljoin,
)
class PuhuTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle'
IE_NAME = 'puhutv'
_TESTS = [{
# film
'url': 'https://puhutv.com/sut-kardesler-izle',
'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7',
'info_dict': {
'id': '5085',
'display_id': 'sut-kardesler',
'ext': 'mp4',
'title': 'Süt Kardeşler',
'description': 'md5:405fd024df916ca16731114eb18e511a',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 4832.44,
'creator': 'Arzu Film',
'timestamp': 1469778212,
'upload_date': '20160729',
'release_year': 1976,
'view_count': int,
'tags': ['Aile', 'Komedi', 'Klasikler'],
},
}, {
# episode, geo restricted, bypassable with --geo-verification-proxy
'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle',
'only_matching': True,
}, {
# 4k, with subtitles
'url': 'https://puhutv.com/dip-1-bolum-izle',
'only_matching': True,
}]
_SUBTITLE_LANGS = {
'English': 'en',
'Deutsch': 'de',
'عربى': 'ar'
}
def _real_extract(self, url):
display_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-izle' % display_id),
display_id)['data']
video_id = compat_str(info['id'])
title = info.get('name') or info['title']['name']
if info.get('display_name'):
title = '%s %s' % (title, info.get('display_name'))
try:
videos = self._download_json(
'https://puhutv.com/api/assets/%s/videos' % video_id,
display_id, 'Downloading video JSON',
headers=self.geo_verification_headers())
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self.raise_geo_restricted()
raise
formats = []
for video in videos['data']['videos']:
media_url = url_or_none(video.get('url'))
if not media_url:
continue
playlist = video.get('is_playlist')
if video.get('stream_type') == 'hls' and playlist is True:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
quality = int_or_none(video.get('quality'))
f = {
'url': media_url,
'ext': 'mp4',
'height': quality
}
video_format = video.get('video_format')
if video_format == 'hls' and playlist is False:
format_id = 'hls'
f['protocol'] = 'm3u8_native'
elif video_format == 'mp4':
format_id = 'http'
else:
continue
if quality:
format_id += '-%sp' % quality
f['format_id'] = format_id
formats.append(f)
self._sort_formats(formats)
description = try_get(
info, lambda x: x['title']['description'],
compat_str) or info.get('description')
timestamp = unified_timestamp(info.get('created_at'))
creator = try_get(
info, lambda x: x['title']['producer']['name'], compat_str)
duration = float_or_none(
try_get(info, lambda x: x['content']['duration_in_ms'], int),
scale=1000)
view_count = try_get(info, lambda x: x['content']['watch_count'], int)
images = try_get(
info, lambda x: x['content']['images']['wide'], dict) or {}
thumbnails = []
for image_id, image_url in images.items():
if not isinstance(image_url, compat_str):
continue
if not image_url.startswith(('http', '//')):
image_url = 'https://%s' % image_url
t = parse_resolution(image_id)
t.update({
'id': image_id,
'url': image_url
})
thumbnails.append(t)
release_year = try_get(info, lambda x: x['title']['released_at'], int)
season_number = int_or_none(info.get('season_number'))
season_id = str_or_none(info.get('season_id'))
episode_number = int_or_none(info.get('episode_number'))
tags = []
for genre in try_get(info, lambda x: x['title']['genres'], list) or []:
if not isinstance(genre, dict):
continue
genre_name = genre.get('name')
if genre_name and isinstance(genre_name, compat_str):
tags.append(genre_name)
subtitles = {}
for subtitle in try_get(
info, lambda x: x['content']['subtitles'], list) or []:
if not isinstance(subtitle, dict):
continue
lang = subtitle.get('language')
sub_url = url_or_none(subtitle.get('url'))
if not lang or not isinstance(lang, compat_str) or not sub_url:
continue
subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
'url': sub_url
}]
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'season_id': season_id,
'season_number': season_number,
'episode_number': episode_number,
'release_year': release_year,
'timestamp': timestamp,
'creator': creator,
'view_count': view_count,
'duration': duration,
'tags': tags,
'subtitles': subtitles,
'thumbnails': thumbnails,
'formats': formats
}
class PuhuTVSerieIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay'
IE_NAME = 'puhutv:serie'
_TESTS = [{
'url': 'https://puhutv.com/deniz-yildizi-detay',
'info_dict': {
'title': 'Deniz Yıldızı',
'id': 'deniz-yildizi',
},
'playlist_mincount': 205,
}, {
# a film detail page which is using same url with serie page
'url': 'https://puhutv.com/kaybedenler-kulubu-detay',
'only_matching': True,
}]
def _extract_entries(self, seasons):
for season in seasons:
season_id = season.get('id')
if not season_id:
continue
page = 1
has_more = True
while has_more is True:
season = self._download_json(
'https://galadriel.puhutv.com/seasons/%s' % season_id,
season_id, 'Downloading page %s' % page, query={
'page': page,
'per': 40,
})
episodes = season.get('episodes')
if isinstance(episodes, list):
for ep in episodes:
slug_path = str_or_none(ep.get('slugPath'))
if not slug_path:
continue
video_id = str_or_none(int_or_none(ep.get('id')))
yield self.url_result(
'https://puhutv.com/%s' % slug_path,
ie=PuhuTVIE.ie_key(), video_id=video_id,
video_title=ep.get('name') or ep.get('eventLabel'))
page += 1
has_more = season.get('hasMore')
def _real_extract(self, url):
playlist_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-detay' % playlist_id),
playlist_id)['data']
seasons = info.get('seasons')
if seasons:
return self.playlist_result(
self._extract_entries(seasons), playlist_id, info.get('name'))
# For films, these are using same url with series
video_id = info.get('slug') or info['assets'][0]['slug']
return self.url_result(
'https://puhutv.com/%s-izle' % video_id,
PuhuTVIE.ie_key(), video_id)

View File

@ -3,12 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor):
video_id, fatal=False) video_id, fatal=False)
if medias and isinstance(medias, list): if medias and isinstance(medias, list):
for media in medias: for media in medias:
format_url = media.get('videoUrl') format_url = url_or_none(media.get('videoUrl'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_id = media.get('quality') format_id = media.get('quality')
formats.append({ formats.append({

View File

@ -6,6 +6,7 @@ from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
url_or_none,
) )
@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor):
title = config['title'] title = config['title']
formats = [] formats = []
for video in config['src']: for video in config['src']:
src = video.get('src') src = url_or_none(video.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
ext = determine_ext(src) ext = determine_ext(src)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -16,6 +16,7 @@ from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -176,8 +177,8 @@ class RutubePlaylistBaseIE(RutubeBaseIE):
break break
for result in results: for result in results:
video_url = result.get('video_url') video_url = url_or_none(result.get('video_url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
entry = self._extract_video(result, require_title=False) entry = self._extract_video(result, require_title=False)
entry.update({ entry.update({

View File

@ -1,12 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
class SlutloadIE(InfoExtractor): class SlutloadIE(InfoExtractor):
_VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
'md5': '868309628ba00fd488cf516a113fd717', 'md5': '868309628ba00fd488cf516a113fd717',
@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor):
'title': 'virginie baisee en cam', 'title': 'virginie baisee en cam',
'age_limit': 18, 'age_limit': 18,
'thumbnail': r're:https?://.*?\.jpg' 'thumbnail': r're:https?://.*?\.jpg'
} },
}, { }, {
# mobile site # mobile site
'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/',
'only_matching': True,
}, {
'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) embed_page = self._download_webpage(
webpage = self._download_webpage(desktop_url, video_id) 'http://www.slutload.com/embed_player/%s' % video_id, video_id,
'Downloading embed page', fatal=False)
video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', if embed_page:
webpage, 'title').strip() def extract(what):
return self._html_search_regex(
r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what,
embed_page, 'video %s' % what, default=None, group='url')
video_url = self._html_search_regex( video_url = extract('url')
r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', if video_url:
webpage, 'video URL') title = self._html_search_regex(
thumbnail = self._html_search_regex( r'<title>([^<]+)', embed_page, 'title', default=video_id)
r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', return {
webpage, 'thumbnail', fatal=False) 'id': video_id,
'url': video_url,
'title': title,
'thumbnail': extract('preview'),
'age_limit': 18
}
return { webpage = self._download_webpage(
'http://www.slutload.com/video/_/%s/' % video_id, video_id)
title = self._html_search_regex(
r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip()
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
info.update({
'id': video_id, 'id': video_id,
'url': video_url, 'title': title,
'title': video_title, 'age_limit': 18,
'thumbnail': thumbnail, })
'age_limit': 18 return info
}

View File

@ -32,13 +32,15 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
class ThePlatformBaseIE(OnceIE): class ThePlatformBaseIE(OnceIE):
_TP_TLD = 'com'
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml( meta = self._download_xml(
smil_url, video_id, note=note, query={'format': 'SMIL'}, smil_url, video_id, note=note, query={'format': 'SMIL'},
headers=self.geo_verification_headers()) headers=self.geo_verification_headers())
error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
if error_element is not None and error_element.attrib['src'].startswith( if error_element is not None and error_element.attrib['src'].startswith(
'http://link.theplatform.com/s/errorFiles/Unavailable.'): 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD):
raise ExtractorError(error_element.attrib['abstract'], expected=True) raise ExtractorError(error_element.attrib['abstract'], expected=True)
smil_formats = self._parse_smil_formats( smil_formats = self._parse_smil_formats(
@ -66,7 +68,7 @@ class ThePlatformBaseIE(OnceIE):
return formats, subtitles return formats, subtitles
def _download_theplatform_metadata(self, path, video_id): def _download_theplatform_metadata(self, path, video_id):
info_url = 'http://link.theplatform.com/s/%s?format=preview' % path info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path)
return self._download_json(info_url, video_id) return self._download_json(info_url, video_id)
def _parse_theplatform_metadata(self, info): def _parse_theplatform_metadata(self, info):

View File

@ -15,6 +15,7 @@ from ..utils import (
update_url_query, update_url_query,
ExtractorError, ExtractorError,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -154,8 +155,8 @@ class TurnerBaseIE(AdobePassIE):
subtitles = {} subtitles = {}
for source in video_data.findall('closedCaptions/source'): for source in video_data.findall('closedCaptions/source'):
for track in source.findall('track'): for track in source.findall('track'):
track_url = track.get('url') track_url = url_or_none(track.get('url'))
if not isinstance(track_url, compat_str) or track_url.endswith('/big'): if not track_url or track_url.endswith('/big'):
continue continue
lang = track.get('lang') or track.get('label') or 'en' lang = track.get('lang') or track.get('label') or 'en'
subtitles.setdefault(lang, []).append({ subtitles.setdefault(lang, []).append({

View File

@ -4,10 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -106,9 +106,8 @@ class TVNetIE(InfoExtractor):
for stream in self._download_json(data_file, video_id): for stream in self._download_json(data_file, video_id):
if not isinstance(stream, dict): if not isinstance(stream, dict):
continue continue
stream_url = stream.get('url') stream_url = url_or_none(stream.get('url'))
if (stream_url in stream_urls or not stream_url or if stream_url in stream_urls or not stream_url:
not isinstance(stream_url, compat_str)):
continue continue
stream_urls.add(stream_url) stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View File

@ -19,6 +19,7 @@ from ..utils import (
try_get, try_get,
unsmuggle_url, unsmuggle_url,
update_url_query, update_url_query,
url_or_none,
) )
@ -255,7 +256,8 @@ class TVPlayIE(InfoExtractor):
quality = qualities(['hls', 'medium', 'high']) quality = qualities(['hls', 'medium', 'high'])
formats = [] formats = []
for format_id, video_url in streams.get('streams', {}).items(): for format_id, video_url in streams.get('streams', {}).items():
if not video_url or not isinstance(video_url, compat_str): video_url = url_or_none(video_url)
if not video_url:
continue continue
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'f4m': if ext == 'f4m':

View File

@ -27,6 +27,7 @@ from ..utils import (
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urlencode_postdata, urlencode_postdata,
url_or_none,
urljoin, urljoin,
) )
@ -663,8 +664,8 @@ class TwitchClipsIE(TwitchBaseIE):
for option in status['quality_options']: for option in status['quality_options']:
if not isinstance(option, dict): if not isinstance(option, dict):
continue continue
source = option.get('source') source = url_or_none(option.get('source'))
if not source or not isinstance(source, compat_str): if not source:
continue continue
formats.append({ formats.append({
'url': source, 'url': source,

View File

@ -20,6 +20,7 @@ from ..utils import (
sanitized_Request, sanitized_Request,
try_get, try_get,
unescapeHTML, unescapeHTML,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -265,8 +266,8 @@ class UdemyIE(InfoExtractor):
if not isinstance(source_list, list): if not isinstance(source_list, list):
return return
for source in source_list: for source in source_list:
video_url = source.get('file') or source.get('src') video_url = url_or_none(source.get('file') or source.get('src'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
@ -293,8 +294,8 @@ class UdemyIE(InfoExtractor):
continue continue
if track.get('kind') != 'captions': if track.get('kind') != 'captions':
continue continue
src = track.get('src') src = url_or_none(track.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
lang = track.get('language') or track.get( lang = track.get('language') or track.get(
'srclang') or track.get('label') 'srclang') or track.get('label')
@ -314,8 +315,8 @@ class UdemyIE(InfoExtractor):
for cc in captions: for cc in captions:
if not isinstance(cc, dict): if not isinstance(cc, dict):
continue continue
cc_url = cc.get('url') cc_url = url_or_none(cc.get('url'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) lang = try_get(cc, lambda x: x['locale']['locale'], compat_str)
sub_dict = (automatic_captions if cc.get('source') == 'auto' sub_dict = (automatic_captions if cc.get('source') == 'auto'

View File

@ -3,15 +3,13 @@ from __future__ import unicode_literals
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
parse_iso8601, parse_iso8601,
url_or_none,
) )
@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor):
formats = [] formats = []
for f in video.get('formats', []): for f in video.get('formats', []):
format_url = f.get('uri') format_url = url_or_none(f.get('uri'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_type = f.get('type') format_type = f.get('type')
if format_type == 'dash': if format_type == 'dash':

View File

@ -539,9 +539,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
# We try to find out to which variable is assigned the config dic # We try to find out to which variable is assigned the config dic
m_variable_name = re.search(r'(\w)\.video\.id', webpage) m_variable_name = re.search(r'(\w)\.video\.id', webpage)
if m_variable_name is not None: if m_variable_name is not None:
config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
else: else:
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
config = self._search_regex(config_re, webpage, 'info section', config = self._search_regex(config_re, webpage, 'info section',
flags=re.DOTALL) flags=re.DOTALL)
config = json.loads(config) config = json.loads(config)

View File

@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor):
'skip': 'Geo-restricted to Hong Kong', 'skip': 'Geo-restricted to Hong Kong',
}] }]
_AREA_ID = {
'HK': 1,
'SG': 2,
'TH': 4,
'PH': 5,
}
def _real_extract(self, url): def _real_extract(self, url):
country_code, video_id = re.match(self._VALID_URL, url).groups() country_code, video_id = re.match(self._VALID_URL, url).groups()
query = {
'r': 'vod/ajax-detail',
'platform_flag_label': 'web',
'product_id': video_id,
}
area_id = self._AREA_ID.get(country_code.upper())
if area_id:
query['area_id'] = area_id
product_data = self._download_json( product_data = self._download_json(
'http://www.viu.com/ott/%s/index.php' % country_code, video_id, 'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
'Downloading video info', query={ 'Downloading video info', query=query)['data']
'r': 'vod/ajax-detail',
'platform_flag_label': 'web',
'product_id': video_id,
})['data']
video_data = product_data.get('current_product') video_data = product_data.get('current_product')
if not video_data: if not video_data:
@ -214,6 +227,9 @@ class ViuOTTIE(InfoExtractor):
'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
video_id, 'Downloading stream info', query={ video_id, 'Downloading stream info', query={
'ccs_product_id': video_data['ccs_product_id'], 'ccs_product_id': video_data['ccs_product_id'],
}, headers={
'Referer': url,
'Origin': re.search(r'https?://[^/]+', url).group(0),
})['data']['stream'] })['data']['stream']
stream_sizes = stream_data.get('size', {}) stream_sizes = stream_data.get('size', {})

View File

@ -20,6 +20,7 @@ from ..utils import (
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -423,7 +424,8 @@ class VKIE(VKBaseIE):
formats = [] formats = []
for format_id, format_url in data.items(): for format_id, format_url in data.items():
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
continue continue
if (format_id.startswith(('url', 'cache')) or if (format_id.startswith(('url', 'cache')) or
format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):

View File

@ -67,11 +67,12 @@ class WatchBoxIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
source = self._parse_json( source = (self._parse_json(
self._search_regex( self._search_regex(
r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config',
default='{}'), default='{}'),
video_id, transform_source=js_to_json, fatal=False) or {} video_id, transform_source=js_to_json,
fatal=False) or {}).get('source') or {}
video_id = compat_str(source.get('videoId') or video_id) video_id = compat_str(source.get('videoId') or video_id)

View File

@ -13,6 +13,7 @@ from ..utils import (
parse_duration, parse_duration,
try_get, try_get,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor):
else: else:
format_url = format_item format_url = format_item
filesize = None filesize = None
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
formats.append({ formats.append({
'format_id': '%s-%s' % (format_id, quality), 'format_id': '%s-%s' % (format_id, quality),
@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor):
default='{}'), default='{}'),
video_id, fatal=False) video_id, fatal=False)
for format_id, format_url in sources.items(): for format_id, format_url in sources.items():
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
if format_url in format_urls: if format_url in format_urls:
continue continue

View File

@ -4,12 +4,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
qualities, qualities,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor):
formats = [] formats = []
for format_id in QUALITIES: for format_id in QUALITIES:
is_hd = format_id == 'hd' is_hd = format_id == 'hd'
format_url = playlist.get( format_url = url_or_none(playlist.get(
'file%s' % ('_hd' if is_hd else '')) 'file%s' % ('_hd' if is_hd else '')))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

View File

@ -3,11 +3,11 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
parse_duration, parse_duration,
url_or_none,
) )
@ -50,8 +50,8 @@ class YouJizzIE(InfoExtractor):
for encoding in encodings: for encoding in encodings:
if not isinstance(encoding, dict): if not isinstance(encoding, dict):
continue continue
format_url = encoding.get('filename') format_url = url_or_none(encoding.get('filename'))
if not isinstance(format_url, compat_str): if not format_url:
continue continue
if determine_ext(format_url) == 'm3u8': if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View File

@ -3,13 +3,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
sanitized_Request, sanitized_Request,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
url_or_none,
) )
from ..aes import aes_decrypt_text from ..aes import aes_decrypt_text
@ -88,8 +88,8 @@ class YouPornIE(InfoExtractor):
for definition in definitions: for definition in definitions:
if not isinstance(definition, dict): if not isinstance(definition, dict):
continue continue
video_url = definition.get('videoUrl') video_url = url_or_none(definition.get('videoUrl'))
if isinstance(video_url, compat_str) and video_url: if video_url:
links.append(video_url) links.append(video_url)
# Fallback #1, this also contains extra low quality 180p format # Fallback #1, this also contains extra low quality 180p format

View File

@ -178,13 +178,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
warn('Unable to extract result entry') warn('Unable to extract result entry')
return False return False
tfa = try_get(res, lambda x: x[0][0], list) login_challenge = try_get(res, lambda x: x[0][0], list)
if tfa: if login_challenge:
tfa_str = try_get(tfa, lambda x: x[2], compat_str) challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
if tfa_str == 'TWO_STEP_VERIFICATION': if challenge_str == 'TWO_STEP_VERIFICATION':
# SEND_SUCCESS - TFA code has been successfully sent to phone # SEND_SUCCESS - TFA code has been successfully sent to phone
# QUOTA_EXCEEDED - reached the limit of TFA codes # QUOTA_EXCEEDED - reached the limit of TFA codes
status = try_get(tfa, lambda x: x[5], compat_str) status = try_get(login_challenge, lambda x: x[5], compat_str)
if status == 'QUOTA_EXCEEDED': if status == 'QUOTA_EXCEEDED':
warn('Exceeded the limit of TFA codes, try later') warn('Exceeded the limit of TFA codes, try later')
return False return False
@ -228,6 +228,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
check_cookie_url = try_get( check_cookie_url = try_get(
tfa_results, lambda x: x[0][-1][2], compat_str) tfa_results, lambda x: x[0][-1][2], compat_str)
else:
CHALLENGES = {
'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
}
challenge = CHALLENGES.get(
challenge_str,
'%s returned error %s.' % (self.IE_NAME, challenge_str))
warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
return False
else: else:
check_cookie_url = try_get(res, lambda x: x[2], compat_str) check_cookie_url = try_get(res, lambda x: x[2], compat_str)

View File

@ -13,6 +13,7 @@ from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
try_get, try_get,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -150,8 +151,8 @@ class ZattooBaseIE(InfoExtractor):
for watch in watch_urls: for watch in watch_urls:
if not isinstance(watch, dict): if not isinstance(watch, dict):
continue continue
watch_url = watch.get('url') watch_url = url_or_none(watch.get('url'))
if not watch_url or not isinstance(watch_url, compat_str): if not watch_url:
continue continue
format_id_list = [stream_type] format_id_list = [stream_type]
maxrate = watch.get('maxrate') maxrate = watch.get('maxrate')

View File

@ -15,6 +15,7 @@ from ..utils import (
try_get, try_get,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
url_or_none,
urljoin, urljoin,
) )
@ -67,8 +68,8 @@ class ZDFIE(ZDFBaseIE):
def _extract_subtitles(src): def _extract_subtitles(src):
subtitles = {} subtitles = {}
for caption in try_get(src, lambda x: x['captions'], list) or []: for caption in try_get(src, lambda x: x['captions'], list) or []:
subtitle_url = caption.get('uri') subtitle_url = url_or_none(caption.get('uri'))
if subtitle_url and isinstance(subtitle_url, compat_str): if subtitle_url:
lang = caption.get('language', 'deu') lang = caption.get('language', 'deu')
subtitles.setdefault(lang, []).append({ subtitles.setdefault(lang, []).append({
'url': subtitle_url, 'url': subtitle_url,
@ -76,8 +77,8 @@ class ZDFIE(ZDFBaseIE):
return subtitles return subtitles
def _extract_format(self, video_id, formats, format_urls, meta): def _extract_format(self, video_id, formats, format_urls, meta):
format_url = meta.get('url') format_url = url_or_none(meta.get('url'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
return return
if format_url in format_urls: if format_url in format_urls:
return return
@ -152,7 +153,8 @@ class ZDFIE(ZDFBaseIE):
content, lambda x: x['teaserImageRef']['layouts'], dict) content, lambda x: x['teaserImageRef']['layouts'], dict)
if layouts: if layouts:
for layout_key, layout_url in layouts.items(): for layout_key, layout_url in layouts.items():
if not isinstance(layout_url, compat_str): layout_url = url_or_none(layout_url)
if not layout_url:
continue continue
thumbnail = { thumbnail = {
'url': layout_url, 'url': layout_url,

View File

@ -184,6 +184,7 @@ DATE_FORMATS_MONTH_FIRST.extend([
]) ])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
def preferredencoding(): def preferredencoding():
@ -1900,6 +1901,13 @@ def strip_or_none(v):
return None if v is None else v.strip() return None if v is None else v.strip()
def url_or_none(url):
if not url or not isinstance(url, compat_str):
return None
url = url.strip()
return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
def parse_duration(s): def parse_duration(s):
if not isinstance(s, compat_basestring): if not isinstance(s, compat_basestring):
return None return None
@ -2316,7 +2324,7 @@ def parse_age_limit(s):
def strip_jsonp(code): def strip_jsonp(code):
return re.sub( return re.sub(
r'''(?sx)^ r'''(?sx)^
(?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+) (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
(?:\s*&&\s*(?P=func_name))? (?:\s*&&\s*(?P=func_name))?
\s*\(\s*(?P<callback_data>.*)\);? \s*\(\s*(?P<callback_data>.*)\);?
\s*?(?://[^\n]*)*$''', \s*?(?://[^\n]*)*$''',

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2018.06.25' __version__ = '2018.07.21'