1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-09 08:53:22 +08:00

Merge changes

This commit is contained in:
AlexAplin 2018-07-30 11:03:55 -04:00
commit 6b31024945
109 changed files with 2980 additions and 1005 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.06.14*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.07.29*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.06.14** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.07.29**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2018.06.14 [debug] youtube-dl version 2018.07.29
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -239,3 +239,10 @@ Martin Weinelt
Surya Oktafendri Surya Oktafendri
TingPing TingPing
Alexandre Macabies Alexandre Macabies
Bastian de Groot
Niklas Haas
András Veres-Szentkirályi
Enes Solak
Nathan Rossi
Thomas van der Berg
Luca Cherubin

127
ChangeLog
View File

@ -1,3 +1,130 @@
version 2018.07.29
Extractors
* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076)
+ [pornhub] Add support for subtitles (#16924, #17088)
* [ceskatelevize] Use https for API call (#16997, #16999)
* [dailymotion:playlist] Fix extraction (#16894)
* [ted] Improve extraction
* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085)
* [telecinco] Fix extraction (#17080)
* [mitele] Reduce number of requests
* [rai] Return non HTTP relinker URL intact (#17055)
* [vk] Fix extraction for inline only videos (#16923)
* [streamcloud] Fix extraction (#17054)
* [facebook] Fix tahoe player extraction with authentication (#16655)
+ [puhutv] Add support for puhutv.com (#12712, #16010, #16269)
version 2018.07.21
Core
+ [utils] Introduce url_or_none
* [utils] Allow JSONP without function name (#17028)
+ [extractor/common] Extract DASH and MSS formats from SMIL manifests
Extractors
+ [bbc] Add support for BBC Radio Play pages (#17022)
* [iwara] Fix download URLs (#17026)
* [vrtnu] Relax title extraction and extract JSON-LD (#17018)
+ [viu] Pass Referer and Origin headers and area id (#16992)
+ [vimeo] Add another config regular expression (#17013)
+ [facebook] Extract view count (#16942)
* [dailymotion] Improve description extraction (#16984)
* [slutload] Fix and improve extraction (#17001)
* [mediaset] Fix extraction (#16977)
+ [theplatform] Add support for theplatform TLD customization (#16977)
* [imgur] Relax URL regular expression (#16987)
* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262,
#16959)
version 2018.07.10
Core
* [utils] Share JSON-LD regular expression
* [downloader/dash] Improve error handling (#16927)
Extractors
+ [nrktv] Add support for new season and serie URL schema
+ [nrktv] Add support for new episode URL schema (#16909)
+ [frontendmasters] Add support for frontendmasters.com (#3661, #16328)
* [funk] Fix extraction (#16918)
* [watchbox] Fix extraction (#16904)
* [dplayit] Sort formats
* [dplayit] Fix extraction (#16901)
* [youtube] Improve login error handling (#13822)
version 2018.07.04
Core
* [extractor/common] Properly escape % in MPD templates (#16867)
* [extractor/common] Use source URL as Referer for HTML5 entries (16849)
* Prefer ffmpeg over avconv by default (#8622)
Extractors
* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899)
* [lynda] Simplify login and improve error capturing (#16891)
+ [go90] Add support for embed URLs (#16873)
* [go90] Detect geo restriction error and pass geo verification headers
(#16874)
* [vlive] Fix live streams extraction (#16871)
* [npo] Fix typo (#16872)
+ [mediaset] Add support for new videos and extract all formats (#16568)
* [dctptv] Restore extraction based on REST API (#16850)
* [svt] Improve extraction and add support for pages (#16802)
* [porncom] Fix extraction (#16808)
version 2018.06.25
Extractors
* [joj] Relax URL regular expression (#16771)
* [brightcove] Workaround sonyliv DRM protected videos (#16807)
* [motherless] Fix extraction (#16786)
* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780)
- [foxnews:insider] Remove extractor (#15810)
+ [foxnews] Add support for iframe embeds (#15810, #16711)
version 2018.06.19
Core
+ [extractor/common] Introduce expected_status in _download_* methods
for convenient accept of HTTP requests failed with non 2xx status codes
+ [compat] Introduce compat_integer_types
Extractors
* [peertube] Improve generic support (#16733)
+ [6play] Use geo verification headers
* [rtbf] Fix extraction for python 3.2
* [vgtv] Improve HLS formats extraction
+ [vgtv] Add support for www.aftonbladet.se/tv URLs
* [bbccouk] Use expected_status
* [markiza] Expect 500 HTTP status code
* [tvnow] Try all clear manifest URLs (#15361)
version 2018.06.18
Core
* [downloader/rtmp] Fix downloading in verbose mode (#16736)
Extractors
+ [markiza] Add support for markiza.sk (#16750)
* [wat] Try all supported adaptive URLs
+ [6play] Add support for rtlplay.be and extract hd usp formats
+ [rtbf] Add support for audio and live streams (#9638, #11923)
+ [rtbf] Extract HLS, DASH and all HTTP formats
+ [rtbf] Extract subtitles
+ [rtbf] Fixup specific HTTP URLs (#16101)
+ [expressen] Add support for expressen.se
* [vidzi] Fix extraction (#16678)
* [pbs] Improve extraction (#16623, #16684)
* [bilibili] Restrict cid regular expression (#16638, #16734)
version 2018.06.14 version 2018.06.14
Core Core

View File

@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms
# INSTALLATION # INSTALLATION
To install it right away for all UNIX users (Linux, OS X, etc.), type: To install it right away for all UNIX users (Linux, macOS, etc.), type:
sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
sudo chmod a+rx /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl
@ -35,7 +35,7 @@ You can also use pip:
This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
OS X users can install youtube-dl with [Homebrew](https://brew.sh/): macOS users can install youtube-dl with [Homebrew](https://brew.sh/):
brew install youtube-dl brew install youtube-dl
@ -427,9 +427,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
default; fix file if we can, warn default; fix file if we can, warn
otherwise) otherwise)
--prefer-avconv Prefer avconv over ffmpeg for running the --prefer-avconv Prefer avconv over ffmpeg for running the
postprocessors (default)
--prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors postprocessors
--prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors (default)
--ffmpeg-location PATH Location of the ffmpeg/avconv binary; --ffmpeg-location PATH Location of the ffmpeg/avconv binary;
either the path to the binary or its either the path to the binary or its
containing directory. containing directory.
@ -442,7 +442,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
# CONFIGURATION # CONFIGURATION
You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
``` ```
@ -870,7 +870,7 @@ Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the op
Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`.
In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox).
Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format.

View File

@ -266,6 +266,7 @@
- **Europa** - **Europa**
- **EveryonesMixtape** - **EveryonesMixtape**
- **ExpoTV** - **ExpoTV**
- **Expressen**
- **ExtremeTube** - **ExtremeTube**
- **EyedoTV** - **EyedoTV**
- **facebook** - **facebook**
@ -289,7 +290,6 @@
- **Foxgay** - **Foxgay**
- **foxnews**: Fox News and Fox Business Video - **foxnews**: Fox News and Fox Business Video
- **foxnews:article** - **foxnews:article**
- **foxnews:insider**
- **FoxSports** - **FoxSports**
- **france2.fr:generation-what** - **france2.fr:generation-what**
- **FranceCulture** - **FranceCulture**
@ -302,6 +302,9 @@
- **Freesound** - **Freesound**
- **freespeech.org** - **freespeech.org**
- **FreshLive** - **FreshLive**
- **FrontendMasters**
- **FrontendMastersCourse**
- **FrontendMastersLesson**
- **Funimation** - **Funimation**
- **FunkChannel** - **FunkChannel**
- **FunkMix** - **FunkMix**
@ -455,6 +458,8 @@
- **mangomolo:live** - **mangomolo:live**
- **mangomolo:video** - **mangomolo:video**
- **ManyVids** - **ManyVids**
- **Markiza**
- **MarkizaPage**
- **massengeschmack.tv** - **massengeschmack.tv**
- **MatchTV** - **MatchTV**
- **MDR**: MDR.DE and KiKA - **MDR**: MDR.DE and KiKA
@ -587,7 +592,9 @@
- **NRKSkole**: NRK Skole - **NRKSkole**: NRK Skole
- **NRKTV**: NRK TV and NRK Radio - **NRKTV**: NRK TV and NRK Radio
- **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte
- **NRKTVEpisode**
- **NRKTVEpisodes** - **NRKTVEpisodes**
- **NRKTVSeason**
- **NRKTVSeries** - **NRKTVSeries**
- **ntv.ru** - **ntv.ru**
- **Nuvid** - **Nuvid**
@ -665,6 +672,8 @@
- **PrimeShareTV** - **PrimeShareTV**
- **PromptFile** - **PromptFile**
- **prosiebensat1**: ProSiebenSat.1 Digital - **prosiebensat1**: ProSiebenSat.1 Digital
- **puhutv**
- **puhutv:serie**
- **Puls4** - **Puls4**
- **Pyvideo** - **Pyvideo**
- **qqmusic**: QQ音乐 - **qqmusic**: QQ音乐
@ -811,6 +820,7 @@
- **StretchInternet** - **StretchInternet**
- **SunPorno** - **SunPorno**
- **SVT** - **SVT**
- **SVTPage**
- **SVTPlay**: SVT Play and Öppet arkiv - **SVTPlay**: SVT Play and Öppet arkiv
- **SVTSeries** - **SVTSeries**
- **SWRMediathek** - **SWRMediathek**

View File

@ -78,6 +78,7 @@ from youtube_dl.utils import (
uppercase_escape, uppercase_escape,
lowercase_escape, lowercase_escape,
url_basename, url_basename,
url_or_none,
base_url, base_url,
urljoin, urljoin,
urlencode_postdata, urlencode_postdata,
@ -507,6 +508,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
def test_url_or_none(self):
self.assertEqual(url_or_none(None), None)
self.assertEqual(url_or_none(''), None)
self.assertEqual(url_or_none('foo'), None)
self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de')
self.assertEqual(url_or_none('http$://foo.de'), None)
self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
self.assertEqual(url_or_none('//foo.de'), '//foo.de')
def test_parse_age_limit(self): def test_parse_age_limit(self):
self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(None), None)
self.assertEqual(parse_age_limit(False), None) self.assertEqual(parse_age_limit(False), None)
@ -717,6 +728,10 @@ class TestUtil(unittest.TestCase):
d = json.loads(stripped) d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'}) self.assertEqual(d, {'status': 'success'})
stripped = strip_jsonp('({"status": "success"});')
d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'})
def test_uppercase_escape(self): def test_uppercase_escape(self):
self.assertEqual(uppercase_escape(''), '') self.assertEqual(uppercase_escape(''), '')
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')

View File

@ -305,8 +305,8 @@ class YoutubeDL(object):
http_chunk_size. http_chunk_size.
The following options are used by the post processors: The following options are used by the post processors:
prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
otherwise prefer avconv. otherwise prefer ffmpeg.
postprocessor_args: A list of additional command-line arguments for the postprocessor_args: A list of additional command-line arguments for the
postprocessor. postprocessor.

View File

@ -2787,6 +2787,12 @@ except NameError: # Python 3
compat_numeric_types = (int, float, complex) compat_numeric_types = (int, float, complex)
try:
compat_integer_types = (int, long)
except NameError: # Python 3
compat_integer_types = (int, )
if sys.version_info < (2, 7): if sys.version_info < (2, 7):
def compat_socket_create_connection(address, timeout, source_address=None): def compat_socket_create_connection(address, timeout, source_address=None):
host, port = address host, port = address
@ -2974,6 +2980,7 @@ __all__ = [
'compat_http_client', 'compat_http_client',
'compat_http_server', 'compat_http_server',
'compat_input', 'compat_input',
'compat_integer_types',
'compat_itertools_count', 'compat_itertools_count',
'compat_kwargs', 'compat_kwargs',
'compat_numeric_types', 'compat_numeric_types',

View File

@ -2,7 +2,10 @@ from __future__ import unicode_literals
from .fragment import FragmentFD from .fragment import FragmentFD
from ..compat import compat_urllib_error from ..compat import compat_urllib_error
from ..utils import urljoin from ..utils import (
DownloadError,
urljoin,
)
class DashSegmentsFD(FragmentFD): class DashSegmentsFD(FragmentFD):
@ -57,6 +60,14 @@ class DashSegmentsFD(FragmentFD):
count += 1 count += 1
if count <= fragment_retries: if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries) self.report_retry_fragment(err, frag_index, count, fragment_retries)
except DownloadError:
# Don't retry fragment if error occurred during HTTP downloading
# itself since it has own retry settings
if not fatal:
self.report_skip_fragment(frag_index)
break
raise
if count > fragment_retries: if count > fragment_retries:
if not fatal: if not fatal:
self.report_skip_fragment(frag_index) self.report_skip_fragment(frag_index)

View File

@ -7,6 +7,7 @@ from .turner import TurnerBaseIE
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -98,7 +99,7 @@ class AdultSwimIE(TurnerBaseIE):
if not video_id: if not video_id:
entries = [] entries = []
for episode in video_data.get('archiveEpisodes', []): for episode in video_data.get('archiveEpisodes', []):
episode_url = episode.get('url') episode_url = url_or_none(episode.get('url'))
if not episode_url: if not episode_url:
continue continue
entries.append(self.url_result( entries.append(self.url_result(

View File

@ -9,6 +9,7 @@ from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
xpath_text, xpath_text,
) )
@ -304,7 +305,7 @@ class AfreecaTVIE(InfoExtractor):
file_elements = video_element.findall(compat_xpath('./file')) file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1 one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1): for file_num, file_element in enumerate(file_elements, start=1):
file_url = file_element.text file_url = url_or_none(file_element.text)
if not file_url: if not file_url:
continue continue
key = file_element.get('key', '') key = file_element.get('key', '')

View File

@ -3,11 +3,12 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none,
parse_iso8601,
mimetype2ext,
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none,
mimetype2ext,
parse_iso8601,
url_or_none,
) )
@ -35,7 +36,7 @@ class AMPIE(InfoExtractor):
media_thumbnail = [media_thumbnail] media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail: for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {}) thumbnail = thumbnail_data.get('@attributes', {})
thumbnail_url = thumbnail.get('url') thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url: if not thumbnail_url:
continue continue
thumbnails.append({ thumbnails.append({
@ -51,7 +52,7 @@ class AMPIE(InfoExtractor):
media_subtitle = [media_subtitle] media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle: for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {}) subtitle = subtitle_data.get('@attributes', {})
subtitle_href = subtitle.get('href') subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href: if not subtitle_href:
continue continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
@ -65,7 +66,7 @@ class AMPIE(InfoExtractor):
media_content = [media_content] media_content = [media_content]
for media_data in media_content: for media_data in media_content:
media = media_data.get('@attributes', {}) media = media_data.get('@attributes', {})
media_url = media.get('url') media_url = url_or_none(media.get('url'))
if not media_url: if not media_url:
continue continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url) ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
@ -79,7 +80,7 @@ class AMPIE(InfoExtractor):
else: else:
formats.append({ formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'], 'url': media_url,
'tbr': int_or_none(media.get('bitrate')), 'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')), 'filesize': int_or_none(media.get('fileSize')),
'ext': ext, 'ext': ext,

View File

@ -8,6 +8,7 @@ from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
ExtractorError, ExtractorError,
url_or_none,
urlencode_postdata, urlencode_postdata,
urljoin, urljoin,
) )
@ -165,7 +166,7 @@ class AnimeOnDemandIE(InfoExtractor):
}, fatal=False) }, fatal=False)
if not playlist: if not playlist:
continue continue
stream_url = playlist.get('streamurl') stream_url = url_or_none(playlist.get('streamurl'))
if stream_url: if stream_url:
rtmp = re.search( rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',

View File

@ -7,6 +7,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -77,7 +78,7 @@ class AolIE(InfoExtractor):
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []): for rendition in video_data.get('renditions', []):
video_url = rendition.get('url') video_url = url_or_none(rendition.get('url'))
if not video_url: if not video_url:
continue continue
ext = rendition.get('format') ext = rendition.get('format')

View File

@ -4,10 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
js_to_json, js_to_json,
url_or_none,
) )
@ -68,8 +68,8 @@ class APAIE(InfoExtractor):
for source in sources: for source in sources:
if not isinstance(source, dict): if not isinstance(source, dict):
continue continue
source_url = source.get('file') source_url = url_or_none(source.get('file'))
if not source_url or not isinstance(source_url, compat_str): if not source_url:
continue continue
ext = determine_ext(source_url) ext = determine_ext(source_url)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -5,6 +5,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
mimetype2ext, mimetype2ext,
url_or_none,
) )
@ -43,7 +44,7 @@ class AparatIE(InfoExtractor):
formats = [] formats = []
for item in file_list[0]: for item in file_list[0]:
file_url = item.get('file') file_url = url_or_none(item.get('file'))
if not file_url: if not file_url:
continue continue
ext = mimetype2ext(item.get('type')) ext = mimetype2ext(item.get('type'))

View File

@ -5,7 +5,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .generic import GenericIE from .generic import GenericIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -15,6 +14,7 @@ from ..utils import (
unified_strdate, unified_strdate,
xpath_text, xpath_text,
update_url_query, update_url_query,
url_or_none,
) )
from ..compat import compat_etree_fromstring from ..compat import compat_etree_fromstring
@ -100,7 +100,7 @@ class ARDMediathekIE(InfoExtractor):
quality = stream.get('_quality') quality = stream.get('_quality')
server = stream.get('_server') server = stream.get('_server')
for stream_url in stream_urls: for stream_url in stream_urls:
if not isinstance(stream_url, compat_str) or '//' not in stream_url: if not url_or_none(stream_url):
continue continue
ext = determine_ext(stream_url) ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'): if quality != 'auto' and ext in ('f4m', 'm3u8'):

View File

@ -19,6 +19,7 @@ from ..utils import (
unescapeHTML, unescapeHTML,
update_url_query, update_url_query,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -131,8 +132,8 @@ class BandcampIE(InfoExtractor):
fatal=False) fatal=False)
if not stat: if not stat:
continue continue
retry_url = stat.get('retry_url') retry_url = url_or_none(stat.get('retry_url'))
if not isinstance(retry_url, compat_str): if not retry_url:
continue continue
formats.append({ formats.append({
'url': self._proto_relative_url(retry_url, 'http:'), 'url': self._proto_relative_url(retry_url, 'http:'),
@ -306,7 +307,7 @@ class BandcampWeeklyIE(InfoExtractor):
formats = [] formats = []
for format_id, format_url in show['audio_stream'].items(): for format_id, format_url in show['audio_stream'].items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
for known_ext in KNOWN_EXTENSIONS: for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id: if known_ext in format_id:

View File

@ -21,7 +21,6 @@ from ..utils import (
urljoin, urljoin,
) )
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_HTTPError, compat_HTTPError,
compat_urlparse, compat_urlparse,
) )
@ -334,14 +333,9 @@ class BBCCoUkIE(InfoExtractor):
self._raise_extractor_error(last_exception) self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None): def _download_media_selector_url(self, url, programme_id=None):
try: media_selection = self._download_xml(
media_selection = self._download_xml( url, programme_id, 'Downloading media selection XML',
url, programme_id, 'Downloading media selection XML') expected_status=(403, 404))
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
else:
raise
return self._process_media_selector(media_selection, programme_id) return self._process_media_selector(media_selection, programme_id)
def _process_media_selector(self, media_selection, programme_id): def _process_media_selector(self, media_selection, programme_id):
@ -784,6 +778,17 @@ class BBCIE(BBCCoUkIE):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} }
}, {
# window.__PRELOADED_STATE__
'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
'info_dict': {
'id': 'b0b9z4vz',
'ext': 'mp4',
'title': 'Prom 6: An American in Paris and Turangalila',
'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
}] }]
@classmethod @classmethod
@ -1006,6 +1011,36 @@ class BBCIE(BBCCoUkIE):
'subtitles': subtitles, 'subtitles': subtitles,
} }
preload_state = self._parse_json(self._search_regex(
r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', default='{}'), playlist_id, fatal=False)
if preload_state:
current_programme = preload_state.get('programmes', {}).get('current') or {}
programme_id = current_programme.get('id')
if current_programme and programme_id and current_programme.get('type') == 'playable_item':
title = current_programme.get('titles', {}).get('tertiary') or playlist_title
formats, subtitles = self._download_media_selector(programme_id)
self._sort_formats(formats)
synopses = current_programme.get('synopses') or {}
network = current_programme.get('network') or {}
duration = int_or_none(
current_programme.get('duration', {}).get('value'))
thumbnail = None
image_url = current_programme.get('image_url')
if image_url:
thumbnail = image_url.replace('{recipe}', '1920x1920')
return {
'id': programme_id,
'title': title,
'description': dict_get(synopses, ('long', 'medium', 'short')),
'thumbnail': thumbnail,
'duration': duration,
'uploader': network.get('short_title'),
'uploader_id': network.get('id'),
'formats': formats,
'subtitles': subtitles,
}
bbc3_config = self._parse_json( bbc3_config = self._parse_json(
self._search_regex( self._search_regex(
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,

View File

@ -4,8 +4,10 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import compat_str from ..utils import (
from ..utils import int_or_none int_or_none,
url_or_none,
)
class BreakIE(InfoExtractor): class BreakIE(InfoExtractor):
@ -55,8 +57,8 @@ class BreakIE(InfoExtractor):
formats = [] formats = []
for video in content: for video in content:
video_url = video.get('url') video_url = url_or_none(video.get('url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
bitrate = int_or_none(self._search_regex( bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None)) r'(\d+)_kbps', video_url, 'tbr', default=None))

View File

@ -572,7 +572,8 @@ class BrightcoveNewIE(AdobePassIE):
container = source.get('container') container = source.get('container')
ext = mimetype2ext(source.get('type')) ext = mimetype2ext(source.get('type'))
src = source.get('src') src = source.get('src')
if ext == 'ism' or container == 'WVM': # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
continue continue
elif ext == 'm3u8' or container == 'M2TS': elif ext == 'm3u8' or container == 'M2TS':
if not src: if not src:
@ -629,6 +630,14 @@ class BrightcoveNewIE(AdobePassIE):
'format_id': build_format_id('rtmp'), 'format_id': build_format_id('rtmp'),
}) })
formats.append(f) formats.append(f)
if not formats:
# for sonyliv.com DRM protected videos
s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
if s3_source_url:
formats.append({
'url': s3_source_url,
'format_id': 'source',
})
errors = json_data.get('errors') errors = json_data.get('errors')
if not formats and errors: if not formats and errors:

View File

@ -2,10 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
) )
@ -56,8 +56,8 @@ class CamModelsIE(InfoExtractor):
for media in encodings: for media in encodings:
if not isinstance(media, dict): if not isinstance(media, dict):
continue continue
media_url = media.get('location') media_url = url_or_none(media.get('location'))
if not media_url or not isinstance(media_url, compat_str): if not media_url:
continue continue
format_id_list = [format_id] format_id_list = [format_id]

View File

@ -11,6 +11,7 @@ from ..utils import (
strip_or_none, strip_or_none,
float_or_none, float_or_none,
int_or_none, int_or_none,
merge_dicts,
parse_iso8601, parse_iso8601,
) )
@ -248,9 +249,13 @@ class VrtNUIE(GigyaBaseIE):
webpage, urlh = self._download_webpage_handle(url, display_id) webpage, urlh = self._download_webpage_handle(url, display_id)
title = self._html_search_regex( info = self._search_json_ld(webpage, display_id, default={})
# title is optional here since it may be extracted by extractor
# that is delegated from here
title = strip_or_none(self._html_search_regex(
r'(?ms)<h1 class="content__heading">(.+?)</h1>', r'(?ms)<h1 class="content__heading">(.+?)</h1>',
webpage, 'title').strip() webpage, 'title', default=None))
description = self._html_search_regex( description = self._html_search_regex(
r'(?ms)<div class="content__description">(.+?)</div>', r'(?ms)<div class="content__description">(.+?)</div>',
@ -295,7 +300,7 @@ class VrtNUIE(GigyaBaseIE):
# the first one # the first one
video_id = list(video.values())[0].get('videoid') video_id = list(video.values())[0].get('videoid')
return { return merge_dicts(info, {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(), 'ie_key': CanvasIE.ie_key(),
@ -307,4 +312,4 @@ class VrtNUIE(GigyaBaseIE):
'season_number': season_number, 'season_number': season_number,
'episode_number': episode_number, 'episode_number': episode_number,
'release_date': release_date, 'release_date': release_date,
} })

View File

@ -4,13 +4,13 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
clean_html, clean_html,
int_or_none, int_or_none,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
parse_resolution, parse_resolution,
url_or_none,
) )
@ -53,8 +53,8 @@ class CCMAIE(InfoExtractor):
media_url = media['media']['url'] media_url = media['media']['url']
if isinstance(media_url, list): if isinstance(media_url, list):
for format_ in media_url: for format_ in media_url:
format_url = format_.get('file') format_url = url_or_none(format_.get('file'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
label = format_.get('label') label = format_.get('label')
f = parse_resolution(label) f = parse_resolution(label)

View File

@ -108,7 +108,7 @@ class CeskaTelevizeIE(InfoExtractor):
for user_agent in (None, USER_AGENTS['Safari']): for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request( req = sanitized_Request(
'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
data=urlencode_postdata(data)) data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded') req.add_header('Content-type', 'application/x-www-form-urlencoded')

View File

@ -19,6 +19,7 @@ from ..compat import (
compat_cookies, compat_cookies,
compat_etree_fromstring, compat_etree_fromstring,
compat_getpass, compat_getpass,
compat_integer_types,
compat_http_client, compat_http_client,
compat_os_name, compat_os_name,
compat_str, compat_str,
@ -51,6 +52,7 @@ from ..utils import (
GeoUtils, GeoUtils,
int_or_none, int_or_none,
js_to_json, js_to_json,
JSON_LD_RE,
mimetype2ext, mimetype2ext,
orderedSet, orderedSet,
parse_codecs, parse_codecs,
@ -548,8 +550,26 @@ class InfoExtractor(object):
def IE_NAME(self): def IE_NAME(self):
return compat_str(type(self).__name__[:-2]) return compat_str(type(self).__name__[:-2])
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): @staticmethod
""" Returns the response handle """ def __can_accept_status_code(err, expected_status):
assert isinstance(err, compat_urllib_error.HTTPError)
if expected_status is None:
return False
if isinstance(expected_status, compat_integer_types):
return err.code == expected_status
elif isinstance(expected_status, (list, tuple)):
return err.code in expected_status
elif callable(expected_status):
return expected_status(err.code) is True
else:
assert False
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
"""
Return the response handle.
See _download_webpage docstring for arguments specification.
"""
if note is None: if note is None:
self.report_download_webpage(video_id) self.report_download_webpage(video_id)
elif note is not False: elif note is not False:
@ -578,6 +598,10 @@ class InfoExtractor(object):
try: try:
return self._downloader.urlopen(url_or_request) return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if isinstance(err, compat_urllib_error.HTTPError):
if self.__can_accept_status_code(err, expected_status):
return err.fp
if errnote is False: if errnote is False:
return False return False
if errnote is None: if errnote is None:
@ -590,13 +614,17 @@ class InfoExtractor(object):
self._downloader.report_warning(errmsg) self._downloader.report_warning(errmsg)
return False return False
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
""" Returns a tuple (page content as string, URL handle) """ """
Return a tuple (page content as string, URL handle).
See _download_webpage docstring for arguments specification.
"""
# Strip hashes from the URL (#1038) # Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)): if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0] url_or_request = url_or_request.partition('#')[0]
urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
if urlh is False: if urlh is False:
assert not fatal assert not fatal
return False return False
@ -685,13 +713,52 @@ class InfoExtractor(object):
return content return content
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): def _download_webpage(
""" Returns the data of the page as a string """ self, url_or_request, video_id, note=None, errnote=None,
fatal=True, tries=1, timeout=5, encoding=None, data=None,
headers={}, query={}, expected_status=None):
"""
Return the data of the page as a string.
Arguments:
url_or_request -- plain text URL as a string or
a compat_urllib_request.Requestobject
video_id -- Video/playlist/item identifier (string)
Keyword arguments:
note -- note printed before downloading (string)
errnote -- note printed in case of an error (string)
fatal -- flag denoting whether error should be considered fatal,
i.e. whether it should cause ExtractionError to be raised,
otherwise a warning will be reported and extraction continued
tries -- number of tries
timeout -- sleep interval between tries
encoding -- encoding for a page content decoding, guessed automatically
when not explicitly specified
data -- POST data (bytes)
headers -- HTTP headers (dict)
query -- URL query (dict)
expected_status -- allows to accept failed HTTP requests (non 2xx
status code) by explicitly specifying a set of accepted status
codes. Can be any of the following entities:
- an integer type specifying an exact failed status code to
accept
- a list or a tuple of integer types specifying a list of
failed status codes to accept
- a callable accepting an actual failed status code and
returning True if it should be accepted
Note that this argument does not affect success status codes (2xx)
which are always accepted.
"""
success = False success = False
try_count = 0 try_count = 0
while success is False: while success is False:
try: try:
res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal,
encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
success = True success = True
except compat_http_client.IncompleteRead as e: except compat_http_client.IncompleteRead as e:
try_count += 1 try_count += 1
@ -707,11 +774,17 @@ class InfoExtractor(object):
def _download_xml_handle( def _download_xml_handle(
self, url_or_request, video_id, note='Downloading XML', self, url_or_request, video_id, note='Downloading XML',
errnote='Unable to download XML', transform_source=None, errnote='Unable to download XML', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
"""Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" expected_status=None):
"""
Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
See _download_webpage docstring for arguments specification.
"""
res = self._download_webpage_handle( res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal, url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query) encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
if res is False: if res is False:
return res return res
xml_string, urlh = res xml_string, urlh = res
@ -719,15 +792,21 @@ class InfoExtractor(object):
xml_string, video_id, transform_source=transform_source, xml_string, video_id, transform_source=transform_source,
fatal=fatal), urlh fatal=fatal), urlh
def _download_xml(self, url_or_request, video_id, def _download_xml(
note='Downloading XML', errnote='Unable to download XML', self, url_or_request, video_id,
transform_source=None, fatal=True, encoding=None, note='Downloading XML', errnote='Unable to download XML',
data=None, headers={}, query={}): transform_source=None, fatal=True, encoding=None,
"""Return the xml as an xml.etree.ElementTree.Element""" data=None, headers={}, query={}, expected_status=None):
"""
Return the xml as an xml.etree.ElementTree.Element.
See _download_webpage docstring for arguments specification.
"""
res = self._download_xml_handle( res = self._download_xml_handle(
url_or_request, video_id, note=note, errnote=errnote, url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding, transform_source=transform_source, fatal=fatal, encoding=encoding,
data=data, headers=headers, query=query) data=data, headers=headers, query=query,
expected_status=expected_status)
return res if res is False else res[0] return res if res is False else res[0]
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
@ -745,11 +824,17 @@ class InfoExtractor(object):
def _download_json_handle( def _download_json_handle(
self, url_or_request, video_id, note='Downloading JSON metadata', self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None, errnote='Unable to download JSON metadata', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
"""Return a tuple (JSON object, URL handle)""" expected_status=None):
"""
Return a tuple (JSON object, URL handle).
See _download_webpage docstring for arguments specification.
"""
res = self._download_webpage_handle( res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal, url_or_request, video_id, note, errnote, fatal=fatal,
encoding=encoding, data=data, headers=headers, query=query) encoding=encoding, data=data, headers=headers, query=query,
expected_status=expected_status)
if res is False: if res is False:
return res return res
json_string, urlh = res json_string, urlh = res
@ -760,11 +845,18 @@ class InfoExtractor(object):
def _download_json( def _download_json(
self, url_or_request, video_id, note='Downloading JSON metadata', self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None, errnote='Unable to download JSON metadata', transform_source=None,
fatal=True, encoding=None, data=None, headers={}, query={}): fatal=True, encoding=None, data=None, headers={}, query={},
expected_status=None):
"""
Return the JSON object as a dict.
See _download_webpage docstring for arguments specification.
"""
res = self._download_json_handle( res = self._download_json_handle(
url_or_request, video_id, note=note, errnote=errnote, url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding, transform_source=transform_source, fatal=fatal, encoding=encoding,
data=data, headers=headers, query=query) data=data, headers=headers, query=query,
expected_status=expected_status)
return res if res is False else res[0] return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
@ -1058,8 +1150,7 @@ class InfoExtractor(object):
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex( json_ld = self._search_regex(
r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
html, 'JSON-LD', group='json_ld', **kwargs)
default = kwargs.get('default', NO_DEFAULT) default = kwargs.get('default', NO_DEFAULT)
if not json_ld: if not json_ld:
return default if default is not NO_DEFAULT else {} return default if default is not NO_DEFAULT else {}
@ -1768,9 +1859,7 @@ class InfoExtractor(object):
'height': height, 'height': height,
}) })
formats.extend(m3u8_formats) formats.extend(m3u8_formats)
continue elif src_ext == 'f4m':
if src_ext == 'f4m':
f4m_url = src_url f4m_url = src_url
if not f4m_params: if not f4m_params:
f4m_params = { f4m_params = {
@ -1780,9 +1869,13 @@ class InfoExtractor(object):
f4m_url += '&' if '?' in f4m_url else '?' f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse_urlencode(f4m_params) f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
continue elif src_ext == 'mpd':
formats.extend(self._extract_mpd_formats(
if src_url.startswith('http') and self._is_valid_url(src, video_id): src_url, video_id, mpd_id='dash', fatal=False))
elif re.search(r'\.ism/[Mm]anifest', src_url):
formats.extend(self._extract_ism_formats(
src_url, video_id, ism_id='mss', fatal=False))
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1 http_count += 1
formats.append({ formats.append({
'url': src_url, 'url': src_url,
@ -1793,7 +1886,6 @@ class InfoExtractor(object):
'width': width, 'width': width,
'height': height, 'height': height,
}) })
continue
return formats return formats
@ -2015,7 +2107,21 @@ class InfoExtractor(object):
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers): def prepare_template(template_name, identifiers):
t = representation_ms_info[template_name] tmpl = representation_ms_info[template_name]
# First of, % characters outside $...$ templates
# must be escaped by doubling for proper processing
# by % operator string formatting used further (see
# https://github.com/rg3/youtube-dl/issues/16867).
t = ''
in_template = False
for c in tmpl:
t += c
if c == '$':
in_template = not in_template
elif c == '%' and not in_template:
t += c
# Next, $...$ templates are translated to their
# %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id) t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
@ -2346,6 +2452,8 @@ class InfoExtractor(object):
media_info['subtitles'].setdefault(lang, []).append({ media_info['subtitles'].setdefault(lang, []).append({
'url': absolute_url(src), 'url': absolute_url(src),
}) })
for f in media_info['formats']:
f.setdefault('http_headers', {})['Referer'] = base_url
if media_info['formats'] or media_info['subtitles']: if media_info['formats'] or media_info['subtitles']:
entries.append(media_info) entries.append(media_info)
return entries return entries

View File

@ -4,16 +4,14 @@ from __future__ import unicode_literals, division
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_str,
compat_HTTPError,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
url_or_none,
ExtractorError ExtractorError
) )
@ -86,8 +84,8 @@ class CrackleIE(InfoExtractor):
for e in media['MediaURLs']: for e in media['MediaURLs']:
if e.get('UseDRM') is True: if e.get('UseDRM') is True:
continue continue
format_url = e.get('Path') format_url = url_or_none(e.get('Path'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':
@ -124,8 +122,8 @@ class CrackleIE(InfoExtractor):
for cc_file in cc_files: for cc_file in cc_files:
if not isinstance(cc_file, dict): if not isinstance(cc_file, dict):
continue continue
cc_url = cc_file.get('Path') cc_url = url_or_none(cc_file.get('Path'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = cc_file.get('Locale') or 'en' lang = cc_file.get('Locale') or 'en'
subtitles.setdefault(lang, []).append({'url': cc_url}) subtitles.setdefault(lang, []).append({'url': cc_url})

View File

@ -262,6 +262,9 @@ class CrunchyrollIE(CrunchyrollBaseIE):
# Just test metadata extraction # Just test metadata extraction
'skip_download': True, 'skip_download': True,
}, },
}, {
'url': 'http://www.crunchyroll.com/media-723735',
'only_matching': True,
}] }]
_FORMAT_IDS = { _FORMAT_IDS = {
@ -580,7 +583,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = 'crunchyroll:playlist' IE_NAME = 'crunchyroll:playlist'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import base64 import base64
import functools
import hashlib import hashlib
import itertools import itertools
import json import json
@ -16,11 +17,13 @@ from ..utils import (
error_to_compat_str, error_to_compat_str,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
mimetype2ext,
OnDemandPagedList,
parse_iso8601, parse_iso8601,
sanitized_Request, sanitized_Request,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
mimetype2ext, urlencode_postdata,
) )
@ -144,7 +147,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
description = self._og_search_description(webpage) or self._html_search_meta( description = self._og_search_description(
webpage, default=None) or self._html_search_meta(
'description', webpage, 'description') 'description', webpage, 'description')
view_count_str = self._search_regex( view_count_str = self._search_regex(
@ -342,17 +346,93 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:playlist' IE_NAME = 'dailymotion:playlist'
_VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
_MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
_TESTS = [{ _TESTS = [{
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': { 'info_dict': {
'title': 'SPORT', 'title': 'SPORT',
'id': 'xv4bw_nqtv_sport', 'id': 'xv4bw',
}, },
'playlist_mincount': 20, 'playlist_mincount': 20,
}] }]
_PAGE_SIZE = 100
def _fetch_page(self, playlist_id, authorizaion, page):
page += 1
videos = self._download_json(
'https://graphql.api.dailymotion.com',
playlist_id, 'Downloading page %d' % page,
data=json.dumps({
'query': '''{
collection(xid: "%s") {
videos(first: %d, page: %d) {
pageInfo {
hasNextPage
nextPage
}
edges {
node {
xid
url
}
}
}
}
}''' % (playlist_id, self._PAGE_SIZE, page)
}).encode(), headers={
'Authorization': authorizaion,
'Origin': 'https://www.dailymotion.com',
})['data']['collection']['videos']
for edge in videos['edges']:
node = edge['node']
yield self.url_result(
node['url'], DailymotionIE.ie_key(), node['xid'])
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
api = self._parse_json(self._search_regex(
r'__PLAYER_CONFIG__\s*=\s*({.+?});',
webpage, 'player config'), playlist_id)['context']['api']
auth = self._download_json(
api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
playlist_id, data=urlencode_postdata({
'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
'grant_type': 'client_credentials',
}))
authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
entries = OnDemandPagedList(functools.partial(
self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
return self.playlist_result(
entries, playlist_id,
self._og_search_title(webpage))
class DailymotionUserIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {
'id': 'nqtv',
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
}, {
'url': 'http://www.dailymotion.com/user/UnderProject',
'info_dict': {
'id': 'UnderProject',
'title': 'UnderProject',
},
'playlist_mincount': 1800,
'expected_warnings': [
'Stopped at duplicated page',
],
'skip': 'Takes too long time',
}]
def _extract_entries(self, id): def _extract_entries(self, id):
video_ids = set() video_ids = set()
@ -378,43 +458,6 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
break break
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
return {
'_type': 'playlist',
'id': playlist_id,
'title': self._og_search_title(webpage),
'entries': self._extract_entries(playlist_id),
}
class DailymotionUserIE(DailymotionPlaylistIE):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {
'id': 'nqtv',
'title': 'Rémi Gaillard',
},
'playlist_mincount': 100,
}, {
'url': 'http://www.dailymotion.com/user/UnderProject',
'info_dict': {
'id': 'UnderProject',
'title': 'UnderProject',
},
'playlist_mincount': 1800,
'expected_warnings': [
'Stopped at duplicated page',
],
'skip': 'Takes too long time',
}]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
user = mobj.group('user') user = mobj.group('user')

View File

@ -5,13 +5,16 @@ from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
float_or_none, float_or_none,
unified_strdate, int_or_none,
unified_timestamp,
url_or_none,
) )
class DctpTvIE(InfoExtractor): class DctpTvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)'
_TEST = { _TESTS = [{
# 4x3
'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/',
'info_dict': { 'info_dict': {
'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'id': '95eaa4f33dad413aa17b4ee613cccc6c',
@ -19,37 +22,55 @@ class DctpTvIE(InfoExtractor):
'ext': 'flv', 'ext': 'flv',
'title': 'Videoinstallation für eine Kaufhausfassade', 'title': 'Videoinstallation für eine Kaufhausfassade',
'description': 'Kurzfilm', 'description': 'Kurzfilm',
'upload_date': '20110407',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 71.24, 'duration': 71.24,
'timestamp': 1302172322,
'upload_date': '20110407',
}, },
'params': { 'params': {
# rtmp download # rtmp download
'skip_download': True, 'skip_download': True,
}, },
} }, {
# 16x9
'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/',
'only_matching': True,
}]
_BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com'
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) version = self._download_json(
'%s/version.json' % self._BASE_URL, display_id,
'Downloading version JSON')
video_id = self._html_search_meta( restapi_base = '%s/%s/restapi' % (
'DC.identifier', webpage, 'video id', self._BASE_URL, version['version_name'])
default=None) or self._search_regex(
r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id')
title = self._og_search_title(webpage) info = self._download_json(
'%s/slugs/%s.json' % (restapi_base, display_id), display_id,
'Downloading video info JSON')
media = self._download_json(
'%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])),
display_id, 'Downloading media JSON')
uuid = media['uuid']
title = media['title']
ratio = '16x9' if media.get('is_wide') else '4x3'
play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio)
servers = self._download_json( servers = self._download_json(
'http://www.dctp.tv/streaming_servers/', display_id, 'http://www.dctp.tv/streaming_servers/', display_id,
note='Downloading server list', fatal=False) note='Downloading server list JSON', fatal=False)
if servers: if servers:
endpoint = next( endpoint = next(
server['endpoint'] server['endpoint']
for server in servers for server in servers
if isinstance(server.get('endpoint'), compat_str) and if url_or_none(server.get('endpoint')) and
'cloudfront' in server['endpoint']) 'cloudfront' in server['endpoint'])
else: else:
endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/'
@ -60,27 +81,35 @@ class DctpTvIE(InfoExtractor):
formats = [{ formats = [{
'url': endpoint, 'url': endpoint,
'app': app, 'app': app,
'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, 'play_path': play_path,
'page_url': url, 'page_url': url,
'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf',
'ext': 'flv', 'ext': 'flv',
}] }]
description = self._html_search_meta('DC.description', webpage) thumbnails = []
upload_date = unified_strdate( images = media.get('images')
self._html_search_meta('DC.date.created', webpage)) if isinstance(images, list):
thumbnail = self._og_search_thumbnail(webpage) for image in images:
duration = float_or_none(self._search_regex( if not isinstance(image, dict):
r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', continue
default=None), scale=1000) image_url = url_or_none(image.get('url'))
if not image_url:
continue
thumbnails.append({
'url': image_url,
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
})
return { return {
'id': video_id, 'id': uuid,
'title': title,
'formats': formats,
'display_id': display_id, 'display_id': display_id,
'description': description, 'title': title,
'upload_date': upload_date, 'alt_title': media.get('subtitle'),
'thumbnail': thumbnail, 'description': media.get('description') or media.get('teaser'),
'duration': duration, 'timestamp': unified_timestamp(media.get('created')),
'duration': float_or_none(media.get('duration_in_ms'), scale=1000),
'thumbnails': thumbnails,
'formats': formats,
} }

View File

@ -3,7 +3,6 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
extract_attributes, extract_attributes,
@ -12,6 +11,7 @@ from ..utils import (
parse_age_limit, parse_age_limit,
remove_end, remove_end,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -69,9 +69,8 @@ class DiscoveryGoBaseIE(InfoExtractor):
captions = stream.get('captions') captions = stream.get('captions')
if isinstance(captions, list): if isinstance(captions, list):
for caption in captions: for caption in captions:
subtitle_url = caption.get('fileUrl') subtitle_url = url_or_none(caption.get('fileUrl'))
if (not subtitle_url or not isinstance(subtitle_url, compat_str) or if not subtitle_url or not subtitle_url.startswith('http'):
not subtitle_url.startswith('http')):
continue continue
lang = caption.get('fileLang', 'en') lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url) ext = determine_ext(subtitle_url)

View File

@ -21,6 +21,7 @@ from ..utils import (
unified_strdate, unified_strdate,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urljoin,
USER_AGENTS, USER_AGENTS,
) )
@ -310,9 +311,11 @@ class DPlayItIE(InfoExtractor):
if not info: if not info:
info_url = self._search_regex( info_url = self._search_regex(
r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'info url') r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'),
webpage, 'info url', group='url')
info_url = urljoin(url, info_url)
video_id = info_url.rpartition('/')[-1] video_id = info_url.rpartition('/')[-1]
try: try:
@ -322,6 +325,8 @@ class DPlayItIE(InfoExtractor):
'dplayit_token').value, 'dplayit_token').value,
'Referer': url, 'Referer': url,
}) })
if isinstance(info, compat_str):
info = self._parse_json(info, display_id)
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
info = self._parse_json(e.cause.read().decode('utf-8'), display_id) info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
@ -337,6 +342,7 @@ class DPlayItIE(InfoExtractor):
formats = self._extract_m3u8_formats( formats = self._extract_m3u8_formats(
hls_url, display_id, ext='mp4', entry_protocol='m3u8_native', hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls') m3u8_id='hls')
self._sort_formats(formats)
series = self._html_search_regex( series = self._html_search_regex(
r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>', r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',

View File

@ -7,7 +7,6 @@ import json
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_str,
compat_urlparse, compat_urlparse,
) )
from ..utils import ( from ..utils import (
@ -17,6 +16,7 @@ from ..utils import (
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -139,8 +139,8 @@ class DramaFeverIE(DramaFeverBaseIE):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
sub_url = sub.get('url') sub_url = url_or_none(sub.get('url'))
if not sub_url or not isinstance(sub_url, compat_str): if not sub_url:
continue continue
subtitles.setdefault( subtitles.setdefault(
sub.get('code') or sub.get('language') or 'en', []).append({ sub.get('code') or sub.get('language') or 'en', []).append({
@ -163,8 +163,8 @@ class DramaFeverIE(DramaFeverBaseIE):
for format_id, format_dict in download_assets.items(): for format_id, format_dict in download_assets.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
format_url = format_dict.get('url') format_url = url_or_none(format_dict.get('url'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

View File

@ -4,14 +4,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
unsmuggle_url, unsmuggle_url,
url_or_none,
) )
@ -177,7 +175,7 @@ class EaglePlatformIE(InfoExtractor):
video_id, 'Downloading mp4 JSON', fatal=False) video_id, 'Downloading mp4 JSON', fatal=False)
if mp4_data: if mp4_data:
for format_id, format_url in mp4_data.get('data', {}).items(): for format_id, format_url in mp4_data.get('data', {}).items():
if not isinstance(format_url, compat_str): if not url_or_none(format_url):
continue continue
height = int_or_none(format_id) height = int_or_none(format_id)
if height is not None and m3u8_formats_dict.get(height): if height is not None and m3u8_formats_dict.get(height):

View File

@ -8,6 +8,7 @@ from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -34,8 +35,8 @@ class EggheadCourseIE(InfoExtractor):
entries = [] entries = []
for lesson in lessons: for lesson in lessons:
lesson_url = lesson.get('http_url') lesson_url = url_or_none(lesson.get('http_url'))
if not lesson_url or not isinstance(lesson_url, compat_str): if not lesson_url:
continue continue
lesson_id = lesson.get('id') lesson_id = lesson.get('id')
if lesson_id: if lesson_id:
@ -95,7 +96,8 @@ class EggheadLessonIE(InfoExtractor):
formats = [] formats = []
for _, format_url in lesson['media_urls'].items(): for _, format_url in lesson['media_urls'].items():
if not format_url or not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
ext = determine_ext(format_url) ext = determine_ext(format_url)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -11,6 +11,7 @@ from ..utils import (
int_or_none, int_or_none,
parse_duration, parse_duration,
str_to_int, str_to_int,
url_or_none,
) )
@ -82,8 +83,8 @@ class EpornerIE(InfoExtractor):
for format_id, format_dict in formats_dict.items(): for format_id, format_dict in formats_dict.items():
if not isinstance(format_dict, dict): if not isinstance(format_dict, dict):
continue continue
src = format_dict.get('src') src = url_or_none(format_dict.get('src'))
if not isinstance(src, compat_str) or not src.startswith('http'): if not src or not src.startswith('http'):
continue continue
if kind == 'hls': if kind == 'hls':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View File

@ -0,0 +1,77 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
unescapeHTML,
unified_timestamp,
)
class ExpressenIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
'md5': '2fbbe3ca14392a6b1b36941858d33a45',
'info_dict': {
'id': '8690962',
'ext': 'mp4',
'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 788,
'timestamp': 1526639109,
'upload_date': '20180518',
},
}, {
'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
'only_matching': True,
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
def extract_data(name):
return self._parse_json(
self._search_regex(
r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
webpage, 'info', group='value'),
display_id, transform_source=unescapeHTML)
info = extract_data('video-tracking-info')
video_id = info['videoId']
data = extract_data('article-data')
stream = data['stream']
if determine_ext(stream) == 'm3u8':
formats = self._extract_m3u8_formats(
stream, display_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
else:
formats = [{
'url': stream,
}]
self._sort_formats(formats)
title = info.get('titleRaw') or data['title']
description = info.get('descriptionRaw')
thumbnail = info.get('socialMediaImage') or data.get('image')
duration = int_or_none(info.get('videoTotalSecondsDuration') or
data.get('totalSecondsDuration'))
timestamp = unified_timestamp(info.get('publishDate'))
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
}

View File

@ -335,6 +335,7 @@ from .esri import EsriVideoIE
from .europa import EuropaIE from .europa import EuropaIE
from .everyonesmixtape import EveryonesMixtapeIE from .everyonesmixtape import EveryonesMixtapeIE
from .expotv import ExpoTVIE from .expotv import ExpoTVIE
from .expressen import ExpressenIE
from .extremetube import ExtremeTubeIE from .extremetube import ExtremeTubeIE
from .eyedotv import EyedoTVIE from .eyedotv import EyedoTVIE
from .facebook import ( from .facebook import (
@ -372,7 +373,6 @@ from .foxgay import FoxgayIE
from .foxnews import ( from .foxnews import (
FoxNewsIE, FoxNewsIE,
FoxNewsArticleIE, FoxNewsArticleIE,
FoxNewsInsiderIE,
) )
from .foxsports import FoxSportsIE from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE from .franceculture import FranceCultureIE
@ -390,6 +390,11 @@ from .francetv import (
from .freesound import FreesoundIE from .freesound import FreesoundIE
from .freespeech import FreespeechIE from .freespeech import FreespeechIE
from .freshlive import FreshLiveIE from .freshlive import FreshLiveIE
from .frontendmasters import (
FrontendMastersIE,
FrontendMastersLessonIE,
FrontendMastersCourseIE
)
from .funimation import FunimationIE from .funimation import FunimationIE
from .funk import ( from .funk import (
FunkMixIE, FunkMixIE,
@ -589,6 +594,10 @@ from .mangomolo import (
MangomoloLiveIE, MangomoloLiveIE,
) )
from .manyvids import ManyVidsIE from .manyvids import ManyVidsIE
from .markiza import (
MarkizaIE,
MarkizaPageIE,
)
from .massengeschmacktv import MassengeschmackTVIE from .massengeschmacktv import MassengeschmackTVIE
from .matchtv import MatchTVIE from .matchtv import MatchTVIE
from .mdr import MDRIE from .mdr import MDRIE
@ -759,7 +768,9 @@ from .nrk import (
NRKSkoleIE, NRKSkoleIE,
NRKTVIE, NRKTVIE,
NRKTVDirekteIE, NRKTVDirekteIE,
NRKTVEpisodeIE,
NRKTVEpisodesIE, NRKTVEpisodesIE,
NRKTVSeasonIE,
NRKTVSeriesIE, NRKTVSeriesIE,
) )
from .ntvde import NTVDeIE from .ntvde import NTVDeIE
@ -849,6 +860,10 @@ from .pornhub import (
from .pornotube import PornotubeIE from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE from .pornoxo import PornoXOIE
from .puhutv import (
PuhuTVIE,
PuhuTVSerieIE,
)
from .presstv import PressTVIE from .presstv import PressTVIE
from .primesharetv import PrimeShareTVIE from .primesharetv import PrimeShareTVIE
from .promptfile import PromptFileIE from .promptfile import PromptFileIE
@ -1036,6 +1051,7 @@ from .stretchinternet import StretchInternetIE
from .sunporno import SunPornoIE from .sunporno import SunPornoIE
from .svt import ( from .svt import (
SVTIE, SVTIE,
SVTPageIE,
SVTPlayIE, SVTPlayIE,
SVTSeriesIE, SVTSeriesIE,
) )
@ -1275,6 +1291,7 @@ from .viki import (
VikiIE, VikiIE,
VikiChannelIE, VikiChannelIE,
) )
from .viqeo import ViqeoIE
from .viu import ( from .viu import (
ViuIE, ViuIE,
ViuPlaylistIE, ViuPlaylistIE,

View File

@ -20,6 +20,7 @@ from ..utils import (
int_or_none, int_or_none,
js_to_json, js_to_json,
limit_length, limit_length,
parse_count,
sanitized_Request, sanitized_Request,
try_get, try_get,
urlencode_postdata, urlencode_postdata,
@ -75,7 +76,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '274175099429670', 'id': '274175099429670',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Asif Nawab Butt posted a video to his Timeline.', 'title': 're:^Asif Nawab Butt posted a video',
'uploader': 'Asif Nawab Butt', 'uploader': 'Asif Nawab Butt',
'upload_date': '20140506', 'upload_date': '20140506',
'timestamp': 1399398998, 'timestamp': 1399398998,
@ -133,7 +134,7 @@ class FacebookIE(InfoExtractor):
}, { }, {
# have 1080P, but only up to 720p in swf params # have 1080P, but only up to 720p in swf params
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
'md5': '0d9813160b146b3bc8744e006027fcc6', 'md5': '9571fae53d4165bbbadb17a94651dcdc',
'info_dict': { 'info_dict': {
'id': '10155529876156509', 'id': '10155529876156509',
'ext': 'mp4', 'ext': 'mp4',
@ -142,6 +143,7 @@ class FacebookIE(InfoExtractor):
'upload_date': '20161030', 'upload_date': '20161030',
'uploader': 'CNN', 'uploader': 'CNN',
'thumbnail': r're:^https?://.*', 'thumbnail': r're:^https?://.*',
'view_count': int,
}, },
}, { }, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
@ -149,7 +151,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1417995061575415', 'id': '1417995061575415',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:a7b86ca673f51800cd54687b7f4012fe', 'title': 'md5:1db063d6a8c13faa8da727817339c857',
'timestamp': 1486648217, 'timestamp': 1486648217,
'upload_date': '20170209', 'upload_date': '20170209',
'uploader': 'Yaroslav Korpan', 'uploader': 'Yaroslav Korpan',
@ -176,7 +178,7 @@ class FacebookIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '1396382447100162', 'id': '1396382447100162',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3', 'title': 'md5:19a428bbde91364e3de815383b54a235',
'timestamp': 1486035494, 'timestamp': 1486035494,
'upload_date': '20170202', 'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn', 'uploader': 'Elisabeth Ahtn',
@ -353,7 +355,6 @@ class FacebookIE(InfoExtractor):
tahoe_data = self._download_webpage( tahoe_data = self._download_webpage(
self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id, self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
data=urlencode_postdata({ data=urlencode_postdata({
'__user': 0,
'__a': 1, '__a': 1,
'__pc': self._search_regex( '__pc': self._search_regex(
r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage, r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
@ -361,6 +362,9 @@ class FacebookIE(InfoExtractor):
'__rev': self._search_regex( '__rev': self._search_regex(
r'client_revision["\']\s*:\s*(\d+),', webpage, r'client_revision["\']\s*:\s*(\d+),', webpage,
'client revision', default='3944515'), 'client revision', default='3944515'),
'fb_dtsg': self._search_regex(
r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
webpage, 'dtsg token', default=''),
}), }),
headers={ headers={
'Content-Type': 'application/x-www-form-urlencoded', 'Content-Type': 'application/x-www-form-urlencoded',
@ -426,6 +430,10 @@ class FacebookIE(InfoExtractor):
'timestamp', default=None)) 'timestamp', default=None))
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
view_count = parse_count(self._search_regex(
r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
default=None))
info_dict = { info_dict = {
'id': video_id, 'id': video_id,
'title': video_title, 'title': video_title,
@ -433,6 +441,7 @@ class FacebookIE(InfoExtractor):
'uploader': uploader, 'uploader': uploader,
'timestamp': timestamp, 'timestamp': timestamp,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'view_count': view_count,
} }
return webpage, info_dict return webpage, info_dict

View File

@ -10,6 +10,7 @@ from ..utils import (
int_or_none, int_or_none,
qualities, qualities,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -88,8 +89,8 @@ class FirstTVIE(InfoExtractor):
formats = [] formats = []
path = None path = None
for f in item.get('mbr', []): for f in item.get('mbr', []):
src = f.get('src') src = url_or_none(f.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
tbr = int_or_none(self._search_regex( tbr = int_or_none(self._search_regex(
r'_(\d{3,})\.mp4', src, 'tbr', default=None)) r'_(\d{3,})\.mp4', src, 'tbr', default=None))

View File

@ -58,6 +58,14 @@ class FoxNewsIE(AMPIE):
}, },
] ]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
host, video_id = re.match(self._VALID_URL, url).groups() host, video_id = re.match(self._VALID_URL, url).groups()
@ -68,21 +76,41 @@ class FoxNewsIE(AMPIE):
class FoxNewsArticleIE(InfoExtractor): class FoxNewsArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)'
IE_NAME = 'foxnews:article' IE_NAME = 'foxnews:article'
_TEST = { _TESTS = [{
# data-video-id
'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html',
'md5': '62aa5a781b308fdee212ebb6f33ae7ef', 'md5': '83d44e1aff1433e7a29a7b537d1700b5',
'info_dict': { 'info_dict': {
'id': '5116295019001', 'id': '5116295019001',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Trump and Clinton asked to defend positions on Iraq War', 'title': 'Trump and Clinton asked to defend positions on Iraq War',
'description': 'Veterans react on \'The Kelly File\'', 'description': 'Veterans react on \'The Kelly File\'',
'timestamp': 1473299755, 'timestamp': 1473301045,
'upload_date': '20160908', 'upload_date': '20160908',
}, },
} }, {
# iframe embed
'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true',
'info_dict': {
'id': '5748266721001',
'ext': 'flv',
'title': 'Kyle Kashuv has a positive message for the Trump White House',
'description': 'Marjory Stoneman Douglas student disagrees with classmates.',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 229,
'timestamp': 1520594670,
'upload_date': '20180309',
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
@ -90,51 +118,10 @@ class FoxNewsArticleIE(InfoExtractor):
video_id = self._html_search_regex( video_id = self._html_search_regex(
r'data-video-id=([\'"])(?P<id>[^\'"]+)\1', r'data-video-id=([\'"])(?P<id>[^\'"]+)\1',
webpage, 'video ID', group='id') webpage, 'video ID', group='id', default=None)
if video_id:
return self.url_result(
'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key())
return self.url_result( return self.url_result(
'http://video.foxnews.com/v/' + video_id, FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key())
FoxNewsIE.ie_key())
class FoxNewsInsiderIE(InfoExtractor):
_VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)'
IE_NAME = 'foxnews:insider'
_TEST = {
'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
'md5': 'a10c755e582d28120c62749b4feb4c0c',
'info_dict': {
'id': '5099377331001',
'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words',
'ext': 'mp4',
'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive',
'description': 'Is campus censorship getting out of control?',
'timestamp': 1472168725,
'upload_date': '20160825',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': [FoxNewsIE.ie_key()],
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL')
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
return {
'_type': 'url_transparent',
'ie_key': FoxNewsIE.ie_key(),
'url': embed_url,
'display_id': display_id,
'title': title,
'description': description,
}

View File

@ -16,6 +16,7 @@ from ..utils import (
int_or_none, int_or_none,
parse_duration, parse_duration,
try_get, try_get,
url_or_none,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -115,14 +116,13 @@ class FranceTVIE(InfoExtractor):
def sign(manifest_url, manifest_id): def sign(manifest_url, manifest_id):
for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
signed_url = self._download_webpage( signed_url = url_or_none(self._download_webpage(
'https://%s/esi/TA' % host, video_id, 'https://%s/esi/TA' % host, video_id,
'Downloading signed %s manifest URL' % manifest_id, 'Downloading signed %s manifest URL' % manifest_id,
fatal=False, query={ fatal=False, query={
'url': manifest_url, 'url': manifest_url,
}) }))
if (signed_url and isinstance(signed_url, compat_str) and if signed_url:
re.search(r'^(?:https?:)?//', signed_url)):
return signed_url return signed_url
return manifest_url return manifest_url

View File

@ -0,0 +1,263 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
parse_duration,
url_or_none,
urlencode_postdata,
)
class FrontendMastersBaseIE(InfoExtractor):
_API_BASE = 'https://api.frontendmasters.com/v1/kabuki'
_LOGIN_URL = 'https://frontendmasters.com/login/'
_NETRC_MACHINE = 'frontendmasters'
_QUALITIES = {
'low': {'width': 480, 'height': 360},
'mid': {'width': 1280, 'height': 720},
'high': {'width': 1920, 'height': 1080}
}
def _real_initialize(self):
self._login()
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
login_page = self._download_webpage(
self._LOGIN_URL, None, 'Downloading login page')
login_form = self._hidden_inputs(login_page)
login_form.update({
'username': username,
'password': password
})
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
'post_url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
post_url, None, 'Logging in', data=urlencode_postdata(login_form),
headers={'Content-Type': 'application/x-www-form-urlencoded'})
# Successful login
if any(p in response for p in (
'wp-login.php?action=logout', '>Logout')):
return
error = self._html_search_regex(
r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<',
response, 'error message', default=None, group='error')
if error:
raise ExtractorError('Unable to login: %s' % error, expected=True)
raise ExtractorError('Unable to log in')
class FrontendMastersPageBaseIE(FrontendMastersBaseIE):
def _download_course(self, course_name, url):
return self._download_json(
'%s/courses/%s' % (self._API_BASE, course_name), course_name,
'Downloading course JSON', headers={'Referer': url})
@staticmethod
def _extract_chapters(course):
chapters = []
lesson_elements = course.get('lessonElements')
if isinstance(lesson_elements, list):
chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)]
return chapters
@staticmethod
def _extract_lesson(chapters, lesson_id, lesson):
title = lesson.get('title') or lesson_id
display_id = lesson.get('slug')
description = lesson.get('description')
thumbnail = lesson.get('thumbnail')
chapter_number = None
index = lesson.get('index')
element_index = lesson.get('elementIndex')
if (isinstance(index, int) and isinstance(element_index, int) and
index < element_index):
chapter_number = element_index - index
chapter = (chapters[chapter_number - 1]
if chapter_number - 1 < len(chapters) else None)
duration = None
timestamp = lesson.get('timestamp')
if isinstance(timestamp, compat_str):
mobj = re.search(
r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})',
timestamp)
if mobj:
duration = parse_duration(mobj.group('end')) - parse_duration(
mobj.group('start'))
return {
'_type': 'url_transparent',
'url': 'frontendmasters:%s' % lesson_id,
'ie_key': FrontendMastersIE.ie_key(),
'id': lesson_id,
'display_id': display_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'chapter': chapter,
'chapter_number': chapter_number,
}
class FrontendMastersIE(FrontendMastersBaseIE):
_VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba',
'md5': '7f161159710d6b7016a4f4af6fcb05e2',
'info_dict': {
'id': 'a2qogef6ba',
'ext': 'mp4',
'title': 'a2qogef6ba',
},
'skip': 'Requires FrontendMasters account credentials',
}, {
'url': 'frontendmasters:a2qogef6ba',
'only_matching': True,
}]
def _real_extract(self, url):
lesson_id = self._match_id(url)
source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id)
formats = []
for ext in ('webm', 'mp4'):
for quality in ('low', 'mid', 'high'):
resolution = self._QUALITIES[quality].copy()
format_id = '%s-%s' % (ext, quality)
format_url = self._download_json(
source_url, lesson_id,
'Downloading %s source JSON' % format_id, query={
'f': ext,
'r': resolution['height'],
}, headers={
'Referer': url,
}, fatal=False)['url']
if not format_url:
continue
f = resolution.copy()
f.update({
'url': format_url,
'ext': ext,
'format_id': format_id,
})
formats.append(f)
self._sort_formats(formats)
subtitles = {
'en': [{
'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id),
}]
}
return {
'id': lesson_id,
'title': lesson_id,
'formats': formats,
'subtitles': subtitles
}
class FrontendMastersLessonIE(FrontendMastersPageBaseIE):
_VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)'
_TEST = {
'url': 'https://frontendmasters.com/courses/web-development/tools',
'info_dict': {
'id': 'a2qogef6ba',
'display_id': 'tools',
'ext': 'mp4',
'title': 'Tools',
'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
'thumbnail': r're:^https?://.*\.jpg$',
'chapter': 'Introduction',
'chapter_number': 1,
},
'params': {
'skip_download': True,
},
'skip': 'Requires FrontendMasters account credentials',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
course_name, lesson_name = mobj.group('course_name', 'lesson_name')
course = self._download_course(course_name, url)
lesson_id, lesson = next(
(video_id, data)
for video_id, data in course['lessonData'].items()
if data.get('slug') == lesson_name)
chapters = self._extract_chapters(course)
return self._extract_lesson(chapters, lesson_id, lesson)
class FrontendMastersCourseIE(FrontendMastersPageBaseIE):
_VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)'
_TEST = {
'url': 'https://frontendmasters.com/courses/web-development/',
'info_dict': {
'id': 'web-development',
'title': 'Introduction to Web Development',
'description': 'md5:9317e6e842098bf725d62360e52d49a6',
},
'playlist_count': 81,
'skip': 'Requires FrontendMasters account credentials',
}
@classmethod
def suitable(cls, url):
return False if FrontendMastersLessonIE.suitable(url) else super(
FrontendMastersBaseIE, cls).suitable(url)
def _real_extract(self, url):
course_name = self._match_id(url)
course = self._download_course(course_name, url)
chapters = self._extract_chapters(course)
lessons = sorted(
course['lessonData'].values(), key=lambda data: data['index'])
entries = []
for lesson in lessons:
lesson_name = lesson.get('slug')
if not lesson_name:
continue
lesson_id = lesson.get('hash') or lesson.get('statsId')
entries.append(self._extract_lesson(chapters, lesson_id, lesson))
title = course.get('title')
description = course.get('description')
return self.playlist_result(entries, course_name, title, description)

View File

@ -5,6 +5,7 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from .nexx import NexxIE from .nexx import NexxIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
@ -12,6 +13,19 @@ from ..utils import (
class FunkBaseIE(InfoExtractor): class FunkBaseIE(InfoExtractor):
_HEADERS = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4',
}
_AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4'
@staticmethod
def _make_headers(referer):
headers = FunkBaseIE._HEADERS.copy()
headers['Referer'] = referer
return headers
def _make_url_result(self, video): def _make_url_result(self, video):
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
@ -48,19 +62,19 @@ class FunkMixIE(FunkBaseIE):
lists = self._download_json( lists = self._download_json(
'https://www.funk.net/api/v3.1/curation/curatedLists/', 'https://www.funk.net/api/v3.1/curation/curatedLists/',
mix_id, headers={ mix_id, headers=self._make_headers(url), query={
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI',
'Referer': url,
}, query={
'size': 100, 'size': 100,
})['result']['lists'] })['_embedded']['curatedListList']
metas = next( metas = next(
l for l in lists l for l in lists
if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas']
video = next( video = next(
meta['videoDataDelegate'] meta['videoDataDelegate']
for meta in metas if meta.get('alias') == alias) for meta in metas
if try_get(
meta, lambda x: x['videoDataDelegate']['alias'],
compat_str) == alias)
return self._make_url_result(video) return self._make_url_result(video)
@ -104,25 +118,39 @@ class FunkChannelIE(FunkBaseIE):
channel_id = mobj.group('id') channel_id = mobj.group('id')
alias = mobj.group('alias') alias = mobj.group('alias')
headers = { headers = self._make_headers(url)
'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc',
'Referer': url,
}
video = None video = None
by_id_list = self._download_json( # Id-based channels are currently broken on their side: webplayer
'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id, # tries to process them via byChannelAlias endpoint and fails
headers=headers, query={ # predictably.
'ids': alias, by_channel_alias = self._download_json(
'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s'
% channel_id,
'Downloading byChannelAlias JSON', headers=headers, query={
'size': 100,
}, fatal=False) }, fatal=False)
if by_id_list: if by_channel_alias:
video = try_get(by_id_list, lambda x: x['result'][0], dict) video_list = try_get(
by_channel_alias, lambda x: x['_embedded']['videoList'], list)
if video_list:
video = next(r for r in video_list if r.get('alias') == alias)
if not video:
by_id_list = self._download_json(
'https://www.funk.net/api/v3.0/content/videos/byIdList',
channel_id, 'Downloading byIdList JSON', headers=headers,
query={
'ids': alias,
}, fatal=False)
if by_id_list:
video = try_get(by_id_list, lambda x: x['result'][0], dict)
if not video: if not video:
results = self._download_json( results = self._download_json(
'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, 'https://www.funk.net/api/v3.0/content/videos/filter',
headers=headers, query={ channel_id, 'Downloading filter JSON', headers=headers, query={
'channelId': channel_id, 'channelId': channel_id,
'size': 100, 'size': 100,
})['result'] })['result']

View File

@ -32,6 +32,7 @@ from ..utils import (
unified_strdate, unified_strdate,
unsmuggle_url, unsmuggle_url,
UnsupportedError, UnsupportedError,
url_or_none,
xpath_text, xpath_text,
) )
from .commonprotocols import RtmpIE from .commonprotocols import RtmpIE
@ -111,6 +112,8 @@ from .cloudflarestream import CloudflareStreamIE
from .peertube import PeerTubeIE from .peertube import PeerTubeIE
from .indavideo import IndavideoEmbedIE from .indavideo import IndavideoEmbedIE
from .apa import APAIE from .apa import APAIE
from .foxnews import FoxNewsIE
from .viqeo import ViqeoIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1394,17 +1397,6 @@ class GenericIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, },
# SVT embed
{
'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
'info_dict': {
'id': '2900353',
'ext': 'flv',
'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
'duration': 27,
'age_limit': 0,
},
},
# Crooks and Liars embed # Crooks and Liars embed
{ {
'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
@ -2069,6 +2061,15 @@ class GenericIE(InfoExtractor):
}, },
'skip': 'TODO: fix nested playlists processing in tests', 'skip': 'TODO: fix nested playlists processing in tests',
}, },
{
# Viqeo embeds
'url': 'https://viqeo.tv/',
'info_dict': {
'id': 'viqeo',
'title': 'All-new video platform',
},
'playlist_count': 6,
},
# { # {
# # TODO: find another test # # TODO: find another test
# # http://schema.org/VideoObject # # http://schema.org/VideoObject
@ -3076,7 +3077,7 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
peertube_urls = PeerTubeIE._extract_urls(webpage) peertube_urls = PeerTubeIE._extract_urls(webpage, url)
if peertube_urls: if peertube_urls:
return self.playlist_from_matches( return self.playlist_from_matches(
peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
@ -3091,6 +3092,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
apa_urls, video_id, video_title, ie=APAIE.ie_key()) apa_urls, video_id, video_title, ie=APAIE.ie_key())
foxnews_urls = FoxNewsIE._extract_urls(webpage)
if foxnews_urls:
return self.playlist_from_matches(
foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
sharevideos_urls = [mobj.group('url') for mobj in re.finditer( sharevideos_urls = [mobj.group('url') for mobj in re.finditer(
r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
webpage)] webpage)]
@ -3098,6 +3104,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
sharevideos_urls, video_id, video_title) sharevideos_urls, video_id, video_title)
viqeo_urls = ViqeoIE._extract_urls(webpage)
if viqeo_urls:
return self.playlist_from_matches(
viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key())
# Look for HTML5 media # Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: if entries:
@ -3135,8 +3146,8 @@ class GenericIE(InfoExtractor):
sources = [sources] sources = [sources]
formats = [] formats = []
for source in sources: for source in sources:
src = source.get('src') src = url_or_none(source.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
src = compat_urlparse.urljoin(url, src) src = compat_urlparse.urljoin(url, src)
src_type = source.get('type') src_type = source.get('type')

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
@ -14,8 +15,8 @@ from ..utils import (
class Go90IE(InfoExtractor): class Go90IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
_TEST = { _TESTS = [{
'url': 'https://www.go90.com/videos/84BUqjLpf9D', 'url': 'https://www.go90.com/videos/84BUqjLpf9D',
'md5': 'efa7670dbbbf21a7b07b360652b24a32', 'md5': 'efa7670dbbbf21a7b07b360652b24a32',
'info_dict': { 'info_dict': {
@ -27,15 +28,31 @@ class Go90IE(InfoExtractor):
'upload_date': '20170411', 'upload_date': '20170411',
'age_limit': 14, 'age_limit': 14,
} }
} }, {
'url': 'https://www.go90.com/embed/261MflWkD3N',
'only_matching': True,
}]
_GEO_BYPASS = False
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_data = self._download_json(
'https://www.go90.com/api/view/items/' + video_id, try:
video_id, headers={ headers = self.geo_verification_headers()
headers.update({
'Content-Type': 'application/json; charset=utf-8', 'Content-Type': 'application/json; charset=utf-8',
}, data=b'{"client":"web","device_type":"pc"}') })
video_data = self._download_json(
'https://www.go90.com/api/view/items/' + video_id, video_id,
headers=headers, data=b'{"client":"web","device_type":"pc"}')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
message = self._parse_json(e.cause.read().decode(), None)['error']['message']
if 'region unavailable' in message:
self.raise_geo_restricted(countries=['US'])
raise ExtractorError(message, expected=True)
raise
if video_data.get('requires_drm'): if video_data.get('requires_drm'):
raise ExtractorError('This video is DRM protected.', expected=True) raise ExtractorError('This video is DRM protected.', expected=True)
main_video_asset = video_data['main_video_asset'] main_video_asset = video_data['main_video_asset']

View File

@ -8,6 +8,7 @@ from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -80,8 +81,8 @@ class HiDiveIE(InfoExtractor):
bitrates = rendition.get('bitrates') bitrates = rendition.get('bitrates')
if not isinstance(bitrates, dict): if not isinstance(bitrates, dict):
continue continue
m3u8_url = bitrates.get('hls') m3u8_url = url_or_none(bitrates.get('hls'))
if not isinstance(m3u8_url, compat_str): if not m3u8_url:
continue continue
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
@ -93,9 +94,8 @@ class HiDiveIE(InfoExtractor):
if not isinstance(cc_file, list) or len(cc_file) < 3: if not isinstance(cc_file, list) or len(cc_file) < 3:
continue continue
cc_lang = cc_file[0] cc_lang = cc_file[0]
cc_url = cc_file[2] cc_url = url_or_none(cc_file[2])
if not isinstance(cc_lang, compat_str) or not isinstance( if not isinstance(cc_lang, compat_str) or not cc_url:
cc_url, compat_str):
continue continue
subtitles.setdefault(cc_lang, []).append({ subtitles.setdefault(cc_lang, []).append({
'url': cc_url, 'url': cc_url,

View File

@ -3,12 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
mimetype2ext, mimetype2ext,
parse_duration, parse_duration,
qualities, qualities,
url_or_none,
) )
@ -61,8 +61,8 @@ class ImdbIE(InfoExtractor):
for encoding in video_metadata.get('encodings', []): for encoding in video_metadata.get('encodings', []):
if not encoding or not isinstance(encoding, dict): if not encoding or not isinstance(encoding, dict):
continue continue
video_url = encoding.get('videoUrl') video_url = url_or_none(encoding.get('videoUrl'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType'))) ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType')))
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -12,7 +12,7 @@ from ..utils import (
class ImgurIE(InfoExtractor): class ImgurIE(InfoExtractor):
_VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$'
_TESTS = [{ _TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv', 'url': 'https://i.imgur.com/A61SaA1.gifv',
@ -43,6 +43,9 @@ class ImgurIE(InfoExtractor):
}, { }, {
'url': 'http://imgur.com/r/aww/VQcQPhM', 'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -17,6 +17,7 @@ from ..utils import (
lowercase_escape, lowercase_escape,
std_headers, std_headers,
try_get, try_get,
url_or_none,
) )
@ -170,7 +171,7 @@ class InstagramIE(InfoExtractor):
node = try_get(edge, lambda x: x['node'], dict) node = try_get(edge, lambda x: x['node'], dict)
if not node: if not node:
continue continue
node_video_url = try_get(node, lambda x: x['video_url'], compat_str) node_video_url = url_or_none(node.get('video_url'))
if not node_video_url: if not node_video_url:
continue continue
entries.append({ entries.append({

View File

@ -13,15 +13,17 @@ from ..compat import (
compat_etree_register_namespace, compat_etree_register_namespace,
) )
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError,
extract_attributes, extract_attributes,
int_or_none,
merge_dicts,
parse_duration,
smuggle_url,
url_or_none,
xpath_with_ns, xpath_with_ns,
xpath_element, xpath_element,
xpath_text, xpath_text,
int_or_none,
parse_duration,
smuggle_url,
ExtractorError,
determine_ext,
) )
@ -129,64 +131,65 @@ class ITVIE(InfoExtractor):
resp_env = self._download_xml( resp_env = self._download_xml(
params['data-playlist-url'], video_id, params['data-playlist-url'], video_id,
headers=headers, data=etree.tostring(req_env)) headers=headers, data=etree.tostring(req_env), fatal=False)
playlist = xpath_element(resp_env, './/Playlist') if resp_env:
if playlist is None: playlist = xpath_element(resp_env, './/Playlist')
fault_code = xpath_text(resp_env, './/faultcode') if playlist is None:
fault_string = xpath_text(resp_env, './/faultstring') fault_code = xpath_text(resp_env, './/faultcode')
if fault_code == 'InvalidGeoRegion': fault_string = xpath_text(resp_env, './/faultstring')
self.raise_geo_restricted( if fault_code == 'InvalidGeoRegion':
msg=fault_string, countries=self._GEO_COUNTRIES) self.raise_geo_restricted(
elif fault_code not in ( msg=fault_string, countries=self._GEO_COUNTRIES)
'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): elif fault_code not in (
raise ExtractorError( 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
'%s said: %s' % (self.IE_NAME, fault_string), expected=True) raise ExtractorError(
info.update({ '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
'title': self._og_search_title(webpage), info.update({
'episode_title': params.get('data-video-episode'), 'title': self._og_search_title(webpage),
'series': params.get('data-video-title'), 'episode_title': params.get('data-video-episode'),
}) 'series': params.get('data-video-title'),
else: })
title = xpath_text(playlist, 'EpisodeTitle', default=None) else:
info.update({ title = xpath_text(playlist, 'EpisodeTitle', default=None)
'title': title, info.update({
'episode_title': title, 'title': title,
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), 'episode_title': title,
'series': xpath_text(playlist, 'ProgrammeTitle'), 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'duration': parse_duration(xpath_text(playlist, 'Duration')), 'series': xpath_text(playlist, 'ProgrammeTitle'),
}) 'duration': parse_duration(xpath_text(playlist, 'Duration')),
video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) })
media_files = xpath_element(video_element, 'MediaFiles', fatal=True) video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
rtmp_url = media_files.attrib['base'] media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
rtmp_url = media_files.attrib['base']
for media_file in media_files.findall('MediaFile'): for media_file in media_files.findall('MediaFile'):
play_path = xpath_text(media_file, 'URL') play_path = xpath_text(media_file, 'URL')
if not play_path: if not play_path:
continue continue
tbr = int_or_none(media_file.get('bitrate'), 1000) tbr = int_or_none(media_file.get('bitrate'), 1000)
f = { f = {
'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
'play_path': play_path, 'play_path': play_path,
# Providing this swfVfy allows to avoid truncated downloads # Providing this swfVfy allows to avoid truncated downloads
'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
'page_url': url, 'page_url': url,
'tbr': tbr, 'tbr': tbr,
'ext': 'flv', 'ext': 'flv',
} }
app = self._search_regex( app = self._search_regex(
'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
if app: if app:
f.update({ f.update({
'url': rtmp_url.split('?', 1)[0], 'url': rtmp_url.split('?', 1)[0],
'app': app, 'app': app,
}) })
else: else:
f['url'] = rtmp_url f['url'] = rtmp_url
formats.append(f) formats.append(f)
for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
if caption_url.text: if caption_url.text:
extract_subtitle(caption_url.text) extract_subtitle(caption_url.text)
ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
hmac = params.get('data-video-hmac') hmac = params.get('data-video-hmac')
@ -248,8 +251,8 @@ class ITVIE(InfoExtractor):
for sub in subs: for sub in subs:
if not isinstance(sub, dict): if not isinstance(sub, dict):
continue continue
href = sub.get('Href') href = url_or_none(sub.get('Href'))
if isinstance(href, compat_str): if href:
extract_subtitle(href) extract_subtitle(href)
if not info.get('duration'): if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration')) info['duration'] = parse_duration(video_data.get('Duration'))
@ -261,7 +264,17 @@ class ITVIE(InfoExtractor):
'formats': formats, 'formats': formats,
'subtitles': subtitles, 'subtitles': subtitles,
}) })
return info
webpage_info = self._search_json_ld(webpage, video_id, default={})
if not webpage_info.get('title'):
webpage_info['title'] = self._html_search_regex(
r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'twitter:title', webpage, 'title',
default=None) or webpage_info['episode']
return merge_dicts(info, webpage_info)
class ITVBTCCIE(InfoExtractor): class ITVBTCCIE(InfoExtractor):

View File

@ -12,6 +12,7 @@ from ..utils import (
get_element_by_class, get_element_by_class,
get_element_by_id, get_element_by_id,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -108,11 +109,14 @@ class IwaraIE(InfoExtractor):
formats = [] formats = []
for a_format in video_data: for a_format in video_data:
format_uri = url_or_none(a_format.get('uri'))
if not format_uri:
continue
format_id = a_format.get('resolution') format_id = a_format.get('resolution')
height = int_or_none(self._search_regex( height = int_or_none(self._search_regex(
r'(\d+)p', format_id, 'height', default=None)) r'(\d+)p', format_id, 'height', default=None))
formats.append({ formats.append({
'url': a_format['uri'], 'url': self._proto_relative_url(format_uri, 'https:'),
'format_id': format_id, 'format_id': format_id,
'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
'height': height, 'height': height,

View File

@ -18,7 +18,7 @@ class JojIE(InfoExtractor):
joj:| joj:|
https?://media\.joj\.sk/embed/ https?://media\.joj\.sk/embed/
) )
(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) (?P<id>[^/?#^]+)
''' '''
_TESTS = [{ _TESTS = [{
'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
@ -29,16 +29,24 @@ class JojIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 3118, 'duration': 3118,
} }
}, {
'url': 'https://media.joj.sk/embed/9i1cxv',
'only_matching': True,
}, { }, {
'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932', 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
'only_matching': True, 'only_matching': True,
}, {
'url': 'joj:9i1cxv',
'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return re.findall( return [
r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', mobj.group('url')
webpage) for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',
webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View File

@ -4,16 +4,14 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..aes import aes_decrypt_text from ..aes import aes_decrypt_text
from ..compat import ( from ..compat import compat_urllib_parse_unquote
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -55,7 +53,8 @@ class KeezMoviesIE(InfoExtractor):
encrypted = False encrypted = False
def extract_format(format_url, height=None): def extract_format(format_url, height=None):
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//')):
return return
if format_url in format_urls: if format_url in format_urls:
return return

View File

@ -2,11 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
float_or_none, float_or_none,
int_or_none, int_or_none,
url_or_none,
) )
@ -109,7 +109,8 @@ class KonserthusetPlayIE(InfoExtractor):
captions = source.get('captionsAvailableLanguages') captions = source.get('captionsAvailableLanguages')
if isinstance(captions, dict): if isinstance(captions, dict):
for lang, subtitle_url in captions.items(): for lang, subtitle_url in captions.items():
if lang != 'none' and isinstance(subtitle_url, compat_str): subtitle_url = url_or_none(subtitle_url)
if lang != 'none' and subtitle_url:
subtitles.setdefault(lang, []).append({'url': subtitle_url}) subtitles.setdefault(lang, []).append({'url': subtitle_url})
return { return {

View File

@ -4,7 +4,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError,
compat_str, compat_str,
compat_urlparse, compat_urlparse,
) )
@ -44,21 +43,15 @@ class LyndaBaseIE(InfoExtractor):
form_data = self._hidden_inputs(form_html) form_data = self._hidden_inputs(form_html)
form_data.update(extra_form_data) form_data.update(extra_form_data)
try: response = self._download_json(
response = self._download_json( action_url, None, note,
action_url, None, note, data=urlencode_postdata(form_data),
data=urlencode_postdata(form_data), headers={
headers={ 'Referer': referrer_url,
'Referer': referrer_url, 'X-Requested-With': 'XMLHttpRequest',
'X-Requested-With': 'XMLHttpRequest', }, expected_status=(418, 500, ))
})
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
response = self._parse_json(e.cause.read().decode('utf-8'), None)
self._check_error(response, ('email', 'password'))
raise
self._check_error(response, 'ErrorMessage') self._check_error(response, ('email', 'password', 'ErrorMessage'))
return response, action_url return response, action_url

View File

@ -0,0 +1,125 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
orderedSet,
parse_duration,
try_get,
)
class MarkizaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?videoarchiv\.markiza\.sk/(?:video/(?:[^/]+/)*|embed/)(?P<id>\d+)(?:[_/]|$)'
_TESTS = [{
'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723_oteckovia-109',
'md5': 'ada4e9fad038abeed971843aa028c7b0',
'info_dict': {
'id': '139078',
'ext': 'mp4',
'title': 'Oteckovia 109',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2760,
},
}, {
'url': 'http://videoarchiv.markiza.sk/video/televizne-noviny/televizne-noviny/85430_televizne-noviny',
'info_dict': {
'id': '85430',
'title': 'Televízne noviny',
},
'playlist_count': 23,
}, {
'url': 'http://videoarchiv.markiza.sk/video/oteckovia/84723',
'only_matching': True,
}, {
'url': 'http://videoarchiv.markiza.sk/video/84723',
'only_matching': True,
}, {
'url': 'http://videoarchiv.markiza.sk/video/filmy/85190_kamenak',
'only_matching': True,
}, {
'url': 'http://videoarchiv.markiza.sk/video/reflex/zo-zakulisia/84651_pribeh-alzbetky',
'only_matching': True,
}, {
'url': 'http://videoarchiv.markiza.sk/embed/85295',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
data = self._download_json(
'http://videoarchiv.markiza.sk/json/video_jwplayer7.json',
video_id, query={'id': video_id})
info = self._parse_jwplayer_data(data, m3u8_id='hls', mpd_id='dash')
if info.get('_type') == 'playlist':
info.update({
'id': video_id,
'title': try_get(
data, lambda x: x['details']['name'], compat_str),
})
else:
info['duration'] = parse_duration(
try_get(data, lambda x: x['details']['duration'], compat_str))
return info
class MarkizaPageIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?(?:(?:[^/]+\.)?markiza|tvnoviny)\.sk/(?:[^/]+/)*(?P<id>\d+)_'
_TESTS = [{
'url': 'http://www.markiza.sk/soubiz/zahranicny/1923705_oteckovia-maju-svoj-den-ti-slavni-nie-su-o-nic-menej-rozkosni',
'md5': 'ada4e9fad038abeed971843aa028c7b0',
'info_dict': {
'id': '139355',
'ext': 'mp4',
'title': 'Oteckovia 110',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2604,
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://dajto.markiza.sk/filmy-a-serialy/1774695_frajeri-vo-vegas',
'only_matching': True,
}, {
'url': 'http://superstar.markiza.sk/aktualne/1923870_to-je-ale-telo-spevacka-ukazala-sexy-postavicku-v-bikinach',
'only_matching': True,
}, {
'url': 'http://hybsa.markiza.sk/aktualne/1923790_uzasna-atmosfera-na-hybsa-v-poprade-superstaristi-si-prve-koncerty-pred-davom-ludi-poriadne-uzili',
'only_matching': True,
}, {
'url': 'http://doma.markiza.sk/filmy/1885250_moja-vysnivana-svadba',
'only_matching': True,
}, {
'url': 'http://www.tvnoviny.sk/domace/1923887_po-smrti-manzela-ju-cakalo-poriadne-prekvapenie',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if MarkizaIE.suitable(url) else super(MarkizaPageIE, cls).suitable(url)
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(
# Downloading for some hosts (e.g. dajto, doma) fails with 500
# although everything seems to be OK, so considering 500
# status code to be expected.
url, playlist_id, expected_status=500)
entries = [
self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id)
for video_id in orderedSet(re.findall(
r'(?:initPlayer_|data-entity=["\']|id=["\']player_)(\d+)',
webpage))]
return self.playlist_result(entries, playlist_id)

View File

@ -3,59 +3,75 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .theplatform import ThePlatformBaseIE
from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, ExtractorError,
parse_duration, int_or_none,
try_get, update_url_query,
unified_strdate,
) )
class MediasetIE(InfoExtractor): class MediasetIE(ThePlatformBaseIE):
_TP_TLD = 'eu'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?: (?:
mediaset:| mediaset:|
https?:// https?://
(?:www\.)?video\.mediaset\.it/ (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?: (?:
(?:video|on-demand)/(?:[^/]+/)+[^/]+_| (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= player/index\.html\?.*?\bprogramGuid=
) )
)(?P<id>[0-9]+) )(?P<id>[0-9A-Z]{16})
''' '''
_TESTS = [{ _TESTS = [{
# full episode # full episode
'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824',
'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
'info_dict': { 'info_dict': {
'id': '661824', 'id': 'FAFU000000661824',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Quarta puntata', 'title': 'Quarta puntata',
'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1414, 'duration': 1414.26,
'creator': 'mediaset',
'upload_date': '20161107', 'upload_date': '20161107',
'series': 'Hello Goodbye', 'series': 'Hello Goodbye',
'categories': ['reality'], 'timestamp': 1478532900,
'uploader': 'Rete 4',
'uploader_id': 'R4',
}, },
'expected_warnings': ['is not a supported codec'], }, {
'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
'md5': '288532f0ad18307705b01e581304cd7b',
'info_dict': {
'id': 'F309013801000501',
'ext': 'mp4',
'title': 'Puntata del 25 maggio',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 6565.007,
'upload_date': '20180526',
'series': 'Matrix',
'timestamp': 1527326245,
'uploader': 'Canale 5',
'uploader_id': 'C5',
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
}, { }, {
# clip # clip
'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
'only_matching': True, 'only_matching': True,
}, { }, {
# iframe simple # iframe simple
'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924',
'only_matching': True, 'only_matching': True,
}, { }, {
# iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'mediaset:661824', 'url': 'mediaset:FAFU000000665924',
'only_matching': True, 'only_matching': True,
}] }]
@ -68,51 +84,54 @@ class MediasetIE(InfoExtractor):
webpage)] webpage)]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid
video_list = self._download_json( info = self._extract_theplatform_metadata(tp_path, guid)
'http://cdnsel01.mediaset.net/GetCdn.aspx',
video_id, 'Downloading video CDN JSON', query={
'streamid': video_id,
'format': 'json',
})['videoList']
formats = [] formats = []
for format_url in video_list: subtitles = {}
if '.ism' in format_url: first_e = None
formats.extend(self._extract_ism_formats( for asset_type in ('SD', 'HD'):
format_url, video_id, ism_id='mss', fatal=False)) for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'):
else: try:
formats.append({ tp_formats, tp_subtitles = self._extract_theplatform_smil(
'url': format_url, update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
'format_id': determine_ext(format_url), 'mbr': 'true',
}) 'formats': f,
'assetTypes': asset_type,
}), guid, 'Downloading %s %s SMIL data' % (f, asset_type))
except ExtractorError as e:
if not first_e:
first_e = e
break
for tp_f in tp_formats:
tp_f['quality'] = 1 if asset_type == 'HD' else 0
formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles)
if first_e and not formats:
raise first_e
self._sort_formats(formats) self._sort_formats(formats)
mediainfo = self._download_json( fields = []
'http://plr.video.mediaset.it/html/metainfo.sjson', for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
video_id, 'Downloading video info JSON', query={ fields.extend(templ % repl for repl in repls)
'id': video_id, feed_data = self._download_json(
})['video'] 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid,
guid, fatal=False, query={'fields': ','.join(fields)})
if feed_data:
publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
info.update({
'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
'series': feed_data.get('mediasetprogram$brandTitle'),
'uploader': publish_info.get('description'),
'uploader_id': publish_info.get('channel'),
'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
})
title = mediainfo['title'] info.update({
'id': guid,
creator = try_get(
mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
category = try_get(
mediainfo, lambda x: x['brand-info']['category'], compat_str)
categories = [category] if category else None
return {
'id': video_id,
'title': title,
'description': mediainfo.get('short-description'),
'thumbnail': mediainfo.get('thumbnail'),
'duration': parse_duration(mediainfo.get('duration')),
'creator': creator,
'upload_date': unified_strdate(mediainfo.get('production-date')),
'webpage_url': mediainfo.get('url'),
'series': mediainfo.get('brand-value'),
'categories': categories,
'formats': formats, 'formats': formats,
} 'subtitles': subtitles,
})
return info

View File

@ -15,6 +15,7 @@ from ..utils import (
mimetype2ext, mimetype2ext,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
url_or_none,
urljoin, urljoin,
) )
@ -156,8 +157,8 @@ class MediasiteIE(InfoExtractor):
stream_formats = [] stream_formats = []
for unum, VideoUrl in enumerate(video_urls): for unum, VideoUrl in enumerate(video_urls):
video_url = VideoUrl.get('Location') video_url = url_or_none(VideoUrl.get('Location'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS

View File

@ -1,84 +1,14 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import uuid
from .common import InfoExtractor from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
extract_attributes,
determine_ext,
smuggle_url, smuggle_url,
parse_duration, parse_duration,
) )
class MiTeleBaseIE(InfoExtractor):
def _get_player_info(self, url, webpage):
player_data = extract_attributes(self._search_regex(
r'(?s)(<ms-video-player.+?</ms-video-player>)',
webpage, 'ms video player'))
video_id = player_data['data-media-id']
if player_data.get('data-cms-id') == 'ooyala':
return self.url_result(
'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)
config_url = compat_urlparse.urljoin(url, player_data['data-config'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')
mmc_url = config['services']['mmc']
duration = None
formats = []
for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
mmc = self._download_json(
m_url, video_id, 'Downloading mmc JSON')
if not duration:
duration = int_or_none(mmc.get('duration'))
for location in mmc['locations']:
gat = self._proto_relative_url(location.get('gat'), 'http:')
gcp = location.get('gcp')
ogn = location.get('ogn')
if None in (gat, gcp, ogn):
continue
token_data = {
'gcp': gcp,
'ogn': ogn,
'sta': 0,
}
media = self._download_json(
gat, video_id, data=json.dumps(token_data).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
})
stream = media.get('stream') or media.get('file')
if not stream:
continue
ext = determine_ext(stream)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'formats': formats,
'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
'duration': duration,
}
class MiTeleIE(InfoExtractor): class MiTeleIE(InfoExtractor):
IE_DESC = 'mitele.es' IE_DESC = 'mitele.es'
_VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
@ -86,7 +16,7 @@ class MiTeleIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player', 'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player',
'info_dict': { 'info_dict': {
'id': '57b0dfb9c715da65618b4afa', 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Tor, la web invisible', 'title': 'Tor, la web invisible',
'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
@ -104,7 +34,7 @@ class MiTeleIE(InfoExtractor):
# no explicit title # no explicit title
'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player',
'info_dict': { 'info_dict': {
'id': '57b0de3dc915da14058b4876', 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Cuarto Milenio Temporada 6 Programa 226', 'title': 'Cuarto Milenio Temporada 6 Programa 226',
'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f', 'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f',
@ -128,40 +58,21 @@ class MiTeleIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
gigya_url = self._search_regex(
r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>',
webpage, 'gigya', default=None)
gigya_sc = self._download_webpage(
compat_urlparse.urljoin('http://www.mitele.es/', gigya_url),
video_id, 'Downloading gigya script')
# Get a appKey/uuid for getting the session key
appKey = self._search_regex(
r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)',
gigya_sc, 'appKey')
session_json = self._download_json(
'https://appgrid-api.cloud.accedo.tv/session',
video_id, 'Downloading session keys', query={
'appKey': appKey,
'uuid': compat_str(uuid.uuid4()),
})
paths = self._download_json( paths = self._download_json(
'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration', 'https://www.mitele.es/amd/agp/web/metadata/general_configuration',
video_id, 'Downloading paths JSON', video_id, 'Downloading paths JSON')
query={'sessionKey': compat_str(session_json['sessionKey'])})
ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search'] ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search']
base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com')
full_path = ooyala_s.get('full_path', '/search/v1/full/providers/')
source = self._download_json( source = self._download_json(
'http://%s%s%s/docs/%s' % ( '%s://%s%s%s/docs/%s' % (
ooyala_s['base_url'], ooyala_s['full_path'], ooyala_s.get('protocol', 'https'), base_url, full_path,
ooyala_s['provider_id'], video_id), ooyala_s.get('provider_id', '104951'), video_id),
video_id, 'Downloading data JSON', query={ video_id, 'Downloading data JSON', query={
'include_titles': 'Series,Season', 'include_titles': 'Series,Season',
'product_name': 'test', 'product_name': ooyala_s.get('product_name', 'test'),
'format': 'full', 'format': 'full',
})['hits']['hits'][0]['_source'] })['hits']['hits'][0]['_source']

View File

@ -77,8 +77,11 @@ class MotherlessIE(InfoExtractor):
title = self._html_search_regex( title = self._html_search_regex(
r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
video_url = self._html_search_regex( video_url = (self._html_search_regex(
r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL') (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
webpage, 'video URL', default=None, group='url') or
'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
view_count = str_to_int(self._html_search_regex( view_count = str_to_int(self._html_search_regex(
r'<strong>Views</strong>\s+([^<]+)<', r'<strong>Views</strong>\s+([^<]+)<',
@ -120,7 +123,7 @@ class MotherlessIE(InfoExtractor):
class MotherlessGroupIE(InfoExtractor): class MotherlessGroupIE(InfoExtractor):
_VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)' _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://motherless.com/g/movie_scenes', 'url': 'http://motherless.com/g/movie_scenes',
'info_dict': { 'info_dict': {

View File

@ -282,7 +282,7 @@ class NPOIE(NPOBaseIE):
video_url = stream_info.get('url') video_url = stream_info.get('url')
if not video_url or video_url in urls: if not video_url or video_url in urls:
continue continue
urls.add(item_url) urls.add(video_url)
if determine_ext(video_url) == 'm3u8': if determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
video_url, video_id, ext='mp4', video_url, video_id, ext='mp4',

View File

@ -4,12 +4,18 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
JSON_LD_RE,
NO_DEFAULT,
parse_age_limit, parse_age_limit,
parse_duration, parse_duration,
try_get,
) )
@ -359,6 +365,182 @@ class NRKTVIE(NRKBaseIE):
}] }]
class NRKTVEpisodeIE(InfoExtractor):
_VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
_TEST = {
'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
'info_dict': {
'id': 'MSUI14000816AA',
'ext': 'mp4',
'title': 'Backstage 8:30',
'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
'duration': 1320,
'series': 'Backstage',
'season_number': 1,
'episode_number': 8,
'episode': '8:30',
},
'params': {
'skip_download': True,
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
nrk_id = self._parse_json(
self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'),
display_id)['@id']
assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
return self.url_result(
'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)
class NRKTVSerieBaseIE(InfoExtractor):
def _extract_series(self, webpage, display_id, fatal=True):
config = self._parse_json(
self._search_regex(
r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config',
default='{}' if not fatal else NO_DEFAULT),
display_id, fatal=False)
if not config:
return
return try_get(config, lambda x: x['series'], dict)
def _extract_episodes(self, season):
entries = []
if not isinstance(season, dict):
return entries
episodes = season.get('episodes')
if not isinstance(episodes, list):
return entries
for episode in episodes:
nrk_id = episode.get('prfId')
if not nrk_id or not isinstance(nrk_id, compat_str):
continue
entries.append(self.url_result(
'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
return entries
class NRKTVSeasonIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
_TEST = {
'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
'info_dict': {
'id': '1',
'title': 'Sesong 1',
},
'playlist_mincount': 30,
}
@classmethod
def suitable(cls, url):
return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
else super(NRKTVSeasonIE, cls).suitable(url))
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
series = self._extract_series(webpage, display_id)
season = next(
s for s in series['seasons']
if int(display_id) == s.get('seasonNumber'))
title = try_get(season, lambda x: x['titles']['title'], compat_str)
return self.playlist_result(
self._extract_episodes(season), display_id, title)
class NRKTVSeriesIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
# new layout
'url': 'https://tv.nrk.no/serie/backstage',
'info_dict': {
'id': 'backstage',
'title': 'Backstage',
'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
},
'playlist_mincount': 60,
}, {
# old layout
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
'playlist_mincount': 9,
}, {
'url': 'http://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
'description': 'md5:58afd450974c89e27d5a19212eee7115',
},
'playlist_mincount': 3,
}, {
'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/saving-the-human-race',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/postmann-pat',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return (
False if any(ie.suitable(url)
for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
else super(NRKTVSeriesIE, cls).suitable(url))
def _real_extract(self, url):
series_id = self._match_id(url)
webpage = self._download_webpage(url, series_id)
# New layout (e.g. https://tv.nrk.no/serie/backstage)
series = self._extract_series(webpage, series_id, fatal=False)
if series:
title = try_get(series, lambda x: x['titles']['title'], compat_str)
description = try_get(
series, lambda x: x['titles']['subtitle'], compat_str)
entries = []
for season in series['seasons']:
entries.extend(self._extract_episodes(season))
return self.playlist_result(entries, series_id, title, description)
# Old layout (e.g. https://tv.nrk.no/serie/groenn-glede)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
series=series_id, season=season_id))
for season_id in re.findall(self._ITEM_RE, webpage)
]
title = self._html_search_meta(
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
description = self._html_search_meta(
'series_description', webpage,
'description', default=None) or self._og_search_description(webpage)
return self.playlist_result(entries, series_id, title, description)
class NRKTVDirekteIE(NRKTVIE): class NRKTVDirekteIE(NRKTVIE):
IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'
_VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)'
@ -438,64 +620,6 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE):
r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
class NRKTVSeriesIE(InfoExtractor):
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
'playlist_mincount': 9,
}, {
'url': 'http://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
'description': 'md5:58afd450974c89e27d5a19212eee7115',
},
'playlist_mincount': 3,
}, {
'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/saving-the-human-race',
'only_matching': True,
}, {
'url': 'https://tv.nrk.no/serie/postmann-pat',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url)
def _real_extract(self, url):
series_id = self._match_id(url)
webpage = self._download_webpage(url, series_id)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
series=series_id, season=season_id))
for season_id in re.findall(self._ITEM_RE, webpage)
]
title = self._html_search_meta(
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
description = self._html_search_meta(
'series_description', webpage,
'description', default=None) or self._og_search_description(webpage)
return self.playlist_result(entries, series_id, title, description)
class NRKSkoleIE(InfoExtractor): class NRKSkoleIE(InfoExtractor):
IE_DESC = 'NRK Skole' IE_DESC = 'NRK Skole'
_VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
determine_ext, determine_ext,
@ -375,6 +376,35 @@ class PBSIE(InfoExtractor):
}, },
'expected_warnings': ['HTTP Error 403: Forbidden'], 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, },
{
'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/',
'info_dict': {
'id': '3007193718',
'ext': 'mp4',
'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster",
'description': 'md5:37efbac85e0c09b009586523ec143652',
'duration': 6292,
'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{
'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/',
'info_dict': {
'id': '3011407934',
'ext': 'mp4',
'title': 'Stories from the Stage - Road Trip',
'duration': 1619,
'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['HTTP Error 403: Forbidden'],
},
{ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True, 'only_matching': True,
@ -438,6 +468,7 @@ class PBSIE(InfoExtractor):
r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer
r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/
r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/
] ]
media_id = self._search_regex( media_id = self._search_regex(
@ -472,7 +503,8 @@ class PBSIE(InfoExtractor):
if not url: if not url:
url = self._og_search_url(webpage) url = self._og_search_url(webpage)
mobj = re.match(self._VALID_URL, url) mobj = re.match(
self._VALID_URL, self._proto_relative_url(url.strip()))
player_id = mobj.group('player_id') player_id = mobj.group('player_id')
if not display_id: if not display_id:
@ -482,13 +514,27 @@ class PBSIE(InfoExtractor):
url, display_id, note='Downloading player page', url, display_id, note='Downloading player page',
errnote='Could not download player page') errnote='Could not download player page')
video_id = self._search_regex( video_id = self._search_regex(
r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') r'<div\s+id=["\']video_(\d+)', player_page, 'video ID',
default=None)
if not video_id:
video_info = self._extract_video_data(
player_page, 'video data', display_id)
video_id = compat_str(
video_info.get('id') or video_info['contentID'])
else: else:
video_id = mobj.group('id') video_id = mobj.group('id')
display_id = video_id display_id = video_id
return video_id, display_id, None, description return video_id, display_id, None, description
def _extract_video_data(self, string, name, video_id, fatal=True):
return self._parse_json(
self._search_regex(
[r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
r'window\.videoBridge\s*=\s*({.+?});'],
string, name, default='{}'),
video_id, transform_source=js_to_json, fatal=fatal)
def _real_extract(self, url): def _real_extract(self, url):
video_id, display_id, upload_date, description = self._extract_webpage(url) video_id, display_id, upload_date, description = self._extract_webpage(url)
@ -519,11 +565,8 @@ class PBSIE(InfoExtractor):
'http://player.pbs.org/%s/%s' % (page, video_id), 'http://player.pbs.org/%s/%s' % (page, video_id),
display_id, 'Downloading %s page' % page, fatal=False) display_id, 'Downloading %s page' % page, fatal=False)
if player: if player:
video_info = self._parse_json( video_info = self._extract_video_data(
self._search_regex( player, '%s video data' % page, display_id, fatal=False)
[r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'],
player, '%s video data' % page, default='{}'),
display_id, transform_source=js_to_json, fatal=False)
if video_info: if video_info:
extract_redirect_urls(video_info) extract_redirect_urls(video_info)
if not info: if not info:

View File

@ -10,6 +10,7 @@ from ..utils import (
parse_resolution, parse_resolution,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
urljoin, urljoin,
) )
@ -116,12 +117,14 @@ class PeerTubeIE(InfoExtractor):
videos\.tcit\.fr| videos\.tcit\.fr|
peertube\.cpy\.re peertube\.cpy\.re
)''' )'''
_UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// (?:
%s peertube:(?P<host>[^:]+):|
/(?:videos/(?:watch|embed)|api/v\d/videos)/ https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/
(?P<id>[^/?\#&]+) )
''' % _INSTANCES_RE (?P<id>%s)
''' % (_INSTANCES_RE, _UUID_RE)
_TESTS = [{ _TESTS = [{
'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c', 'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c',
'md5': '80f24ff364cc9d333529506a263e7feb', 'md5': '80f24ff364cc9d333529506a263e7feb',
@ -157,21 +160,40 @@ class PeerTubeIE(InfoExtractor):
}, { }, {
'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8', 'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
'only_matching': True, 'only_matching': True,
}, {
'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_peertube_url(webpage, source_url):
return [ mobj = re.match(
mobj.group('url') r'https?://(?P<host>[^/]+)/videos/watch/(?P<id>%s)'
for mobj in re.finditer( % PeerTubeIE._UUID_RE, source_url)
r'''(?x)<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1''' if mobj and any(p in webpage for p in (
% PeerTubeIE._INSTANCES_RE, webpage)] '<title>PeerTube<',
'There will be other non JS-based clients to access PeerTube',
'>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
return 'peertube:%s:%s' % mobj.group('host', 'id')
@staticmethod
def _extract_urls(webpage, source_url):
entries = re.findall(
r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)'''
% (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage)
if not entries:
peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url)
if peertube_url:
entries = [peertube_url]
return entries
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) mobj = re.match(self._VALID_URL, url)
host = mobj.group('host') or mobj.group('host_2')
video_id = mobj.group('id')
video = self._download_json( video = self._download_json(
urljoin(url, '/api/v1/videos/%s' % video_id), video_id) 'https://%s/api/v1/videos/%s' % (host, video_id), video_id)
title = video['name'] title = video['name']
@ -179,8 +201,8 @@ class PeerTubeIE(InfoExtractor):
for file_ in video['files']: for file_ in video['files']:
if not isinstance(file_, dict): if not isinstance(file_, dict):
continue continue
file_url = file_.get('fileUrl') file_url = url_or_none(file_.get('fileUrl'))
if not file_url or not isinstance(file_url, compat_str): if not file_url:
continue continue
file_size = int_or_none(file_.get('size')) file_size = int_or_none(file_.get('size'))
format_id = try_get( format_id = try_get(

View File

@ -27,6 +27,60 @@ from ..utils import (
class PluralsightBaseIE(InfoExtractor): class PluralsightBaseIE(InfoExtractor):
_API_BASE = 'https://app.pluralsight.com' _API_BASE = 'https://app.pluralsight.com'
_GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE
_GRAPHQL_HEADERS = {
'Content-Type': 'application/json;charset=UTF-8',
}
_GRAPHQL_COURSE_TMPL = '''
query BootstrapPlayer {
rpc {
bootstrapPlayer {
profile {
firstName
lastName
email
username
userHandle
authed
isAuthed
plan
}
course(courseId: "%s") {
name
title
courseHasCaptions
translationLanguages {
code
name
}
supportsWideScreenVideoFormats
timestamp
modules {
name
title
duration
formattedDuration
author
authorized
clips {
authorized
clipId
duration
formattedDuration
id
index
moduleIndex
moduleTitle
name
title
watched
}
}
}
}
}
}'''
def _download_course(self, course_id, url, display_id): def _download_course(self, course_id, url, display_id):
try: try:
return self._download_course_rpc(course_id, url, display_id) return self._download_course_rpc(course_id, url, display_id)
@ -39,20 +93,14 @@ class PluralsightBaseIE(InfoExtractor):
def _download_course_rpc(self, course_id, url, display_id): def _download_course_rpc(self, course_id, url, display_id):
response = self._download_json( response = self._download_json(
'%s/player/functions/rpc' % self._API_BASE, display_id, self._GRAPHQL_EP, display_id, data=json.dumps({
'Downloading course JSON', 'query': self._GRAPHQL_COURSE_TMPL % course_id,
data=json.dumps({ 'variables': {}
'fn': 'bootstrapPlayer', }).encode('utf-8'), headers=self._GRAPHQL_HEADERS)
'payload': {
'courseId': course_id,
},
}).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
})
course = try_get(response, lambda x: x['payload']['course'], dict) course = try_get(
response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'],
dict)
if course: if course:
return course return course
@ -90,6 +138,28 @@ class PluralsightIE(PluralsightBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
GRAPHQL_VIEWCLIP_TMPL = '''
query viewClip {
viewClip(input: {
author: "%(author)s",
clipIndex: %(clipIndex)d,
courseName: "%(courseName)s",
includeCaptions: %(includeCaptions)s,
locale: "%(locale)s",
mediaType: "%(mediaType)s",
moduleName: "%(moduleName)s",
quality: "%(quality)s"
}) {
urls {
url
cdn
rank
source
},
status
}
}'''
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -277,7 +347,7 @@ class PluralsightIE(PluralsightBaseIE):
f = QUALITIES[quality].copy() f = QUALITIES[quality].copy()
clip_post = { clip_post = {
'author': author, 'author': author,
'includeCaptions': False, 'includeCaptions': 'false',
'clipIndex': int(clip_idx), 'clipIndex': int(clip_idx),
'courseName': course_name, 'courseName': course_name,
'locale': 'en', 'locale': 'en',
@ -286,11 +356,23 @@ class PluralsightIE(PluralsightBaseIE):
'quality': '%dx%d' % (f['width'], f['height']), 'quality': '%dx%d' % (f['width'], f['height']),
} }
format_id = '%s-%s' % (ext, quality) format_id = '%s-%s' % (ext, quality)
viewclip = self._download_json(
'%s/video/clips/viewclip' % self._API_BASE, display_id, try:
'Downloading %s viewclip JSON' % format_id, fatal=False, viewclip = self._download_json(
data=json.dumps(clip_post).encode('utf-8'), self._GRAPHQL_EP, display_id,
headers={'Content-Type': 'application/json;charset=utf-8'}) 'Downloading %s viewclip graphql' % format_id,
data=json.dumps({
'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post,
'variables': {}
}).encode('utf-8'),
headers=self._GRAPHQL_HEADERS)['data']['viewClip']
except ExtractorError:
# Still works but most likely will go soon
viewclip = self._download_json(
'%s/video/clips/viewclip' % self._API_BASE, display_id,
'Downloading %s viewclip JSON' % format_id, fatal=False,
data=json.dumps(clip_post).encode('utf-8'),
headers={'Content-Type': 'application/json;charset=utf-8'})
# Pluralsight tracks multiple sequential calls to ViewClip API and start # Pluralsight tracks multiple sequential calls to ViewClip API and start
# to return 429 HTTP errors after some time (see # to return 429 HTTP errors after some time (see

View File

@ -43,7 +43,8 @@ class PornComIE(InfoExtractor):
config = self._parse_json( config = self._parse_json(
self._search_regex( self._search_regex(
r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', (r'=\s*({.+?})\s*;\s*v1ar\b',
r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='),
webpage, 'config', default='{}'), webpage, 'config', default='{}'),
display_id, transform_source=js_to_json, fatal=False) display_id, transform_source=js_to_json, fatal=False)
@ -69,7 +70,7 @@ class PornComIE(InfoExtractor):
'height': int(height), 'height': int(height),
'filesize_approx': parse_filesize(filesize), 'filesize_approx': parse_filesize(filesize),
} for format_url, height, filesize in re.findall( } for format_url, height, filesize in re.findall(
r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<', r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<',
webpage)] webpage)]
thumbnail = None thumbnail = None
duration = None duration = None

View File

@ -4,28 +4,22 @@ from __future__ import unicode_literals
import functools import functools
import itertools import itertools
import operator import operator
# import os
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
# compat_urllib_parse_unquote, compat_str,
# compat_urllib_parse_unquote_plus,
# compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
js_to_json, js_to_json,
orderedSet, orderedSet,
# sanitized_Request,
remove_quotes, remove_quotes,
str_to_int, str_to_int,
url_or_none,
) )
# from ..aes import (
# aes_decrypt_text
# )
class PornHubIE(InfoExtractor): class PornHubIE(InfoExtractor):
@ -62,7 +56,7 @@ class PornHubIE(InfoExtractor):
'id': '1331683002', 'id': '1331683002',
'ext': 'mp4', 'ext': 'mp4',
'title': '重庆婷婷女王足交', 'title': '重庆婷婷女王足交',
'uploader': 'cj397186295', 'uploader': 'Unknown',
'duration': 1753, 'duration': 1753,
'view_count': int, 'view_count': int,
'like_count': int, 'like_count': int,
@ -75,6 +69,31 @@ class PornHubIE(InfoExtractor):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# subtitles
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
'info_dict': {
'id': 'ph5af5fef7c2aa7',
'ext': 'mp4',
'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
'uploader': 'BFFs',
'duration': 622,
'view_count': int,
'like_count': int,
'dislike_count': int,
'comment_count': int,
'age_limit': 18,
'tags': list,
'categories': list,
'subtitles': {
'en': [{
"ext": 'srt'
}]
},
},
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True, 'only_matching': True,
@ -121,7 +140,7 @@ class PornHubIE(InfoExtractor):
self._set_cookie('pornhub.com', 'platform', platform) self._set_cookie('pornhub.com', 'platform', platform)
return self._download_webpage( return self._download_webpage(
'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
video_id) video_id, 'Downloading %s webpage' % platform)
webpage = dl_webpage('pc') webpage = dl_webpage('pc')
@ -134,57 +153,105 @@ class PornHubIE(InfoExtractor):
'PornHub said: %s' % error_msg, 'PornHub said: %s' % error_msg,
expected=True, video_id=video_id) expected=True, video_id=video_id)
tv_webpage = dl_webpage('tv')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
title = self._search_regex(
r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
# video_title from flashvars contains whitespace instead of non-ASCII (see # video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore. # on that anymore.
title = title or self._html_search_meta( title = self._html_search_meta(
'twitter:title', webpage, default=None) or self._search_regex( 'twitter:title', webpage, default=None) or self._search_regex(
(r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)', (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1', r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'), r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
webpage, 'title', group='title') webpage, 'title', group='title')
video_urls = []
video_urls_set = set()
subtitles = {}
flashvars = self._parse_json( flashvars = self._parse_json(
self._search_regex( self._search_regex(
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'), r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id) video_id)
if flashvars: if flashvars:
subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
if subtitle_url:
subtitles.setdefault('en', []).append({
'url': subtitle_url,
'ext': 'srt',
})
thumbnail = flashvars.get('image_url') thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration')) duration = int_or_none(flashvars.get('video_duration'))
media_definitions = flashvars.get('mediaDefinitions')
if isinstance(media_definitions, list):
for definition in media_definitions:
if not isinstance(definition, dict):
continue
video_url = definition.get('videoUrl')
if not video_url or not isinstance(video_url, compat_str):
continue
if video_url in video_urls_set:
continue
video_urls_set.add(video_url)
video_urls.append(
(video_url, int_or_none(definition.get('quality'))))
else: else:
title, thumbnail, duration = [None] * 3 thumbnail, duration = [None] * 2
if not video_urls:
tv_webpage = dl_webpage('tv')
assignments = self._search_regex(
r'(var.+?mediastring.+?)</script>', tv_webpage,
'encoded url').split(';')
js_vars = {}
def parse_js_value(inp):
inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
if '+' in inp:
inps = inp.split('+')
return functools.reduce(
operator.concat, map(parse_js_value, inps))
inp = inp.strip()
if inp in js_vars:
return js_vars[inp]
return remove_quotes(inp)
for assn in assignments:
assn = assn.strip()
if not assn:
continue
assn = re.sub(r'var\s+', '', assn)
vname, value = assn.split('=', 1)
js_vars[vname] = parse_js_value(value)
video_url = js_vars['mediastring']
if video_url not in video_urls_set:
video_urls.append((video_url, None))
video_urls_set.add(video_url)
for mobj in re.finditer(
r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage):
video_url = mobj.group('url')
if video_url not in video_urls_set:
video_urls.append((video_url, None))
video_urls_set.add(video_url)
formats = []
for video_url, height in video_urls:
tbr = None
mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
if mobj:
if not height:
height = int(mobj.group('height'))
tbr = int(mobj.group('tbr'))
formats.append({
'url': video_url,
'format_id': '%dp' % height if height else None,
'height': height,
'tbr': tbr,
})
self._sort_formats(formats)
video_uploader = self._html_search_regex( video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@ -210,7 +277,6 @@ class PornHubIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'uploader': video_uploader, 'uploader': video_uploader,
'title': title, 'title': title,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
@ -219,10 +285,11 @@ class PornHubIE(InfoExtractor):
'like_count': like_count, 'like_count': like_count,
'dislike_count': dislike_count, 'dislike_count': dislike_count,
'comment_count': comment_count, 'comment_count': comment_count,
# 'formats': formats, 'formats': formats,
'age_limit': 18, 'age_limit': 18,
'tags': tags, 'tags': tags,
'categories': categories, 'categories': categories,
'subtitles': subtitles,
} }

View File

@ -0,0 +1,247 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_resolution,
str_or_none,
try_get,
unified_timestamp,
url_or_none,
urljoin,
)
class PuhuTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle'
IE_NAME = 'puhutv'
_TESTS = [{
# film
'url': 'https://puhutv.com/sut-kardesler-izle',
'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7',
'info_dict': {
'id': '5085',
'display_id': 'sut-kardesler',
'ext': 'mp4',
'title': 'Süt Kardeşler',
'description': 'md5:405fd024df916ca16731114eb18e511a',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 4832.44,
'creator': 'Arzu Film',
'timestamp': 1469778212,
'upload_date': '20160729',
'release_year': 1976,
'view_count': int,
'tags': ['Aile', 'Komedi', 'Klasikler'],
},
}, {
# episode, geo restricted, bypassable with --geo-verification-proxy
'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle',
'only_matching': True,
}, {
# 4k, with subtitles
'url': 'https://puhutv.com/dip-1-bolum-izle',
'only_matching': True,
}]
_SUBTITLE_LANGS = {
'English': 'en',
'Deutsch': 'de',
'عربى': 'ar'
}
def _real_extract(self, url):
display_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-izle' % display_id),
display_id)['data']
video_id = compat_str(info['id'])
title = info.get('name') or info['title']['name']
if info.get('display_name'):
title = '%s %s' % (title, info.get('display_name'))
try:
videos = self._download_json(
'https://puhutv.com/api/assets/%s/videos' % video_id,
display_id, 'Downloading video JSON',
headers=self.geo_verification_headers())
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self.raise_geo_restricted()
raise
formats = []
for video in videos['data']['videos']:
media_url = url_or_none(video.get('url'))
if not media_url:
continue
playlist = video.get('is_playlist')
if video.get('stream_type') == 'hls' and playlist is True:
formats.extend(self._extract_m3u8_formats(
media_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
continue
quality = int_or_none(video.get('quality'))
f = {
'url': media_url,
'ext': 'mp4',
'height': quality
}
video_format = video.get('video_format')
if video_format == 'hls' and playlist is False:
format_id = 'hls'
f['protocol'] = 'm3u8_native'
elif video_format == 'mp4':
format_id = 'http'
else:
continue
if quality:
format_id += '-%sp' % quality
f['format_id'] = format_id
formats.append(f)
self._sort_formats(formats)
description = try_get(
info, lambda x: x['title']['description'],
compat_str) or info.get('description')
timestamp = unified_timestamp(info.get('created_at'))
creator = try_get(
info, lambda x: x['title']['producer']['name'], compat_str)
duration = float_or_none(
try_get(info, lambda x: x['content']['duration_in_ms'], int),
scale=1000)
view_count = try_get(info, lambda x: x['content']['watch_count'], int)
images = try_get(
info, lambda x: x['content']['images']['wide'], dict) or {}
thumbnails = []
for image_id, image_url in images.items():
if not isinstance(image_url, compat_str):
continue
if not image_url.startswith(('http', '//')):
image_url = 'https://%s' % image_url
t = parse_resolution(image_id)
t.update({
'id': image_id,
'url': image_url
})
thumbnails.append(t)
release_year = try_get(info, lambda x: x['title']['released_at'], int)
season_number = int_or_none(info.get('season_number'))
season_id = str_or_none(info.get('season_id'))
episode_number = int_or_none(info.get('episode_number'))
tags = []
for genre in try_get(info, lambda x: x['title']['genres'], list) or []:
if not isinstance(genre, dict):
continue
genre_name = genre.get('name')
if genre_name and isinstance(genre_name, compat_str):
tags.append(genre_name)
subtitles = {}
for subtitle in try_get(
info, lambda x: x['content']['subtitles'], list) or []:
if not isinstance(subtitle, dict):
continue
lang = subtitle.get('language')
sub_url = url_or_none(subtitle.get('url'))
if not lang or not isinstance(lang, compat_str) or not sub_url:
continue
subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
'url': sub_url
}]
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
'season_id': season_id,
'season_number': season_number,
'episode_number': episode_number,
'release_year': release_year,
'timestamp': timestamp,
'creator': creator,
'view_count': view_count,
'duration': duration,
'tags': tags,
'subtitles': subtitles,
'thumbnails': thumbnails,
'formats': formats
}
class PuhuTVSerieIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay'
IE_NAME = 'puhutv:serie'
_TESTS = [{
'url': 'https://puhutv.com/deniz-yildizi-detay',
'info_dict': {
'title': 'Deniz Yıldızı',
'id': 'deniz-yildizi',
},
'playlist_mincount': 205,
}, {
# a film detail page which is using same url with serie page
'url': 'https://puhutv.com/kaybedenler-kulubu-detay',
'only_matching': True,
}]
def _extract_entries(self, seasons):
for season in seasons:
season_id = season.get('id')
if not season_id:
continue
page = 1
has_more = True
while has_more is True:
season = self._download_json(
'https://galadriel.puhutv.com/seasons/%s' % season_id,
season_id, 'Downloading page %s' % page, query={
'page': page,
'per': 40,
})
episodes = season.get('episodes')
if isinstance(episodes, list):
for ep in episodes:
slug_path = str_or_none(ep.get('slugPath'))
if not slug_path:
continue
video_id = str_or_none(int_or_none(ep.get('id')))
yield self.url_result(
'https://puhutv.com/%s' % slug_path,
ie=PuhuTVIE.ie_key(), video_id=video_id,
video_title=ep.get('name') or ep.get('eventLabel'))
page += 1
has_more = season.get('hasMore')
def _real_extract(self, url):
playlist_id = self._match_id(url)
info = self._download_json(
urljoin(url, '/api/slug/%s-detay' % playlist_id),
playlist_id)['data']
seasons = info.get('seasons')
if seasons:
return self.playlist_result(
self._extract_entries(seasons), playlist_id, info.get('name'))
# For films, these are using same url with series
video_id = info.get('slug') or info['assets'][0]['slug']
return self.url_result(
'https://puhutv.com/%s-izle' % video_id,
PuhuTVIE.ie_key(), video_id)

View File

@ -32,6 +32,9 @@ class RaiBaseIE(InfoExtractor):
_GEO_BYPASS = False _GEO_BYPASS = False
def _extract_relinker_info(self, relinker_url, video_id): def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
return {'formats': [{'url': relinker_url}]}
formats = [] formats = []
geoprotection = None geoprotection = None
is_live = None is_live = None
@ -369,6 +372,10 @@ class RaiIE(RaiBaseIE):
'params': { 'params': {
'skip_download': True, 'skip_download': True,
}, },
}, {
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'only_matching': True,
}] }]
def _extract_from_content_id(self, content_id, url): def _extract_from_content_id(self, content_id, url):

View File

@ -3,12 +3,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_to_int, str_to_int,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -71,8 +71,8 @@ class RedTubeIE(InfoExtractor):
video_id, fatal=False) video_id, fatal=False)
if medias and isinstance(medias, list): if medias and isinstance(medias, list):
for media in medias: for media in medias:
format_url = media.get('videoUrl') format_url = url_or_none(media.get('videoUrl'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_id = media.get('quality') format_id = media.get('quality')
formats.append({ formats.append({

View File

@ -6,6 +6,7 @@ from ..compat import compat_str
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
url_or_none,
) )
@ -37,8 +38,8 @@ class RENTVIE(InfoExtractor):
title = config['title'] title = config['title']
formats = [] formats = []
for video in config['src']: for video in config['src']:
src = video.get('src') src = url_or_none(video.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
ext = determine_ext(src) ext = determine_ext(src)
if ext == 'm3u8': if ext == 'm3u8':

View File

@ -1,10 +1,14 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none,
ExtractorError, ExtractorError,
float_or_none,
int_or_none,
strip_or_none,
) )
@ -14,20 +18,19 @@ class RTBFIE(InfoExtractor):
(?: (?:
video/[^?]+\?.*\bid=| video/[^?]+\?.*\bid=|
ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=|
auvio/[^/]+\?.*id= auvio/[^/]+\?.*\b(?P<live>l)?id=
)(?P<id>\d+)''' )(?P<id>\d+)'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
'md5': '799f334ddf2c0a582ba80c44655be570', 'md5': '8c876a1cceeb6cf31b476461ade72384',
'info_dict': { 'info_dict': {
'id': '1921274', 'id': '1921274',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Les Diables au coeur (épisode 2)', 'title': 'Les Diables au coeur (épisode 2)',
'description': 'Football - Diables Rouges', 'description': '(du 25/04/2014)',
'duration': 3099, 'duration': 3099.54,
'upload_date': '20140425', 'upload_date': '20140425',
'timestamp': 1398456336, 'timestamp': 1398456300,
'uploader': 'rtbfsport',
} }
}, { }, {
# geo restricted # geo restricted
@ -39,6 +42,18 @@ class RTBFIE(InfoExtractor):
}, { }, {
'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996',
'only_matching': True, 'only_matching': True,
}, {
# Live
'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775',
'only_matching': True,
}, {
# Audio
'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811',
'only_matching': True,
}, {
# With Subtitle
'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588',
'only_matching': True,
}] }]
_IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be'
_PROVIDERS = { _PROVIDERS = {
@ -53,46 +68,94 @@ class RTBFIE(InfoExtractor):
] ]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) live, media_id = re.match(self._VALID_URL, url).groups()
data = self._download_json( embed_page = self._download_webpage(
'http://www.rtbf.be/api/media/video?method=getVideoDetail&args[]=%s' % video_id, video_id) 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'),
media_id, query={'id': media_id})
data = self._parse_json(self._html_search_regex(
r'data-media="([^"]+)"', embed_page, 'media data'), media_id)
error = data.get('error') error = data.get('error')
if error: if error:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
data = data['data']
provider = data.get('provider') provider = data.get('provider')
if provider in self._PROVIDERS: if provider in self._PROVIDERS:
return self.url_result(data['url'], self._PROVIDERS[provider]) return self.url_result(data['url'], self._PROVIDERS[provider])
title = data['title']
is_live = data.get('isLive')
if is_live:
title = self._live_title(title)
height_re = r'-(\d+)p\.'
formats = [] formats = []
for key, format_id in self._QUALITIES:
format_url = data.get(key + 'Url') m3u8_url = data.get('urlHlsAes128') or data.get('urlHls')
if format_url: if m3u8_url:
formats.extend(self._extract_m3u8_formats(
m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False))
fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x
http_url = data.get('url')
if formats and http_url and re.search(height_re, http_url):
http_url = fix_url(http_url)
for m3u8_f in formats[:]:
height = m3u8_f.get('height')
if not height:
continue
f = m3u8_f.copy()
del f['protocol']
f.update({
'format_id': m3u8_f['format_id'].replace('hls-', 'http-'),
'url': re.sub(height_re, '-%dp.' % height, http_url),
})
formats.append(f)
else:
sources = data.get('sources') or {}
for key, format_id in self._QUALITIES:
format_url = sources.get(key)
if not format_url:
continue
height = int_or_none(self._search_regex(
height_re, format_url, 'height', default=None))
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'url': format_url, 'url': fix_url(format_url),
'height': height,
}) })
thumbnails = [] mpd_url = data.get('urlDash')
for thumbnail_id, thumbnail_url in data.get('thumbnail', {}).items(): if not data.get('drm') and mpd_url:
if thumbnail_id != 'default': formats.extend(self._extract_mpd_formats(
thumbnails.append({ mpd_url, media_id, mpd_id='dash', fatal=False))
'url': self._IMAGE_HOST + thumbnail_url,
'id': thumbnail_id, audio_url = data.get('urlAudio')
}) if audio_url:
formats.append({
'format_id': 'audio',
'url': audio_url,
'vcodec': 'none',
})
self._sort_formats(formats)
subtitles = {}
for track in (data.get('tracks') or {}).values():
sub_url = track.get('url')
if not sub_url:
continue
subtitles.setdefault(track.get('lang') or 'fr', []).append({
'url': sub_url,
})
return { return {
'id': video_id, 'id': media_id,
'formats': formats, 'formats': formats,
'title': data['title'], 'title': title,
'description': data.get('description') or data.get('subtitle'), 'description': strip_or_none(data.get('description')),
'thumbnails': thumbnails, 'thumbnail': data.get('thumbnail'),
'duration': data.get('duration') or data.get('realDuration'), 'duration': float_or_none(data.get('realDuration')),
'timestamp': int_or_none(data.get('created')), 'timestamp': int_or_none(data.get('liveFrom')),
'view_count': int_or_none(data.get('viewCount')), 'series': data.get('programLabel'),
'uploader': data.get('channel'), 'subtitles': subtitles,
'tags': data.get('tags'), 'is_live': is_live,
} }

View File

@ -16,6 +16,7 @@ from ..utils import (
int_or_none, int_or_none,
try_get, try_get,
unified_timestamp, unified_timestamp,
url_or_none,
) )
@ -176,8 +177,8 @@ class RutubePlaylistBaseIE(RutubeBaseIE):
break break
for result in results: for result in results:
video_url = result.get('video_url') video_url = url_or_none(result.get('video_url'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
entry = self._extract_video(result, require_title=False) entry = self._extract_video(result, require_title=False)
entry.update({ entry.update({

View File

@ -19,29 +19,33 @@ from ..utils import (
class SixPlayIE(InfoExtractor): class SixPlayIE(InfoExtractor):
IE_NAME = '6play' IE_NAME = '6play'
_VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)'
_TEST = { _TESTS = [{
'url': 'http://www.6play.fr/le-meilleur-patissier-p_1807/le-meilleur-patissier-special-fetes-mercredi-a-21-00-sur-m6-c_11638450', 'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051',
'md5': '42310bffe4ba3982db112b9cd3467328', 'md5': '31fcd112637baa0c2ab92c4fcd8baf27',
'info_dict': { 'info_dict': {
'id': '11638450', 'id': '12041051',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Le Meilleur Pâtissier, spécial fêtes mercredi à 21:00 sur M6', 'title': 'Le but qui a marqué l\'histoire du football français !',
'description': 'md5:308853f6a5f9e2d55a30fc0654de415f', 'description': 'md5:b59e7e841d646ef1eb42a7868eb6a851',
'duration': 39,
'series': 'Le meilleur pâtissier',
}, },
'params': { }, {
'skip_download': True, 'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869',
}, 'only_matching': True,
} }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) domain, video_id = re.search(self._VALID_URL, url).groups()
service, consumer_name = {
'6play.fr': ('6play', 'm6web'),
'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'),
}.get(domain, ('6play', 'm6web'))
data = self._download_json( data = self._download_json(
'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/6play/videos/clip_%s' % video_id, 'https://pc.middleware.6play.fr/6play/v2/platforms/m6group_web/services/%s/videos/clip_%s' % (service, video_id),
video_id, query={ video_id, headers={
'x-customer-name': consumer_name
}, query={
'csa': 5, 'csa': 5,
'with': 'clips', 'with': 'clips',
}) })
@ -65,7 +69,14 @@ class SixPlayIE(InfoExtractor):
subtitles.setdefault('fr', []).append({'url': asset_url}) subtitles.setdefault('fr', []).append({'url': asset_url})
continue continue
if container == 'm3u8' or ext == 'm3u8': if container == 'm3u8' or ext == 'm3u8':
if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: if protocol == 'usp':
if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]:
urlh = self._request_webpage(
asset_url, video_id, fatal=False,
headers=self.geo_verification_headers())
if not urlh:
continue
asset_url = urlh.geturl()
asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url)
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', 'm3u8_native', asset_url, video_id, 'mp4', 'm3u8_native',

View File

@ -1,12 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
class SlutloadIE(InfoExtractor): class SlutloadIE(InfoExtractor):
_VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$' _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/', 'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
'md5': '868309628ba00fd488cf516a113fd717', 'md5': '868309628ba00fd488cf516a113fd717',
@ -16,33 +14,52 @@ class SlutloadIE(InfoExtractor):
'title': 'virginie baisee en cam', 'title': 'virginie baisee en cam',
'age_limit': 18, 'age_limit': 18,
'thumbnail': r're:https?://.*?\.jpg' 'thumbnail': r're:https?://.*?\.jpg'
} },
}, { }, {
# mobile site # mobile site
'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/', 'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/',
'only_matching': True,
}, {
'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url) embed_page = self._download_webpage(
webpage = self._download_webpage(desktop_url, video_id) 'http://www.slutload.com/embed_player/%s' % video_id, video_id,
'Downloading embed page', fatal=False)
video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>', if embed_page:
webpage, 'title').strip() def extract(what):
return self._html_search_regex(
r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what,
embed_page, 'video %s' % what, default=None, group='url')
video_url = self._html_search_regex( video_url = extract('url')
r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"', if video_url:
webpage, 'video URL') title = self._html_search_regex(
thumbnail = self._html_search_regex( r'<title>([^<]+)', embed_page, 'title', default=video_id)
r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"', return {
webpage, 'thumbnail', fatal=False) 'id': video_id,
'url': video_url,
'title': title,
'thumbnail': extract('preview'),
'age_limit': 18
}
return { webpage = self._download_webpage(
'http://www.slutload.com/video/_/%s/' % video_id, video_id)
title = self._html_search_regex(
r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip()
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
info.update({
'id': video_id, 'id': video_id,
'url': video_url, 'title': title,
'title': video_title, 'age_limit': 18,
'thumbnail': thumbnail, })
'age_limit': 18 return info
}

View File

@ -72,4 +72,7 @@ class StreamcloudIE(InfoExtractor):
'title': title, 'title': title,
'url': video_url, 'url': video_url,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'http_headers': {
'Referer': url,
},
} }

View File

@ -12,6 +12,8 @@ from ..utils import (
determine_ext, determine_ext,
dict_get, dict_get,
int_or_none, int_or_none,
orderedSet,
strip_or_none,
try_get, try_get,
urljoin, urljoin,
compat_str, compat_str,
@ -137,7 +139,12 @@ class SVTPlayBaseIE(SVTBaseIE):
class SVTPlayIE(SVTPlayBaseIE): class SVTPlayIE(SVTPlayBaseIE):
IE_DESC = 'SVT Play and Öppet arkiv' IE_DESC = 'SVT Play and Öppet arkiv'
_VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)' _VALID_URL = r'''(?x)
(?:
svt:(?P<svt_id>[^/?#&]+)|
https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
)
'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
'md5': '2b6704fe4a28801e1a098bbf3c5ac611', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
@ -164,10 +171,40 @@ class SVTPlayIE(SVTPlayBaseIE):
}, { }, {
'url': 'https://www.svtplay.se/kanaler/svt1', 'url': 'https://www.svtplay.se/kanaler/svt1',
'only_matching': True, 'only_matching': True,
}, {
'url': 'svt:1376446-003A',
'only_matching': True,
}, {
'url': 'svt:14278044',
'only_matching': True,
}] }]
def _adjust_title(self, info):
if info['is_live']:
info['title'] = self._live_title(info['title'])
def _extract_by_video_id(self, video_id, webpage=None):
data = self._download_json(
'https://api.svt.se/videoplayer-api/video/%s' % video_id,
video_id, headers=self.geo_verification_headers())
info_dict = self._extract_video(data, video_id)
if not info_dict.get('title'):
title = dict_get(info_dict, ('episode', 'series'))
if not title and webpage:
title = re.sub(
r'\s*\|\s*.+?$', '', self._og_search_title(webpage))
if not title:
title = video_id
info_dict['title'] = title
self._adjust_title(info_dict)
return info_dict
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) mobj = re.match(self._VALID_URL, url)
video_id, svt_id = mobj.group('id', 'svt_id')
if svt_id:
return self._extract_by_video_id(svt_id)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
@ -179,10 +216,6 @@ class SVTPlayIE(SVTPlayBaseIE):
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
def adjust_title(info):
if info['is_live']:
info['title'] = self._live_title(info['title'])
if data: if data:
video_info = try_get( video_info = try_get(
data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'], data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
@ -193,24 +226,14 @@ class SVTPlayIE(SVTPlayBaseIE):
'title': data['context']['dispatcher']['stores']['MetaStore']['title'], 'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
'thumbnail': thumbnail, 'thumbnail': thumbnail,
}) })
adjust_title(info_dict) self._adjust_title(info_dict)
return info_dict return info_dict
video_id = self._search_regex( svt_id = self._search_regex(
r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
webpage, 'video id', default=None) webpage, 'video id')
if video_id: return self._extract_by_video_id(svt_id, webpage)
data = self._download_json(
'https://api.svt.se/videoplayer-api/video/%s' % video_id,
video_id, headers=self.geo_verification_headers())
info_dict = self._extract_video(data, video_id)
if not info_dict.get('title'):
info_dict['title'] = re.sub(
r'\s*\|\s*.+?$', '',
info_dict.get('episode') or self._og_search_title(webpage))
adjust_title(info_dict)
return info_dict
class SVTSeriesIE(SVTPlayBaseIE): class SVTSeriesIE(SVTPlayBaseIE):
@ -292,3 +315,57 @@ class SVTSeriesIE(SVTPlayBaseIE):
return self.playlist_result( return self.playlist_result(
entries, series_id, title, metadata.get('description')) entries, series_id, title, metadata.get('description'))
class SVTPageIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
'info_dict': {
'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
'title': 'GUIDE: Sommarträning du kan göra var och när du vill',
},
'playlist_count': 7,
}, {
'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
'info_dict': {
'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”',
},
'playlist_count': 1,
}, {
# only programTitle
'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
'info_dict': {
'id': '2900353',
'ext': 'mp4',
'title': 'Stjärnorna skojar till det - under SVT-intervjun',
'duration': 27,
'age_limit': 0,
},
}, {
'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
'only_matching': True,
}, {
'url': 'https://www.svt.se/vader/manadskronikor/maj2018',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
def _real_extract(self, url):
playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
entries = [
self.url_result(
'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id)
for video_id in orderedSet(re.findall(
r'data-video-id=["\'](\d+)', webpage))]
title = strip_or_none(self._og_search_title(webpage, default=None))
return self.playlist_result(entries, playlist_id, title)

View File

@ -7,8 +7,10 @@ from .common import InfoExtractor
from ..compat import compat_str from ..compat import compat_str
from ..utils import ( from ..utils import (
float_or_none,
int_or_none, int_or_none,
try_get, try_get,
url_or_none,
) )
@ -30,7 +32,7 @@ class TEDIE(InfoExtractor):
''' '''
_TESTS = [{ _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': '0de43ac406aa3e4ea74b66c9c7789b13', 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
'info_dict': { 'info_dict': {
'id': '102', 'id': '102',
'ext': 'mp4', 'ext': 'mp4',
@ -42,24 +44,30 @@ class TEDIE(InfoExtractor):
'uploader': 'Dan Dennett', 'uploader': 'Dan Dennett',
'width': 853, 'width': 853,
'duration': 1308, 'duration': 1308,
} 'view_count': int,
}, { 'comment_count': int,
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', 'tags': list,
'md5': 'b899ac15e345fb39534d913f7606082b', },
'info_dict': { 'params': {
'id': 'tSVI8ta_P4w', 'skip_download': True,
'ext': 'mp4', },
'title': 'Vishal Sikka: The beauty and power of algorithms', }, {
'thumbnail': r're:^https?://.+\.jpg', # missing HTTP bitrates
'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4', 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
'upload_date': '20140122', 'info_dict': {
'uploader_id': 'TEDInstitute', 'id': '6069',
'uploader': 'TED Institute', 'ext': 'mp4',
'title': 'The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
'description': 'md5:734e352710fb00d840ab87ae31aaf688',
'uploader': 'Vishal Sikka',
},
'params': {
'skip_download': True,
}, },
'add_ie': ['Youtube'],
}, { }, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
'md5': '71b3ab2f4233012dce09d515c9c39ce2', 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
'info_dict': { 'info_dict': {
'id': '1972', 'id': '1972',
'ext': 'mp4', 'ext': 'mp4',
@ -68,6 +76,9 @@ class TEDIE(InfoExtractor):
'description': 'md5:5174aed4d0f16021b704120360f72b92', 'description': 'md5:5174aed4d0f16021b704120360f72b92',
'duration': 1128, 'duration': 1128,
}, },
'params': {
'skip_download': True,
},
}, { }, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers', 'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'info_dict': { 'info_dict': {
@ -92,17 +103,17 @@ class TEDIE(InfoExtractor):
'skip_download': True, 'skip_download': True,
}, },
}, { }, {
# YouTube video # no nativeDownloads
'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond', 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
'add_ie': ['Youtube'],
'info_dict': { 'info_dict': {
'id': 'aFBIPO-P7LM', 'id': '1792',
'ext': 'mp4', 'ext': 'mp4',
'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville', 'title': 'The orchestra in my mouth',
'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1', 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
'uploader': 'TEDx Talks', 'uploader': 'Tom Thum',
'uploader_id': 'TEDxTalks', 'view_count': int,
'upload_date': '20111216', 'comment_count': int,
'tags': list,
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
@ -161,27 +172,16 @@ class TEDIE(InfoExtractor):
info = self._extract_info(webpage) info = self._extract_info(webpage)
talk_info = try_get( data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
info, lambda x: x['__INITIAL_DATA__']['talks'][0], talk_info = data['talks'][0]
dict) or info['talks'][0]
title = talk_info['title'].strip() title = talk_info['title'].strip()
external = talk_info.get('external')
if external:
service = external['service']
self.to_screen('Found video from %s' % service)
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return {
'_type': 'url',
'url': ext_url or external['uri'],
}
native_downloads = try_get( native_downloads = try_get(
talk_info, lambda x: x['downloads']['nativeDownloads'], talk_info,
dict) or talk_info['nativeDownloads'] (lambda x: x['downloads']['nativeDownloads'],
lambda x: x['nativeDownloads']),
dict) or {}
formats = [{ formats = [{
'url': format_url, 'url': format_url,
@ -196,10 +196,24 @@ class TEDIE(InfoExtractor):
player_talk = talk_info['player_talks'][0] player_talk = talk_info['player_talks'][0]
external = player_talk.get('external')
if isinstance(external, dict):
service = external.get('service')
if isinstance(service, compat_str):
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
return {
'_type': 'url',
'url': ext_url or external['uri'],
}
resources_ = player_talk.get('resources') or talk_info.get('resources') resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None http_url = None
for format_id, resources in resources_.items(): for format_id, resources in resources_.items():
if not isinstance(resources, dict):
continue
if format_id == 'h264': if format_id == 'h264':
for resource in resources: for resource in resources:
h264_url = resource.get('file') h264_url = resource.get('file')
@ -228,8 +242,12 @@ class TEDIE(InfoExtractor):
'tbr': int_or_none(resource.get('bitrate')), 'tbr': int_or_none(resource.get('bitrate')),
}) })
elif format_id == 'hls': elif format_id == 'hls':
stream_url = url_or_none(resources.get('stream'))
if not stream_url:
continue
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) stream_url, video_name, 'mp4', m3u8_id=format_id,
fatal=False))
m3u8_formats = list(filter( m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
@ -239,9 +257,13 @@ class TEDIE(InfoExtractor):
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate: if not bitrate:
continue continue
bitrate_url = re.sub(r'\d+k', bitrate, http_url)
if not self._is_valid_url(
bitrate_url, video_name, '%s bitrate' % bitrate):
continue
f = m3u8_format.copy() f = m3u8_format.copy()
f.update({ f.update({
'url': re.sub(r'\d+k', bitrate, http_url), 'url': bitrate_url,
'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http', 'protocol': 'http',
}) })
@ -267,7 +289,11 @@ class TEDIE(InfoExtractor):
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info), 'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats, 'formats': formats,
'duration': talk_info.get('duration'), 'duration': float_or_none(talk_info.get('duration')),
'view_count': int_or_none(data.get('viewed_count')),
'comment_count': int_or_none(
try_get(data, lambda x: x['comments']['count'])),
'tags': try_get(talk_info, lambda x: x['tags'], list),
} }
def _get_subtitles(self, video_id, talk_info): def _get_subtitles(self, video_id, talk_info):

View File

@ -1,26 +1,43 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .mitele import MiTeleBaseIE import json
import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
from ..utils import (
clean_html,
determine_ext,
int_or_none,
str_or_none,
urljoin,
)
class TelecincoIE(MiTeleBaseIE): class TelecincoIE(InfoExtractor):
IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
_VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{ _TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
'info_dict': { 'info_dict': {
'id': 'JEA5ijCnF6p5W08A1rNKn7', 'id': '1876350223',
'ext': 'mp4',
'title': 'Bacalao con kokotxas al pil-pil', 'title': 'Bacalao con kokotxas al pil-pil',
'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
'duration': 662,
}, },
'playlist': [{
'md5': 'adb28c37238b675dad0f042292f209a7',
'info_dict': {
'id': 'JEA5ijCnF6p5W08A1rNKn7',
'ext': 'mp4',
'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido',
'duration': 662,
},
}]
}, { }, {
'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
'md5': '284393e5387b3b947b77c613ef04749a', 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43',
'info_dict': { 'info_dict': {
'id': 'jn24Od1zGLG4XUZcnUnZB6', 'id': 'jn24Od1zGLG4XUZcnUnZB6',
'ext': 'mp4', 'ext': 'mp4',
@ -30,7 +47,7 @@ class TelecincoIE(MiTeleBaseIE):
}, },
}, { }, {
'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
'md5': '749afab6ea5a136a8806855166ae46a2', 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6',
'info_dict': { 'info_dict': {
'id': 'aywerkD2Sv1vGNqq9b85Q2', 'id': 'aywerkD2Sv1vGNqq9b85Q2',
'ext': 'mp4', 'ext': 'mp4',
@ -50,17 +67,90 @@ class TelecincoIE(MiTeleBaseIE):
'only_matching': True, 'only_matching': True,
}] }]
def _parse_content(self, content, url):
video_id = content['dataMediaId']
if content.get('dataCmsId') == 'ooyala':
return self.url_result(
'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id)
config_url = urljoin(url, content['dataConfig'])
config = self._download_json(
config_url, video_id, 'Downloading config JSON')
title = config['info']['title']
def mmc_url(mmc_type):
return re.sub(
r'/(?:flash|html5)\.json', '/%s.json' % mmc_type,
config['services']['mmc'])
duration = None
formats = []
for mmc_type in ('flash', 'html5'):
mmc = self._download_json(
mmc_url(mmc_type), video_id,
'Downloading %s mmc JSON' % mmc_type, fatal=False)
if not mmc:
continue
if not duration:
duration = int_or_none(mmc.get('duration'))
for location in mmc['locations']:
gat = self._proto_relative_url(location.get('gat'), 'http:')
gcp = location.get('gcp')
ogn = location.get('ogn')
if None in (gat, gcp, ogn):
continue
token_data = {
'gcp': gcp,
'ogn': ogn,
'sta': 0,
}
media = self._download_json(
gat, video_id, data=json.dumps(token_data).encode('utf-8'),
headers={
'Content-Type': 'application/json;charset=utf-8',
'Referer': url,
}, fatal=False) or {}
stream = media.get('stream') or media.get('file')
if not stream:
continue
ext = determine_ext(stream)
if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
video_id, f4m_id='hds', fatal=False))
elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
stream, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
'duration': duration,
}
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
title = self._html_search_meta( article = self._parse_json(self._search_regex(
['og:title', 'twitter:title'], webpage, 'title') r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})',
info = self._get_player_info(url, webpage) webpage, 'article'), display_id)['article']
title = article.get('title')
description = clean_html(article.get('leadParagraph'))
if article.get('editorialType') != 'VID':
entries = []
for p in article.get('body', []):
content = p.get('content')
if p.get('type') != 'video' or not content:
continue
entries.append(self._parse_content(content, url))
return self.playlist_result(
entries, str_or_none(article.get('id')), title, description)
content = article['opening']['content']
info = self._parse_content(content, url)
info.update({ info.update({
'display_id': display_id, 'description': description,
'title': title,
'description': self._html_search_meta(
['og:description', 'twitter:description'],
webpage, 'title', fatal=False),
}) })
return info return info

View File

@ -19,6 +19,7 @@ class TF1IE(InfoExtractor):
# Sometimes wat serves the whole file with the --test option # Sometimes wat serves the whole file with the --test option
'skip_download': True, 'skip_download': True,
}, },
'expected_warnings': ['HTTP Error 404'],
}, { }, {
'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html',
'info_dict': { 'info_dict': {

View File

@ -32,13 +32,15 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
class ThePlatformBaseIE(OnceIE): class ThePlatformBaseIE(OnceIE):
_TP_TLD = 'com'
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml( meta = self._download_xml(
smil_url, video_id, note=note, query={'format': 'SMIL'}, smil_url, video_id, note=note, query={'format': 'SMIL'},
headers=self.geo_verification_headers()) headers=self.geo_verification_headers())
error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
if error_element is not None and error_element.attrib['src'].startswith( if error_element is not None and error_element.attrib['src'].startswith(
'http://link.theplatform.com/s/errorFiles/Unavailable.'): 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD):
raise ExtractorError(error_element.attrib['abstract'], expected=True) raise ExtractorError(error_element.attrib['abstract'], expected=True)
smil_formats = self._parse_smil_formats( smil_formats = self._parse_smil_formats(
@ -66,7 +68,7 @@ class ThePlatformBaseIE(OnceIE):
return formats, subtitles return formats, subtitles
def _download_theplatform_metadata(self, path, video_id): def _download_theplatform_metadata(self, path, video_id):
info_url = 'http://link.theplatform.com/s/%s?format=preview' % path info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path)
return self._download_json(info_url, video_id) return self._download_json(info_url, video_id)
def _parse_theplatform_metadata(self, info): def _parse_theplatform_metadata(self, info):
@ -308,7 +310,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
class ThePlatformFeedIE(ThePlatformBaseIE): class ThePlatformFeedIE(ThePlatformBaseIE):
_URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s' _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
_VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))' _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))'
_TESTS = [{ _TESTS = [{
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
@ -325,6 +327,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
'uploader': 'NBCU-NEWS', 'uploader': 'NBCU-NEWS',
}, },
}, {
'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01',
'only_matching': True,
}] }]
def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None): def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):

View File

@ -15,6 +15,7 @@ from ..utils import (
update_url_query, update_url_query,
ExtractorError, ExtractorError,
strip_or_none, strip_or_none,
url_or_none,
) )
@ -154,8 +155,8 @@ class TurnerBaseIE(AdobePassIE):
subtitles = {} subtitles = {}
for source in video_data.findall('closedCaptions/source'): for source in video_data.findall('closedCaptions/source'):
for track in source.findall('track'): for track in source.findall('track'):
track_url = track.get('url') track_url = url_or_none(track.get('url'))
if not isinstance(track_url, compat_str) or track_url.endswith('/big'): if not track_url or track_url.endswith('/big'):
continue continue
lang = track.get('lang') or track.get('label') or 'en' lang = track.get('lang') or track.get('label') or 'en'
subtitles.setdefault(lang, []).append({ subtitles.setdefault(lang, []).append({

View File

@ -4,10 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -106,9 +106,8 @@ class TVNetIE(InfoExtractor):
for stream in self._download_json(data_file, video_id): for stream in self._download_json(data_file, video_id):
if not isinstance(stream, dict): if not isinstance(stream, dict):
continue continue
stream_url = stream.get('url') stream_url = url_or_none(stream.get('url'))
if (stream_url in stream_urls or not stream_url or if stream_url in stream_urls or not stream_url:
not isinstance(stream_url, compat_str)):
continue continue
stream_urls.add(stream_url) stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(

View File

@ -19,8 +19,8 @@ class TVNowBaseIE(InfoExtractor):
_VIDEO_FIELDS = ( _VIDEO_FIELDS = (
'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode',
'manifest.dashclear', 'format.title', 'format.defaultImage169Format', 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear',
'format.defaultImage169Logo') 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo')
def _call_api(self, path, video_id, query): def _call_api(self, path, video_id, query):
return self._download_json( return self._download_json(
@ -31,27 +31,42 @@ class TVNowBaseIE(InfoExtractor):
video_id = compat_str(info['id']) video_id = compat_str(info['id'])
title = info['title'] title = info['title']
mpd_url = info['manifest']['dashclear'] paths = []
if not mpd_url: for manifest_url in (info.get('manifest') or {}).values():
if not manifest_url:
continue
manifest_url = update_url_query(manifest_url, {'filter': ''})
path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
if path in paths:
continue
paths.append(path)
def url_repl(proto, suffix):
return re.sub(
r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
'.ism/' + suffix, manifest_url))
formats = self._extract_mpd_formats(
url_repl('dash', '.mpd'), video_id,
mpd_id='dash', fatal=False)
formats.extend(self._extract_ism_formats(
url_repl('hss', 'Manifest'),
video_id, ism_id='mss', fatal=False))
formats.extend(self._extract_m3u8_formats(
url_repl('hls', '.m3u8'), video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
if formats:
break
else:
if info.get('isDrm'): if info.get('isDrm'):
raise ExtractorError( raise ExtractorError(
'Video %s is DRM protected' % video_id, expected=True) 'Video %s is DRM protected' % video_id, expected=True)
if info.get('geoblocked'): if info.get('geoblocked'):
raise ExtractorError( raise self.raise_geo_restricted()
'Video %s is not available from your location due to geo restriction' % video_id,
expected=True)
if not info.get('free', True): if not info.get('free', True):
raise ExtractorError( raise ExtractorError(
'Video %s is not available for free' % video_id, expected=True) 'Video %s is not available for free' % video_id, expected=True)
mpd_url = update_url_query(mpd_url, {'filter': ''})
formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)
formats.extend(self._extract_ism_formats(
mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'),
video_id, ism_id='mss', fatal=False))
formats.extend(self._extract_m3u8_formats(
mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
self._sort_formats(formats) self._sort_formats(formats)
description = info.get('articleLong') or info.get('articleShort') description = info.get('articleLong') or info.get('articleShort')
@ -88,7 +103,7 @@ class TVNowBaseIE(InfoExtractor):
class TVNowIE(TVNowBaseIE): class TVNowIE(TVNowBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/ (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/
(?P<show_id>[^/]+)/ (?P<show_id>[^/]+)/
(?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+) (?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
''' '''
@ -140,11 +155,13 @@ class TVNowIE(TVNowBaseIE):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
display_id = '%s/%s' % re.match(self._VALID_URL, url).groups() mobj = re.match(self._VALID_URL, url)
display_id = '%s/%s' % mobj.group(2, 3)
info = self._call_api( info = self._call_api(
'movies/' + display_id, display_id, query={ 'movies/' + display_id, display_id, query={
'fields': ','.join(self._VIDEO_FIELDS), 'fields': ','.join(self._VIDEO_FIELDS),
'station': mobj.group(1),
}) })
return self._extract_video(info, display_id) return self._extract_video(info, display_id)

View File

@ -19,6 +19,7 @@ from ..utils import (
try_get, try_get,
unsmuggle_url, unsmuggle_url,
update_url_query, update_url_query,
url_or_none,
) )
@ -255,7 +256,8 @@ class TVPlayIE(InfoExtractor):
quality = qualities(['hls', 'medium', 'high']) quality = qualities(['hls', 'medium', 'high'])
formats = [] formats = []
for format_id, video_url in streams.get('streams', {}).items(): for format_id, video_url in streams.get('streams', {}).items():
if not video_url or not isinstance(video_url, compat_str): video_url = url_or_none(video_url)
if not video_url:
continue continue
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'f4m': if ext == 'f4m':

View File

@ -27,6 +27,7 @@ from ..utils import (
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
urlencode_postdata, urlencode_postdata,
url_or_none,
urljoin, urljoin,
) )
@ -663,8 +664,8 @@ class TwitchClipsIE(TwitchBaseIE):
for option in status['quality_options']: for option in status['quality_options']:
if not isinstance(option, dict): if not isinstance(option, dict):
continue continue
source = option.get('source') source = url_or_none(option.get('source'))
if not source or not isinstance(source, compat_str): if not source:
continue continue
formats.append({ formats.append({
'url': source, 'url': source,

View File

@ -20,6 +20,7 @@ from ..utils import (
sanitized_Request, sanitized_Request,
try_get, try_get,
unescapeHTML, unescapeHTML,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
@ -265,8 +266,8 @@ class UdemyIE(InfoExtractor):
if not isinstance(source_list, list): if not isinstance(source_list, list):
return return
for source in source_list: for source in source_list:
video_url = source.get('file') or source.get('src') video_url = url_or_none(source.get('file') or source.get('src'))
if not video_url or not isinstance(video_url, compat_str): if not video_url:
continue continue
if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
@ -293,8 +294,8 @@ class UdemyIE(InfoExtractor):
continue continue
if track.get('kind') != 'captions': if track.get('kind') != 'captions':
continue continue
src = track.get('src') src = url_or_none(track.get('src'))
if not src or not isinstance(src, compat_str): if not src:
continue continue
lang = track.get('language') or track.get( lang = track.get('language') or track.get(
'srclang') or track.get('label') 'srclang') or track.get('label')
@ -314,8 +315,8 @@ class UdemyIE(InfoExtractor):
for cc in captions: for cc in captions:
if not isinstance(cc, dict): if not isinstance(cc, dict):
continue continue
cc_url = cc.get('url') cc_url = url_or_none(cc.get('url'))
if not cc_url or not isinstance(cc_url, compat_str): if not cc_url:
continue continue
lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) lang = try_get(cc, lambda x: x['locale']['locale'], compat_str)
sub_dict = (automatic_captions if cc.get('source') == 'auto' sub_dict = (automatic_captions if cc.get('source') == 'auto'

View File

@ -24,6 +24,7 @@ class VGTVIE(XstreamIE):
'aftenposten.no/webtv': 'aptv', 'aftenposten.no/webtv': 'aptv',
'ap.vgtv.no/webtv': 'aptv', 'ap.vgtv.no/webtv': 'aptv',
'tv.aftonbladet.se/abtv': 'abtv', 'tv.aftonbladet.se/abtv': 'abtv',
'www.aftonbladet.se/tv': 'abtv',
} }
_APP_NAME_TO_VENDOR = { _APP_NAME_TO_VENDOR = {
@ -44,7 +45,7 @@ class VGTVIE(XstreamIE):
(?: (?:
(?:\#!/)?(?:video|live)/| (?:\#!/)?(?:video|live)/|
embed?.*id=| embed?.*id=|
articles/ a(?:rticles)?/
)| )|
(?P<appname> (?P<appname>
%s %s
@ -143,6 +144,10 @@ class VGTVIE(XstreamIE):
'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'only_matching': True, 'only_matching': True,
}, },
{
'url': 'https://www.aftonbladet.se/tv/a/36015',
'only_matching': True,
},
{ {
'url': 'abtv:140026', 'url': 'abtv:140026',
'only_matching': True, 'only_matching': True,
@ -178,13 +183,15 @@ class VGTVIE(XstreamIE):
streams = data['streamUrls'] streams = data['streamUrls']
stream_type = data.get('streamType') stream_type = data.get('streamType')
is_live = stream_type == 'live'
formats = [] formats = []
hls_url = streams.get('hls') hls_url = streams.get('hls')
if hls_url: if hls_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) hls_url, video_id, 'mp4',
entry_protocol='m3u8' if is_live else 'm3u8_native',
m3u8_id='hls', fatal=False))
hds_url = streams.get('hds') hds_url = streams.get('hds')
if hds_url: if hds_url:
@ -229,13 +236,13 @@ class VGTVIE(XstreamIE):
info.update({ info.update({
'id': video_id, 'id': video_id,
'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'title': self._live_title(data['title']) if is_live else data['title'],
'description': data['description'], 'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'], 'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000), 'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'], 'view_count': data['displays'],
'is_live': True if stream_type == 'live' else False, 'is_live': is_live,
}) })
return info return info

View File

@ -3,15 +3,13 @@ from __future__ import unicode_literals
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import compat_HTTPError
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
parse_iso8601, parse_iso8601,
url_or_none,
) )
@ -166,8 +164,8 @@ class VidmeIE(InfoExtractor):
formats = [] formats = []
for f in video.get('formats', []): for f in video.get('formats', []):
format_url = f.get('uri') format_url = url_or_none(f.get('uri'))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
format_type = f.get('type') format_type = f.get('type')
if format_type == 'dash': if format_type == 'dash':

View File

@ -54,7 +54,8 @@ class VidziIE(InfoExtractor):
self._search_regex( self._search_regex(
r'setup\(([^)]+)\)', code, 'jwplayer data', r'setup\(([^)]+)\)', code, 'jwplayer data',
default=NO_DEFAULT if num == len(codes) else '{}'), default=NO_DEFAULT if num == len(codes) else '{}'),
video_id, transform_source=js_to_json) video_id, transform_source=lambda s: js_to_json(
re.sub(r'\s*\+\s*window\[.+?\]', '', s)))
if jwplayer_data: if jwplayer_data:
break break

View File

@ -539,9 +539,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
# We try to find out to which variable is assigned the config dic # We try to find out to which variable is assigned the config dic
m_variable_name = re.search(r'(\w)\.video\.id', webpage) m_variable_name = re.search(r'(\w)\.video\.id', webpage)
if m_variable_name is not None: if m_variable_name is not None:
config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1)) config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
else: else:
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});'] config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
config = self._search_regex(config_re, webpage, 'info section', config = self._search_regex(config_re, webpage, 'info section',
flags=re.DOTALL) flags=re.DOTALL)
config = json.loads(config) config = json.loads(config)

View File

@ -0,0 +1,99 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
str_or_none,
url_or_none,
)
class ViqeoIE(InfoExtractor):
_VALID_URL = r'''(?x)
(?:
viqeo:|
https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=|
https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])=
)
(?P<id>[\da-f]+)
'''
_TESTS = [{
'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837',
'md5': 'a169dd1a6426b350dca4296226f21e76',
'info_dict': {
'id': 'cde96f09d25f39bee837',
'ext': 'mp4',
'title': 'cde96f09d25f39bee837',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 76,
},
}, {
'url': 'viqeo:cde96f09d25f39bee837',
'only_matching': True,
}, {
'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return [
mobj.group('url')
for mobj in re.finditer(
r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1',
webpage)]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id)
data = self._parse_json(
self._search_regex(
r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'),
video_id)
formats = []
thumbnails = []
for media_file in data['mediaFiles']:
if not isinstance(media_file, dict):
continue
media_url = url_or_none(media_file.get('url'))
if not media_url or not media_url.startswith(('http', '//')):
continue
media_type = str_or_none(media_file.get('type'))
if not media_type:
continue
media_kind = media_type.split('/')[0].lower()
f = {
'url': media_url,
'width': int_or_none(media_file.get('width')),
'height': int_or_none(media_file.get('height')),
}
format_id = str_or_none(media_file.get('quality'))
if media_kind == 'image':
f['id'] = format_id
thumbnails.append(f)
elif media_kind in ('video', 'audio'):
is_audio = media_kind == 'audio'
f.update({
'format_id': 'audio' if is_audio else format_id,
'fps': int_or_none(media_file.get('fps')),
'vcodec': 'none' if is_audio else None,
})
formats.append(f)
self._sort_formats(formats)
duration = int_or_none(data.get('duration'))
return {
'id': video_id,
'title': video_id,
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
}

View File

@ -195,16 +195,29 @@ class ViuOTTIE(InfoExtractor):
'skip': 'Geo-restricted to Hong Kong', 'skip': 'Geo-restricted to Hong Kong',
}] }]
_AREA_ID = {
'HK': 1,
'SG': 2,
'TH': 4,
'PH': 5,
}
def _real_extract(self, url): def _real_extract(self, url):
country_code, video_id = re.match(self._VALID_URL, url).groups() country_code, video_id = re.match(self._VALID_URL, url).groups()
query = {
'r': 'vod/ajax-detail',
'platform_flag_label': 'web',
'product_id': video_id,
}
area_id = self._AREA_ID.get(country_code.upper())
if area_id:
query['area_id'] = area_id
product_data = self._download_json( product_data = self._download_json(
'http://www.viu.com/ott/%s/index.php' % country_code, video_id, 'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
'Downloading video info', query={ 'Downloading video info', query=query)['data']
'r': 'vod/ajax-detail',
'platform_flag_label': 'web',
'product_id': video_id,
})['data']
video_data = product_data.get('current_product') video_data = product_data.get('current_product')
if not video_data: if not video_data:
@ -214,6 +227,9 @@ class ViuOTTIE(InfoExtractor):
'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
video_id, 'Downloading stream info', query={ video_id, 'Downloading stream info', query={
'ccs_product_id': video_data['ccs_product_id'], 'ccs_product_id': video_data['ccs_product_id'],
}, headers={
'Referer': url,
'Origin': re.search(r'https?://[^/]+', url).group(0),
})['data']['stream'] })['data']['stream']
stream_sizes = stream_data.get('size', {}) stream_sizes = stream_data.get('size', {})

View File

@ -17,9 +17,11 @@ from ..utils import (
int_or_none, int_or_none,
orderedSet, orderedSet,
remove_start, remove_start,
str_or_none,
str_to_int, str_to_int,
unescapeHTML, unescapeHTML,
unified_timestamp, unified_timestamp,
url_or_none,
urlencode_postdata, urlencode_postdata,
) )
from .dailymotion import DailymotionIE from .dailymotion import DailymotionIE
@ -105,10 +107,10 @@ class VKIE(VKBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'ProtivoGunz - Хуёвая песня', 'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
'uploader_id': '-77521',
'duration': 195, 'duration': 195,
'timestamp': 1329060660, 'timestamp': 1329049880,
'upload_date': '20120212', 'upload_date': '20120212',
'view_count': int,
}, },
}, },
{ {
@ -117,12 +119,12 @@ class VKIE(VKBaseIE):
'info_dict': { 'info_dict': {
'id': '165548505', 'id': '165548505',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Tom Cruise',
'title': 'No name', 'title': 'No name',
'uploader': 'Tom Cruise',
'uploader_id': '205387401',
'duration': 9, 'duration': 9,
'timestamp': 1374374880, 'timestamp': 1374364108,
'upload_date': '20130721', 'upload_date': '20130720',
'view_count': int,
} }
}, },
{ {
@ -206,10 +208,10 @@ class VKIE(VKBaseIE):
'id': 'V3K4mi0SYkc', 'id': 'V3K4mi0SYkc',
'ext': 'webm', 'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
'description': 'md5:d9903938abdc74c738af77f527ca0596', 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
'duration': 178, 'duration': 179,
'upload_date': '20130116', 'upload_date': '20130116',
'uploader': "Children's Joy Foundation", 'uploader': "Children's Joy Foundation Inc.",
'uploader_id': 'thecjf', 'uploader_id': 'thecjf',
'view_count': int, 'view_count': int,
}, },
@ -221,6 +223,7 @@ class VKIE(VKBaseIE):
'id': 'k3lz2cmXyRuJQSjGHUv', 'id': 'k3lz2cmXyRuJQSjGHUv',
'ext': 'mp4', 'ext': 'mp4',
'title': 'md5:d52606645c20b0ddbb21655adaa4f56f', 'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
# TODO: fix test by fixing dailymotion description extraction
'description': 'md5:c651358f03c56f1150b555c26d90a0fd', 'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
'uploader': 'AniLibria.Tv', 'uploader': 'AniLibria.Tv',
'upload_date': '20160914', 'upload_date': '20160914',
@ -240,9 +243,12 @@ class VKIE(VKBaseIE):
'ext': 'mp4', 'ext': 'mp4',
'title': 'S-Dance, репетиции к The way show', 'title': 'S-Dance, репетиции к The way show',
'uploader': 'THE WAY SHOW | 17 апреля', 'uploader': 'THE WAY SHOW | 17 апреля',
'timestamp': 1454870100, 'uploader_id': '-110305615',
'timestamp': 1454859345,
'upload_date': '20160207', 'upload_date': '20160207',
'view_count': int, },
'params': {
'skip_download': True,
}, },
}, },
{ {
@ -295,7 +301,7 @@ class VKIE(VKBaseIE):
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
if video_id: if video_id:
info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
# Some videos (removed?) can only be downloaded with list id specified # Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id') list_id = mobj.group('list_id')
if list_id: if list_id:
@ -345,6 +351,9 @@ class VKIE(VKBaseIE):
r'<!>This video is no longer available, because its author has been blocked.': r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.', 'Video %s is no longer available, because its author has been blocked.',
r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.',
} }
for error_re, error_msg in ERRORS.items(): for error_re, error_msg in ERRORS.items():
@ -393,7 +402,8 @@ class VKIE(VKBaseIE):
if not data: if not data:
data = self._parse_json( data = self._parse_json(
self._search_regex( self._search_regex(
r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'), [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
info_page, 'json', default='{}'),
video_id) video_id)
if data: if data:
data = data['player']['params'][0] data = data['player']['params'][0]
@ -415,7 +425,7 @@ class VKIE(VKBaseIE):
timestamp = unified_timestamp(self._html_search_regex( timestamp = unified_timestamp(self._html_search_regex(
r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page, r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page,
'upload date', fatal=False)) 'upload date', default=None)) or int_or_none(data.get('date'))
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)', r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)',
@ -423,7 +433,8 @@ class VKIE(VKBaseIE):
formats = [] formats = []
for format_id, format_url in data.items(): for format_id, format_url in data.items():
if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')): format_url = url_or_none(format_url)
if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
continue continue
if (format_id.startswith(('url', 'cache')) or if (format_id.startswith(('url', 'cache')) or
format_id in ('extra_data', 'live_mp4', 'postlive_mp4')): format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):
@ -452,9 +463,12 @@ class VKIE(VKBaseIE):
'title': title, 'title': title,
'thumbnail': data.get('jpg'), 'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'), 'uploader': data.get('md_author'),
'uploader_id': str_or_none(data.get('author_id')),
'duration': data.get('duration'), 'duration': data.get('duration'),
'timestamp': timestamp, 'timestamp': timestamp,
'view_count': view_count, 'view_count': view_count,
'like_count': int_or_none(data.get('liked')),
'dislike_count': int_or_none(data.get('nolikes')),
'is_live': is_live, 'is_live': is_live,
} }

View File

@ -57,7 +57,7 @@ class VLiveIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.vlive.tv/video/%s' % video_id, video_id) 'https://www.vlive.tv/video/%s' % video_id, video_id)
VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
VIDEO_PARAMS_FIELD = 'video params' VIDEO_PARAMS_FIELD = 'video params'
@ -108,11 +108,11 @@ class VLiveIE(InfoExtractor):
def _live(self, video_id, webpage): def _live(self, video_id, webpage):
init_page = self._download_webpage( init_page = self._download_webpage(
'http://www.vlive.tv/video/init/view', 'https://www.vlive.tv/video/init/view',
video_id, note='Downloading live webpage', video_id, note='Downloading live webpage',
data=urlencode_postdata({'videoSeq': video_id}), data=urlencode_postdata({'videoSeq': video_id}),
headers={ headers={
'Referer': 'http://www.vlive.tv/video/%s' % video_id, 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
'Content-Type': 'application/x-www-form-urlencoded' 'Content-Type': 'application/x-www-form-urlencoded'
}) })

View File

@ -19,7 +19,6 @@ class WatIE(InfoExtractor):
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
'info_dict': { 'info_dict': {
'id': '11713067', 'id': '11713067',
'ext': 'mp4', 'ext': 'mp4',
@ -28,10 +27,15 @@ class WatIE(InfoExtractor):
'upload_date': '20140819', 'upload_date': '20140819',
'duration': 120, 'duration': 120,
}, },
'params': {
# m3u8 download
'skip_download': True,
},
'expected_warnings': ['HTTP Error 404'],
}, },
{ {
'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', 'md5': 'b16574df2c3cd1a36ca0098f2a791925',
'info_dict': { 'info_dict': {
'id': '11713075', 'id': '11713075',
'ext': 'mp4', 'ext': 'mp4',
@ -98,38 +102,25 @@ class WatIE(InfoExtractor):
formats = [] formats = []
try: try:
alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')]
manifest_urls = self._download_json( manifest_urls = self._download_json(
'http://www.wat.tv/get/webhtml/' + video_id, video_id) 'http://www.wat.tv/get/webhtml/' + video_id, video_id)
m3u8_url = manifest_urls.get('hls') m3u8_url = manifest_urls.get('hls')
if m3u8_url: if m3u8_url:
m3u8_url = remove_bitrate_limit(m3u8_url) m3u8_url = remove_bitrate_limit(m3u8_url)
m3u8_formats = self._extract_m3u8_formats( for m3u8_alt_url in alt_urls(m3u8_url):
m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_m3u8_formats(
if m3u8_formats: m3u8_alt_url, video_id, 'mp4',
formats.extend(m3u8_formats) 'm3u8_native', m3u8_id='hls', fatal=False))
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'),
video_id, f4m_id='hds', fatal=False)) video_id, f4m_id='hds', fatal=False))
http_url = extract_url('android5/%s.mp4', 'http')
if http_url:
for m3u8_format in m3u8_formats:
vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr')
if not vbr or not abr:
continue
format_id = m3u8_format['format_id'].replace('hls', 'http')
fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url)
if self._is_valid_url(fmt_url, video_id, format_id):
f = m3u8_format.copy()
f.update({
'url': fmt_url,
'format_id': format_id,
'protocol': 'http',
})
formats.append(f)
mpd_url = manifest_urls.get('mpd') mpd_url = manifest_urls.get('mpd')
if mpd_url: if mpd_url:
formats.extend(self._extract_mpd_formats(remove_bitrate_limit( mpd_url = remove_bitrate_limit(mpd_url)
mpd_url), video_id, mpd_id='dash', fatal=False)) for mpd_alt_url in alt_urls(mpd_url):
formats.extend(self._extract_mpd_formats(
mpd_alt_url, video_id, mpd_id='dash', fatal=False))
self._sort_formats(formats) self._sort_formats(formats)
except ExtractorError: except ExtractorError:
abr = 64 abr = 64

View File

@ -67,11 +67,12 @@ class WatchBoxIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
source = self._parse_json( source = (self._parse_json(
self._search_regex( self._search_regex(
r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source', r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config',
default='{}'), default='{}'),
video_id, transform_source=js_to_json, fatal=False) or {} video_id, transform_source=js_to_json,
fatal=False) or {}).get('source') or {}
video_id = compat_str(source.get('videoId') or video_id) video_id = compat_str(source.get('videoId') or video_id)

View File

@ -13,6 +13,7 @@ from ..utils import (
parse_duration, parse_duration,
try_get, try_get,
unified_strdate, unified_strdate,
url_or_none,
) )
@ -137,7 +138,8 @@ class XHamsterIE(InfoExtractor):
else: else:
format_url = format_item format_url = format_item
filesize = None filesize = None
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
formats.append({ formats.append({
'format_id': '%s-%s' % (format_id, quality), 'format_id': '%s-%s' % (format_id, quality),
@ -198,7 +200,8 @@ class XHamsterIE(InfoExtractor):
default='{}'), default='{}'),
video_id, fatal=False) video_id, fatal=False)
for format_id, format_url in sources.items(): for format_id, format_url in sources.items():
if not isinstance(format_url, compat_str): format_url = url_or_none(format_url)
if not format_url:
continue continue
if format_url in format_urls: if format_url in format_urls:
continue continue

View File

@ -4,12 +4,12 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
qualities, qualities,
unescapeHTML, unescapeHTML,
url_or_none,
) )
@ -80,9 +80,9 @@ class YapFilesIE(InfoExtractor):
formats = [] formats = []
for format_id in QUALITIES: for format_id in QUALITIES:
is_hd = format_id == 'hd' is_hd = format_id == 'hd'
format_url = playlist.get( format_url = url_or_none(playlist.get(
'file%s' % ('_hd' if is_hd else '')) 'file%s' % ('_hd' if is_hd else '')))
if not format_url or not isinstance(format_url, compat_str): if not format_url:
continue continue
formats.append({ formats.append({
'url': format_url, 'url': format_url,

Some files were not shown because too many files have changed in this diff Show More