diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c5898701f..fc18e733b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.13*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.13** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.03*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.13 +[debug] youtube-dl version 2016.09.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index f762e8a16..c4bef040a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -178,3 +178,8 @@ Artur Krysiak Jakub Adam Wieczorek Aleksandar Topuzović Nehal Patel +Rob van Bekkum +Petr Zvoníček +Pratyush Singh +Aleksander Nitecki +Sebastian Blunt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index fbf0ab7e8..95392030e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,7 +46,7 @@ Make sure that someone has not already opened the issue you're trying to open. S ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 000000000..2809e55d7 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,568 @@ +version + +Extractors +* [youjizz] Fix extraction (#10437) ++ [foxnews] Add support for FoxNews Insider (#10445) ++ [fc2] Recognize Flash player URLs (#10512) + + +version 2016.09.03 + +Core +* Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in + _extract_m3u8_formats (#10522) +* Handle semicolon in mimetype2ext + + +Extractors ++ [youtube] Add support for rental videos' previews (#10532) +* [youtube:playlist] Fallback to video extraction for video/playlist URLs when + no playlist is actually served (#10537) ++ [drtv] Add support for dr.dk/nyheder (#10536) ++ [facebook:plugins:video] Add extractor (#10530) ++ [go] Add extractor for *.go.com sites +* [adobepass] Check for authz_token expiration (#10527) +* [nytimes] improve extraction +* [thestar] Fix extraction (#10465) +* [glide] Fix extraction (#10478) +- [exfm] Remove extractor (#10482) +* [youporn] Fix categories and tags extraction (#10521) ++ [curiositystream] Add extractor for app.curiositystream.com +- [thvideo] Remove extractor (#10464) +* [movingimage] Fix for the new site name (#10466) ++ [cbs] Add support for once formats (#10515) +* [limelight] Skip ism snd duplicate manifests ++ [porncom] Extract categories and tags (#10510) ++ [facebook] Extract timestamp (#10508) ++ [yahoo] Extract more formats + + +version 2016.08.31 + +Extractors +* [soundcloud] Fix URL regular expression to avoid clashes with sets (#10505) +* [bandcamp:album] Fix title extraction (#10455) +* [pyvideo] Fix extraction (#10468) ++ [ctv] Add support for tsn.ca, bnn.ca and thecomedynetwork.ca (#10016) +* [9c9media] Extract more metadata +* [9c9media] Fix multiple stacks extraction (#10016) +* [adultswim] Improve video info extraction (#10492) +* [vodplatform] Improve embed regular expression +- [played] Remove extractor (#10470) ++ [tbs] Add extractor for tbs.com and tntdrama.com (#10222) ++ [cartoonnetwork] Add extractor for cartoonnetwork.com (#10110) +* [adultswim] Rework in terms of turner extractor +* [cnn] Rework in terms of turner extractor +* [nba] Rework in terms of turner extractor ++ [turner] Add base extractor for Turner Broadcasting System based sites +* [bilibili] Fix extraction (#10375) +* [openload] Fix extraction (#10408) + + +version 2016.08.28 + +Core ++ Add warning message that ffmpeg doesn't support SOCKS +* Improve thumbnail sorting ++ Extract formats from #EXT-X-MEDIA tags in _extract_m3u8_formats +* Fill IV with leading zeros for IVs shorter than 16 octets in hlsnative ++ Add ac-3 to the list of audio codecs in parse_codecs + +Extractors +* [periscope:user] Fix extraction (#10453) +* [douyutv] Fix extraction (#10153, #10318, #10444) ++ [nhk:vod] Add extractor for www3.nhk.or.jp on demand (#4437, #10424) +- [trutube] Remove extractor (#10438) ++ [usanetwork] Add extractor for usanetwork.com +* [crackle] Fix extraction (#10333) +* [spankbang] Fix description and uploader extraction (#10339) +* [discoverygo] Detect cable provider restricted videos (#10425) ++ [cbc] Add support for watch.cbc.ca +* [kickstarter] Silent the warning for og:description (#10415) +* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) + + +version 2016.08.24.1 + +Extractors ++ [pluralsight] Add support for subtitles (#9681) + + +version 2016.08.24 + +Extractors +* [youtube] Fix authentication (#10392) +* [openload] Fix extraction (#10408) ++ [bravotv] Add support for Adobe Pass (#10407) +* [bravotv] Fix clip info extraction (#10407) +* [eagleplatform] Improve embedded videos detection (#10409) +* [awaan] Fix extraction +* [mtvservices:embedded] Update config URL ++ [abc:iview] Add extractor (#6148) + + +version 2016.08.22 + +Core +* Improve formats and subtitles extension auto calculation ++ Recognize full unit names in parse_filesize ++ Add support for m3u8 manifests in HTML5 multimedia tags +* Fix octal/hexadecimal number detection in js_to_json + +Extractors ++ [ivi] Add support for 720p and 1080p ++ [charlierose] Add new extractor (#10382) +* [1tv] Fix extraction (#9249) +* [twitch] Renew authentication +* [kaltura] Improve subtitles extension calculation ++ [zingmp3] Add support for video clips +* [zingmp3] Fix extraction (#10041) +* [kaltura] Improve subtitles extraction (#10279) +* [cultureunplugged] Fix extraction (#10330) ++ [cnn] Add support for money.cnn.com (#2797) +* [cbsnews] Fix extraction (#10362) +* [cbs] Fix extraction (#10393) ++ [litv] Support 'promo' URLs (#10385) +* [snotr] Fix extraction (#10338) +* [n-tv.de] Fix extraction (#10331) +* [globo:article] Relax URL and video id regular expressions (#10379) + + +version 2016.08.19 + +Core +- Remove output template description from --help +* Recognize lowercase units in parse_filesize + +Extractors ++ [porncom] Add extractor for porn.com (#2251, #10251) ++ [generic] Add support for DBTV embeds +* [vk:wallpost] Fix audio extraction for new site layout +* [vk] Fix authentication ++ [hgtvcom:show] Add extractor for hgtv.com shows (#10365) ++ [discoverygo] Add support for another GO network sites + + +version 2016.08.17 + +Core ++ Add _get_netrc_login_info + +Extractors +* [mofosex] Extract all formats (#10335) ++ [generic] Add support for vbox7 embeds ++ [vbox7] Add support for embed URLs ++ [viafree] Add extractor (#10358) ++ [mtg] Add support for viafree URLs (#10358) +* [theplatform] Extract all subtitles per language ++ [xvideos] Fix HLS extraction (#10356) ++ [amcnetworks] Add extractor ++ [bbc:playlist] Add support for pagination (#10349) ++ [fxnetworks] Add extractor (#9462) +* [cbslocal] Fix extraction for SendtoNews-based videos +* [sendtonews] Fix extraction +* [jwplatform] Extract video id from JWPlayer data +- [zippcast] Remove extractor (#10332) ++ [viceland] Add extractor (#8799) ++ [adobepass] Add base extractor for Adobe Pass Authentication +* [life:embed] Improve extraction +* [vgtv] Detect geo restricted videos (#10348) ++ [uplynk] Add extractor +* [xiami] Fix extraction (#10342) + + +version 2016.08.13 + +Core +* Show progress for curl external downloader +* Forward more options to curl external downloader + +Extractors +* [pbs] Fix description extraction +* [franceculture] Fix extraction (#10324) +* [pornotube] Fix extraction (#10322) +* [4tube] Fix metadata extraction (#10321) +* [imgur] Fix width and height extraction (#10325) +* [expotv] Improve extraction ++ [vbox7] Fix extraction (#10309) +- [tapely] Remove extractor (#10323) +* [muenchentv] Fix extraction (#10313) ++ [24video] Add support for .me and .xxx TLDs +* [24video] Fix comment count extraction +* [sunporno] Add support for embed URLs +* [sunporno] Fix metadata extraction (#10316) ++ [hgtv] Add extractor for hgtv.ca (#3999) +- [pbs] Remove request to unavailable API ++ [pbs] Add support for high quality HTTP formats ++ [crunchyroll] Add support for HLS formats (#10301) + + +version 2016.08.12 + +Core +* Subtitles are now written as is. Newline conversions are disabled. (#10268) ++ Recognize more formats in unified_timestamp + +Extractors +- [goldenmoustache] Remove extractor (#10298) +* [drtuber] Improve title extraction +* [drtuber] Make dislike count optional (#10297) +* [chirbit] Fix extraction (#10296) +* [francetvinfo] Relax URL regular expression +* [rtlnl] Relax URL regular expression (#10282) +* [formula1] Relax URL regular expression (#10283) +* [wat] Improve extraction (#10281) +* [ctsnews] Fix extraction + + +version 2016.08.10 + +Core +* Make --metadata-from-title non fatal when title does not match the pattern +* Introduce options for randomized sleep before each download + --min-sleep-interval and --max-sleep-interval (#9930) +* Respect default in _search_json_ld + +Extractors ++ [uol] Add extractor for uol.com.br (#4263) +* [rbmaradio] Fix extraction and extract all formats (#10242) ++ [sonyliv] Add extractor for sonyliv.com (#10258) +* [aparat] Fix extraction +* [cwtv] Extract HTTP formats ++ [rozhlas] Add extractor for prehravac.rozhlas.cz (#10253) +* [kuwo:singer] Fix extraction + + +version 2016.08.07 + +Core ++ Add support for TV Parental Guidelines ratings in parse_age_limit ++ Add decode_png (#9706) ++ Add support for partOfTVSeries in JSON-LD +* Lower master M3U8 manifest preference for better format sorting + +Extractors ++ [discoverygo] Add extractor (#10245) +* [flipagram] Make JSON-LD extraction non fatal +* [generic] Make JSON-LD extraction non fatal ++ [bbc] Add support for morph embeds (#10239) +* [tnaflixnetworkbase] Improve title extraction +* [tnaflix] Fix metadata extraction (#10249) +* [fox] Fix theplatform release URL query +* [openload] Fix extraction (#9706) +* [bbc] Skip duplicate manifest URLs +* [bbc] Improve format code ++ [bbc] Add support for DASH and F4M +* [bbc] Improve format sorting and listing +* [bbc] Improve playlist extraction ++ [pokemon] Add extractor (#10093) ++ [condenast] Add fallback scenario for video info extraction + + +version 2016.08.06 + +Core +* Add support for JSON-LD root list entries (#10203) +* Improve unified_timestamp +* Lower preference of RTSP formats in generic sorting ++ Add support for multiple properties in _og_search_property +* Improve password hiding from verbose output + +Extractors ++ [adultswim] Add support for trailers (#10235) +* [archiveorg] Improve extraction (#10219) ++ [jwplatform] Add support for playlists ++ [jwplatform] Add support for relative URLs +* [jwplatform] Improve audio detection ++ [tvplay] Capture and output native error message ++ [tvplay] Extract series metadata ++ [tvplay] Add support for subtitles (#10194) +* [tvp] Improve extraction (#7799) +* [cbslocal] Fix timestamp parsing (#10213) ++ [naver] Add support for subtitles (#8096) +* [naver] Improve extraction +* [condenast] Improve extraction +* [engadget] Relax URL regular expression +* [5min] Fix extraction ++ [nationalgeographic] Add support for Episode Guide ++ [kaltura] Add support for subtitles +* [kaltura] Optimize network requests ++ [vodplatform] Add extractor for vod-platform.net +- [gamekings] Remove extractor +* [limelight] Extract HTTP formats +* [ntvru] Fix extraction ++ [comedycentral] Re-add :tds and :thedailyshow shortnames + + +version 2016.08.01 + +Fixed/improved extractors +- [yandexmusic:track] Adapt to changes in track location JSON (#10193) +- [bloomberg] Support another form of player (#10187) +- [limelight] Skip DRM protected videos +- [safari] Relax regular expressions for URL matching (#10202) +- [cwtv] Add support for cwtvpr.com (#10196) + + +version 2016.07.30 + +Fixed/improved extractors +- [twitch:clips] Sort formats +- [tv2] Use m3u8_native +- [tv2:article] Fix video detection (#10188) +- rtve (#10076) +- [dailymotion:playlist] Optimize download archive processing (#10180) + + +version 2016.07.28 + +Fixed/improved extractors +- shared (#10170) +- soundcloud (#10179) +- twitch (#9767) + + +version 2016.07.26.2 + +Fixed/improved extractors +- smotri +- camdemy +- mtv +- comedycentral +- cmt +- cbc +- mgtv +- orf + + +version 2016.07.24 + +New extractors +- arkena (#8682) +- lcp (#8682) + +Fixed/improved extractors +- facebook (#10151) +- dailymail +- telegraaf +- dcn +- onet +- tvp + +Miscellaneous +- Support $Time$ in DASH manifests + + +version 2016.07.22 + +New extractors +- odatv (#9285) + +Fixed/improved extractors +- bbc +- youjizz (#10131) +- youtube (#10140) +- pornhub (#10138) +- eporner (#10139) + + +version 2016.07.17 + +New extractors +- nintendo (#9986) +- streamable (#9122) + +Fixed/improved extractors +- ard (#10095) +- mtv +- comedycentral (#10101) +- viki (#10098) +- spike (#10106) + +Miscellaneous +- Improved twitter player detection (#10090) + + +version 2016.07.16 + +New extractors +- ninenow (#5181) + +Fixed/improved extractors +- rtve (#10076) +- brightcove +- 3qsdn +- syfy (#9087, #3820, #2388) +- youtube (#10083) + +Miscellaneous +- Fix subtitle embedding for video-only and audio-only files (#10081) + + +version 2016.07.13 + +New extractors +- rudo + +Fixed/improved extractors +- biobiochiletv +- tvplay +- dbtv +- brightcove +- tmz +- youtube (#10059) +- shahid (#10062) +- vk +- ellentv (#10067) + + +version 2016.07.11 + +New Extractors +- roosterteeth (#9864) + +Fixed/improved extractors +- miomio (#9605) +- vuclip +- youtube +- vidzi (#10058) + + +version 2016.07.09.2 + +Fixed/improved extractors +- vimeo (#1638) +- facebook (#10048) +- lynda (#10047) +- animeondemand + +Fixed/improved features +- Embedding subtitles no longer throws an error with problematic inputs (#9063) + + +version 2016.07.09.1 + +Fixed/improved extractors +- youtube +- ard +- srmediatek (#9373) + + +version 2016.07.09 + +New extractors +- Flipagram (#9898) + +Fixed/improved extractors +- telecinco +- toutv +- radiocanada +- tweakers (#9516) +- lynda +- nick (#7542) +- polskieradio (#10028) +- le +- facebook (#9851) +- mgtv +- animeondemand (#10031) + +Fixed/improved features +- `--postprocessor-args` and `--downloader-args` now accepts non-ASCII inputs + on non-Windows systems + + +version 2016.07.07 + +New extractors +- kamcord (#10001) + +Fixed/improved extractors +- spiegel (#10018) +- metacafe (#8539, #3253) +- onet (#9950) +- francetv (#9955) +- brightcove (#9965) +- daum (#9972) + + +version 2016.07.06 + +Fixed/improved extractors +- youtube (#10007, #10009) +- xuite +- stitcher +- spiegel +- slideshare +- sandia +- rtvnh +- prosiebensat1 +- onionstudios + + +version 2016.07.05 + +Fixed/improved extractors +- brightcove +- yahoo (#9995) +- pornhub (#9997) +- iqiyi +- kaltura (#5557) +- la7 +- Changed features +- Rename --cn-verfication-proxy to --geo-verification-proxy +Miscellaneous +- Add script for displaying downloads statistics + + +version 2016.07.03.1 + +Fixed/improved extractors +- theplatform +- aenetworks +- nationalgeographic +- hrti (#9482) +- facebook (#5701) +- buzzfeed (#5701) +- rai (#8617, #9157, #9232, #8552, #8551) +- nationalgeographic (#9991) +- iqiyi + + +version 2016.07.03 + +New extractors +- hrti (#9482) + +Fixed/improved extractors +- vk (#9981) +- facebook (#9938) +- xtube (#9953, #9961) + + +version 2016.07.02 + +New extractors +- fusion (#9958) + +Fixed/improved extractors +- twitch (#9975) +- vine (#9970) +- periscope (#9967) +- pornhub (#8696) + + +version 2016.07.01 + +New extractors +- 9c9media +- ctvnews (#2156) +- ctv (#4077) + +Fixed/Improved extractors +- rds +- meta (#8789) +- pornhub (#9964) +- sixplay (#2183) + +New features +- Accept quoted strings across multiple lines (#9940) diff --git a/Makefile b/Makefile index 6ee4ba4eb..354052c50 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ _EXTRACTOR_FILES != find youtube_dl/extractor -iname '*.py' -and -not -iname 'la youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES) $(PYTHON) devscripts/make_lazy_extractors.py $@ -youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish +youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \ --exclude '*.DS_Store' \ --exclude '*.kate-swp' \ @@ -107,7 +107,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude 'docs/_build' \ -- \ bin devscripts test youtube_dl docs \ - LICENSE README.md README.txt \ + ChangeLog LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \ youtube-dl.zsh youtube-dl.fish setup.py \ youtube-dl diff --git a/README.md b/README.md index 44332ea9a..87465aa5e 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: - sudo curl -L https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl + sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: @@ -201,32 +201,8 @@ which means you can modify it, redistribute it or use it however you like. -a, --batch-file FILE File containing URLs to download ('-' for stdin) --id Use only video ID in file name - -o, --output TEMPLATE Output filename template. Use %(title)s to - get the title, %(uploader)s for the - uploader name, %(uploader_id)s for the - uploader nickname if different, - %(autonumber)s to get an automatically - incremented number, %(ext)s for the - filename extension, %(format)s for the - format description (like "22 - 1280x720" or - "HD"), %(format_id)s for the unique id of - the format (like YouTube's itags: "137"), - %(upload_date)s for the upload date - (YYYYMMDD), %(extractor)s for the provider - (youtube, metacafe, etc), %(id)s for the - video id, %(playlist_title)s, - %(playlist_id)s, or %(playlist)s (=title if - present, ID otherwise) for the playlist the - video is in, %(playlist_index)s for the - position in the playlist. %(height)s and - %(width)s for the width and height of the - video format. %(resolution)s for a textual - description of the resolution of the video - format. %% for a literal percent. Use - to - output to stdout. Can also be used to - download to a different directory, for - example with -o '/my/downloads/%(uploader)s - /%(title)s-%(id)s.%(ext)s' . + -o, --output TEMPLATE Output filename template, see the "OUTPUT + TEMPLATE" for all the info --autonumber-size NUMBER Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option @@ -330,7 +306,15 @@ which means you can modify it, redistribute it or use it however you like. bidirectional text support. Requires bidiv or fribidi executable in PATH --sleep-interval SECONDS Number of seconds to sleep before each - download. + download when used alone or a lower bound + of a range for randomized sleep before each + download (minimum possible number of + seconds to sleep) when used along with + --max-sleep-interval. + --max-sleep-interval SECONDS Upper bound of a range for randomized sleep + before each download (maximum possible + number of seconds to sleep). Must only be + used along with --min-sleep-interval. ## Video Format Options: -f, --format FORMAT Video format code, see the "FORMAT @@ -428,11 +412,19 @@ You can configure youtube-dl by placing any supported command line option to a c For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` --x ---no-mtime ---proxy 127.0.0.1:3128 --o ~/Movies/%(title)s.%(ext)s # Lines starting with # are comments + +# Always extract audio +-x + +# Do not copy the mtime +--no-mtime + +# Use this proxy +--proxy 127.0.0.1:3128 + +# Save all videos under Movies directory in your home directory +-o ~/Movies/%(title)s.%(ext)s ``` Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. @@ -661,7 +653,11 @@ $ youtube-dl -f 'best[filesize<50M]' # Download best format available via direct link over HTTP/HTTPS protocol $ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]' + +# Download the best video format and the best audio format without merging them +$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s' ``` +Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name. # VIDEO SELECTION @@ -742,7 +738,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. @@ -824,10 +820,32 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt ### How do I pass cookies to youtube-dl? -Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. + +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). + +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). +### How do I stream directly to media player? + +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: + + youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + +### How do I download only new videos from a playlist? + +Use download-archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special file. Each subsequent run with the same `--download-archive` will download only new videos and skip all videos that have been downloaded before. Note that only successful downloads are recorded in the file. + +For example, at first, + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + +will download the complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and create a file `archive.txt`. Each subsequent run will only download new videos if any: + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. @@ -1196,7 +1214,7 @@ Make sure that someone has not already opened the issue you're trying to open. S ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index e3f6339b5..ce548739f 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -54,17 +54,21 @@ def filter_options(readme): if in_options: if line.lstrip().startswith('-'): - option, description = re.split(r'\s{2,}', line.lstrip()) - split_option = option.split(' ') + split = re.split(r'\s{2,}', line.lstrip()) + # Description string may start with `-` as well. If there is + # only one piece then it's a description bit not an option. + if len(split) > 1: + option, description = split + split_option = option.split(' ') - if not split_option[-1].startswith('-'): # metavar - option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) + if not split_option[-1].startswith('-'): # metavar + option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) - # Pandoc's definition_lists. See http://pandoc.org/README.html - # for more information. - ret += '\n%s\n: %s\n' % (option, description) - else: - ret += line.lstrip() + '\n' + # Pandoc's definition_lists. See http://pandoc.org/README.html + # for more information. + ret += '\n%s\n: %s\n' % (option, description) + continue + ret += line.lstrip() + '\n' else: ret += line + '\n' diff --git a/devscripts/release.sh b/devscripts/release.sh index f8d466ba8..ca6ae1b49 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -71,9 +71,12 @@ fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py +/bin/echo -e "\n### Changing version in ChangeLog..." +sed -i "s//$version/" ChangeLog + /bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..." make README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md supportedsites -git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py +git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py ChangeLog git commit $gpg_sign_commits -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py index b591d3fc9..e25d28411 100644 --- a/devscripts/show-downloads-statistics.py +++ b/devscripts/show-downloads-statistics.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from __future__ import unicode_literals +import itertools import json import os import re @@ -21,21 +22,26 @@ def format_size(bytes): total_bytes = 0 -releases = json.loads(compat_urllib_request.urlopen( - 'https://api.github.com/repos/rg3/youtube-dl/releases').read().decode('utf-8')) +for page in itertools.count(1): + releases = json.loads(compat_urllib_request.urlopen( + 'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page + ).read().decode('utf-8')) -for release in releases: - compat_print(release['name']) - for asset in release['assets']: - asset_name = asset['name'] - total_bytes += asset['download_count'] * asset['size'] - if all(not re.match(p, asset_name) for p in ( - r'^youtube-dl$', - r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', - r'^youtube-dl\.exe$')): - continue - compat_print( - ' %s size: %s downloads: %d' - % (asset_name, format_size(asset['size']), asset['download_count'])) + if not releases: + break + + for release in releases: + compat_print(release['name']) + for asset in release['assets']: + asset_name = asset['name'] + total_bytes += asset['download_count'] * asset['size'] + if all(not re.match(p, asset_name) for p in ( + r'^youtube-dl$', + r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', + r'^youtube-dl\.exe$')): + continue + compat_print( + ' %s size: %s downloads: %d' + % (asset_name, format_size(asset['size']), asset['download_count'])) compat_print('total downloads traffic: %s' % format_size(total_bytes)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 380f93ed6..015332bca 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,8 +13,12 @@ - **5min** - **8tracks** - **91porn** + - **9c9media** + - **9c9media:stack** - **9gag** + - **9now.com.au** - **abc.net.au** + - **abc.net.au:iview** - **Abc7News** - **abcnews** - **abcnews:video** @@ -34,6 +38,7 @@ - **AlJazeera** - **Allocine** - **AlphaPorno** + - **AMCNetworks** - **AnimeOnDemand** - **anitube.se** - **AnySex** @@ -45,6 +50,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** + - **Arkena** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -63,6 +69,10 @@ - **audiomack** - **audiomack:album** - **auroravid**: AuroraVid + - **AWAAN** + - **awaan:live** + - **awaan:season** + - **awaan:video** - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -107,8 +117,11 @@ - **Canvas** - **CarambaTV** - **CarambaTVPage** - - **CBC** - - **CBCPlayer** + - **CartoonNetwork** + - **cbc.ca** + - **cbc.ca:player** + - **cbc.ca:watch** + - **cbc.ca:watch:video** - **CBS** - **CBSInteractive** - **CBSLocal** @@ -118,6 +131,7 @@ - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 + - **CharlieRose** - **Chaturbate** - **Chilloutzone** - **chirbit** @@ -140,7 +154,8 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - - **ComedyCentralShows**: The Daily Show / The Colbert Report + - **ComedyCentralShortname** + - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** - **Cracked** @@ -156,6 +171,8 @@ - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** + - **curiositystream** + - **curiositystream:collection** - **CWTV** - **DailyMail** - **dailymotion** @@ -167,10 +184,6 @@ - **daum.net:playlist** - **daum.net:user** - **DBTV** - - **DCN** - - **dcn:live** - - **dcn:season** - - **dcn:video** - **DctpTv** - **DeezerPlaylist** - **defense.gouv.fr** @@ -179,6 +192,7 @@ - **DigitallySpeaking** - **Digiteka** - **Discovery** + - **DiscoveryGo** - **Dotsub** - **DouyuTV**: 斗鱼 - **DPlay** @@ -211,11 +225,11 @@ - **EsriVideo** - **Europa** - **EveryonesMixtape** - - **exfm**: ex.fm - **ExpoTV** - **ExtremeTube** - **EyedoTV** - **facebook** + - **FacebookPluginsVideo** - **faz.net** - **fc2** - **Fczenit** @@ -234,7 +248,6 @@ - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** - - **FranceCultureEmission** - **FranceInter** - **francetv**: France 2, 3, 4, 5 and Ô - **francetvinfo.fr** @@ -244,8 +257,8 @@ - **Funimation** - **FunnyOrDie** - **Fusion** + - **FXNetworks** - **GameInformer** - - **Gamekings** - **GameOne** - **gameone:playlist** - **Gamersyde** @@ -260,9 +273,9 @@ - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** + - **Go** - **GodTube** - **GodTV** - - **GoldenMoustache** - **Golem** - **GoogleDrive** - **Goshgay** @@ -275,6 +288,8 @@ - **HellPorno** - **Helsinki**: helsinki.fi - **HentaiStigma** + - **HGTV** + - **hgtv.com:show** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** @@ -335,6 +350,8 @@ - **kuwo:song**: 酷我音乐 - **la7.it** - **Laola1Tv** + - **Lcp** + - **LcpPlay** - **Le**: 乐视网 - **Learnr** - **Lecture2Go** @@ -392,11 +409,12 @@ - **MovieClips** - **MovieFap** - **Moviezine** + - **MovingImage** - **MPORA** - **MSN** + - **mtg**: MTG services - **MTV** - **mtv.de** - - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -412,7 +430,8 @@ - **MyVidster** - **n-tv.de** - **natgeo** - - **natgeo:channel** + - **natgeo:episodeguide** + - **natgeo:video** - **Naver** - **NBA** - **NBC** @@ -436,9 +455,9 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 - - **nextmovie.com** - **nfb**: National Film Board of Canada - **nfl.com** + - **NhkVod** - **nhl.com** - **nhl.com:news**: NHL news - **nhl.com:videocenter** @@ -447,7 +466,7 @@ - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - - **NineCNineMedia** + - **Nintendo** - **njoy**: N-JOY - **njoy:embed** - **Noco** @@ -475,6 +494,7 @@ - **NYTimes** - **NYTimesArticle** - **ocw.mit.edu** + - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** @@ -503,7 +523,6 @@ - **Pinkbike** - **Pladform** - **play.fm** - - **played.to** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -513,7 +532,9 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **Pokemon** - **PolskieRadio** + - **PornCom** - **PornHd** - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** @@ -557,6 +578,7 @@ - **RoosterTeeth** - **RottenTomatoes** - **Roxwel** + - **Rozhlas** - **RTBF** - **rte**: Raidió Teilifís Éireann TV - **rte:radio**: Raidió Teilifís Éireann radio @@ -567,6 +589,7 @@ - **rtve.es:alacarta**: RTVE a la carta - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams + - **rtve.es:television** - **RTVNH** - **Rudo** - **RUHD** @@ -613,6 +636,7 @@ - **smotri:user**: Smotri.com user videos - **Snotr** - **Sohu** + - **SonyLIV** - **soundcloud** - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search @@ -639,10 +663,10 @@ - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** + - **Streamable** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -654,8 +678,8 @@ - **SztvHu** - **Tagesschau** - **tagesschau:player** - - **Tapely** - **Tass** + - **TBS** - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos @@ -681,8 +705,6 @@ - **TheStar** - **ThisAmericanLife** - **ThisAV** - - **THVideo** - - **THVideoPlaylist** - **tinypic**: tinypic.com videos - **tlc.de** - **TMZ** @@ -690,13 +712,13 @@ - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics user profile - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** - **trollvids** - - **TruTube** - **Tube8** - **TubiTv** - **tudou** @@ -719,9 +741,8 @@ - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** - **tvp**: Telewizja Polska + - **tvp:embed**: Telewizja Polska - **tvp:series** - - **TVPlay**: TV3Play and related services - - **tvple** - **Tweakers** - **twitch:chapter** - **twitch:clips** @@ -737,8 +758,12 @@ - **udemy:course** - **UDNEmbed**: 聯合影音 - **Unistra** + - **uol.com.br** + - **uplynk** + - **uplynk:preplay** - **Urort**: NRK P3 Urørt - **URPlay** + - **USANetwork** - **USAToday** - **ustream** - **ustream:channel** @@ -754,7 +779,9 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **Viafree** - **Vice** + - **Viceland** - **ViceShow** - **Vidbit** - **Viddler** @@ -799,6 +826,7 @@ - **vk:wallpost** - **vlive** - **Vodlocker** + - **VODPlatform** - **VoiceRepublic** - **VoxMedia** - **Vporn** @@ -873,6 +901,4 @@ - **Zapiks** - **ZDF** - **ZDFChannel** - - **zingmp3:album**: mp3.zing.vn albums - - **zingmp3:song**: mp3.zing.vn songs - - **ZippCast** + - **zingmp3**: mp3.zing.vn diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 88e8ff904..a98305c74 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -48,6 +48,9 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') + self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') + self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) + self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) def test_html_search_meta(self): ie = self.ie diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ca25025e2..0dfe25c00 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -335,6 +335,40 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) + def test_audio_only_extractor_format_selection(self): + # For extractors with incomplete formats (all formats are audio-only or + # video-only) best and worst should fallback to corresponding best/worst + # video-only or audio-only formats (as per + # https://github.com/rg3/youtube-dl/pull/5556) + formats = [ + {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'high') + + ydl = YDL({'format': 'worst'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'low') + + def test_format_not_available(self): + formats = [ + {'format_id': 'regular', 'ext': 'mp4', 'height': 360, 'url': TEST_URL}, + {'format_id': 'video', 'ext': 'mp4', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # This must fail since complete video-audio format does not match filter + # and extractor does not provide incomplete only formats (i.e. only + # video-only or audio-only). + ydl = YDL({'format': 'best[height>360]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 1f6079c29..cd1cd4b24 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -101,8 +101,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentralShows']) - self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) diff --git a/test/test_utils.py b/test/test_utils.py index 2273b5a10..405c5d351 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -39,9 +39,11 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + mimetype2ext, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, + parse_age_limit, parse_duration, parse_filesize, parse_count, @@ -308,6 +310,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) + self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -431,6 +434,20 @@ class TestUtil(unittest.TestCase): url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'), 'trailer.mp4') + def test_parse_age_limit(self): + self.assertEqual(parse_age_limit(None), None) + self.assertEqual(parse_age_limit(False), None) + self.assertEqual(parse_age_limit('invalid'), None) + self.assertEqual(parse_age_limit(0), 0) + self.assertEqual(parse_age_limit(18), 18) + self.assertEqual(parse_age_limit(21), 21) + self.assertEqual(parse_age_limit(22), None) + self.assertEqual(parse_age_limit('18'), 18) + self.assertEqual(parse_age_limit('18+'), 18) + self.assertEqual(parse_age_limit('PG-13'), 13) + self.assertEqual(parse_age_limit('TV-14'), 14) + self.assertEqual(parse_age_limit('TV-MA'), 17) + def test_parse_duration(self): self.assertEqual(parse_duration(None), None) self.assertEqual(parse_duration(False), None) @@ -609,6 +626,14 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_mimetype2ext(self): + self.assertEqual(mimetype2ext(None), None) + self.assertEqual(mimetype2ext('video/x-flv'), 'flv') + self.assertEqual(mimetype2ext('application/x-mpegURL'), 'm3u8') + self.assertEqual(mimetype2ext('text/vtt'), 'vtt') + self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') + self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + def test_parse_codecs(self): self.assertEqual(parse_codecs(''), {}) self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { @@ -696,6 +721,9 @@ class TestUtil(unittest.TestCase): inp = '''{"foo":101}''' self.assertEqual(js_to_json(inp), '''{"foo":101}''') + inp = '''{"duration": "00:01:07"}''' + self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) @@ -801,7 +829,10 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('2 MiB'), 2097152) self.assertEqual(parse_filesize('5 GB'), 5000000000) self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) + self.assertEqual(parse_filesize('1.2tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) + self.assertEqual(parse_filesize('1,24 kb'), 1240) + self.assertEqual(parse_filesize('8.5 megabytes'), 8500000) def test_parse_count(self): self.assertEqual(parse_count(None), None) @@ -952,6 +983,7 @@ The first line self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + self.assertEqual(cli_option({'retries': 10}, '--retries', 'retries'), ['--retries', '10']) def test_cli_valueless_option(self): self.assertEqual(cli_valueless_option( diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py new file mode 100644 index 000000000..96a66f7a0 --- /dev/null +++ b/test/test_verbose_output.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +import unittest + +import sys +import os +import subprocess +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + + +class TestVerboseOutput(unittest.TestCase): + def test_private_info_arg(self): + outp = subprocess.Popen( + [ + sys.executable, 'youtube_dl/__main__.py', '-v', + '--username', 'johnsmith@gmail.com', + '--password', 'secret', + ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) + + def test_private_info_shortarg(self): + outp = subprocess.Popen( + [ + sys.executable, 'youtube_dl/__main__.py', '-v', + '-u', 'johnsmith@gmail.com', + '-p', 'secret', + ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) + + def test_private_info_eq(self): + outp = subprocess.Popen( + [ + sys.executable, 'youtube_dl/__main__.py', '-v', + '--username=johnsmith@gmail.com', + '--password=secret', + ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) + + def test_private_info_shortarg_eq(self): + outp = subprocess.Popen( + [ + sys.executable, 'youtube_dl/__main__.py', '-v', + '-u=johnsmith@gmail.com', + '-p=secret', + ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sout, serr = outp.communicate() + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ba72ec6f3..805733fb7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib +import copy import datetime import errno import fileinput @@ -248,7 +249,16 @@ class YoutubeDL(object): source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@ -1051,9 +1061,9 @@ class YoutubeDL(object): if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - for format in f(formats): + for format in f(ctx): yield format return selector_function elif selector.type == GROUP: @@ -1061,17 +1071,17 @@ class YoutubeDL(object): elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - picked_formats = list(f(formats)) + picked_formats = list(f(ctx)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector - def selector_function(formats): - formats = list(formats) + def selector_function(ctx): + formats = list(ctx['formats']) if not formats: return if format_spec == 'all': @@ -1084,9 +1094,10 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in formats) or - all(f.get('vcodec') != 'none' for f in formats)): + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1160,17 +1171,18 @@ class YoutubeDL(object): } video_selector, audio_selector = map(_build_selector_function, selector.selector) - def selector_function(formats): - formats = list(formats) - for pair in itertools.product(video_selector(formats), audio_selector(formats)): + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] - def final_selector(formats): + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) for _filter in filters: - formats = list(filter(_filter, formats)) - return selector_function(formats) + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) @@ -1244,8 +1256,10 @@ class YoutubeDL(object): info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( - t.get('preference'), t.get('width'), t.get('height'), - t.get('id'), t.get('url'))) + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', t.get('url'))) for i, t in enumerate(thumbnails): t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): @@ -1287,7 +1301,7 @@ class YoutubeDL(object): for subtitle_format in subtitle: if subtitle_format.get('url'): subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if 'ext' not in subtitle_format: + if subtitle_format.get('ext') is None: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): @@ -1342,7 +1356,7 @@ class YoutubeDL(object): note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing - if 'ext' not in format: + if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) @@ -1377,7 +1391,34 @@ class YoutubeDL(object): req_format_list.append('best') req_format = '/'.join(req_format_list) format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/rg3/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/rg3/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or + # all formats are audio-only + all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) @@ -1564,7 +1605,9 @@ class YoutubeDL(object): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2b34bf9c2..42128272a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -145,6 +145,16 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval def parse_retries(retries): if retries in ('inf', 'infinite'): @@ -308,6 +318,7 @@ def _real_main(argv=None): 'nooverwrites': opts.nooverwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, @@ -370,6 +381,7 @@ def _real_main(argv=None): 'source_address': opts.source_address, 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1dba9f49a..8482cbd84 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,6 +4,7 @@ import os import re import sys import time +import random from ..compat import compat_os_name from ..utils import ( @@ -342,8 +343,11 @@ class FileDownloader(object): }) return True - sleep_interval = self.params.get('sleep_interval') - if sleep_interval: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + print(min_sleep_interval, max_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) time.sleep(sleep_interval) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8bbab9dbc..efeae02a3 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,6 +38,7 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) def append_url_to_file(target_url, tmp_filename, segment_name): target_filename = '%s-%s' % (tmp_filename, segment_name) @@ -52,26 +53,30 @@ class DashSegmentsFD(FragmentFD): down.close() segments_filenames.append(target_sanitized) break - except (compat_urllib_error.HTTPError, ) as err: + except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately # retried with the same request data this usually succeeds (1-2 attemps # is usually enough) thus allowing to download the whole file successfully. - # So, we will retry all fragments that fail with 404 HTTP error for now. - if err.code != 404: - raise - # Retry fragment + # To be future-proof we will retry all fragments that fail with any + # HTTP error. count += 1 if count <= fragment_retries: - self.report_retry_fragment(segment_name, count, fragment_retries) + self.report_retry_fragment(err, segment_name, count, fragment_retries) if count > fragment_retries: + if skip_unavailable_fragments: + self.report_skip_fragment(segment_name) + return True self.report_error('giving up after %s fragment retries' % fragment_retries) return False + return True if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') + if not append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init'): + return False for i, segment_url in enumerate(segment_urls): - append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) + if not append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i): + return False self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index fae245024..0aeae3b8f 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -96,6 +96,12 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + cmd += self._option('--retry', 'retries') + cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--insecure', 'nocheckcertificate') @@ -103,6 +109,16 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + # curl writes the progress to stderr so don't capture it. + p = subprocess.Popen(cmd) + p.communicate() + return p.returncode + class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -204,6 +220,12 @@ class FFmpegFD(ExternalFD): if proxy: if not re.match(r'^[\da-zA-Z]+://', proxy): proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) + # Since December 2015 ffmpeg supports -http_proxy option (see # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) # We could switch to the following code if we are able to detect version properly diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index ba903ae10..84aacf7db 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -6,6 +6,7 @@ import time from .common import FileDownloader from .http import HttpFD from ..utils import ( + error_to_compat_str, encodeFilename, sanitize_open, ) @@ -22,13 +23,19 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) """ - def report_retry_fragment(self, fragment_name, count, retries): + def report_retry_fragment(self, err, fragment_name, count, retries): self.to_screen( - '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' - % (fragment_name, count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' + % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) + + def report_skip_fragment(self, fragment_name): + self.to_screen('[download] Skipping fragment %s...' % fragment_name) def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 3b7bb3508..5d70abf62 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from .fragment import FragmentFD from .external import FFmpegFD from ..compat import ( + compat_urllib_error, compat_urlparse, compat_struct_pack, ) @@ -20,6 +21,7 @@ from ..utils import ( encodeFilename, sanitize_open, parse_m3u8_attributes, + update_url_query, ) @@ -82,6 +84,14 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + test = self.params.get('test', False) + + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -94,13 +104,37 @@ class HlsFD(FragmentFD): line if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + break + except compat_urllib_error.HTTPError as err: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/rg3/youtube-dl/issues/10165, + # https://github.com/rg3/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_name, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + i += 1 + media_sequence += 1 + self.report_skip_fragment(frag_name) + continue + self.report_error( + 'giving up after %s fragment retries' % fragment_retries) return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - frag_content = down.read() - down.close() if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) frag_content = AES.new( @@ -108,7 +142,7 @@ class HlsFD(FragmentFD): ctx['dest_stream'].write(frag_content) frags_filenames.append(frag_sanitized) # We only download the first fragment during the test - if self.params.get('test', False): + if test: break i += 1 media_sequence += 1 @@ -116,10 +150,12 @@ class HlsFD(FragmentFD): decrypt_info = parse_m3u8_attributes(line[11:]) if decrypt_info['METHOD'] == 'AES-128': if 'IV' in decrypt_info: - decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index b584277be..c7b6df7d0 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, js_to_json, int_or_none, + parse_iso8601, ) @@ -93,3 +94,57 @@ class ABCIE(InfoExtractor): 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ABCIViewIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview' + _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + + _TESTS = [{ + 'url': 'http://iview.abc.net.au/programs/gardening-australia/FA1505V024S00', + 'md5': '979d10b2939101f0d27a06b79edad536', + 'info_dict': { + 'id': 'FA1505V024S00', + 'ext': 'mp4', + 'title': 'Series 27 Ep 24', + 'description': 'md5:b28baeae7504d1148e1d2f0e3ed3c15d', + 'upload_date': '20160820', + 'uploader_id': 'abc1', + 'timestamp': 1471719600, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_params = self._parse_json(self._search_regex( + r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) + title = video_params['title'] + stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + + formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) + self._sort_formats(formats) + + subtitles = {} + src_vtt = stream.get('captions', {}).get('src-vtt') + if src_vtt: + subtitles['en'] = [{ + 'url': src_vtt, + 'ext': 'vtt', + }] + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'duration': int_or_none(video_params.get('eventDuration')), + 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), + 'series': video_params.get('seriesTitle'), + 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage)), + 'episode': self._html_search_meta('episode_title', webpage), + 'uploader_id': video_params.get('channel'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py new file mode 100644 index 000000000..68ec37e00 --- /dev/null +++ b/youtube_dl/extractor/adobepass.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, +) + + +class AdobePassIE(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '' + etree.tostring(channel).decode() + '' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)' % (tag, tag), xml_str, tag) + + def is_expired(token, date_ele): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) + return token_expires and token_expires <= int(time.time()) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': self._USER_AGENT, + 'User-Agent': self._USER_AGENT, + } + + guid = xml_text(resource, 'guid') + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token and is_expired(authn_token, 'simpleTokenExpires'): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = 'DTV' + username, password = self._get_netrc_login_info(mso_id) + if not username or not password: + return '' + + def post_form(form_page, note, data={}): + post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') + return self._download_webpage( + post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + provider_redirect_page = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + provider_login_page = post_form( + provider_redirect_page, 'Downloading Provider Login Page') + mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { + 'username': username, + 'password': password, + }) + post_form(mvpd_confirm_page, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if 'playlists/)?(?P[^/]+)/(?P[^/?#]+)/?' _TESTS = [{ @@ -83,6 +81,21 @@ class AdultSwimIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + # heroMetadata.trailer + 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', + 'info_dict': { + 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', + 'ext': 'mp4', + 'title': 'Decker - Inside Decker: A New Hero', + 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', + 'duration': 249.008, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }] @staticmethod @@ -133,79 +146,56 @@ class AdultSwimIE(InfoExtractor): if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] - else: - raise ExtractorError('Unable to find video info') + if not video_info: + video_info = bootstrapped_data.get( + 'heroMetadata', {}).get('trailer', {}).get('video') + if not video_info: + video_info = bootstrapped_data.get('onlineOriginals', [None])[0] + if not video_info: + raise ExtractorError('Unable to find video info') show = bootstrapped_data['show'] show_title = show['title'] stream = video_info.get('stream') - clips = [stream] if stream else video_info.get('clips') - if not clips: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.' - if video_info.get('auth') is True else 'Unable to find stream or clips', - expected=True) - segment_ids = [clip['videoPlaybackID'] for clip in clips] + if stream and stream.get('videoPlaybackID'): + segment_ids = [stream['videoPlaybackID']] + elif video_info.get('clips'): + segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + elif video_info.get('videoPlaybackID'): + segment_ids = [video_info['videoPlaybackID']] + else: + if video_info.get('auth') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream or clips') episode_id = video_info['id'] episode_title = video_info['title'] - episode_description = video_info['description'] - episode_duration = video_info.get('duration') + episode_description = video_info.get('description') + episode_duration = int_or_none(video_info.get('duration')) + view_count = int_or_none(video_info.get('views')) entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id - + segement_info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, + segment_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }) segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: segment_title += ' Part %d' % (part_num + 1) - - idoc = self._download_xml( - segment_url, segment_title, - 'Downloading segment information', 'Unable to download segment information') - - segment_duration = float_or_none( - xpath_text(idoc, './/trt', 'segment duration').strip()) - - formats = [] - file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') - - unique_urls = [] - unique_file_els = [] - for file_el in file_els: - media_url = file_el.text - if not media_url or determine_ext(media_url) == 'f4m': - continue - if file_el.text not in unique_urls: - unique_urls.append(file_el.text) - unique_file_els.append(file_el) - - for file_el in unique_file_els: - bitrate = file_el.attrib.get('bitrate') - ftype = file_el.attrib.get('type') - media_url = file_el.text - if determine_ext(media_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', preference=0, - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - }) - - self._sort_formats(formats) - - entries.append({ + segement_info.update({ 'id': segment_id, 'title': segment_title, - 'formats': formats, - 'duration': segment_duration, - 'description': episode_description + 'description': episode_description, }) + entries.append(segement_info) return { '_type': 'playlist', @@ -214,5 +204,6 @@ class AdultSwimIE(InfoExtractor): 'entries': entries, 'title': '%s - %s' % (show_title, episode_title), 'description': episode_description, - 'duration': episode_duration + 'duration': episode_duration, + 'view_count': view_count, } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8f53050c9..6adb6d824 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -109,7 +109,10 @@ class AENetworksIE(AENetworksBaseIE): info = self._parse_theplatform_metadata(theplatform_metadata) if theplatform_metadata.get('AETN$isBehindWall'): requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] - resource = '%s%s%s%s' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) query['auth'] = self._extract_mvpd_auth( url, video_id, requestor_id, resource) info.update(self._search_json_ld(webpage, video_id, fatal=False)) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py new file mode 100644 index 000000000..c739d2c99 --- /dev/null +++ b/youtube_dl/extractor/amcnetworks.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + update_url_query, + parse_age_limit, + int_or_none, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?season-\d+/episode-\d+(?:-(?:[^/]+/)?|/))(?P[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', + 'md5': '', + 'info_dict': { + 'id': 's3MX01Nl4vPH', + 'ext': 'mp4', + 'title': 'Maron - Season 4 - Step 1', + 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', + 'age_limit': 17, + 'upload_date': '20160505', + 'timestamp': 1462468831, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = theplatform_metadata['ratings'][0]['rating'] + auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') + if auth_required == 'true': + requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id') + resource = self._get_mvpd_resource(requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'age_limit': parse_age_limit(parse_age_limit(rating)), + }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + series = theplatform_metadata.get(ns + '$show') + season_number = int_or_none(theplatform_metadata.get(ns + '$season')) + episode = theplatform_metadata.get(ns + '$episodeTitle') + episode_number = int_or_none(theplatform_metadata.get(ns + '$episode')) + if season_number: + title = 'Season %d - %s' % (season_number, title) + if series: + title = '%s - %s' % (series, title) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + return info diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 42c21bf41..2cdee3320 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -123,6 +123,10 @@ class AolFeaturesIE(InfoExtractor): 'title': 'What To Watch - February 17, 2016', }, 'add_ie': ['FiveMin'], + 'params': { + # encrypted m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 63429780e..025e29aa4 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -15,7 +13,7 @@ class AparatIE(InfoExtractor): _TEST = { 'url': 'http://www.aparat.com/v/wP8On', - 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', @@ -31,13 +29,13 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + - video_id + '/vt/frame') + embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id webpage = self._download_webpage(embed_url, video_id) - video_urls = [video_url.replace('\\/', '/') for video_url in re.findall( - r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)] - for i, video_url in enumerate(video_urls): + file_list = self._parse_json(self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) + for i, item in enumerate(file_list[0]): + video_url = item['file'] req = HEADRequest(video_url) res = self._request_webpage( req, video_id, note='Testing video URL %d' % i, errnote=False) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 8feb7cb74..486dff82d 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,67 +1,65 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import unified_strdate +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + unified_strdate, + clean_html, +) -class ArchiveOrgIE(InfoExtractor): +class ArchiveOrgIE(JWPlatformBaseIE): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P[^?/]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', 'info_dict': { 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'ext': 'ogv', + 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', - 'description': 'md5:1780b464abaca9991d8968c877bb53ed', + 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', 'upload_date': '19681210', 'uploader': 'SRI International' } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': '18f2a19e6d89af8425671da1cf3d4e04', + 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', 'info_dict': { 'id': 'Cops1922', - 'ext': 'ogv', + 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:70f72ee70882f713d4578725461ffcc3', + 'description': 'md5:b4544662605877edd99df22f9620d858', } + }, { + 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://archive.org/embed/' + video_id, video_id) + jwplayer_playlist = self._parse_json(self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", + webpage, 'jwplayer playlist'), video_id) + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) - json_url = url + ('&' if '?' in url else '?') + 'output=json' - data = self._download_json(json_url, video_id) + def get_optional(metadata, field): + return metadata.get(field, [None])[0] - def get_optional(data_dict, field): - return data_dict['metadata'].get(field, [None])[0] - - title = get_optional(data, 'title') - description = get_optional(data, 'description') - uploader = get_optional(data, 'creator') - upload_date = unified_strdate(get_optional(data, 'date')) - - formats = [ - { - 'format': fdata['format'], - 'url': 'http://' + data['server'] + data['dir'] + fn, - 'file_size': int(fdata['size']), - } - for fn, fdata in data['files'].items() - if 'Video' in fdata['format']] - - self._sort_formats(formats) - - return { - '_type': 'video', - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - 'upload_date': upload_date, - 'thumbnail': data.get('misc', {}).get('image'), - } + metadata = self._download_json( + 'http://archive.org/details/' + video_id, video_id, query={ + 'output': 'json', + })['metadata'] + info.update({ + 'title': get_optional(metadata, 'title') or info.get('title'), + 'description': clean_html(get_optional(metadata, 'description')), + }) + if info.get('_type') != 'playlist': + info.update({ + 'uploader': get_optional(metadata, 'creator'), + 'upload_date': unified_strdate(get_optional(metadata, 'date')), + }) + return info diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 13a06396d..07e67dd33 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -20,7 +20,7 @@ from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', @@ -62,6 +62,18 @@ class ARDMediathekIE(InfoExtractor): }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'md5': '4e8f00631aac0395fee17368ac0e9867', + 'info_dict': { + 'id': '30796318', + 'ext': 'mp3', + 'title': 'Vor dem Fest', + 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', + 'duration': 3287, + }, + 'skip': 'Video is no longer available', }] def _extract_media_info(self, media_info_url, webpage, video_id): diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 000000000..d45cae301 --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P[^/]+)/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + continue + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/awaan.py similarity index 72% rename from youtube_dl/extractor/dcn.py rename to youtube_dl/extractor/awaan.py index efb8585e8..bdf23c6a9 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/awaan.py @@ -12,46 +12,41 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, smuggle_url, unsmuggle_url, urlencode_postdata, ) -class DCNIE(InfoExtractor): +class AWAANIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P\d+)/[^/]+(?:/(?P\d+)/(?P\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() if video_id and int(video_id) > 0: return self.url_result( - 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') elif season_id and int(season_id) > 0: return self.url_result(smuggle_url( - 'http://www.dcndigital.ae/program/season/%s' % season_id, - {'show_id': show_id}), 'DCNSeason') + 'http://awaan.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'AWAANSeason') else: return self.url_result( - 'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') + 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') -class DCNBaseIE(InfoExtractor): - def _extract_video_info(self, video_data, video_id, is_live): +class AWAANBaseIE(InfoExtractor): + def _parse_video_data(self, video_data, video_id, is_live): title = video_data.get('title_en') or video_data['title_ar'] img = video_data.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video_data.get('duration')) - description = video_data.get('description_en') or video_data.get('description_ar') - timestamp = parse_iso8601(video_data.get('create_time'), ' ') return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': video_data.get('description_en') or video_data.get('description_ar'), + 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, } @@ -62,11 +57,9 @@ class DCNBaseIE(InfoExtractor): r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', r']+href="rtsp(://[^"]+)"' ], webpage, 'format url') - # TODO: Current DASH formats are broken - $Time$ pattern in - # not implemented yet - # formats.extend(self._extract_mpd_formats( - # format_url_base + '/manifest.mpd', - # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats( + format_url_base + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_m3u8_formats( format_url_base + '/playlist.m3u8', video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) @@ -77,11 +70,12 @@ class DCNBaseIE(InfoExtractor): return formats -class DCNVideoIE(DCNBaseIE): - IE_NAME = 'dcn:video' +class AWAANVideoIE(AWAANBaseIE): + IE_NAME = 'awaan:video' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', + 'md5': '5f61c33bfc7794315c671a62d43116aa', 'info_dict': { 'id': '17375', @@ -92,10 +86,6 @@ class DCNVideoIE(DCNBaseIE): 'timestamp': 1227504126, 'upload_date': '20081124', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', 'only_matching': True, @@ -104,11 +94,10 @@ class DCNVideoIE(DCNBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request( + video_data = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - video_data = self._download_json(request, video_id) - info = self._extract_video_info(video_data, video_id, False) + video_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(video_data, video_id, False) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + @@ -123,19 +112,31 @@ class DCNVideoIE(DCNBaseIE): return info -class DCNLiveIE(DCNBaseIE): - IE_NAME = 'dcn:live' +class AWAANLiveIE(AWAANBaseIE): + IE_NAME = 'awaan:live' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P\d+)' + _TEST = { + 'url': 'http://awaan.ae/live/6/dubai-tv', + 'info_dict': { + 'id': '6', + 'ext': 'mp4', + 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20150107', + 'timestamp': 1420588800, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): channel_id = self._match_id(url) - request = sanitized_Request( + channel_data = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - - channel_data = self._download_json(request, channel_id) - info = self._extract_video_info(channel_data, channel_id, True) + channel_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(channel_data, channel_id, True) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + @@ -150,8 +151,8 @@ class DCNLiveIE(DCNBaseIE): return info -class DCNSeasonIE(InfoExtractor): - IE_NAME = 'dcn:season' +class AWAANSeasonIE(InfoExtractor): + IE_NAME = 'awaan:season' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P\d+)|season/(?P\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', @@ -172,21 +173,17 @@ class DCNSeasonIE(InfoExtractor): data['season'] = season_id show_id = smuggled_data.get('show_id') if show_id is None: - request = sanitized_Request( + season = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - season = self._download_json(request, season_id) + season_id, headers={'Origin': 'http://awaan.ae'}) show_id = season['id'] data['show_id'] = show_id - request = sanitized_Request( + show = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/show', - urlencode_postdata(data), - { - 'Origin': 'http://www.dcndigital.ae', + show_id, data=urlencode_postdata(data), headers={ + 'Origin': 'http://awaan.ae', 'Content-Type': 'application/x-www-form-urlencoded' }) - - show = self._download_json(request, show_id) if not season_id: season_id = show['default_season'] for season in show['seasons']: @@ -197,6 +194,6 @@ class DCNSeasonIE(InfoExtractor): for video in show['videos']: video_id = compat_str(video['id']) entries.append(self.url_result( - 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo', video_id)) + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 991ab0676..249c3d956 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -162,6 +162,15 @@ class BandcampAlbumIE(InfoExtractor): 'uploader_id': 'dotscale', }, 'playlist_mincount': 7, + }, { + # with escaped quote in title + 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', + 'info_dict': { + 'title': '"Entropy" EP', + 'uploader_id': 'jstrecords', + 'id': 'entropy-ep', + }, + 'playlist_mincount': 3, }] def _real_extract(self, url): @@ -176,8 +185,11 @@ class BandcampAlbumIE(InfoExtractor): entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] - title = self._search_regex( - r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False) + title = self._html_search_regex( + r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + webpage, 'title', fatal=False) + if title: + title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 23c6e505b..deb9cc1c0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,19 +2,23 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( + dict_get, ExtractorError, float_or_none, int_or_none, parse_duration, parse_iso8601, + try_get, unescapeHTML, ) from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) @@ -55,11 +59,12 @@ class BBCCoUkIE(InfoExtractor): 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', 'info_dict': { 'id': 'b039d07m', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', }, 'params': { + # rtmp download 'skip_download': True, } }, @@ -91,7 +96,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - 'skip': 'this episode is not currently available', + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/p026c7jt/tomorrows-worlds-the-unearthly-history-of-science-fiction-2-invasion', @@ -106,7 +111,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - 'skip': 'this episode is not currently available', + 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only', }, { 'url': 'http://www.bbc.co.uk/programmes/b04v20dw', 'info_dict': { @@ -126,12 +131,13 @@ class BBCCoUkIE(InfoExtractor): 'note': 'Audio', 'info_dict': { 'id': 'p022h44j', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'BBC Proms Music Guides, Rachmaninov: Symphonic Dances', 'description': "In this Proms Music Guide, Andrew McGregor looks at Rachmaninov's Symphonic Dances.", 'duration': 227, }, 'params': { + # rtmp download 'skip_download': True, } }, { @@ -139,12 +145,13 @@ class BBCCoUkIE(InfoExtractor): 'note': 'Video', 'info_dict': { 'id': 'p025c103', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Reading and Leeds Festival, 2014, Rae Morris - Closer (Live on BBC Three)', 'description': 'Rae Morris performs Closer for BBC Three at Reading 2014', 'duration': 226, }, 'params': { + # rtmp download 'skip_download': True, } }, { @@ -160,7 +167,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - 'skip': 'this episode is not currently available', + 'skip': 'geolocation', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b05zmgwn/royal-academy-summer-exhibition', 'info_dict': { @@ -174,7 +181,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - 'skip': 'this episode is not currently available', + 'skip': 'geolocation', }, { # iptv-all mediaset fails with geolocation however there is no geo restriction # for this programme at all @@ -189,17 +196,18 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - 'skip': 'this episode is not currently available on BBC iPlayer Radio', + 'skip': 'Now it\'s really geo-restricted', }, { # compact player (https://github.com/rg3/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', 'info_dict': { 'id': 'p028bfkj', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', 'description': 'Extract from BBC documentary Look Stranger - Giant Leeks and Magic Brews', }, 'params': { + # rtmp download 'skip_download': True, }, }, { @@ -225,51 +233,6 @@ class BBCCoUkIE(InfoExtractor): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] - def _extract_connection(self, connection, programme_id): - formats = [] - kind = connection.get('kind') - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - transfer_format = connection.get('transferFormat') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Skip DASH until supported - elif transfer_format == 'dash': - pass - elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, 'mp4', 'm3u8_native', - m3u8_id=supplier, fatal=False)) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier or kind or protocol, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) @@ -290,47 +253,6 @@ class BBCCoUkIE(InfoExtractor): def _extract_connections(self, media): return self._findall_ns(media, './{%s}connection') - def _extract_video(self, media, programme_id): - formats = [] - vbr = int_or_none(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - if format.get('protocol') != 'm3u8_native': - format.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - if service: - format['format_id'] = '%s_%s' % (service, format['format_id']) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int_or_none(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - 'vcodec': 'none', - }) - formats.extend(conn_formats) - return formats - def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): @@ -376,13 +298,87 @@ class BBCCoUkIE(InfoExtractor): def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None + urls = [] for media in self._extract_medias(media_selection): kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + if not service and not supplier and bitrate: + format_id += '-%d' % bitrate + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'vbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol == 'http': + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -587,6 +583,7 @@ class BBCIE(BBCCoUkIE): 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -600,6 +597,7 @@ class BBCIE(BBCCoUkIE): 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -649,6 +647,23 @@ class BBCIE(BBCCoUkIE): # rtmp download 'skip_download': True, } + }, { + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to UK', }, { # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', @@ -746,7 +761,7 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') playlist_title = json_ld_info.get('title') @@ -815,8 +830,29 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: - entries.append(self._extract_from_playlist_sxml( - playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + entry = None + for key in ('streaming', 'progressiveDownload'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) + except Exception as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -849,6 +885,50 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), @@ -866,7 +946,7 @@ class BBCIE(BBCCoUkIE): r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) if entries: return self.playlist_result( - [self.url_result(entry, 'BBCCoUk') for entry in entries], + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) @@ -978,27 +1058,43 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P%s)' % BBCCoUkIE._ID_REGEX + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P%s)' % BBCCoUkIE._ID_REGEX _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', @@ -1006,7 +1102,17 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 6, - } + 'skip': 'This programme is not currently available on BBC iPlayer', + }, { + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }] def _extract_title_and_description(self, webpage): title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) @@ -1029,6 +1135,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True, diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index bd3ee2e2e..1f8ef0303 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate -from ..compat import compat_urllib_parse_urlencode class BetIE(MTVServicesInfoExtractor): @@ -53,9 +52,9 @@ class BetIE(MTVServicesInfoExtractor): _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'uuid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index 33762ad93..b4ce767af 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -11,22 +11,13 @@ from ..compat import compat_urllib_parse_unquote class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', - 'info_dict': { - 'id': '16537', - 'ext': 'mp4', - 'title': 'Singham Returns', - 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d', - } - }, { # 2 formats 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', 'info_dict': { 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', - 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', 'formats': 'mincount:2', }, 'params': { diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index b17047b39..a332fbb69 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,22 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime +import hashlib import re from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_parse_qs, - compat_xml_parse_error, -) +from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, int_or_none, float_or_none, - xpath_text, + unified_timestamp, ) @@ -25,13 +18,13 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', + 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { - 'id': '1554319', - 'ext': 'flv', + 'id': '1074402', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, + 'duration': 308.315, 'timestamp': 1398012660, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', @@ -42,76 +35,42 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { 'id': '1041170', + 'ext': 'mp4', 'title': '【BD1080P】刀语【诸神&异域】', 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'duration': 3382.259, + 'timestamp': 1396530060, + 'upload_date': '20140403', + 'thumbnail': 're:^https?://.+\.jpg', + 'uploader': '枫叶逝去', + 'uploader_id': '520116', }, - 'playlist_count': 9, }, { 'url': 'http://www.bilibili.com/video/av4808130/', 'info_dict': { 'id': '4808130', + 'ext': 'mp4', 'title': '【长篇】哆啦A梦443【钉铛】', 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'duration': 1493.995, + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'thumbnail': 're:^https?://.+\.jpg', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', }, - 'playlist': [{ - 'md5': '55cdadedf3254caaa0d5d27cf20a8f9c', - 'info_dict': { - 'id': '4808130_part1', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '926f9f67d0c482091872fbd8eca7ea3d', - 'info_dict': { - 'id': '4808130_part2', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '4b7b225b968402d7c32348c646f1fd83', - 'info_dict': { - 'id': '4808130_part3', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '7b795e214166501e9141139eea236e91', - 'info_dict': { - 'id': '4808130_part4', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }], }, { # Missing upload time 'url': 'http://www.bilibili.com/video/av1867637/', 'info_dict': { - 'id': '2880301', - 'ext': 'flv', + 'id': '1867637', + 'ext': 'mp4', 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', + 'duration': 5760.0, 'uploader': '黑夜为猫', 'uploader_id': '610729', + 'thumbnail': 're:^https?://.+\.jpg', }, 'params': { # Just to test metadata extraction @@ -120,86 +79,61 @@ class BiliBiliIE(InfoExtractor): 'expected_warnings': ['upload time'], }] - # BiliBili blocks keys from time to time. The current key is extracted from - # the Android client - # TODO: find the sign algorithm used in the flash player - _APP_KEY = '86385cdc024c0f6c' + _APP_KEY = '6f90a59ac58a4123' + _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - params = compat_parse_qs(self._search_regex( + cid = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters')) - cid = params['cid'][0] + webpage, 'player parameters'))['cid'][0] - info_xml_str = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play', - cid, query={'appkey': self._APP_KEY, 'cid': cid}, - note='Downloading video info page') + payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - err_msg = None - durls = None - info_xml = None - try: - info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) - except compat_xml_parse_error: - info_json = self._parse_json(info_xml_str, video_id, fatal=False) - err_msg = (info_json or {}).get('error_text') - else: - err_msg = xpath_text(info_xml, './message') - - if info_xml is not None: - durls = info_xml.findall('./durl') - if not durls: - if err_msg: - raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) - else: - raise ExtractorError('No videos found!') + video_info = self._download_json( + 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page') entries = [] - for durl in durls: - size = xpath_text(durl, ['./filesize', './size']) + for idx, durl in enumerate(video_info['durl']): formats = [{ - 'url': durl.find('./url').text, - 'filesize': int_or_none(size), + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), }] - for backup_url in durl.findall('./backup_url/url'): + for backup_url in durl['backup_url']: formats.append({ - 'url': backup_url.text, + 'url': backup_url, # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url.text else -3, + 'preference': -2 if 'hd.mp4' in backup_url else -3, }) self._sort_formats(formats) entries.append({ - 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), - 'duration': int_or_none(xpath_text(durl, './length'), 1000), + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), 'formats': formats, }) title = self._html_search_regex(']+title="([^"]+)">', webpage, 'title') description = self._html_search_meta('description', webpage) - datetime_str = self._html_search_regex( - r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False) - timestamp = None - if datetime_str: - timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) + timestamp = unified_timestamp(self._html_search_regex( + r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': compat_str(cid), + 'id': video_id, 'title': title, 'description': description, 'timestamp': timestamp, 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), - 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), + 'duration': float_or_none(video_info.get('timelength'), scale=1000), } uploader_mobj = re.search( diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index ae4579b33..beaebfd2a 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -24,7 +24,8 @@ class BIQLEIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', 'uploader': 'Dmitry Kotov', - } + }, + 'skip': ' This video was marked as adult. Embedding adult videos on external sites is prohibited.', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index bd538be50..2a8cd64b9 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -20,6 +21,18 @@ class BloombergIE(InfoExtractor): 'params': { 'format': 'best[format_id^=hds]', }, + }, { + # video ID in BPlayer(...) + 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', + 'info_dict': { + 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', + 'ext': 'flv', + 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', + 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, @@ -33,7 +46,11 @@ class BloombergIE(InfoExtractor): webpage = self._download_webpage(url, name) video_id = self._search_regex( r'["\']bmmrId["\']\s*:\s*(["\'])(?P.+?)\1', - webpage, 'id', group='url') + webpage, 'id', group='url', default=None) + if not video_id: + bplayer_data = self._parse_json(self._search_regex( + r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + video_id = bplayer_data['id'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 541c76944..a25d500e4 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -1,31 +1,74 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import smuggle_url +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, +) -class BravoTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P[^/?]+)' - _TEST = { +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _TESTS = [{ 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', - 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'md5': '9086d0b7ef0ea2aabc4781d75f4e5863', 'info_dict': { - 'id': 'LitrBdX64qLn', + 'id': 'zHyk1_HU_mPy', 'ext': 'mp4', - 'title': 'Last Chance Kitchen Returns', - 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', - 'timestamp': 1448926740, - 'upload_date': '20151130', + 'title': 'LCK Ep 12: Fishy Finale', + 'description': 'S13/E12: Two eliminated chefs have just 12 minutes to cook up a delicious fish dish.', 'uploader': 'NBCU-BRAV', + 'upload_date': '20160302', + 'timestamp': 1456945320, } - } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') - release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') - return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), - {'force_smil_url': True}), 'ThePlatform', release_pid) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('sharedTVE') + if tve: + query['manifest'] = 'm3u' + account_pid = 'HNK2IC' + release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'bravo'), + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + else: + shared_playlist = settings['shared_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + release_pid = metadata['release_pid'] + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, release_pid), + query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 6ffbeabd3..268c34392 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -10,8 +9,10 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, + parse_duration, str_to_int, + unified_strdate, ) @@ -26,14 +27,14 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { @@ -41,64 +42,71 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, + 'duration': 318, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"
Source: ]*>Sources?(?:\s+from)?\s*:\s*]+(?:href|title)=(['\"])(?P(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"
Posted\s*:
\s*
([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"
Views\s*:
\s*
([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, 'view_count': view_count, } diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py new file mode 100644 index 000000000..b3f30b1ca --- /dev/null +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'info_dict': { + 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'ext': 'mp4', + 'title': 'Starfire the Cat Lady', + 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() + query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id + return self._extract_cvp_info( + 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { + 'secure': { + 'media_src': 'http://apple-secure.cdn.turner.com/toon/big', + 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', + }, + }) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ff663d079..d71fddf58 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,13 +4,24 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( js_to_json, smuggle_url, + try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + parse_iso8601, + parse_age_limit, + int_or_none, + ExtractorError, ) class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ # with mediaId @@ -25,8 +36,22 @@ class CBCIE(InfoExtractor): 'upload_date': '20160203', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { - # with clipId + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { @@ -64,6 +89,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }], + 'skip': 'Geo-restricted to Canada', }] @classmethod @@ -81,9 +107,15 @@ class CBCIE(InfoExtractor): media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)] @@ -91,6 +123,7 @@ class CBCIE(InfoExtractor): class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', @@ -104,6 +137,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', @@ -143,3 +177,165 @@ class CBCPlayerIE(InfoExtractor): }), 'id': video_id, } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if not self._device_id or not self._device_token: + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if not self._device_id or not self._device_token: + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, data=b'web') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P[0-9a-f-]+)' + _TESTS = [{ + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '38e815a-009e3ab12e4', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + 'skip': 'Geo-restricted to Canada', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index a23173d6f..3f4dea40c 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, find_xpath_attr, + ExtractorError, ) @@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info( - 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { - 'series': entry.get('cbs$SeriesTitle'), - 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), - 'episode': entry.get('cbs$EpisodeTitle'), - 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), - }, { - 'StreamPack': { - 'manifest': 'm3u', - } - }) - class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' @@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', - 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': { + # m3u8 download + 'skip_download': True, + }, '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' + + def _extract_video_info(self, guid): + path = 'dJ5BDC/media/guid/2198311517/' + guid + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid) + for r in ('OnceURL&formats=M3U', 'HLS&formats=M3U', 'RTMP', 'WIFI', '3G'): + try: + tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0]) + formats.extend(tp_formats) + except ExtractorError: + continue + self._sort_formats(formats) + metadata = self._download_theplatform_metadata(path, guid) + info = self._parse_theplatform_metadata(metadata) + info.update({ + 'id': guid, + 'formats': formats, + 'subtitles': subtitles, + 'series': metadata.get('cbs$SeriesTitle'), + 'season_number': int_or_none(metadata.get('cbs$SeasonNumber')), + 'episode': metadata.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')), + }) + return info def _real_extract(self, url): content_id = self._match_id(url) - return self._extract_video_info('byGuid=%s' % content_id, content_id) + return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 74adb38a6..4bcd104af 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -1,12 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse +from ..utils import unified_timestamp class CBSLocalIE(AnvatoIE): @@ -43,13 +41,8 @@ class CBSLocalIE(AnvatoIE): 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'upload_date': '20160516', - 'timestamp': 1463433840, - 'duration': 49, }, + 'playlist_count': 9, 'params': { # m3u8 download 'skip_download': True, @@ -62,19 +55,15 @@ class CBSLocalIE(AnvatoIE): sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: - info_dict = { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, sendtonews_url), - } - else: - info_dict = self._extract_anvato_videos(webpage, display_id) + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) time_str = self._html_search_regex( r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) - timestamp = None - if time_str: - timestamp = calendar.timegm(datetime.datetime.strptime( - time_str, '%b %d, %Y %I:%M %p').timetuple()) + timestamp = unified_timestamp(time_str) info_dict.update({ 'display_id': display_id, diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 387537e76..4aa6917a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .cbs import CBSBaseIE +from .cbs import CBSIE from ..utils import ( parse_duration, ) -class CBSNewsIE(CBSBaseIE): +class CBSNewsIE(CBSIE): IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' @@ -26,6 +26,7 @@ class CBSNewsIE(CBSBaseIE): # rtmp download 'skip_download': True, }, + 'skip': 'Subscribers only', }, { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -34,7 +35,8 @@ class CBSNewsIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', - 'upload_date': '19700101', + 'upload_date': '20140404', + 'timestamp': 1396650660, 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, @@ -62,13 +64,14 @@ class CBSNewsIE(CBSBaseIE): item = video_info['item'] if 'item' in video_info else video_info guid = item['mpxRefId'] - return self._extract_video_info('byGuid=%s' % guid, guid) + return self._extract_video_info(guid) class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { @@ -77,6 +80,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, + 'skip': 'Video gone', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 78ca44b02..bf7915626 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE): } }] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py new file mode 100644 index 000000000..4bf2cf7b0 --- /dev/null +++ b/youtube_dl/extractor/charlierose.py @@ -0,0 +1,51 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class CharlieRoseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P\d+)' + _TESTS = [{ + 'url': 'https://charlierose.com/videos/27996', + 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', + 'info_dict': { + 'id': '27996', + 'ext': 'mp4', + 'title': 'Remembering Zaha Hadid', + 'thumbnail': 're:^https?://.*\.jpg\?\d+', + 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, + }, + }, { + 'url': 'https://charlierose.com/videos/27996', + 'only_matching': True, + }] + + _PLAYER_BASE = 'https://charlierose.com/video/player/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) + + title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') + + info_dict = self._parse_html5_media_entries( + self._PLAYER_BASE % video_id, webpage, video_id, + m3u8_entry_protocol='m3u8_native')[0] + + self._sort_formats(info_dict['formats']) + self._remove_duplicate_formats(info_dict['formats']) + + info_dict.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + }) + + return info_dict diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index b2234549e..29a8820d5 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -17,7 +17,8 @@ class ChaturbateIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Room is offline', }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index b1eeaf101..b43518652 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,30 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 + from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) +from ..utils import parse_duration class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P[\da-zA-Z]+)' _TESTS = [{ - 'url': 'http://chirb.it/PrIPv5', - 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'url': 'http://chirb.it/be2abG', 'info_dict': { - 'id': 'PrIPv5', + 'id': 'be2abG', 'ext': 'mp3', - 'title': 'Фасадстрой', - 'duration': 52, - 'view_count': int, - 'comment_count': int, + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + }, + 'params': { + 'skip_download': True, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,27 +36,30 @@ class ChirbitIE(InfoExtractor): webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) - audio_url = self._search_regex( - r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + data_fd = self._search_regex( + r'data-fd=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = base64.b64decode( + data_fd[::-1].encode('ascii')).decode('utf-8') title = self._search_regex( - r'itemprop="name">([^<]+)', webpage, 'title') - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'itemprop="playCount"\s*>(\d+)', webpage, - 'listen count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'>(\d+) Comments?:', webpage, - 'comment count', fatal=False)) + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'

Description

\s*]*>([^<]+)', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': audio_id, 'url': audio_url, 'title': title, + 'description': description, 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 19f8b397e..252c2e846 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -23,7 +23,7 @@ class CliphunterIE(InfoExtractor): (?P[0-9]+)/ (?P.+?)(?:$|[#\?]) ''' - _TEST = { + _TESTS = [{ 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', 'info_dict': { @@ -32,8 +32,19 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - } - } + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', + 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', + 'info_dict': { + 'id': '2019449', + 'ext': 'mp4', + 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', + 'thumbnail': 're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9a28ef354..ae5ba0015 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -17,37 +16,26 @@ from ..utils import ( class CloudyIE(InfoExtractor): - _IE_DESC = 'cloudy.ec and videoraj.ch' + _IE_DESC = 'cloudy.ec' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?Pcloudy\.ec|videoraj\.(?:ch|to))/ + https?://(?:www\.)?cloudy\.ec/ (?:v/|embed\.php\?id=) (?P[A-Za-z0-9]+) ''' - _EMBED_URL = 'http://www.%s/embed.php?id=%s' - _API_URL = 'http://www.%s/api/player.api.php?%s' + _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' + _API_URL = 'http://www.cloudy.ec/api/player.api.php' _MAX_TRIES = 2 - _TESTS = [ - { - 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '5cb253ace826a42f35b4740539bedf07', - 'info_dict': { - 'id': 'af511e2527aac', - 'ext': 'flv', - 'title': 'Funny Cats and Animals Compilation june 2013', - } - }, - { - 'url': 'http://www.videoraj.to/v/47f399fd8bb60', - 'md5': '7d0f8799d91efd4eda26587421c3c3b0', - 'info_dict': { - 'id': '47f399fd8bb60', - 'ext': 'flv', - 'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?', - } + _TEST = { + 'url': 'https://www.cloudy.ec/v/af511e2527aac', + 'md5': '5cb253ace826a42f35b4740539bedf07', + 'info_dict': { + 'id': 'af511e2527aac', + 'ext': 'flv', + 'title': 'Funny Cats and Animals Compilation june 2013', } - ] + } - def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0): + def _extract_video(self, video_id, file_key, error_url=None, try_num=0): if try_num > self._MAX_TRIES - 1: raise ExtractorError('Unable to extract video URL', expected=True) @@ -64,9 +52,8 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( - data_url, video_id, 'Downloading player data') + self._API_URL, video_id, 'Downloading player data', query=form) data = compat_parse_qs(player_data) try_num += 1 @@ -88,7 +75,7 @@ class CloudyIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: self.report_warning('Invalid video URL, requesting another', video_id) - return self._extract_video(video_host, video_id, file_key, video_url, try_num) + return self._extract_video(video_id, file_key, video_url, try_num) return { 'id': video_id, @@ -98,14 +85,13 @@ class CloudyIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_host = mobj.group('host') video_id = mobj.group('id') - url = self._EMBED_URL % (video_host, video_id) + url = self._EMBED_URL % video_id webpage = self._download_webpage(url, video_id) file_key = self._search_regex( [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], webpage, 'file_key') - return self._extract_video(video_host, video_id, file_key) + return self._extract_video(video_id, file_key) diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f1311b14f..f24568dcc 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals + from .mtv import MTVIE +from ..utils import ExtractorError class CMTIE(MTVIE): @@ -16,7 +18,27 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % cls.IE_NAME, expected=True) + + return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 53489a14e..5fc311f53 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -3,15 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - url_basename, -) +from .turner import TurnerBaseIE +from ..utils import url_basename -class CNNIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ +class CNNIE(TurnerBaseIE): + _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ @@ -25,6 +22,7 @@ class CNNIE(InfoExtractor): 'duration': 135, 'upload_date': '20130609', }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', @@ -34,7 +32,8 @@ class CNNIE(InfoExtractor): 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", 'upload_date': '20130821', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', @@ -44,80 +43,61 @@ class CNNIE(InfoExtractor): 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, }] + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - path = mobj.group('path') - page_title = mobj.group('title') - info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path - info = self._download_xml(info_url, page_title) - - formats = [] - rex = re.compile(r'''(?x) - (?P<width>[0-9]+)x(?P<height>[0-9]+) - (?:_(?P<bitrate>[0-9]+)k)? - ''') - for f in info.findall('files/file'): - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) - fdct = { - 'format_id': f.attrib['bitrate'], - 'url': video_url, - } - - mf = rex.match(f.attrib['bitrate']) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mf = rex.search(f.text) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) - if mi: - if mi.group(1) == 'audio': - fdct['vcodec'] = 'none' - fdct['ext'] = 'm4a' - else: - fdct['tbr'] = int(mi.group(1)) - - formats.append(fdct) - - self._sort_formats(formats) - - thumbnails = [{ - 'height': int(t.attrib['height']), - 'width': int(t.attrib['width']), - 'url': t.text, - } for t in info.findall('images/image')] - - metas_el = info.find('metas') - upload_date = ( - metas_el.attrib.get('version') if metas_el is not None else None) - - duration_el = info.find('length') - duration = parse_duration(duration_el.text) - - return { - 'id': info.attrib['id'], - 'title': info.find('headline').text, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': info.find('description').text, - 'duration': duration, - 'upload_date': upload_date, - } + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + } + }) class CNNBlogsIE(InfoExtractor): @@ -132,6 +112,7 @@ class CNNBlogsIE(InfoExtractor): 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', 'upload_date': '20140209', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } @@ -146,7 +127,7 @@ class CNNBlogsIE(InfoExtractor): class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', @@ -154,9 +135,10 @@ class CNNArticleIE(InfoExtractor): 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', 'upload_date': '20141221', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 2b6aaa3aa..88346dde7 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,17 +1,7 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - float_or_none, - unified_strdate, -) +from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): @@ -26,8 +16,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', - 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', @@ -35,241 +27,92 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] -class ComedyCentralShowsIE(MTVServicesInfoExtractor): - IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow - # urls for episodes like: - # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day - # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news - # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) - |https?://(:www\.)? - (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ - ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| - (?P<clip> - (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) - |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) - |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) - )| - (?P<interview> - extended-interviews/(?P<interID>[0-9a-z]+)/ - (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?) - (?:/[^/?#]?|[?#]|$)))) - ''' +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + _TESTS = [{ - 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', - 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', - 'info_dict': { - 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', - 'ext': 'mp4', - 'upload_date': '20121213', - 'description': 'Kristen Stewart learns to let loose in "On the Road."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow kristen-stewart part 1', - } - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', - 'info_dict': { - 'id': 'sarah-chayes-extended-interview', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'title': 'thedailyshow Sarah Chayes Extended Interview', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 1', - }, - }, - { - 'info_dict': { - 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 2', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', - 'only_matching': True, - }, { 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': 're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', 'only_matching': True, }] - _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) + new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') + return new_urls - _video_extensions = { - '3500': 'mp4', - '2200': 'mp4', - '1700': 'mp4', - '1200': 'mp4', - '750': 'mp4', - '400': 'mp4', - } - _video_dimensions = { - '3500': (1280, 720), - '2200': (960, 540), - '1700': (768, 432), - '1200': (640, 360), - '750': (512, 288), - '400': (384, 216), - } + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'info_dict': { + 'id': 'local_playlist-f99b626bdfe13568579a', + 'ext': 'flv', + 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', + 'only_matching': True, + }, { + 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + video_id = self._match_id(url) - if mobj.group('shortname'): - return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') + webpage = self._download_webpage(url, video_id) - if mobj.group('clip'): - if mobj.group('videotitle'): - epTitle = mobj.group('videotitle') - elif mobj.group('showname') == 'thedailyshow': - epTitle = mobj.group('tdstitle') - else: - epTitle = mobj.group('cntitle') - dlNewest = False - elif mobj.group('interview'): - epTitle = mobj.group('interview_title') - dlNewest = False - else: - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') - else: - epTitle = mobj.group('episode') - show_name = mobj.group('showname') + mrss_url = self._search_regex( + r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'mrss url', group='url') - webpage, htmlHandle = self._download_webpage_handle(url, epTitle) - if dlNewest: - url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid redirected URL: ' + url) - if mobj.group('episode') == '': - raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] + return self._get_videos_info_from_url(mrss_url, video_id) - mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) - if len(mMovieParams) == 0: - # The Colbert Report embeds the information in a without - # a URL prefix; so extract the alternate reference - # and then add the URL prefix manually. - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) - if len(altMovieParams) == 0: - raise ExtractorError('unable to find Flash URL in webpage ' + url) - else: - mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])] +class ComedyCentralShortnameIE(InfoExtractor): + _VALID_URL = r'^:(?P<id>tds|thedailyshow)$' + _TESTS = [{ + 'url': ':tds', + 'only_matching': True, + }, { + 'url': ':thedailyshow', + 'only_matching': True, + }] - uri = mMovieParams[0][1] - # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) - idoc = self._download_xml( - index_url, epTitle, - 'Downloading show index', 'Unable to download episode index') - - title = idoc.find('./channel/title').text - description = idoc.find('./channel/description').text - - entries = [] - item_els = idoc.findall('.//item') - for part_num, itemEl in enumerate(item_els): - upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) - thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') - - content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') - duration = float_or_none(content.attrib.get('duration')) - mediagen_url = content.attrib['url'] - guid = itemEl.find('./guid').text.rpartition(':')[-1] - - cdoc = self._download_xml( - mediagen_url, epTitle, - 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) - - turls = [] - for rendition in cdoc.findall('.//rendition'): - finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) - turls.append(finfo) - - formats = [] - for format, rtmp_video_url in turls: - w, h = self._video_dimensions.get(format, (None, None)) - formats.append({ - 'format_id': 'vhttp-%s' % format, - 'url': self._transform_rtmp_url(rtmp_video_url), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - formats.append({ - 'format_id': 'rtmp-%s' % format, - 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - self._sort_formats(formats) - - subtitles = self._extract_subtitles(cdoc, guid) - - virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) - entries.append({ - 'id': guid, - 'title': virtual_id, - 'formats': formats, - 'uploader': show_name, - 'upload_date': upload_date, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'subtitles': subtitles, - }) - - return { - '_type': 'playlist', - 'id': epTitle, - 'entries': entries, - 'title': show_name + ' ' + title, - 'description': description, + def _real_extract(self, url): + video_id = self._match_id(url) + shortcut_map = { + 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', } + return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 29544c1a8..a82968162 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -662,6 +662,24 @@ class InfoExtractor(object): else: return res + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self._downloader.params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + + return (username, password) + def _get_login_info(self): """ Get the login info as (username, password) @@ -679,16 +697,8 @@ class InfoExtractor(object): if downloader_params.get('username') is not None: username = downloader_params['username'] password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + else: + username, password = self._get_netrc_login_info() return (username, password) @@ -727,9 +737,14 @@ class InfoExtractor(object): [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): + if not isinstance(prop, (list, tuple)): + prop = [prop] if name is None: - name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) + name = 'OpenGraph %s' % prop[0] + og_regexes = [] + for p in prop: + og_regexes.extend(self._og_regexes(p)) + escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -811,11 +826,14 @@ class InfoExtractor(object): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) if not json_ld: - return {} - return self._json_ld( - json_ld, video_id, fatal=kwargs.get('fatal', True), - expected_type=expected_type) + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): @@ -823,41 +841,47 @@ class InfoExtractor(object): if not json_ld: return {} info = {} - if json_ld.get('@context') == 'http://schema.org': - item_type = json_ld.get('@type') - if expected_type is not None and expected_type != item_type: - return info - if item_type == 'TVEpisode': - info.update({ - 'episode': unescapeHTML(json_ld.get('name')), - 'episode_number': int_or_none(json_ld.get('episodeNumber')), - 'description': unescapeHTML(json_ld.get('description')), - }) - part_of_season = json_ld.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = json_ld.get('partOfSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': - info.update({ - 'timestamp': parse_iso8601(json_ld.get('datePublished')), - 'title': unescapeHTML(json_ld.get('headline')), - 'description': unescapeHTML(json_ld.get('articleBody')), - }) - elif item_type == 'VideoObject': - info.update({ - 'url': json_ld.get('contentUrl'), - 'title': unescapeHTML(json_ld.get('name')), - 'description': unescapeHTML(json_ld.get('description')), - 'thumbnail': json_ld.get('thumbnailUrl'), - 'duration': parse_duration(json_ld.get('duration')), - 'timestamp': unified_timestamp(json_ld.get('uploadDate')), - 'filesize': float_or_none(json_ld.get('contentSize')), - 'tbr': int_or_none(json_ld.get('bitrate')), - 'width': int_or_none(json_ld.get('width')), - 'height': int_or_none(json_ld.get('height')), - }) + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + for e in json_ld: + if e.get('@context') == 'http://schema.org': + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + return info + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(e.get('name')), + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + }) + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -911,7 +935,8 @@ class InfoExtractor(object): if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 - proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + protocol = f.get('protocol') or determine_protocol(f) + proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) if f.get('vcodec') == 'none': # audio only preference -= 50 @@ -1128,7 +1153,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': preference - 1 if preference else -1, + 'preference': preference - 100 if preference else -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } @@ -1176,27 +1201,44 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - last_info = None - last_media = None + last_info = {} + last_media = {} for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = parse_m3u8_attributes(line) + media = parse_m3u8_attributes(line) + media_type = media.get('TYPE') + if media_type in ('VIDEO', 'AUDIO'): + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (media.get('GROUP-ID'), media.get('NAME')): + if v: + format_id.append(v) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'language': media.get('LANGUAGE'), + 'vcodec': 'none' if media_type == 'AUDIO' else None, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }) + else: + # When there is no URI in EXT-X-MEDIA let this tag's + # data be used by regular URI lines below + last_media = media elif line.startswith('#') or not line.strip(): continue else: - if last_info is None: - formats.append({'url': format_url(line)}) - continue - tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None # Despite specification does not mention NAME attribute for # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media_name + stream_name = last_info.get('NAME') or last_media.get('NAME') # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. @@ -1227,11 +1269,9 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) - if last_media is not None: - f['m3u8_media'] = last_media - last_media = None formats.append(f) last_info = {} + last_media = {} return formats @staticmethod @@ -1481,6 +1521,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1513,8 +1560,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1551,7 +1606,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1595,16 +1650,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1631,7 +1710,7 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _parse_html5_media_entries(self, base_url, webpage): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1646,6 +1725,21 @@ class InfoExtractor(object): return f return {} + def _media_formats(src, cur_media_type): + full_url = absolute_url(src) + if determine_ext(full_url) == 'm3u8': + is_plain_url = False + formats = self._extract_m3u8_formats( + full_url, video_id, ext='mp4', + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + else: + is_plain_url = True + formats = [{ + 'url': full_url, + 'vcodec': 'none' if cur_media_type == 'audio' else None, + }] + return is_plain_url, formats + entries = [] for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): media_info = { @@ -1655,10 +1749,8 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = media_attributes.get('src') if src: - media_info['formats'].append({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) + _, formats = _media_formats(src, media_type) + media_info['formats'].extend(formats) media_info['thumbnail'] = media_attributes.get('poster') if media_content: for source_tag in re.findall(r'<source[^>]+>', media_content): @@ -1666,12 +1758,13 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - f = parse_content_type(source_attributes.get('type')) - f.update({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) - media_info['formats'].append(f) + is_plain_url, formats = _media_formats(src, media_type) + if is_plain_url: + f = parse_content_type(source_attributes.get('type')) + f.update(formats[0]) + media_info['formats'].append(f) + else: + media_info['formats'].extend(formats) for track_tag in re.findall(r'<track[^>]+>', media_content): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') @@ -1687,6 +1780,18 @@ class InfoExtractor(object): entries.append(media_info) return entries + def _extract_akamai_formats(self, manifest_url, video_id): + formats = [] + f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + formats.extend(self._extract_f4m_formats( + update_url_query(f4m_url, {'hdcore': '3.7.0'}), + video_id, f4m_id='hds', fatal=False)) + m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1747,7 +1852,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index e8f2b5a07..8d8f60598 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,13 +5,17 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( orderedSet, remove_end, + extract_attributes, + mimetype2ext, + determine_ext, + int_or_none, + parse_iso8601, ) @@ -58,6 +62,9 @@ class CondeNastIE(InfoExtractor): 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + 'uploader': 'wired', + 'upload_date': '20130314', + 'timestamp': 1363219200, } }, { # JS embed @@ -67,70 +74,93 @@ class CondeNastIE(InfoExtractor): 'id': '55f9cf8b61646d1acf00000c', 'ext': 'mp4', 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'uploader': 'arstechnica', + 'upload_date': '20150916', + 'timestamp': 1442434955, } }] def _extract_series(self, url, webpage): - title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>', - webpage, 'series title', flags=re.DOTALL) + title = self._html_search_regex( + r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>', + webpage, 'series title') url_object = compat_urllib_parse_urlparse(url) base_url = '%s://%s' % (url_object.scheme, url_object.netloc) - m_paths = re.finditer(r'<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', - webpage, flags=re.DOTALL) + m_paths = re.finditer( + r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage) paths = orderedSet(m.group(1) for m in m_paths) build_url = lambda path: compat_urlparse.urljoin(base_url, path) entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) def _extract_video(self, webpage, url_type): - if url_type != 'embed': - description = self._html_search_regex( - [ - r'<div class="cne-video-description">(.+?)</div>', - r'<div class="video-post-content">(.+?)</div>', - ], - webpage, 'description', fatal=False, flags=re.DOTALL) + query = {} + params = self._search_regex( + r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) + if params: + query.update({ + 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), + 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), + 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), + }) else: - description = None - params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, - 'player params', flags=re.DOTALL) - video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') - player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') - target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse_urlencode({'videoId': video_id, - 'playerId': player_id, - 'target': target, - }) - base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', - webpage, 'base info url', - default='http://player.cnevids.com/player/loader.js?') - info_url = base_info_url + data - info_page = self._download_webpage(info_url, video_id, - 'Downloading video info') - video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') - video_info = self._parse_json(video_info, video_id) + params = extract_attributes(self._search_regex( + r'(<[^>]+data-js="video-player"[^>]+>)', + webpage, 'player params element')) + query.update({ + 'videoId': params['data-video'], + 'playerId': params['data-player'], + 'target': params['id'], + }) + video_id = query['videoId'] + video_info = None + info_page = self._download_webpage( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', query=query, fatal=False) + if info_page: + video_info = self._parse_json(self._search_regex( + r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] + else: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=query) + video_info = self._parse_json(self._search_regex( + r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) + title = video_info['title'] - formats = [{ - 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), - 'url': fdata['src'], - 'ext': fdata['type'].split('/')[-1], - 'quality': 1 if fdata['quality'] == 'high' else 0, - } for fdata in video_info['sources'][0]] + formats = [] + for fdata in video_info.get('sources', [{}])[0]: + src = fdata.get('src') + if not src: + continue + ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + quality = fdata.get('quality') + formats.append({ + 'format_id': ext + ('-%s' % quality if quality else ''), + 'url': src, + 'ext': ext, + 'quality': 1 if quality == 'high' else 0, + }) self._sort_formats(formats) - return { + info = self._search_json_ld( + webpage, video_id, fatal=False) if url_type != 'embed' else {} + info.update({ 'id': video_id, 'formats': formats, - 'title': video_info['title'], - 'thumbnail': video_info['poster_frame'], - 'description': description, - } + 'title': title, + 'thumbnail': video_info.get('poster_frame'), + 'uploader': video_info.get('brand'), + 'duration': int_or_none(video_info.get('duration')), + 'tags': video_info.get('tags'), + 'series': video_info.get('series_title'), + 'season': video_info.get('season_title'), + 'timestamp': parse_iso8601(video_info.get('premiere_date')), + }) + return info def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') - url_type = mobj.group('type') - item_id = mobj.group('id') + site, url_type, item_id = re.match(self._VALID_URL, url).groups() # Convert JS embed to regular embed if url_type == 'embedjs': diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 79238cce7..cc68f1c00 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,5 +1,5 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import unicode_literals, division from .common import InfoExtractor from ..utils import int_or_none @@ -8,12 +8,22 @@ from ..utils import int_or_none class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TEST = { - 'url': 'http://www.crackle.com/the-art-of-more/2496419', + 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', 'info_dict': { - 'id': '2496419', + 'id': '2498934', 'ext': 'mp4', - 'title': 'Heavy Lies the Head', - 'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca', + 'title': 'Everybody Respects A Bloody Nose', + 'description': 'Jerry is kaffeeklatsching in L.A. with funnyman J.B. Smoove (Saturday Night Live, Real Husbands of Hollywood). They’re headed for brew at 10 Speed Coffee in a 1964 Studebaker Avanti.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 906, + 'series': 'Comedians In Cars Getting Coffee', + 'season_number': 8, + 'episode_number': 4, + 'subtitles': { + 'en-US': [{ + 'ext': 'ttml', + }] + }, }, 'params': { # m3u8 download @@ -21,12 +31,8 @@ class CrackleIE(InfoExtractor): } } - # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx - _SUBTITLE_SERVER = 'http://web-us-az.crackle.com' - _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b' - _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' - # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx + _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' _MEDIA_FILE_SLOTS = { 'c544.flv': { 'width': 544, @@ -48,16 +54,21 @@ class CrackleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + + config_doc = self._download_xml( + 'http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx?site=16', + video_id, 'Downloading config') + item = self._download_xml( 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, video_id).find('i') title = item.attrib['t'] - thumbnail = None subtitles = {} formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), + 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) + thumbnail = None path = item.attrib.get('p') if path: thumbnail = self._THUMBNAIL_TEMPLATE % path @@ -76,7 +87,7 @@ class CrackleIE(InfoExtractor): if locale not in subtitles: subtitles[locale] = [] subtitles[locale] = [{ - 'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v), + 'url': '%s/%s%s_%s.xml' % (config_doc.attrib['strSubtitleServer'], path, locale, v), 'ext': 'ttml', }] self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) @@ -85,7 +96,7 @@ class CrackleIE(InfoExtractor): 'id': video_id, 'title': title, 'description': item.attrib.get('d'), - 'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None, + 'duration': int(item.attrib.get('r'), 16) / 1000 if item.attrib.get('r') else None, 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 90a64303d..6d3abb52f 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -114,6 +114,21 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', + 'info_dict': { + 'id': '702409', + 'ext': 'mp4', + 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', + 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'TV TOKYO', + 'upload_date': '20160508', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -336,9 +351,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_encode_id in video_encode_ids: continue video_encode_ids.append(video_encode_id) + + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + video_url = xpath_text(stream_info, './host') - video_play_path = xpath_text(stream_info, './file') - if not video_url or not video_play_path: + if not video_url: continue metadata = stream_info.find('./metadata') format_info = { @@ -353,7 +377,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text parsed_video_url = compat_urlparse.urlparse(video_url) direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): format_info.update({ 'url': direct_video_url, @@ -363,7 +387,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text format_info.update({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_file, 'ext': 'flv', }) formats.append(format_info) diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 1622fc844..83ca90c3b 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -1,13 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import parse_iso8601, ExtractorError +from ..utils import unified_timestamp class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' - # https connection failed (Connection reset) _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', @@ -16,7 +15,7 @@ class CtsNewsIE(InfoExtractor): 'id': '201501291578109', 'ext': 'mp4', 'title': '以色列.真主黨交火 3人死亡', - 'description': 'md5:95e9b295c898b7ff294f09d450178d7d', + 'description': '以色列和黎巴嫩真主黨,爆發五年最嚴重衝突,雙方砲轟交火,兩名以軍死亡,還有一名西班牙籍的聯合國維和人...', 'timestamp': 1422528540, 'upload_date': '20150129', } @@ -28,7 +27,7 @@ class CtsNewsIE(InfoExtractor): 'id': '201309031304098', 'ext': 'mp4', 'title': '韓國31歲童顏男 貌如十多歲小孩', - 'description': 'md5:f183feeba3752b683827aab71adad584', + 'description': '越有年紀的人,越希望看起來年輕一點,而南韓卻有一位31歲的男子,看起來像是11、12歲的小孩,身...', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1378205880, 'upload_date': '20130903', @@ -36,8 +35,7 @@ class CtsNewsIE(InfoExtractor): }, { # With Youtube embedded video 'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', - 'md5': '1d842c771dc94c8c3bca5af2cc1db9c5', - 'add_ie': ['Youtube'], + 'md5': 'e4726b2ccd70ba2c319865e28f0a91d1', 'info_dict': { 'id': 'OVbfO7d0_hQ', 'ext': 'mp4', @@ -47,42 +45,37 @@ class CtsNewsIE(InfoExtractor): 'upload_date': '20150128', 'uploader_id': 'TBSCTS', 'uploader': '中華電視公司', - } + }, + 'add_ie': ['Youtube'], }] def _real_extract(self, url): news_id = self._match_id(url) page = self._download_webpage(url, news_id) - if self._search_regex(r'(CTSPlayer2)', page, 'CTSPlayer2 identifier', default=None): - feed_url = self._html_search_regex( - r'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)', - page, 'feed url') - video_url = self._download_webpage( - feed_url, news_id, note='Fetching feed') + news_id = self._hidden_inputs(page).get('get_id') + + if news_id: + mp4_feed = self._download_json( + 'http://news.cts.com.tw/action/test_mp4feed.php', + news_id, note='Fetching feed', query={'news_id': news_id}) + video_url = mp4_feed['source_url'] else: self.to_screen('Not CTSPlayer video, trying Youtube...') youtube_url = self._search_regex( - r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url', - default=None) - if not youtube_url: - raise ExtractorError('The news includes no videos!', expected=True) + r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url') - return { - '_type': 'url', - 'url': youtube_url, - 'ie_key': 'Youtube', - } + return self.url_result(youtube_url, ie='Youtube') description = self._html_search_meta('description', page) - title = self._html_search_meta('title', page) + title = self._html_search_meta('title', page, fatal=True) thumbnail = self._html_search_meta('image', page) datetime_str = self._html_search_regex( - r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time') - # Transform into ISO 8601 format with timezone info - datetime_str = datetime_str.replace('/', '-') + ':00+0800' - timestamp = parse_iso8601(datetime_str, delimiter=' ') + r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time', fatal=False) + timestamp = None + if datetime_str: + timestamp = unified_timestamp(datetime_str) - 8 * 3600 return { 'id': news_id, diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py index 5807fbac9..a1fe86316 100644 --- a/youtube_dl/extractor/ctv.py +++ b/youtube_dl/extractor/ctv.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class CTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>ctv|tsn|bnn|thecomedynetwork)\.ca/.*?(?:\bvid=|-vid|~|%7E)(?P<id>[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -18,13 +20,27 @@ class CTVIE(InfoExtractor): 'timestamp': 1442624700, }, 'expected_warnings': ['HTTP Error 404'], + }, { + 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', + 'only_matching': True, + }, { + 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', + 'only_matching': True, + }, { + 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() + if domain == 'thecomedynetwork': + domain = 'comedy' return { '_type': 'url_transparent', 'id': video_id, - 'url': '9c9media:ctv_web:%s' % video_id, + 'url': '9c9media:%s_web:%s' % (domain, video_id), 'ie_key': 'NineCNineMedia', } diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py index 9c764fe68..9f26fa587 100644 --- a/youtube_dl/extractor/cultureunplugged.py +++ b/youtube_dl/extractor/cultureunplugged.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + HEADRequest, +) class CultureUnpluggedIE(InfoExtractor): @@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request + self._request_webpage(HEADRequest( + 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py new file mode 100644 index 000000000..e3c99468c --- /dev/null +++ b/youtube_dl/extractor/curiositystream.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urlencode_postdata, + compat_str, + ExtractorError, +) + + +class CuriosityStreamBaseIE(InfoExtractor): + _NETRC_MACHINE = 'curiositystream' + _auth_token = None + _API_BASE_URL = 'https://api.curiositystream.com/v1/' + + def _handle_errors(self, result): + error = result.get('error', {}).get('message') + if error: + if isinstance(error, dict): + error = ', '.join(error.values()) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error), expected=True) + + def _call_api(self, path, video_id): + headers = {} + if self._auth_token: + headers['X-Auth-Token'] = self._auth_token + result = self._download_json( + self._API_BASE_URL + path, video_id, headers=headers) + self._handle_errors(result) + return result['data'] + + def _real_initialize(self): + (email, password) = self._get_login_info() + if email is None: + return + result = self._download_json( + self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'email': email, + 'password': password, + })) + self._handle_errors(result) + self._auth_token = result['message']['auth_token'] + + def _extract_media_info(self, media): + video_id = compat_str(media['id']) + limelight_media_id = media['limelight_media_id'] + title = media['title'] + + subtitles = {} + for closed_caption in media.get('closed_captions', []): + sub_url = closed_caption.get('file') + if not sub_url: + continue + lang = closed_caption.get('code') or closed_caption.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:' + limelight_media_id, + 'title': title, + 'description': media.get('description'), + 'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'), + 'duration': int_or_none(media.get('duration')), + 'tags': media.get('tags'), + 'subtitles': subtitles, + 'ie_key': 'LimelightMedia', + } + + +class CuriosityStreamIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream' + _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'timestamp': 1448388615, + 'upload_date': '20151124', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('media/' + video_id, video_id) + return self._extract_media_info(media) + + +class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream:collection' + _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P<id>\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/collection/2', + 'info_dict': { + 'id': '2', + 'title': 'Curious Minds: The Internet', + 'description': 'How is the internet shaping our lives in the 21st Century?', + }, + 'playlist_mincount': 17, + } + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api( + 'collections/' + collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + entries.append(self._extract_media_info(media)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index ebd14cb16..1ab9333b2 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv(?:pr)?|seed)\.com/(?:shows/)?(?:[^/]+/)+[^?]*\?.*\b(?:play|watch)=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -28,7 +28,8 @@ class CWTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'redirect to http://cwtv.com/shows/arrow/', }, { 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088', 'info_dict': { @@ -44,22 +45,43 @@ class CWTVIE(InfoExtractor): 'upload_date': '20151006', 'timestamp': 1444107300, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', 'only_matching': True, + }, { + 'url': 'http://cwtvpr.com/the-cw/video?watch=9eee3f60-ef4e-440b-b3b2-49428ac9c54e', + 'only_matching': True, + }, { + 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?watch=6b15e985-9345-4f60-baf8-56e96be57c63', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id) - - formats = self._extract_m3u8_formats( - video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + video_data = None + formats = [] + for partner in (154, 213): + vdata = self._download_json( + 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) + if not vdata: + continue + video_data = vdata + for quality, quality_data in vdata.get('videos', {}).items(): + quality_url = quality_data.get('uri') + if not quality_url: + continue + if quality == 'variantplaylist': + formats.extend(self._extract_m3u8_formats( + quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + tbr = int_or_none(quality_data.get('bitrate')) + format_id = 'http' + ('-%d' % tbr if tbr else '') + if self._is_valid_url(quality_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': quality_url, + 'tbr': tbr, + }) self._sort_formats(formats) thumbnails = [{ diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index b60a1d813..98c835bf1 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -5,19 +5,20 @@ from .common import InfoExtractor from ..utils import ( int_or_none, determine_protocol, + unescapeHTML, ) class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' _TEST = { - 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', - 'md5': '2f639d446394f53f3a33658b518b6615', + 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', + 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { - 'id': '1288527', + 'id': '1295863', 'ext': 'mp4', - 'title': 'Turn any video into an impressionist masterpiece', - 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', + 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } } @@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) - title = video_data['title'] + title = unescapeHTML(video_data['title']) video_sources = self._download_json(video_data.get( 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) @@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video_data.get('descr'), + 'description': unescapeHTML(video_data.get('descr')), 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 'formats': formats, } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1f92823b7..496883d15 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -331,7 +331,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): for video_id in re.findall(r'data-xid="(.+?)"', webpage): if video_id not in video_ids: - yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + yield self.url_result( + 'http://www.dailymotion.com/video/%s' % video_id, + DailymotionIE.ie_key(), video_id) video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index caff8842e..6d880d43d 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -38,6 +38,12 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1', + webpage)] + def _real_extract(self, url): video_id, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py new file mode 100644 index 000000000..c4e83b2c3 --- /dev/null +++ b/youtube_dl/extractor/discoverygo.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + extract_attributes, + int_or_none, + parse_age_limit, + unescapeHTML, + ExtractorError, +) + + +class DiscoveryGoIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocitychannel + )go\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)''' + _TEST = { + 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', + 'info_dict': { + 'id': '57a33c536b66d1cd0345eeb1', + 'ext': 'mp4', + 'title': 'Kiss First, Ask Questions Later!', + 'description': 'md5:fe923ba34050eae468bffae10831cb22', + 'duration': 2579, + 'series': 'Love at First Kiss', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(<div[^>]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + unescapeHTML(container.get('data-video') or container.get('data-json')), + display_id) + + title = video['name'] + + stream = video.get('stream') + if not stream: + if video.get('authenticated') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream') + STREAM_URL_SUFFIX = 'streamUrl' + formats = [] + for stream_kind in ('', 'hds'): + suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX + stream_url = stream.get('%s%s' % (stream_kind, suffix)) + if not stream_url: + continue + if stream_kind == '': + formats.extend(self._extract_m3u8_formats( + stream_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif stream_kind == 'hds': + formats.extend(self._extract_f4m_formats( + stream_url, display_id, f4m_id=stream_kind, fatal=False)) + self._sort_formats(formats) + + video_id = video.get('id') or display_id + description = video.get('description', {}).get('detailed') + duration = int_or_none(video.get('duration')) + + series = video.get('show', {}).get('name') + season_number = int_or_none(video.get('season', {}).get('number')) + episode_number = int_or_none(video.get('episodeNumber')) + + tags = video.get('tags') + age_limit = parse_age_limit(video.get('parental', {}).get('rating')) + + subtitles = {} + captions = stream.get('captions') + if isinstance(captions, list): + for caption in captions: + subtitle_url = caption.get('fileUrl') + if (not subtitle_url or not isinstance(subtitle_url, compat_str) or + not subtitle_url.startswith('http')): + continue + lang = caption.get('fileLang', 'en') + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'tags': tags, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index e9ca236d4..fd64d1a7f 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -10,18 +10,18 @@ from ..utils import ( class DotsubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)' _TEST = { - 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', - 'md5': '0914d4d69605090f623b7ac329fea66e', + 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', + 'md5': '21c7ff600f545358134fea762a6d42b6', 'info_dict': { - 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27', + 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', 'ext': 'flv', - 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary', - 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074', - 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', - 'duration': 3169, - 'uploader': '4v4l0n42', - 'timestamp': 1292248482.625, - 'upload_date': '20101213', + 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', + 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', + 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', + 'duration': 198, + 'uploader': 'liuxt', + 'timestamp': 1385778501.104, + 'upload_date': '20131130', 'view_count': int, } } diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index ce6962755..e366e17e6 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,9 +3,17 @@ from __future__ import unicode_literals import hashlib import time +import uuid + from .common import InfoExtractor -from ..utils import (ExtractorError, unescapeHTML) -from ..compat import (compat_str, compat_basestring) +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + unescapeHTML, +) class DouyuTVIE(InfoExtractor): @@ -21,7 +29,6 @@ class DouyuTVIE(InfoExtractor): 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -37,7 +44,6 @@ class DouyuTVIE(InfoExtractor): 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'douyu小漠', - 'uploader_id': '3769985', 'is_live': True, }, 'params': { @@ -54,7 +60,6 @@ class DouyuTVIE(InfoExtractor): 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -65,6 +70,10 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] + # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf + # is encrypted originally, but ffdec can dump memory to get the decrypted one. + _API_KEY = 'A12Svb&%1UUmf@hC' + def _real_extract(self, url): video_id = self._match_id(url) @@ -75,74 +84,56 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id"\s*:\s*(\d+),', page, 'room id') - config = None - # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" - # Retry with different parameters - same parameters cause same errors - for i in range(5): - prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( - room_id, int(time.time())) - auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() + room = self._download_json( + 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, + note='Downloading room info')['data'] - config_page = self._download_webpage( - 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), - video_id) - try: - config = self._parse_json(config_page, video_id, fatal=False) - except ExtractorError: - # Wait some time before retrying to get a different time() value - self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' - 'Waiting for %(timeout)s seconds before retrying') - continue - else: - break - if config is None: - raise ExtractorError('Unable to fetch API result') - - data = config['data'] - - error_code = config.get('error', 0) - if error_code is not 0: - error_desc = 'Server reported error %i' % error_code - if isinstance(data, (compat_str, compat_basestring)): - error_desc += ': ' + data - raise ExtractorError(error_desc, expected=True) - - show_status = data.get('show_status') # 1 = live, 2 = offline - if show_status == '2': + if room.get('show_status') == '2': + raise ExtractorError('Live stream is offline', expected=True) + + tt = compat_str(int(time.time() / 60)) + did = uuid.uuid4().hex.upper() + + sign_content = ''.join((room_id, did, self._API_KEY, tt)) + sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() + + flv_data = compat_urllib_parse_urlencode({ + 'cdn': 'ws', + 'rate': '0', + 'tt': tt, + 'did': did, + 'sign': sign, + }) + + video_info = self._download_json( + 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id, video_id, + data=flv_data, note='Downloading video info', + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + error_code = video_info.get('error', 0) + if error_code is not 0: raise ExtractorError( - 'Live stream is offline', expected=True) + '%s reported error %i' % (self.IE_NAME, error_code), + expected=True) - base_url = data['rtmp_url'] - live_path = data['rtmp_live'] + base_url = video_info['data']['rtmp_url'] + live_path = video_info['data']['rtmp_live'] - title = self._live_title(unescapeHTML(data['room_name'])) - description = data.get('show_details') - thumbnail = data.get('room_src') + video_url = '%s/%s' % (base_url, live_path) - uploader = data.get('nickname') - uploader_id = data.get('owner_uid') - - multi_formats = data.get('rtmp_multi_bitrate') - if not isinstance(multi_formats, dict): - multi_formats = {} - multi_formats['live'] = live_path - - formats = [{ - 'url': '%s/%s' % (base_url, format_path), - 'format_id': format_id, - 'preference': 1 if format_id == 'live' else 0, - } for format_id, format_path in multi_formats.items()] - self._sort_formats(formats) + title = self._live_title(unescapeHTML(room['room_name'])) + description = room.get('notice') + thumbnail = room.get('room_src') + uploader = room.get('nickname') return { 'id': room_id, 'display_id': video_id, + 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, 'is_live': True, } diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 639f9182c..e8870c460 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + NO_DEFAULT, + str_to_int, +) class DrTuberIE(InfoExtractor): @@ -17,7 +20,6 @@ class DrTuberIE(InfoExtractor): 'ext': 'mp4', 'title': 'hot perky blonde naked golf', 'like_count': int, - 'dislike_count': int, 'comment_count': int, 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 'thumbnail': 're:https?://.*\.jpg$', @@ -36,25 +38,29 @@ class DrTuberIE(InfoExtractor): r'<source src="([^"]+)"', webpage, 'video URL') title = self._html_search_regex( - [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'], + (r'class="title_watch"[^>]*><p>([^<]+)<', + r'<p[^>]+class="title_substrate">([^<]+)</p>', + r'<title>([^<]+) - \d+'), webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) - def extract_count(id_, name): + def extract_count(id_, name, default=NO_DEFAULT): return str_to_int(self._html_search_regex( r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, - webpage, '%s count' % name, fatal=False)) + webpage, '%s count' % name, default=default, fatal=False)) like_count = extract_count('rate_likes', 'like') - dislike_count = extract_count('rate_dislikes', 'dislike') + dislike_count = extract_count('rate_dislikes', 'dislike', default=None) comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( - r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) - categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) + r'<div[^>]+class="categories_list">(.+?)</div>', + webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall( + r'<a title="([^"]+)"', cats_str) return { 'id': video_id, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 2d74ff855..88d096b30 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -4,26 +4,45 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, + float_or_none, + mimetype2ext, parse_iso8601, + remove_end, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' - _TEST = { - 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5', - 'md5': 'dc515a9ab50577fa14cc4e4b0265168f', + _TESTS = [{ + 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { - 'id': 'panisk-paske-5', + 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', - 'title': 'Panisk Påske (5)', - 'description': 'md5:ca14173c5ab24cd26b0fcc074dff391c', - 'timestamp': 1426984612, - 'upload_date': '20150322', - 'duration': 1455, + 'title': 'Klassen - Dårlig taber (10)', + 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', + 'timestamp': 1471991907, + 'upload_date': '20160823', + 'duration': 606.84, }, - } + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', + 'md5': '2c37175c718155930f939ef59952474a', + 'info_dict': { + 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'ext': 'mp4', + 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'timestamp': 1472800279, + 'upload_date': '20160902', + 'duration': 131.4, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +54,8 @@ class DRTVIE(InfoExtractor): 'Video %s is not available' % video_id, expected=True) video_id = self._search_regex( - r'data-(?:material-identifier|episode-slug)="([^"]+)"', + (r'data-(?:material-identifier|episode-slug)="([^"]+)"', + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), webpage, 'video id') programcard = self._download_json( @@ -43,9 +63,12 @@ class DRTVIE(InfoExtractor): video_id, 'Downloading video JSON') data = programcard['Data'][0] - title = data['Title'] - description = data['Description'] - timestamp = parse_iso8601(data['CreatedTime']) + title = remove_end(self._og_search_title( + webpage, default=None), ' | TV | DR') or data['Title'] + description = self._og_search_description( + webpage, default=None) or data.get('Description') + + timestamp = parse_iso8601(data.get('CreatedTime')) thumbnail = None duration = None @@ -56,16 +79,18 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset['Kind'] == 'Image': - thumbnail = asset['Uri'] - elif asset['Kind'] == 'VideoResource': - duration = asset['DurationInMilliseconds'] / 1000.0 - restricted_to_denmark = asset['RestrictedToDenmark'] - spoken_subtitles = asset['Target'] == 'SpokenSubtitles' - for link in asset['Links']: - uri = link['Uri'] - target = link['Target'] - format_id = target + if asset.get('Kind') == 'Image': + thumbnail = asset.get('Uri') + elif asset.get('Kind') == 'VideoResource': + duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) + restricted_to_denmark = asset.get('RestrictedToDenmark') + spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + for link in asset.get('Links', []): + uri = link.get('Uri') + if not uri: + continue + target = link.get('Target') + format_id = target or '' preference = None if spoken_subtitles: preference = -1 @@ -76,8 +101,8 @@ class DRTVIE(InfoExtractor): video_id, preference, f4m_id=format_id)) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( - uri, video_id, 'mp4', preference=preference, - m3u8_id=format_id)) + uri, video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, m3u8_id=format_id)) else: bitrate = link.get('Bitrate') if bitrate: @@ -85,7 +110,7 @@ class DRTVIE(InfoExtractor): formats.append({ 'url': uri, 'format_id': format_id, - 'tbr': bitrate, + 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), }) subtitles_list = asset.get('SubtitlesList') @@ -94,12 +119,18 @@ class DRTVIE(InfoExtractor): 'Danish': 'da', } for subs in subtitles_list: - lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] + if not subs.get('Uri'): + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': subs['Uri'], + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: - raise ExtractorError( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + self.raise_geo_restricted( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', + expected=True) self._sort_formats(formats) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 12d28d3b9..d4dfda8cd 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -52,11 +52,24 @@ class EaglePlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): + # Regular iframe embedding mobj = re.search( r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', webpage) if mobj is not None: return mobj.group('url') + # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + <script[^>]+ + src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) + .+? + <div[^>]+ + class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ + data-id=["\'](?P<id>\d+) + ''', webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @staticmethod def _handle_error(response): diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e5e57d485..a39e9010d 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -4,9 +4,10 @@ from .common import InfoExtractor class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)' + _VALID_URL = r'https?://www.engadget.com/video/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ + # video with 5min ID 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { @@ -15,8 +16,12 @@ class EngadgetIE(InfoExtractor): 'title': 'Samsung Galaxy Tab Pro 8.4 Review', }, 'add_ie': ['FiveMin'], - } + }, { + # video with vidible ID + 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + return self.url_result('aol-video:%s' % video_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index ac5d0fe24..f3734e9f8 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,19 +4,23 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, parse_duration, str_to_int, ) class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { - 'id': '95008', + 'id': 'qlDUmNsj6VS', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', @@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor): # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.*?) - EPORNER', webpage, 'title') + webpage, urlh = self._download_webpage_handle(url, display_id) - redirect_url = 'http://www.eporner.com/config5/%s' % video_id - player_code = self._download_webpage( - redirect_url, display_id, note='Downloading player config') + video_id = self._match_id(compat_str(urlh.geturl())) - sources = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources') + hash = self._search_regex( + r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') + + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] formats = [] - for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources): - fmt = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = format_dict.get('src') + if not isinstance(src, compat_str) or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) self._sort_formats(formats) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 66c08bec4..6d10f8e68 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -5,7 +5,7 @@ from ..utils import remove_end class ESPNIE(InfoExtractor): - _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', 'md5': '60e5d097a523e767d06479335d1bdc58', @@ -47,6 +47,9 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip?id=10365079', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py deleted file mode 100644 index 09ed4f2b5..000000000 --- a/youtube_dl/extractor/exfm.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ExfmIE(InfoExtractor): - IE_NAME = 'exfm' - IE_DESC = 'ex.fm' - _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)' - _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' - _TESTS = [ - { - 'url': 'http://ex.fm/song/eh359', - 'md5': 'e45513df5631e6d760970b14cc0c11e7', - 'info_dict': { - 'id': '44216187', - 'ext': 'mp3', - 'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive', - 'uploader': 'deadjournalist', - 'upload_date': '20120424', - 'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', - }, - 'note': 'Soundcloud song', - 'skip': 'The site is down too often', - }, - { - 'url': 'http://ex.fm/song/wddt8', - 'md5': '966bd70741ac5b8570d8e45bfaed3643', - 'info_dict': { - 'id': 'wddt8', - 'ext': 'mp3', - 'title': 'Safe and Sound', - 'uploader': 'Capital Cities', - }, - 'skip': 'The site is down too often', - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') - info_url = 'http://ex.fm/api/v3/song/%s' % song_id - info = self._download_json(info_url, song_id)['song'] - song_url = info['url'] - if re.match(self._SOUNDCLOUD_URL, song_url) is not None: - self.to_screen('Soundcloud song detected') - return self.url_result(song_url.replace('/stream', ''), 'Soundcloud') - return { - 'id': song_id, - 'url': song_url, - 'ext': 'mp3', - 'title': info['title'], - 'thumbnail': info['image']['large'], - 'uploader': info['artist'], - 'view_count': info['loved_count'], - } diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 1585a03bb..971c918a4 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -12,23 +10,22 @@ from ..utils import ( class ExpoTVIE(InfoExtractor): _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', - 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', + 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', + 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', 'info_dict': { - 'id': '17561', + 'id': '667916', 'ext': 'mp4', - 'upload_date': '20060212', - 'title': 'My Favorite Online Scrapbook Store', - 'view_count': int, - 'description': 'You\'ll find most everything you need at this virtual store front.', - 'uploader': 'Anna T.', + 'title': 'NYX Butter Lipstick Little Susie', + 'description': 'Goes on like butter, but looks better!', 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Stephanie S.', + 'upload_date': '20150520', + 'view_count': int, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_key = self._search_regex( @@ -66,7 +63,7 @@ class ExpoTVIE(InfoExtractor): fatal=False) upload_date = unified_strdate(self._search_regex( r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', - fatal=False)) + fatal=False), day_first=False) return { 'id': video_id, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1145361e9..870e10ebf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1,7 +1,10 @@ # flake8: noqa from __future__ import unicode_literals -from .abc import ABCIE +from .abc import ( + ABCIE, + ABCIViewIE, +) from .abc7news import Abc7NewsIE from .abcnews import ( AbcNewsIE, @@ -29,6 +32,7 @@ from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE @@ -44,6 +48,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arkena import ArkenaIE from .ard import ( ARDIE, ARDMediathekIE, @@ -66,6 +71,12 @@ from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE @@ -116,9 +127,12 @@ from .carambatv import ( CarambaTVIE, CarambaTVPageIE, ) +from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCPlayerIE, + CBCWatchVideoIE, + CBCWatchIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE @@ -132,6 +146,7 @@ from .ccc import CCCIE from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( @@ -156,7 +171,12 @@ from .cnn import ( ) from .coub import CoubIE from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralShortnameIE, + ComedyCentralTVIE, + ToshIE, +) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import RtmpIE @@ -174,6 +194,10 @@ from .ctsnews import CtsNewsIE from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionIE, +) from .cwtv import CWTVIE from .dailymail import DailyMailIE from .dailymotion import ( @@ -189,12 +213,6 @@ from .daum import ( DaumUserIE, ) from .dbtv import DBTVIE -from .dcn import ( - DCNIE, - DCNVideoIE, - DCNLiveIE, - DCNSeasonIE, -) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE @@ -215,6 +233,7 @@ from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .discoverygo import DiscoveryGoIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( @@ -242,13 +261,18 @@ from .espn import ESPNIE from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE -from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE -from .facebook import FacebookIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, +) from .faz import FazIE -from .fc2 import FC2IE +from .fc2 import ( + FC2IE, + FC2EmbedIE, +) from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE @@ -263,12 +287,12 @@ from .formula1 import Formula1IE from .fourtube import FourTubeIE from .fox import FOXIE from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE -from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, +from .foxnews import ( + FoxNewsIE, + FoxNewsInsiderIE, ) +from .foxsports import FoxSportsIE +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -283,8 +307,8 @@ from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE +from .fxnetworks import FXNetworksIE from .gameinformer import GameInformerIE -from .gamekings import GamekingsIE from .gameone import ( GameOneIE, GameOnePlaylistIE, @@ -303,9 +327,9 @@ from .globo import ( GloboIE, GloboArticleIE, ) +from .go import GoIE from .godtube import GodTubeIE from .godtv import GodTVIE -from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE @@ -320,6 +344,10 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hgtv import ( + HGTVIE, + HGTVComShowIE, +) from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE @@ -393,6 +421,10 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE @@ -467,11 +499,11 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .movingimage import MovingImageIE from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, - MTVIggyIE, MTVDEIE, ) from .muenchentv import MuenchenTVIE @@ -483,8 +515,9 @@ from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import ( + NationalGeographicVideoIE, NationalGeographicIE, - NationalGeographicChannelIE, + NationalGeographicEpisodeGuideIE, ) from .naver import NaverIE from .nba import NBAIE @@ -521,9 +554,9 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) -from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE +from .nhk import NhkVodIE from .nhl import ( NHLVideocenterIE, NHLNewsIE, @@ -535,9 +568,13 @@ from .nick import ( NickDeIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import NineCNineMediaIE +from .ninecninemedia import ( + NineCNineMediaStackIE, + NineCNineMediaIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE +from .nintendo import NintendoIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -582,6 +619,7 @@ from .nytimes import ( NYTimesArticleIE, ) from .nuvid import NuvidIE +from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE from .onet import ( @@ -615,7 +653,6 @@ from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .pinkbike import PinkbikeIE from .pladform import PladformIE -from .played import PlayedIE from .playfm import PlayFMIE from .plays import PlaysTVIE from .playtvak import PlaytvakIE @@ -626,8 +663,10 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .pokemon import PokemonIE from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE +from .porncom import PornComIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, @@ -684,6 +723,7 @@ from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE @@ -743,6 +783,7 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .sonyliv import SonyLIVIE from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -779,9 +820,9 @@ from .srgssr import ( SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE -from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -797,8 +838,8 @@ from .tagesschau import ( TagesschauPlayerIE, TagesschauIE, ) -from .tapely import TapelyIE from .tass import TassIE +from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, @@ -841,16 +882,11 @@ from .tnaflix import ( MovieFapIE, ) from .toggle import ToggleIE -from .thvideo import ( - THVideoIE, - THVideoPlaylistIE -) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trollvids import TrollvidsIE -from .trutube import TruTubeIE from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import ( @@ -881,10 +917,14 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvp import ( + TVPEmbedIE, TVPIE, TVPSeriesIE, ) -from .tvplay import TVPlayIE +from .tvplay import ( + TVPlayIE, + ViafreeIE, +) from .tvple import TvpleIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE @@ -914,8 +954,14 @@ from .udemy import ( from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) from .urort import UrortIE from .urplay import URPlayIE +from .usanetwork import USANetworkIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( @@ -942,6 +988,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .viceland import VicelandIE from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE @@ -995,6 +1042,7 @@ from .vk import ( ) from .vlive import VLiveIE from .vodlocker import VodlockerIE +from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE from .vporn import VpornIE @@ -1087,8 +1135,4 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) -from .zippcast import ZippCastIE +from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3403581fd..445f9438d 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,20 +1,14 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, - str_to_int, -) +from ..utils import str_to_int +from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(InfoExtractor): +class ExtremeTubeIE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', 'info_dict': { 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', @@ -35,58 +29,22 @@ class ExtremeTubeIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + if not info['title']: + info['title'] = self._search_regex( + r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') - video_title = self._html_search_regex( - r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', webpage, 'uploader', fatal=False) - view_count = str_to_int(self._html_search_regex( + view_count = str_to_int(self._search_regex( r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', webpage, 'view count', fatal=False)) - flash_vars = self._parse_json( - self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), - video_id) - - formats = [] - for quality_key, video_url in flash_vars.items(): - height = int_or_none(self._search_regex( - r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) - if not height: - continue - f = { - 'url': video_url, - } - mobj = re.search( - r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - else: - f.update({ - 'format_id': '%dp' % height, - 'height': height, - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, + info.update({ 'uploader': uploader, 'view_count': view_count, - 'age_limit': 18, - } + }) + + return info diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index cdb093262..3a220e995 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -15,6 +15,7 @@ from ..compat import ( from ..utils import ( error_to_compat_str, ExtractorError, + int_or_none, limit_length, sanitized_Request, urlencode_postdata, @@ -27,7 +28,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:\w+\.)?facebook\.com/ + (?:[\w-]+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: @@ -62,6 +63,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', 'uploader': 'Tennis on Facebook', + 'upload_date': '20140908', + 'timestamp': 1410199200, } }, { 'note': 'Video without discernible title', @@ -71,6 +74,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 'Facebook video #274175099429670', 'uploader': 'Asif Nawab Butt', + 'upload_date': '20140506', + 'timestamp': 1399398998, }, 'expected_warnings': [ 'title' @@ -78,12 +83,14 @@ class FacebookIE(InfoExtractor): }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': '54706e4db4f5ad58fbad82dde1f1213f', + 'md5': 'b2c28d528273b323abe5c6ab59f0f030', 'info_dict': { 'id': '957955867617029', 'ext': 'mp4', 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', + 'upload_date': '20160110', + 'timestamp': 1452431627, }, }, { 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', @@ -127,6 +134,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + }, { + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }] @staticmethod @@ -303,12 +313,16 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + timestamp = int_or_none(self._search_regex( + r'<abbr[^>]+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) info_dict = { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, + 'timestamp': timestamp, } return webpage, info_dict @@ -337,3 +351,32 @@ class FacebookIE(InfoExtractor): self._VIDEO_PAGE_TEMPLATE % video_id, video_id, fatal_if_no_video=True) return info_dict + + +class FacebookPluginsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)' + + _TESTS = [{ + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', + 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'info_dict': { + 'id': '10154383743583686', + 'ext': 'mp4', + 'title': 'What to do during the haze?', + 'uploader': 'Gov.sg', + 'upload_date': '20160826', + 'timestamp': 1472184808, + }, + 'add_ie': [FacebookIE.ie_key()], + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + compat_urllib_parse_unquote(self._match_id(url)), + FacebookIE.ie_key()) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index c7d69ff1f..c032d4d02 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -1,10 +1,12 @@ -#! -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib +import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_request, compat_urlparse, ) @@ -16,7 +18,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)' + _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P<id>[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -75,12 +77,17 @@ class FC2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) self._login() - webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear - self._login() + webpage = None + if not url.startswith('fc2:'): + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + self._login() - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) + title = 'FC2 video %s' % video_id + thumbnail = None + if webpage is not None: + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() @@ -113,3 +120,41 @@ class FC2IE(InfoExtractor): 'ext': 'flv', 'thumbnail': thumbnail, } + + +class FC2EmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P<query>.+)' + IE_NAME = 'fc2:embed' + + _TEST = { + 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', + 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', + 'info_dict': { + 'id': '201403223kCqB3Ez', + 'ext': 'flv', + 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_parse_qs(mobj.group('query')) + + video_id = query['i'][-1] + title = query.get('tl', ['FC2 video %s' % video_id])[0] + + sj = query.get('sj', [None])[0] + thumbnail = None + if sj: + # See thumbnailImagePath() in ServerConst.as of flv2.swf + thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) + + return { + '_type': 'url_transparent', + 'ie_key': FC2IE.ie_key(), + 'url': 'fc2:%s' % video_id, + 'title': title, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 88bca1007..332d12020 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,44 +2,40 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_xpath +from ..compat import compat_urlparse from ..utils import ( int_or_none, qualities, unified_strdate, - xpath_attr, - xpath_element, - xpath_text, - xpath_with_ns, ) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ - # single format via video_materials.json API - 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': '82a2777648acae812d58b3f5bd42882b', + # single format + 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', 'info_dict': { - 'id': '35930', + 'id': '40049', 'ext': 'mp4', 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'description': 'md5:357933adeede13b202c7c21f91b871b2', + 'description': 'md5:36a39c1d19618fec57d12efe212a8370', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20150212', 'duration': 2694, }, }, { - # multiple formats via video_materials.json API - 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + # multiple formats + 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', 'info_dict': { - 'id': '113641', + 'id': '364746', 'ext': 'mp4', 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', + 'description': 'md5:a242eea0031fd180a4497d52640a9572', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20160407', 'duration': 179, @@ -48,84 +44,47 @@ class FirstTVIE(InfoExtractor): 'params': { 'skip_download': True, }, - }, { - # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API - 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', - 'md5': '519d306c5b5669761fd8906c39dbee23', - 'info_dict': { - 'id': '47038', - 'ext': 'mp4', - 'title': '"Побег". Второй сезон. 3 серия', - 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20120516', - 'duration': 3080, - }, - }, { - 'url': 'http://www.1tv.ru/videoarchive/9967', - 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - # Videos with multiple formats only available via this API - video = self._download_json( - 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, - video_id, fatal=False) - - description, thumbnail, upload_date, duration = [None] * 4 - - if video: - item = video[0] - title = item['title'] - quality = qualities(('ld', 'sd', 'hd', )) - formats = [{ - 'url': f['src'], - 'format_id': f.get('name'), - 'quality': quality(f.get('name')), - } for f in item['mbr'] if f.get('src')] - thumbnail = item.get('poster') - else: - # Some videos are not available via video_materials.json - video = self._download_xml( - 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, - video_id) - - NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - } - - item = xpath_element(video, './channel/item', fatal=True) - title = xpath_text(item, './title', fatal=True) - formats = [{ - 'url': content.attrib['url'], - } for content in item.findall( - compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] - thumbnail = xpath_attr( - item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + webpage = self._download_webpage(url, display_id) + playlist_url = compat_urlparse.urljoin(url, self._search_regex( + r'data-playlist-url="([^"]+)', webpage, 'playlist url')) + item = self._download_json(playlist_url, display_id)[0] + video_id = item['id'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [] + for f in item.get('mbr', []): + src = f.get('src') + if not src: + continue + fname = f.get('name') + formats.append({ + 'url': src, + 'format_id': fname, + 'quality': quality(fname), + }) self._sort_formats(formats) - webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) - if webpage: - title = self._html_search_regex( - (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or title - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or item['title'] + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'thumbnail': thumbnail, + 'thumbnail': item.get('poster') or self._og_search_thumbnail(webpage), 'title': title, 'description': description, 'upload_date': upload_date, diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 6b8345416..f3f876ecd 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,24 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - parse_duration, - replace_extension, -) class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))' + _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' _TESTS = [ { @@ -29,8 +16,16 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'description': 'iPad mini with Retina Display review', 'duration': 177, + 'uploader': 'engadget', + 'upload_date': '20131115', + 'timestamp': 1384515288, }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, { # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 @@ -44,108 +39,16 @@ class FiveMinIE(InfoExtractor): }, 'skip': 'no longer available', }, + { + 'url': 'http://embed.5min.com/518726732/', + 'only_matching': True, + }, + { + 'url': 'http://delivery.vidible.tv/aol?playList=518013791', + 'only_matching': True, + } ] - _ERRORS = { - 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', - 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', - 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', - 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', - 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - } - _QUALITIES = { - 1: { - 'width': 640, - 'height': 360, - }, - 2: { - 'width': 854, - 'height': 480, - }, - 4: { - 'width': 1280, - 'height': 720, - }, - 8: { - 'width': 1920, - 'height': 1080, - }, - 16: { - 'width': 640, - 'height': 360, - }, - 32: { - 'width': 854, - 'height': 480, - }, - 64: { - 'width': 1280, - 'height': 720, - }, - 128: { - 'width': 640, - 'height': 360, - }, - } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sid = mobj.group('sid') - - if mobj.group('query'): - qs = compat_parse_qs(mobj.group('query')) - if not qs.get('playList'): - raise ExtractorError('Invalid URL', expected=True) - video_id = qs['playList'][0] - if qs.get('sid'): - sid = qs['sid'][0] - - embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - if not sid: - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - - response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + - compat_urllib_parse_urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }), - video_id) - if not response['success']: - raise ExtractorError( - '%s said: %s' % ( - self.IE_NAME, - self._ERRORS.get(response['errorMessage'], response['errorMessage'])), - expected=True) - info = response['binding'][0] - - formats = [] - parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( - compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) - for rendition in info['Renditions']: - if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': - continue - else: - rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) - quality = self._QUALITIES.get(rendition['ID'], {}) - formats.append({ - 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), - 'url': rendition_url, - 'width': quality.get('width'), - 'height': quality.get('height'), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info['Title'], - 'thumbnail': info.get('ThumbURL'), - 'duration': parse_duration(info.get('Duration')), - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('aol-video:%s' % video_id) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index acb6133ff..1902a2393 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -48,7 +48,7 @@ class FlipagramIE(InfoExtractor): flipagram = video_data['flipagram'] video = flipagram['video'] - json_ld = self._search_json_ld(webpage, video_id, default=False) + json_ld = self._search_json_ld(webpage, video_id, default={}) title = json_ld.get('title') or flipagram['captionText'] description = json_ld.get('description') or flipagram.get('captionText') diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index 322c41e5a..8c417ab65 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,8 +5,8 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' + _TESTS = [{ 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', 'md5': '8c79e54be72078b26b89e0e111c0502b', 'info_dict': { @@ -15,7 +15,10 @@ class Formula1IE(InfoExtractor): 'title': 'Race highlights - Spain 2016', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index fc4a5a0fb..9776c8422 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -43,14 +43,14 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader', fatal=False) categories_html = self._search_regex( - r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>', + r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>', webpage, 'categories', fatal=False) categories = None if categories_html: @@ -59,10 +59,10 @@ class FourTubeIE(InfoExtractor): r'(?s)<li><a.*?>(.*?)</a>', categories_html)] view_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 95c1abf94..9f406b17e 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FOXIE(InfoExtractor): @@ -29,11 +32,12 @@ class FOXIE(InfoExtractor): release_url = self._parse_json(self._search_regex( r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'), - video_id)['release_url'] + '&switch=http' + video_id)['release_url'] return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', - 'url': smuggle_url(release_url, {'force_smil_url': True}), + 'url': smuggle_url(update_url_query( + release_url, {'switch': 'http'}), {'force_smil_url': True}), 'id': video_id, } diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index b04da2415..5c7acd795 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals import re from .amp import AMPIE +from .common import InfoExtractor class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' + _VALID_URL = r'https?://(?P<host>video\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -49,6 +50,11 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -58,3 +64,43 @@ class FoxNewsIE(AMPIE): 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) info['id'] = video_id return info + + +class FoxNewsInsiderIE(InfoExtractor): + _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)' + IE_NAME = 'foxnews:insider' + + _TEST = { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'md5': 'a10c755e582d28120c62749b4feb4c0c', + 'info_dict': { + 'id': '5099377331001', + 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', + 'ext': 'mp4', + 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', + 'description': 'Is campus censorship getting out of control?', + 'timestamp': 1472168725, + 'upload_date': '20160825', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'add_ie': [FoxNewsIE.ie_key()], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + '_type': 'url_transparent', + 'ie_key': FoxNewsIE.ie_key(), + 'url': embed_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index e2ca96283..186da0d3b 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -2,104 +2,56 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) from ..utils import ( determine_ext, - int_or_none, - ExtractorError, + unified_strdate, ) class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', + 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { - 'id': '4795174', + 'id': 'rendez-vous-au-pays-des-geeks', + 'display_id': 'rendez-vous-au-pays-des-geeks', 'ext': 'mp3', 'title': 'Rendez-vous au pays des geeks', - 'alt_title': 'Carnet nomade | 13-14', - 'vcodec': 'none', + 'thumbnail': 're:^https?://.*\\.jpg$', 'upload_date': '20140301', - 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', - 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche', - 'timestamp': 1393700400, + 'vcodec': 'none', } } - def _extract_from_player(self, url, video_id): - webpage = self._download_webpage(url, video_id) + def _real_extract(self, url): + display_id = self._match_id(url) - video_path = self._search_regex( - r'<a id="player".*?href="([^"]+)"', webpage, 'video path') - video_url = compat_urlparse.urljoin(url, video_path) - timestamp = int_or_none(self._search_regex( - r'<a id="player".*?data-date="([0-9]+)"', + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<a[^>]+href="([^"]+)"', + webpage, 'video path') + + title = self._og_search_title(webpage) + + upload_date = unified_strdate(self._search_regex( + '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<', webpage, 'upload date', fatal=False)) thumbnail = self._search_regex( - r'<a id="player".*?>\s+<img src="([^"]+)"', + r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-pagespeed-(?:lazy|high-res)-src="([^"]+)"', webpage, 'thumbnail', fatal=False) - - display_id = self._search_regex( - r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id') - - title = self._html_search_regex( - r'<span class="title-diffusion">(.*?)</span>', webpage, 'title') - alt_title = self._html_search_regex( - r'<span class="title">(.*?)</span>', - webpage, 'alt_title', fatal=False) - description = self._html_search_regex( - r'<span class="description">(.*?)</span>', - webpage, 'description', fatal=False) - uploader = self._html_search_regex( r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', webpage, 'uploader', default=None) vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None return { - 'id': video_id, + 'id': display_id, + 'display_id': display_id, 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, 'vcodec': vcodec, 'uploader': uploader, - 'timestamp': timestamp, - 'title': title, - 'alt_title': alt_title, - 'thumbnail': thumbnail, - 'description': description, - 'display_id': display_id, + 'upload_date': upload_date, } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_from_player(url, video_id) - - -class FranceCultureEmissionIE(FranceCultureIE): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)' - _TEST = { - 'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'info_dict': { - 'title': 'Jean-Gabriel Périot, cinéaste', - 'alt_title': 'Les Carnets de la création', - 'id': '5093239', - 'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'ext': 'mp3', - 'timestamp': 1444762500, - 'upload_date': '20151013', - 'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_path = self._html_search_regex( - r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path', 'no_path_player') - if video_path == 'no_path_player': - raise ExtractorError('no player : no sound in this page.', expected=True) - new_id = self._search_regex('play=(?P<id>[0-9]+)', video_path, 'new_id', group='id') - video_url = compat_urlparse.urljoin(url, video_path) - return self._extract_from_player(video_url, new_id) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7653975e3..3233f66d5 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -131,7 +131,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -206,6 +206,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'uploader_id': 'x2q2ez', }, 'add_ie': ['Dailymotion'], + }, { + 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py new file mode 100644 index 000000000..629897317 --- /dev/null +++ b/youtube_dl/extractor/fxnetworks.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + extract_attributes, + parse_age_limit, + smuggle_url, +) + + +class FXNetworksIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.fxnetworks.com/video/719841347694', + 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'info_dict': { + 'id': '719841347694', + 'ext': 'mp4', + 'title': 'Vanpage', + 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'age_limit': 14, + 'uploader': 'NEWA-FNG-FX', + 'upload_date': '20160706', + 'timestamp': 1467844741, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.simpsonsworld.com/video/716094019682', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if 'The content you are trying to access is not available in your region.' in webpage: + self.raise_geo_restricted() + video_data = extract_attributes(self._search_regex( + r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) + release_url = video_data['rel'] + title = video_data['data-title'] + rating = video_data.get('data-rating') + query = { + 'mbr': 'true', + } + if player_type == 'movies': + query.update({ + 'manifest': 'm3u', + }) + else: + query.update({ + 'switch': 'http', + }) + if video_data.get('data-req-auth') == '1': + resource = self._get_mvpd_resource( + video_data['data-channel'], title, + video_data.get('data-guid'), rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'thumbnail': video_data.get('data-large-thumb'), + 'age_limit': parse_age_limit(rating), + 'ie_key': 'ThePlatform', + } diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py deleted file mode 100644 index cbcddcb7c..000000000 --- a/youtube_dl/extractor/gamekings.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - xpath_text, - xpath_with_ns, -) -from .youtube import YoutubeIE - - -class GamekingsIE(InfoExtractor): - _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' - _TESTS = [{ - # YouTube embed video - 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', - 'md5': '5208d3a17adeaef829a7861887cb9029', - 'info_dict': { - 'id': 'HkSQKetlGOU', - 'ext': 'mp4', - 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', - 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', - 'uploader': 'Gamekings Vault', - 'upload_date': '20151123', - }, - 'add_ie': ['Youtube'], - }, { - # vimeo video - 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', - 'md5': '12bf04dfd238e70058046937657ea68d', - 'info_dict': { - 'id': 'the-legend-of-zelda-majoras-mask', - 'ext': 'mp4', - 'title': 'The Legend of Zelda: Majora’s Mask', - 'description': 'md5:9917825fe0e9f4057601fe1e38860de3', - 'thumbnail': 're:^https?://.*\.jpg$', - }, - }, { - 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist_id = self._search_regex( - r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') - - # Check if a YouTube embed is used - if YoutubeIE.suitable(playlist_id): - return self.url_result(playlist_id, ie='Youtube') - - playlist = self._download_xml( - 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, - video_id) - - NS_MAP = { - 'jwplayer': 'http://rss.jwpcdn.com/' - } - - item = playlist.find('./channel/item') - - thumbnail = xpath_text(item, xpath_with_ns('./jwplayer:image', NS_MAP), 'thumbnail') - video_url = item.find(xpath_with_ns('./jwplayer:source', NS_MAP)).get('file') - - return { - 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cddd1a817..24b217715 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -62,6 +62,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE @@ -70,6 +71,9 @@ from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE +from .soundcloud import SoundcloudIE +from .vbox7 import Vbox7IE +from .dbtv import DBTVIE class GenericIE(InfoExtractor): @@ -100,7 +104,8 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' - ] + ], + 'skip': 'URL invalid', }, # Direct download with broken HEAD { @@ -264,7 +269,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { @@ -279,7 +285,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # google redirect { @@ -364,6 +371,7 @@ class GenericIE(InfoExtractor): 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -417,6 +425,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'movie expired', }, # embed.ly video { @@ -444,6 +453,8 @@ class GenericIE(InfoExtractor): 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, + # HEAD requests lead to endless 301, while GET is OK + 'expected_warnings': ['301'], }, # RUTV embed { @@ -473,7 +484,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -518,6 +529,9 @@ class GenericIE(InfoExtractor): 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', }, 'playlist_mincount': 7, + # This forum does not allow <iframe> syntaxes anymore + # Now HTML tags are displayed as-is + 'skip': 'No videos on this page', }, # Embedded TED video { @@ -566,7 +580,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': 'Requires rtmpdump' - } + }, + 'skip': 'video gone', }, # francetv embed { @@ -640,6 +655,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> @@ -781,6 +798,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -856,6 +882,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { @@ -1249,6 +1276,20 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player:stream embed + { + 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', + 'info_dict': { + 'id': 'master', + 'ext': 'mp4', + 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', + 'uploader': 'www.rtl.be', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, # twitter:player embed { 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', @@ -1328,6 +1369,44 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Vimeo'], }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, + { + # DBTV embeds + 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', + 'info_dict': { + 'id': '43254897', + 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', + }, + 'playlist_mincount': 3, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1964,12 +2043,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2132,6 +2208,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2160,6 +2241,14 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + # Look for VODPlatform embeds + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') + # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: @@ -2184,15 +2273,20 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } - # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser - embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url: - return self.url_result(embed_url) + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + + # Look for DBTV embeds + dbtv_urls = DBTVIE._extract_urls(webpage) + if dbtv_urls: + return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( - webpage, video_id, default=None, expected_type='VideoObject') - if json_ld and json_ld.get('url'): + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): info_dict.update({ 'title': video_title or info_dict['title'], 'description': video_description, @@ -2245,6 +2339,9 @@ class GenericIE(InfoExtractor): r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if not found: # Try to find twitter cards info + # twitter:player:stream should be checked before twitter:player since + # it is expected to contain a raw stream (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) if not found: @@ -2278,6 +2375,15 @@ class GenericIE(InfoExtractor): '_type': 'url', 'url': new_url, } + + if not found: + # twitter:player is a https URL to iframe player that may or may not + # be supported by youtube-dl thus this is checked the very last (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url: + return self.url_result(embed_url) + if not found: raise UnsupportedError(url) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 62ff84835..f0d951396 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -14,10 +13,8 @@ class GlideIE(InfoExtractor): 'info_dict': { 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', 'ext': 'mp4', - 'title': 'Damon Timm\'s Glide message', + 'title': "Damon's Glide message", 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', - 'uploader': 'Damon Timm', - 'upload_date': '20140919', } } @@ -27,7 +24,8 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'<title>(.+?)', webpage, 'title') + r'(.+?)', webpage, + 'title', default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, @@ -36,18 +34,10 @@ class GlideIE(InfoExtractor): r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', webpage, 'thumbnail url', default=None, group='url')) or self._og_search_thumbnail(webpage) - uploader = self._search_regex( - r']+class=["\']info-name["\'][^>]*>([^<]+)', - webpage, 'uploader', fatal=False) - upload_date = unified_strdate(self._search_regex( - r']+class="info-date"[^>]*>([^<]+)', - webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 3de8356f6..dbacbfc61 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -396,12 +396,12 @@ class GloboIE(InfoExtractor): class GloboArticleIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)(?:\.html)?' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', r'\bdata-player-videosids=["\'](\d{7,})', - r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\']?(\d{7,})', r'\bdata-id=["\'](\d{7,})', r']+\bid=["\'](\d{7,})', ] @@ -423,6 +423,9 @@ class GloboArticleIE(InfoExtractor): }, { 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', 'only_matching': True, + }, { + 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py new file mode 100644 index 000000000..6a437c54d --- /dev/null +++ b/youtube_dl/extractor/go.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + parse_age_limit, +) + + +class GoIE(InfoExtractor): + _BRANDS = { + 'abc': '001', + 'freeform': '002', + 'watchdisneychannel': '004', + 'watchdisneyjunior': '008', + 'watchdisneyxd': '009', + } + _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/.*?vdka(?P\w+)' % '|'.join(_BRANDS.keys()) + _TESTS = [{ + 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'info_dict': { + 'id': '0_g86w5onx', + 'ext': 'mp4', + 'title': 'Sneak Peek: Language Arts', + 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', + 'only_matching': True, + }] + + def _real_extract(self, url): + sub_domain, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (self._BRANDS[sub_domain], video_id), + video_id)['video'][0] + title = video_data['title'] + + formats = [] + for asset in video_data.get('assets', {}).get('asset', []): + asset_url = asset.get('value') + if not asset_url: + continue + format_id = asset.get('format') + ext = determine_ext(asset_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': asset_url, + 'ext': ext, + }) + self._sort_formats(formats) + + subtitles = {} + for cc in video_data.get('closedcaption', {}).get('src', []): + cc_url = cc.get('value') + if not cc_url: + continue + ext = determine_ext(cc_url) + if ext == 'xml': + ext = 'ttml' + subtitles.setdefault(cc.get('lang'), []).append({ + 'url': cc_url, + 'ext': ext, + }) + + thumbnails = [] + for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []): + thumbnail_url = thumbnail.get('value') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('longdescription') or video_data.get('description'), + 'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000), + 'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')), + 'episode_number': int_or_none(video_data.get('episodenumber')), + 'series': video_data.get('show', {}).get('title'), + 'season_number': int_or_none(video_data.get('season', {}).get('num')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py deleted file mode 100644 index 0fb509724..000000000 --- a/youtube_dl/extractor/goldenmoustache.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class GoldenMoustacheIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P[\w-]+)-(?P\d+)' - _TESTS = [{ - 'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/', - 'md5': '0f904432fa07da5054d6c8beb5efb51a', - 'info_dict': { - 'id': '3700', - 'ext': 'mp4', - 'title': 'Suricate - Le Poker', - 'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/', - 'md5': '27f0c50fb4dd5f01dc9082fc67cd5700', - 'info_dict': { - 'id': '55249', - 'ext': 'mp4', - 'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)', - 'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'(.*?)(?: - Golden Moustache)?', webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py new file mode 100644 index 000000000..69543bff2 --- /dev/null +++ b/youtube_dl/extractor/hgtv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + smuggle_url, +) + + +class HGTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P[^/]+)/video.html' + _TEST = { + 'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video', + 'md5': '', + 'info_dict': { + 'id': 'aFH__I_5FBOX', + 'ext': 'mp4', + 'title': 'Overnight Success', + 'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.', + 'uploader': 'SHWM-NEW', + 'timestamp': 1470320034, + 'upload_date': '20160804', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + embed_vars = self._parse_json(self._search_regex( + r'(?s)embed_vars\s*=\s*({.*?});', + webpage, 'embed vars'), display_id, js_to_json) + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], { + 'force_smil_url': True + }), + 'series': embed_vars.get('show'), + 'season_number': int_or_none(embed_vars.get('season')), + 'episode_number': int_or_none(embed_vars.get('episode')), + 'ie_key': 'ThePlatform', + } + + +class HGTVComShowIE(InfoExtractor): + IE_NAME = 'hgtv.com:show' + _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', + 'info_dict': { + 'id': 'flip-or-flop-full-episodes-videos', + 'title': 'Flip or Flop Full Episodes', + }, + 'playlist_mincount': 15, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'(?s)data-module=["\']video["\'][^>]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?)(.*?)
', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index fc0197ae1..8f7f232be 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor): 'info_dict': { 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', - 'uploader_id': 'britneyspears', 'title': 'Video by britneyspears', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1453760977, diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 472d72b4c..7c8cb21c2 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -8,7 +8,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, + qualities, ) @@ -49,11 +49,27 @@ class IviIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', + }, + { + # with MP4-HD720 format + 'url': 'http://www.ivi.ru/watch/146500', + 'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e', + 'info_dict': { + 'id': '146500', + 'ext': 'mp4', + 'title': 'Кукла', + 'description': 'md5:ffca9372399976a2d260a407cc74cce6', + 'duration': 5599, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', } ] # Sorted by quality - _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + _KNOWN_FORMATS = ( + 'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', + 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): video_id = self._match_id(url) @@ -69,10 +85,9 @@ class IviIE(InfoExtractor): ] } - request = sanitized_Request( - 'http://api.digitalaccess.ru/api/json/', json.dumps(data)) video_json = self._download_json( - request, video_id, 'Downloading video JSON') + 'http://api.digitalaccess.ru/api/json/', video_id, + 'Downloading video JSON', data=json.dumps(data)) if 'error' in video_json: error = video_json['error'] @@ -84,11 +99,13 @@ class IviIE(InfoExtractor): result = video_json['result'] + quality = qualities(self._KNOWN_FORMATS) + formats = [{ 'url': x['url'], - 'format_id': x['content_format'], - 'preference': self._KNOWN_FORMATS.index(x['content_format']), - } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS] + 'format_id': x.get('content_format'), + 'quality': quality(x.get('content_format')), + } for x in result['files'] if x.get('url')] self._sort_formats(formats) @@ -115,7 +132,7 @@ class IviIE(InfoExtractor): webpage, 'season number', default=None)) episode_number = int_or_none(self._search_regex( - r']+itemprop="episode"[^>]*>\s*]+itemprop="episodeNumber"[^>]+content="(\d+)', + r'[^>]+itemprop="episode"[^>]*>\s*]+itemprop="episodeNumber"[^>]+content="(\d+)', webpage, 'episode number', default=None)) description = self._og_search_description(webpage, default=None) or self._html_search_meta( diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index e44e31104..ce3126943 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( determine_ext, float_or_none, int_or_none, + mimetype2ext, ) @@ -28,74 +30,86 @@ class JWPlatformBaseIE(InfoExtractor): return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None): + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 if 'playlist' not in jwplayer_data: jwplayer_data = {'playlist': [jwplayer_data]} - video_data = jwplayer_data['playlist'][0] + entries = [] + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] + this_video_id = video_id or video_data['mediaid'] - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - source_type = source.get('type') or '' - if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type.startswith('audio'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - }) - else: - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv', - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('file') and track.get('kind') == 'captions': - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track['file']) + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, }) + else: + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' - return { - 'id': video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - } + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) + + entries.append({ + 'id': this_video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) class JWPlatformIE(JWPlatformBaseIE): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 1729f5bfb..6a8464998 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -36,6 +36,12 @@ class KalturaIE(InfoExtractor): ''' _SERVICE_URL = 'http://cdnapi.kaltura.com' _SERVICE_BASE = '/api_v3/index.php' + # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php + _CAPTION_TYPES = { + 1: 'srt', + 2: 'ttml', + 3: 'vtt', + } _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -62,6 +68,32 @@ class KalturaIE(InfoExtractor): { 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', 'only_matching': True, + }, + { + # video with subtitles + 'url': 'kaltura:111032:1_cw786r8q', + 'only_matching': True, + }, + { + # video with ttml subtitles (no fileExt) + 'url': 'kaltura:1926081:0_l5ye1133', + 'info_dict': { + 'id': '0_l5ye1133', + 'ext': 'mp4', + 'title': 'What Can You Do With Python?', + 'upload_date': '20160221', + 'uploader_id': 'stork', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + 'skip_download': True, + }, } ] @@ -117,20 +149,7 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id, service_url=None): - actions = [{ - 'apiVersion': '3.1', - 'expiry': 86400, - 'format': 1, - 'service': 'session', - 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, - }] - return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id, service_url=None): - signature = self._get_kaltura_signature(video_id, partner_id, service_url) actions = [ { 'action': 'null', @@ -138,18 +157,30 @@ class KalturaIE(InfoExtractor): 'clientTag': 'kdp:v3.8.5', 'format': 1, # JSON, 2 = XML, 3 = PHP 'service': 'multirequest', - 'ks': signature, + }, + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': '_%s' % partner_id, }, { 'action': 'get', 'entryId': video_id, 'service': 'baseentry', - 'version': '-1', + 'ks': '{1:result:ks}', }, { 'action': 'getbyentryid', 'entryId': video_id, 'service': 'flavorAsset', + 'ks': '{1:result:ks}', + }, + { + 'action': 'list', + 'filter:entryIdEqual': video_id, + 'service': 'caption_captionasset', + 'ks': '{1:result:ks}', }, ] return self._kaltura_api_call( @@ -161,8 +192,9 @@ class KalturaIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) partner_id, entry_id = mobj.group('partner_id', 'id') ks = None + captions = None if partner_id and entry_id: - info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -181,7 +213,7 @@ class KalturaIE(InfoExtractor): raise ExtractorError('Invalid URL', expected=True) if 'entry_id' in params: entry_id = params['entry_id'][0] - info, flavor_assets = self._get_video_info(entry_id, partner_id) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: reference_id = params['flashvars[referenceId]'][0] webpage = self._download_webpage(url, reference_id) @@ -191,6 +223,17 @@ class KalturaIE(InfoExtractor): reference_id)['entryResult'] info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] entry_id = info['id'] + # Unfortunately, data returned in kalturaIframePackageData lacks + # captions so we will try requesting the complete data using + # regular approach since we now know the entry_id + try: + _, info, flavor_assets, captions = self._get_video_info( + entry_id, partner_id) + except ExtractorError: + # Regular scenario failed but we already have everything + # extracted apart from captions and can process at least + # with this + pass else: raise ExtractorError('Invalid URL', expected=True) ks = params.get('flashvars[ks]', [None])[0] @@ -217,7 +260,7 @@ class KalturaIE(InfoExtractor): formats = [] for f in flavor_assets: # Continue if asset is not ready - if f['status'] != 2: + if f.get('status') != 2: continue video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) @@ -240,13 +283,27 @@ class KalturaIE(InfoExtractor): m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._check_formats(formats, entry_id) self._sort_formats(formats) + subtitles = {} + if captions: + for caption in captions.get('objects', []): + # Continue if caption is not ready + if f.get('status') != 2: + continue + if not caption.get('id'): + continue + caption_format = int_or_none(caption.get('format')) + subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ + 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), + 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', + }) + return { 'id': entry_id, 'title': info['name'], 'formats': formats, + 'subtitles': subtitles, 'description': clean_html(info.get('description')), 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 126ca13df..588a4d0ec 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -3,64 +3,126 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( - sanitized_Request, - url_basename, + determine_ext, + ExtractorError, + int_or_none, + str_to_int, + strip_or_none, ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P[0-9]+)(?:[/?&]|$)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', 'info_dict': { 'id': '1214711', + 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', 'ext': 'mp4', 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'age_limit': 18, 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'age_limit': 18, } - } + }, { + 'url': 'http://www.keezmovies.com/video/1214711', + 'only_matching': True, + }] - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_info(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - # embedded video - mobj = re.search(r'href="([^"]+)">', webpage) - if mobj: - embedded_url = mobj.group(1) - return self.url_result(embedded_url) - - video_title = self._html_search_regex( - r'

]*>([^<]+)', webpage, 'title') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) formats = [] - for height in (180, 240, 480): - if flashvars.get('quality_%dp' % height): - video_url = flashvars['quality_%dp' % height] - a_format = { - 'url': video_url, - 'height': height, - 'format_id': '%dp' % height, - } - filename_parts = url_basename(video_url).split('_') - if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): - a_format['tbr'] = int(filename_parts[1][:-1]) - formats.append(a_format) + format_urls = set() - age_limit = self._rta_search(webpage) + title = None + thumbnail = None + duration = None + encrypted = False - return { + def extract_format(format_url, height=None): + if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?Phttp.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + + self._sort_formats(formats) + + if not title: + title = self._html_search_regex( + r']*>([^<]+)', webpage, 'title') + + return webpage, { 'id': video_id, - 'title': video_title, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, 'formats': formats, - 'age_limit': age_limit, - 'thumbnail': flashvars.get('image_url') } + + def _real_extract(self, url): + webpage, info = self._extract_info(url) + info['view_count'] = str_to_int(self._search_regex( + r'([\d,.]+) Views?', webpage, 'view count', fatal=False)) + return info diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 9f1ade2e4..c61e78622 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -37,7 +37,6 @@ class KickStarterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Power Drive 2000', }, - 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -67,6 +66,6 @@ class KickStarterIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'thumbnail': thumbnail, } diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 12cc56e44..2e66e8cf9 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -18,31 +18,20 @@ from ..utils import ( class KUSIIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?Pstory/.+|video\?clipId=(?P\d+))' _TESTS = [{ - 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', - 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', + 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', 'info_dict': { - 'id': '12203019', + 'id': '12689020', 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, + 'title': "Turko Files: Refused to Help, It Ain't Right!", + 'duration': 223.586, + 'upload_date': '20160826', + 'timestamp': 1472233118, 'thumbnail': 're:^https?://.*\.jpg$' }, }, { 'url': 'http://kusi.com/video?clipId=12203019', - 'info_dict': { - 'id': '12203019', - 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, - 'thumbnail': 're:^https?://.*\.jpg$' - }, - 'params': { - 'skip_download': True, # Same as previous one - }, + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index b1d460599..0eeb9ffeb 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( get_element_by_id, clean_html, @@ -242,8 +243,9 @@ class KuwoSingerIE(InfoExtractor): query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) return [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r']+class="name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)', + self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + for song_url in re.findall( + r']+class="name">]+href="(/yinyue/\d+)', webpage) ] diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..ade27a99e --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .arkena import ArkenaIE + + +class LcpPlayIE(ArkenaIE): + _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P[^/]+)/(?P[^/]+)/[^/]+/[^/]+' + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': '327336', + 'ext': 'mp4', + 'title': '327336', + 'timestamp': 1456391602, + 'upload_date': '20160225', + }, + 'params': { + 'skip_download': True, + }, + }] + + +class LcpIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P[^/]+)' + + _TESTS = [{ + # arkena embed + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': 'd56d03e9', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'description': 'md5:96ad55009548da9dea19f4120c6c16a8', + 'timestamp': 1456488895, + 'upload_date': '20160226', + }, + 'params': { + 'skip_download': True, + }, + }, { + # dailymotion live stream + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'id': 'xji3qy', + 'ext': 'mp4', + 'title': 'La Chaine Parlementaire (LCP), Live TNT', + 'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b', + 'uploader': 'LCP', + 'uploader_id': 'xbz33d', + 'timestamp': 1308923058, + 'upload_date': '20110624', + }, + 'params': { + # m3u8 live stream + 'skip_download': True, + }, + }, { + 'url': 'http://www.lcp.fr/emissions/277792-les-volontaires', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + play_url = self._search_regex( + r']+src=(["\'])(?P%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + webpage, 'play iframe', default=None, group='url') + + if not play_url: + return self.url_result(url, 'Generic') + + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + description = self._html_search_meta( + ('description', 'twitter:description'), webpage) + + return { + '_type': 'url_transparent', + 'ie_key': LcpPlayIE.ie_key(), + 'url': play_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index c2b4490c4..87120ecd1 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -96,7 +99,7 @@ class LifeNewsIE(InfoExtractor): r']+>]+src=["\'](.+?)["\']', webpage) iframe_links = re.findall( - r']+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', + r']+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', webpage) if not video_urls and not iframe_links: @@ -164,9 +167,9 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'https?://embed\.life\.ru/embed/(?P[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P[\da-f]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', 'md5': 'b889715c9e49cb1981281d0e5458fbbe', 'info_dict': { @@ -175,30 +178,57 @@ class LifeEmbedIE(InfoExtractor): 'title': 'e50c2dec2867350528e2574c899b8291', 'thumbnail': 're:http://.*\.jpg', } - } + }, { + # with 1080p + 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + thumbnail = None formats = [] - for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8')) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - 'preference': 1, - }) + + def extract_m3u8(manifest_url): + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) + + def extract_original(original_url): + formats.append({ + 'url': original_url, + 'format_id': determine_ext(original_url, None), + 'preference': 1, + }) + + playlist = self._parse_json( + self._search_regex( + r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), + video_id).get('playlist', {}) + if playlist: + master = playlist.get('master') + if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': + extract_m3u8(compat_urlparse.urljoin(url, master)) + original = playlist.get('original') + if isinstance(original, compat_str): + extract_original(original) + thumbnail = playlist.get('image') + + # Old rendition fallback + if not formats: + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + if determine_ext(video_url) == 'm3u8': + extract_m3u8(video_url) + else: + extract_original(video_url) + self._sort_formats(formats) - thumbnail = self._search_regex( + thumbnail = thumbnail or self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 5d2c3e256..6752ffee2 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -34,14 +34,16 @@ class LimelightBaseIE(InfoExtractor): def _extract_info(self, streams, mobile_urls, properties): video_id = properties['media_id'] formats = [] - + urls = [] for stream in streams: stream_url = stream.get('url') - if not stream_url: + if not stream_url or stream.get('drmProtected') or stream_url in urls: continue - if '.f4m' in stream_url: + urls.append(stream_url) + ext = determine_ext(stream_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url, video_id, fatal=False)) + stream_url, video_id, f4m_id='hds', fatal=False)) else: fmt = { 'url': stream_url, @@ -50,13 +52,21 @@ class LimelightBaseIE(InfoExtractor): 'fps': float_or_none(stream.get('videoFrameRate')), 'width': int_or_none(stream.get('videoWidthInPixels')), 'height': int_or_none(stream.get('videoHeightInPixels')), - 'ext': determine_ext(stream_url) + 'ext': ext, } - rtmp = re.search(r'^(?Prtmpe?://[^/]+/(?P.+))/(?Pmp4:.+)$', stream_url) + rtmp = re.search(r'^(?Prtmpe?://(?P[^/]+)/(?P.+))/(?Pmp4:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) + http_url = 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]) + urls.append(http_url) + http_fmt = fmt.copy() + http_fmt.update({ + 'url': http_url, + 'format_id': format_id.replace('rtmp', 'http'), + }) + formats.append(http_fmt) fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), @@ -68,18 +78,24 @@ class LimelightBaseIE(InfoExtractor): for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') - if not media_url: - continue format_id = mobile_url.get('targetMediaPlatform') - if determine_ext(media_url) == 'm3u8': + if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: + continue + urls.append(media_url) + ext = determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id=format_id, fatal=False)) else: formats.append({ 'url': media_url, 'format_id': format_id, 'preference': -1, + 'ext': ext, }) self._sort_formats(formats) @@ -145,7 +161,7 @@ class LimelightMediaIE(LimelightBaseIE): 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { 'id': '3ffd040b522b4485b6d84effc750cd86', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'HaP and the HB Prince Trailer', 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': 're:^https?://.*\.jpeg$', @@ -154,27 +170,23 @@ class LimelightMediaIE(LimelightBaseIE): 'upload_date': '20090604', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { # video with subtitles 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', 'info_dict': { 'id': 'a3e00274d4564ec4a9b29b9466432335', - 'ext': 'flv', + 'ext': 'mp4', 'title': '3Play Media Overview Video', - 'description': '', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 78.101, 'timestamp': 1338929955, 'upload_date': '20120605', 'subtitles': 'mincount:9', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py index 3356d015d..05c6579f1 100644 --- a/youtube_dl/extractor/litv.py +++ b/youtube_dl/extractor/litv.py @@ -14,7 +14,7 @@ from ..utils import ( class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P[^&]+)' + _VALID_URL = r'https?://www\.litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' @@ -27,6 +27,7 @@ class LiTVIE(InfoExtractor): 'playlist_count': 50, }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'md5': '969e343d9244778cb29acec608e53640', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', @@ -37,7 +38,16 @@ class LiTVIE(InfoExtractor): }, 'params': { 'noplaylist': True, - 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }, { + 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', + 'md5': '88322ea132f848d6e3e18b32a832b918', + 'info_dict': { + 'id': 'VOD00044841', + 'ext': 'mp4', + 'title': '芈月傳第1集 霸星芈月降世楚國', + 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, 'skip': 'Georestricted to Taiwan', }] @@ -92,13 +102,18 @@ class LiTVIE(InfoExtractor): # endpoint gives the same result as the data embedded in the webpage. # If georestricted, there are no embedded data, so an extra request is # necessary to get the error code + if 'assetId' not in view_data: + view_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, + query={'contentId': video_id}, + headers={'Accept': 'application/json'}) video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) if not video_data: payload = { 'assetId': view_data['assetId'], - 'watchDevices': vod_data['watchDevices'], + 'watchDevices': view_data['watchDevices'], 'contentType': view_data['contentType'], } video_data = self._download_json( @@ -115,7 +130,8 @@ class LiTVIE(InfoExtractor): raise ExtractorError('Unexpected result from %s' % self.IE_NAME) formats = self._extract_m3u8_formats( - video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + video_data['fullpath'], video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index d970e94ec..27bdff8b2 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor): _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' - _TEST = { + _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { @@ -20,7 +20,11 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + # no tbr extracted from stream_url + 'url': 'http://www.mgtv.com/v/1/1/f/3324755.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -41,7 +45,8 @@ class MGTVIE(InfoExtractor): def extract_format(stream_url, format_id, idx, query={}): format_info = self._download_json( stream_url, video_id, - note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + note='Download video info for format %s' % (format_id or '#%d' % idx), + query=query) return { 'format_id': format_id, 'url': format_info['info'], diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 937ba0f28..ec1b4c4fe 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -25,10 +25,7 @@ class MioMioIE(InfoExtractor): 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, - 'params': { - # The server provides broken file - 'skip_download': True, - } + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -47,16 +44,12 @@ class MioMioIE(InfoExtractor): 'skip': 'Unable to load videos', }, { # new 'h5' player - 'url': 'http://www.miomio.tv/watch/cc273295/', - 'md5': '', + 'url': 'http://www.miomio.tv/watch/cc273997/', + 'md5': '0b27a4b4495055d826813f8c3a6b2070', 'info_dict': { - 'id': '273295', + 'id': '273997', 'ext': 'mp4', - 'title': 'アウト×デラックス 20160526', - }, - 'params': { - # intermittent HTTP 500 - 'skip_download': True, + 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', }, }] @@ -116,7 +109,7 @@ class MioMioIE(InfoExtractor): player_webpage = self._download_webpage( player_url, video_id, note='Downloading player webpage', headers={'Referer': url}) - entries = self._parse_html5_media_entries(player_url, player_webpage) + entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) http_headers = {'Referer': player_url} else: http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index e47c80119..e3bbe5aa8 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,53 +1,56 @@ from __future__ import unicode_literals -import os -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, +from ..utils import ( + int_or_none, + str_to_int, + unified_strdate, ) -from ..utils import sanitized_Request +from .keezmovies import KeezMoviesIE -class MofosexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pmofosex\.com/videos/(?P[0-9]+)/.*?\.html)' - _TEST = { - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'md5': '1b2eb47ac33cc75d4a80e3026b613c5a', +class MofosexIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P\d+)/(?P[^/?#&.]+)\.html' + _TESTS = [{ + 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', + 'md5': '39a15853632b7b2e5679f92f69b78e91', 'info_dict': { - 'id': '5018', + 'id': '318131', + 'display_id': 'amateur-teen-playing-and-masturbating-318131', 'ext': 'mp4', - 'title': 'Japanese Teen Music Video', + 'title': 'amateur teen playing and masturbating', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20121114', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, 'age_limit': 18, } - } + }, { + # This video is no longer available + 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + view_count = str_to_int(self._search_regex( + r'VIEWS:\s*([\d,.]+)', webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + upload_date = unified_strdate(self._html_search_regex( + r'Added:([^<]+)', webpage, 'upload date', fatal=False)) - video_title = self._html_search_regex(r'

(.+?)<', webpage, 'title') - video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) + info.update({ + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'upload_date': upload_date, + 'thumbnail': self._og_search_thumbnail(webpage), + }) - age_limit = self._rta_search(webpage) - - return { - 'id': video_id, - 'title': video_title, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'age_limit': age_limit, - } + return info diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/movingimage.py similarity index 65% rename from youtube_dl/extractor/ssa.py rename to youtube_dl/extractor/movingimage.py index 54d1843f2..bb789c32e 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/movingimage.py @@ -7,22 +7,19 @@ from ..utils import ( ) -class SSAIE(InfoExtractor): - _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P\d+)' +class MovingImageIE(InfoExtractor): + _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P\d+)' _TEST = { - 'url': 'http://ssa.nls.uk/film/3561', + 'url': 'http://movingimage.nls.uk/film/3561', + 'md5': '4caa05c2b38453e6f862197571a7be2f', 'info_dict': { 'id': '3561', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'SHETLAND WOOL', 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', 'duration': 900, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - }, } def _real_extract(self, url): @@ -30,10 +27,9 @@ class SSAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - streamer = self._search_regex( - r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') - play_path = self._search_regex( - r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + formats = self._extract_m3u8_formats( + self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'), + video_id, ext='mp4', entry_protocol='m3u8_native') def search_field(field_name, fatal=False): return self._search_regex( @@ -44,13 +40,11 @@ class SSAIE(InfoExtractor): description = unescapeHTML(search_field('Description')) duration = parse_duration(search_field('Running time')) thumbnail = self._search_regex( - r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) return { 'id': video_id, - 'url': streamer, - 'play_path': play_path, - 'ext': 'flv', + 'formats': formats, 'title': title, 'description': description, 'duration': duration, diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index dd0639589..bdda68819 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_str, compat_xpath, ) @@ -14,10 +13,13 @@ from ..utils import ( fix_xml_ampersands, float_or_none, HEADRequest, - sanitized_Request, - unescapeHTML, - url_basename, RegexNotFoundError, + sanitized_Request, + strip_or_none, + timeconvert, + unescapeHTML, + update_url_query, + url_basename, xpath_text, ) @@ -34,14 +36,19 @@ class MTVServicesInfoExtractor(InfoExtractor): def _id_from_uri(uri): return uri.split(':')[-1] - # This was originally implemented for ComedyCentral, but it also works here @staticmethod - def _transform_rtmp_url(rtmp_video_url): + def _remove_template_parameter(url): + # Remove the templates, like &device={device} + return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) + + # This was originally implemented for ComedyCentral, but it also works here + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) if not m: - return rtmp_video_url + return {'rtmp': rtmp_video_url} base = 'http://viacommtvstrmfs.fplive.net/' - return base + m.group('finalid') + return {'http': base + m.group('finalid')} def _get_feed_url(self, uri): return self._FEED_URL @@ -85,14 +92,14 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue - new_url = self._transform_rtmp_url(rtmp_video_url) - formats.append({ + new_urls = self._transform_rtmp_url(rtmp_video_url) + formats.extend([{ 'ext': 'flv' if new_url.startswith('rtmp') else ext, 'url': new_url, - 'format_id': rendition.get('bitrate'), + 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - }) + } for kind, new_url in new_urls.items()]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -115,9 +122,7 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) self.report_extraction(video_id) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) - mediagen_url = content_el.attrib['url'] - # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) + mediagen_url = self._remove_template_parameter(content_el.attrib['url']) if 'acceptMethods' not in mediagen_url: mediagen_url += '&' if '?' in mediagen_url else '?' mediagen_url += 'acceptMethods=fms' @@ -133,7 +138,9 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description = xpath_text(itemdoc, 'description') + description = strip_or_none(xpath_text(itemdoc, 'description')) + + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) title_el = None if title_el is None: @@ -167,26 +174,32 @@ class MTVServicesInfoExtractor(InfoExtractor): 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, } def _get_feed_query(self, uri): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse_urlencode(data) + return data def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - info_url = feed_url + '?' + self._get_feed_query(uri) + info_url = update_url_query(feed_url, self._get_feed_query(uri)) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')]) + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_title=title, playlist_description=description) def _extract_mgid(self, webpage): try: @@ -232,6 +245,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', }, } @@ -244,13 +259,9 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) - site_id = uri.replace(video_id, '') - config_url = ('http://media.mtvnservices.com/pmt/e1/players/{0}/' - 'context4/context5/config.xml'.format(site_id)) - config_doc = self._download_xml(config_url, video_id) - feed_node = config_doc.find('.//feed') - feed_url = feed_node.text.strip().split('?')[0] - return feed_url + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -274,6 +285,8 @@ class MTVIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', }, }, ] @@ -300,20 +313,6 @@ class MTVIE(MTVServicesInfoExtractor): return self._get_videos_info(uri) -class MTVIggyIE(MTVServicesInfoExtractor): - IE_NAME = 'mtviggy.com' - _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' - _TEST = { - 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', - 'info_dict': { - 'id': '984696', - 'ext': 'mp4', - 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', - } - } - _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' - - class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' @@ -321,7 +320,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, @@ -329,20 +328,21 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { - # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', @@ -354,6 +354,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] def _real_extract(self, url): @@ -366,11 +367,14 @@ class MTVDEIE(MTVServicesInfoExtractor): r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) + def _mrss_url(item): + return item['mrss'] + item.get('mrssvars', '') + # news pages contain single video in playlist with different id if len(playlist) == 1: - return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(item['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(item), video_id) diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index b4e8ad17e..d9f176136 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -36,7 +36,7 @@ class MuenchenTVIE(InfoExtractor): title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( - r'(?s)\nplaylist:\s*(\[.*?}\]),related:', + r'(?s)\nplaylist:\s*(\[.*?}\]),', webpage, 'playlist configuration') data_json = js_to_json(data_js) data = json.loads(data_json)[0] diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index 731c24542..2117d302d 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -13,7 +13,7 @@ class MyVidsterIE(InfoExtractor): 'id': '3685814', 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749', 'upload_date': '20141027', - 'uploader_id': 'utkualp', + 'uploader': 'utkualp', 'ext': 'mp4', 'age_limit': 18, }, diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index e717abb9f..1dcf27afe 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,16 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .adobepass import AdobePassIE from ..utils import ( smuggle_url, url_basename, update_url_query, + get_element_by_class, ) -class NationalGeographicIE(InfoExtractor): - IE_NAME = 'natgeo' +class NationalGeographicVideoIE(InfoExtractor): + IE_NAME = 'natgeo:video' _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ @@ -62,16 +65,16 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(ThePlatformIE): - IE_NAME = 'natgeo:channel' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' +class NationalGeographicIE(AdobePassIE): + IE_NAME = 'natgeo' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' _TESTS = [ { 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { - 'id': 'nB5vIAfmyllm', + 'id': 'vKInpacll2pC', 'ext': 'mp4', 'title': 'Uncovering a Universal Knowledge', 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', @@ -85,7 +88,7 @@ class NationalGeographicChannelIE(ThePlatformIE): 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { - 'id': '3TmMv9OvGwIR', + 'id': 'Pok5lWCkiEFA', 'ext': 'mp4', 'title': 'The Stunning Red Bird of Paradise', 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', @@ -95,6 +98,10 @@ class NationalGeographicChannelIE(ThePlatformIE): }, 'add_ie': ['ThePlatform'], }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', + 'only_matching': True, + } ] def _real_extract(self, url): @@ -112,7 +119,7 @@ class NationalGeographicChannelIE(ThePlatformIE): auth_resource_id = self._search_regex( r"video_auth_resourceId\s*=\s*'([^']+)'", webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) return { '_type': 'url_transparent', @@ -122,3 +129,40 @@ class NationalGeographicChannelIE(ThePlatformIE): {'force_smil_url': True}), 'display_id': display_id, } + + +class NationalGeographicEpisodeGuideIE(InfoExtractor): + IE_NAME = 'natgeo:episodeguide' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P[^/]+)/episode-guide' + _TESTS = [ + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', + 'info_dict': { + 'id': 'the-story-of-god-with-morgan-freeman-season-1', + 'title': 'The Story of God with Morgan Freeman - Season 1', + }, + 'playlist_mincount': 6, + }, + { + 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', + 'info_dict': { + 'id': 'underworld-inc-season-2', + 'title': 'Underworld, Inc. - Season 2', + }, + 'playlist_mincount': 7, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show = get_element_by_class('show', webpage) + selected_season = self._search_regex( + r']+class="select-seasons[^"]*".*?]*>(.*?)', + webpage, 'selected season') + entries = [ + self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') + for entry_url in re.findall('(?s)]+class="col-inner"[^>]*?>.*?]+href="([^"]+)"', webpage)] + return self.playlist_result( + entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), + '%s - %s' % (show, selected_season)) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 6d6f69b44..0891d2772 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -4,12 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( ExtractorError, + int_or_none, + update_url_query, ) @@ -51,48 +49,74 @@ class NaverIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') - vid = m_id.group(1) - key = m_id.group(2) - query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse_urlencode({ - 'masterVid': vid, - 'protocol': 'p2p', - 'inKey': key, - }) - info = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, - video_id, 'Downloading video info') - urls = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, - video_id, 'Downloading video formats info') - + video_data = self._download_json( + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + video_id, query={ + 'key': m_id.group(2), + }) + meta = video_data['meta'] + title = meta['subject'] formats = [] - for format_el in urls.findall('EncodingOptions/EncodingOption'): - domain = format_el.find('Domain').text - uri = format_el.find('uri').text - f = { - 'url': compat_urlparse.urljoin(domain, uri), - 'ext': 'mp4', - 'width': int(format_el.find('width').text), - 'height': int(format_el.find('height').text), - } - if domain.startswith('rtmp'): - # urlparse does not support custom schemes - # https://bugs.python.org/issue18828 - f.update({ - 'url': domain + uri, - 'ext': 'flv', - 'rtmp_protocol': '1', # rtmpt + + def extract_formats(streams, stream_type, query={}): + for stream in streams: + stream_url = stream.get('source') + if not stream_url: + continue + stream_url = update_url_query(stream_url, query) + encoding_option = stream.get('encodingOption', {}) + bitrate = stream.get('bitrate', {}) + formats.append({ + 'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')), + 'url': stream_url, + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': int_or_none(bitrate.get('video')), + 'abr': int_or_none(bitrate.get('audio')), + 'filesize': int_or_none(stream.get('size')), + 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, }) - formats.append(f) + + extract_formats(video_data.get('videos', {}).get('list', []), 'H264') + for stream_set in video_data.get('streams', []): + query = {} + for param in stream_set.get('keys', []): + query[param['name']] = param['value'] + stream_type = stream_set.get('type') + videos = stream_set.get('videos') + if videos: + extract_formats(videos, stream_type, query) + elif stream_type == 'HLS': + stream_url = stream_set.get('source') + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + update_url_query(stream_url, query), video_id, + 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) self._sort_formats(formats) + subtitles = {} + for caption in video_data.get('captions', {}).get('list', []): + caption_url = caption.get('source') + if not caption_url: + continue + subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({ + 'url': caption_url, + }) + + upload_date = self._search_regex( + r']+class="date".*?(\d{4}\.\d{2}\.\d{2})', + webpage, 'upload date', fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') + return { 'id': video_id, - 'title': info.find('Subject').text, + 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': info.find('WriteDate').text.replace('.', ''), - 'view_count': int(info.find('PlayCount').text), + 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), + 'view_count': int_or_none(meta.get('count')), + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index d896b0d04..53561961c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,25 +1,20 @@ from __future__ import unicode_literals import functools -import os.path import re -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..compat import ( compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - int_or_none, OnDemandPagedList, - parse_duration, remove_start, - xpath_text, - xpath_attr, ) -class NBAIE(InfoExtractor): +class NBAIE(TurnerBaseIE): _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', @@ -44,28 +39,30 @@ class NBAIE(InfoExtractor): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap', + 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, 'timestamp': 1432134543, 'upload_date': '20150520', - } + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', 'info_dict': { - 'id': '1455672027478-Doc_Feb16_720', + 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', 'ext': 'mp4', 'title': 'Practice: Doc Rivers - 2/16/16', 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160217', + 'upload_date': '20160216', 'timestamp': 1455672000, }, 'params': { # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { @@ -80,7 +77,7 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { - 'id': 'Wigginsmp4', + 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', 'ext': 'mp4', 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', @@ -92,6 +89,7 @@ class NBAIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }] _PAGE_SIZE = 30 @@ -145,53 +143,12 @@ class NBAIE(InfoExtractor): if path.startswith('video/teams'): path = 'video/channels/proxy/' + path[6:] - video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) - video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0] - title = xpath_text(video_info, 'headline') - description = xpath_text(video_info, 'description') - duration = parse_duration(xpath_text(video_info, 'length')) - timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) - - thumbnails = [] - for image in video_info.find('images'): - thumbnails.append({ - 'id': image.attrib.get('cut'), - 'url': image.text, - 'width': int_or_none(image.attrib.get('width')), - 'height': int_or_none(image.attrib.get('height')), + return self._extract_cvp_info( + 'http://www.nba.com/%s.xml' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, }) - - formats = [] - for video_file in video_info.findall('.//file'): - video_url = video_file.text - if video_url.startswith('/'): - continue - if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) - elif video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) - else: - key = video_file.attrib.get('bitrate') - format_info = { - 'format_id': key, - 'url': video_url, - } - mobj = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key) - if mobj: - format_info.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - 'tbr': int_or_none(mobj.group(3)), - }) - formats.append(format_info) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py deleted file mode 100644 index 9ccd7d774..000000000 --- a/youtube_dl/extractor/nextmovie.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class NextMovieIE(MTVServicesInfoExtractor): - IE_NAME = 'nextmovie.com' - _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P[^/?#]+)' - _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' - _TESTS = [{ - 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', - 'md5': '09a9199f2f11f10107d04fcb153218aa', - 'info_dict': { - 'id': '961726', - 'ext': 'mp4', - 'title': 'The Muppets\' Gravity', - }, - }] - - def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ - 'feed': '1505', - 'mgid': uri, - }) - - def _real_extract(self, url): - mgid = self._match_id(url) - return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py new file mode 100644 index 000000000..691bdfa4e --- /dev/null +++ b/youtube_dl/extractor/nhk.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class NhkVodIE(InfoExtractor): + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P.+?)\.html' + _TEST = { + # Videos available only for a limited period of time. Visit + # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. + 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815.html', + 'info_dict': { + 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', + 'ext': 'flv', + 'title': 'TOKYO FASHION EXPRESS - The Kimono as Global Fashion', + 'description': 'md5:db338ee6ce8204f415b754782f819824', + 'series': 'TOKYO FASHION EXPRESS', + 'episode': 'The Kimono as Global Fashion', + }, + 'skip': 'Videos available only for a limited period of time', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + embed_code = self._search_regex( + r'nw_vod_ooplayer\([^,]+,\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'ooyala embed code', group='id') + + title = self._search_regex( + r']+class=["\']episode-detail["\']>\s*([^<]+)', + webpage, 'title', default=None) + description = self._html_search_regex( + r'(?s)]+class=["\']description["\'][^>]*>(.+?)

', + webpage, 'description', default=None) + series = self._search_regex( + r']+class=["\']detail-top-player-title[^>]+>]+>([^<]+)', + webpage, 'series', default=None) + + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % embed_code, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': description, + 'series': series, + 'episode': title, + } diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 4935002d0..64730a624 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): + # None of videos on the website are still alive? IE_NAME = 'nick.com' _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' @@ -58,10 +58,10 @@ class NickIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'feed': 'nick_arc_player_prime', 'mgid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d889245ad..ec4d675e2 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,40 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_iso8601, - parse_duration, - ExtractorError + float_or_none, + ExtractorError, + int_or_none, ) -class NineCNineMediaIE(InfoExtractor): - _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' +class NineCNineMediaBaseIE(InfoExtractor): + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' + + +class NineCNineMediaStackIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media:stack' + _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' def _real_extract(self, url): - destination_code, video_id = re.match(self._VALID_URL, url).groups() - api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) - content = self._download_json(api_base_url, video_id, query={ - '$include': '[contentpackages]', - }) - title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') - content_package = content['ContentPackages'][0] - stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] - stacks = self._download_json(stacks_base_url, video_id)['Items'] - if len(stacks) > 1: - raise ExtractorError('multiple stacks') - stack = stacks[0] - stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() + stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' + stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) + formats = [] formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', video_id, 'mp4', + stack_base_url + 'm3u8', stack_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', video_id, + stack_base_url + 'f4m', stack_id, f4m_id='hds', fatal=False)) - mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) if mp4_url: formats.append({ 'url': mp4_url, @@ -46,10 +42,86 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': content.get('Desc') or content.get('ShortDesc'), - 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), - 'duration': parse_duration(content.get('BroadcastTime')), + 'id': stack_id, 'formats': formats, } + + +class NineCNineMediaIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media' + _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + + def _real_extract(self, url): + destination_code, content_id = re.match(self._VALID_URL, url).groups() + api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) + content = self._download_json(api_base_url, content_id, query={ + '$include': '[Media,Season,ContentPackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + package_id = content_package['Id'] + content_package_url = api_base_url + 'contentpackages/%s/' % package_id + content_package = self._download_json(content_package_url, content_id) + + if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + raise ExtractorError('This video is DRM protected.', expected=True) + + stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] + multistacks = len(stacks) > 1 + + thumbnails = [] + for image in content.get('Images', []): + image_url = image.get('Url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('Width')), + 'height': int_or_none(image.get('Height')), + }) + + tags, categories = [], [] + for source_name, container in (('Tags', tags), ('Genres', categories)): + for e in content.get(source_name, []): + e_name = e.get('Name') + if not e_name: + continue + container.append(e_name) + + description = content.get('Desc') or content.get('ShortDesc') + season = content.get('Season', {}) + base_info = { + 'description': description, + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'episode_number': int_or_none(content.get('Episode')), + 'season': season.get('Name'), + 'season_number': season.get('Number'), + 'season_id': season.get('Id'), + 'series': content.get('Media', {}).get('Name'), + 'tags': tags, + 'categories': categories, + } + + entries = [] + for stack in stacks: + stack_id = compat_str(stack['Id']) + entry = { + '_type': 'url_transparent', + 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), + 'id': stack_id, + 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, + 'duration': float_or_none(stack.get('Duration')), + 'ie_key': 'NineCNineMediaStack', + } + entry.update(base_info) + entries.append(entry) + + return { + '_type': 'multi_video', + 'id': content_id, + 'title': title, + 'description': description, + 'entries': entries, + } diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py new file mode 100644 index 000000000..4b4e66b05 --- /dev/null +++ b/youtube_dl/extractor/nintendo.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import unescapeHTML + + +class NintendoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nintendo\.com/games/detail/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nintendo.com/games/detail/yEiAzhU2eQI1KZ7wOHhngFoAHc1FpHwj', + 'info_dict': { + 'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW', + 'ext': 'flv', + 'title': 'Duck Hunt Wii U VC NES - Trailer', + 'duration': 60.326, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u', + 'info_dict': { + 'id': 'tokyo-mirage-sessions-fe-wii-u', + 'title': 'Tokyo Mirage Sessions ♯FE', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + entries = [ + OoyalaIE._build_url_result(m.group('code')) + for m in re.finditer( + r'class=(["\'])embed-video\1[^>]+data-video-code=(["\'])(?P(?:(?!\2).)+)\2', + webpage)] + + return self.playlist_result( + entries, page_id, unescapeHTML(self._og_search_title(webpage, fatal=False))) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ded5bd45..ed42eb301 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,16 +14,6 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - def _extract_formats(self, manifest_url, video_id, fatal=True): - formats = [] - formats.extend(self._extract_f4m_formats( - manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', - video_id, f4m_id='hds', fatal=fatal)) - formats.extend(self._extract_m3u8_formats(manifest_url.replace( - 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) - return formats - def _real_extract(self, url): video_id = self._match_id(url) @@ -45,7 +35,7 @@ class NRKBaseIE(InfoExtractor): asset_url = asset.get('url') if not asset_url: continue - formats = self._extract_formats(asset_url, video_id, fatal=False) + formats = self._extract_akamai_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) @@ -69,7 +59,7 @@ class NRKBaseIE(InfoExtractor): if not entries: media_url = data.get('mediaUrl') if media_url: - formats = self._extract_formats(media_url, video_id) + formats = self._extract_akamai_formats(media_url, video_id) self._sort_formats(formats) duration = parse_duration(data.get('duration')) entries = [{ diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py index a83e85cb8..d28a81542 100644 --- a/youtube_dl/extractor/ntvde.py +++ b/youtube_dl/extractor/ntvde.py @@ -1,6 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -40,8 +42,8 @@ class NTVDeIE(InfoExtractor): timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) vdata = self._parse_json(self._search_regex( r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), - video_id, transform_source=js_to_json) + webpage, 'player data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) duration = parse_duration(vdata.get('duration')) formats = [] diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 0895d7ea4..e8702ebcd 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,70 +11,64 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?:[^/]+/)*(?P[^/?#&]+)' - _TESTS = [ - { - 'url': 'http://www.ntv.ru/novosti/863142/', - 'md5': 'ba7ea172a91cb83eb734cad18c10e723', - 'info_dict': { - 'id': '746000', - 'ext': 'mp4', - 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', - 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 136, - }, + _TESTS = [{ + 'url': 'http://www.ntv.ru/novosti/863142/', + 'md5': 'ba7ea172a91cb83eb734cad18c10e723', + 'info_dict': { + 'id': '746000', + 'ext': 'mp4', + 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 136, }, - { - 'url': 'http://www.ntv.ru/video/novosti/750370/', - 'md5': 'adecff79691b4d71e25220a191477124', - 'info_dict': { - 'id': '750370', - 'ext': 'mp4', - 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', - 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 172, - }, + }, { + 'url': 'http://www.ntv.ru/video/novosti/750370/', + 'md5': 'adecff79691b4d71e25220a191477124', + 'info_dict': { + 'id': '750370', + 'ext': 'mp4', + 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 172, }, - { - 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', - 'md5': '82dbd49b38e3af1d00df16acbeab260c', - 'info_dict': { - 'id': '747480', - 'ext': 'mp4', - 'title': '«Сегодня». 21 марта 2014 года. 16:00', - 'description': '«Сегодня». 21 марта 2014 года. 16:00', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 1496, - }, + }, { + 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', + 'md5': '82dbd49b38e3af1d00df16acbeab260c', + 'info_dict': { + 'id': '747480', + 'ext': 'mp4', + 'title': '«Сегодня». 21 марта 2014 года. 16:00', + 'description': '«Сегодня». 21 марта 2014 года. 16:00', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 1496, }, - { - 'url': 'http://www.ntv.ru/kino/Koma_film', - 'md5': 'f825770930937aa7e5aca0dc0d29319a', - 'info_dict': { - 'id': '1007609', - 'ext': 'mp4', - 'title': 'Остросюжетный фильм «Кома»', - 'description': 'Остросюжетный фильм «Кома»', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 5592, - }, + }, { + 'url': 'http://www.ntv.ru/kino/Koma_film', + 'md5': 'f825770930937aa7e5aca0dc0d29319a', + 'info_dict': { + 'id': '1007609', + 'ext': 'mp4', + 'title': 'Остросюжетный фильм «Кома»', + 'description': 'Остросюжетный фильм «Кома»', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 5592, }, - { - 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', - 'md5': '9320cd0e23f3ea59c330dc744e06ff3b', - 'info_dict': { - 'id': '751482', - 'ext': 'mp4', - 'title': '«Дело врачей»: «Деревце жизни»', - 'description': '«Дело врачей»: «Деревце жизни»', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 2590, - }, + }, { + 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', + 'md5': '9320cd0e23f3ea59c330dc744e06ff3b', + 'info_dict': { + 'id': '751482', + 'ext': 'mp4', + 'title': '«Дело врачей»: «Деревце жизни»', + 'description': '«Дело врачей»: «Деревце жизни»', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 2590, }, - ] + }] _VIDEO_ID_REGEXES = [ r'http.+?)\1', webpage, 'video url', + default=None if no_video else NO_DEFAULT, group='url') + + if no_video: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + return { + 'id': video_id, + 'url': video_url, + 'title': remove_start(self._og_search_title(webpage), 'Video: '), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index 402d3a9f7..fc22ad5eb 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -59,11 +59,8 @@ class OnetBaseIE(InfoExtractor): # TODO: Support Microsoft Smooth Streaming continue elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # not implemented yet - # formats.extend(self._extract_mpd_formats( - # video_url, video_id, mpd_id='dash', fatal=False)) - continue + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) else: formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 6415b8fdc..03baf8e32 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,15 +1,14 @@ # coding: utf-8 -from __future__ import unicode_literals - -import re +from __future__ import unicode_literals, division from .common import InfoExtractor -from ..compat import compat_chr +from ..compat import ( + compat_chr, + compat_ord, +) from ..utils import ( determine_ext, - encode_base_n, ExtractorError, - mimetype2ext, ) @@ -41,90 +40,41 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def openload_level2_debase(m): - radix, num = int(m.group(1)) + 27, int(m.group(2)) - return '"' + encode_base_n(num, radix) + '"' - - @classmethod - def openload_level2(cls, txt): - # The function name is ǃ \u01c3 - # Using escaped unicode literals does not work in Python 3.2 - return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') - - # Openload uses a variant of aadecode - # openload_decode and related functions are originally written by - # vitas@matfyz.cz and released with public domain - # See https://github.com/rg3/youtube-dl/issues/8489 - @classmethod - def openload_decode(cls, txt): - symbol_table = [ - ('_', '(゚Д゚) [゚Θ゚]'), - ('a', '(゚Д゚) [゚ω゚ノ]'), - ('b', '(゚Д゚) [゚Θ゚ノ]'), - ('c', '(゚Д゚) [\'c\']'), - ('d', '(゚Д゚) [゚ー゚ノ]'), - ('e', '(゚Д゚) [゚Д゚ノ]'), - ('f', '(゚Д゚) [1]'), - - ('o', '(゚Д゚) [\'o\']'), - ('u', '(o゚ー゚o)'), - ('c', '(゚Д゚) [\'c\']'), - - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('4', '(-~3)'), - ('3', '(-~-~1)'), - ('2', '(-~1)'), - ('1', '(-~0)'), - ('0', '((c^_^o)-(c^_^o))'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aachar in txt.split(delim): - for val, pat in symbol_table: - aachar = aachar.replace(pat, val) - aachar = aachar.replace('+ ', '') - m = re.match(r'^\d+', aachar) - if m: - ret += compat_chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aachar) - if m: - ret += compat_chr(int(m.group(1), 16)) - return cls.openload_level2(ret) - def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) - if 'File not found' in webpage: + if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True) - code = self._search_regex( - r'\s*

\s*]+>[^>]+\s*]+>([^<]+)', - webpage, 'JS code') + # The following decryption algorithm is written by @yokrysty and + # declared to be freely used in youtube-dl + # See https://github.com/rg3/youtube-dl/issues/10408 + enc_data = self._html_search_regex( + r']+id="hiddenurl"[^>]*>([^<]+)', webpage, 'encrypted data') - decoded = self.openload_decode(code) + video_url_chars = [] - video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + for idx, c in enumerate(enc_data): + j = compat_ord(c) + if j >= 33 and j <= 126: + j = ((j + 14) % 94) + 33 + if idx == len(enc_data) - 1: + j += 1 + video_url_chars += compat_chr(j) + + video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - ext = mimetype2ext(self._search_regex( - r'window\.vt\s*=\s*(["\'])(?P.+?)\1', decoded, - 'mimetype', default=None, group='mimetype')) or determine_ext( - video_url, 'mp4') - return { 'id': video_id, 'title': title, - 'ext': ext, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, + # Seems all videos have extensions in their titles + 'ext': determine_ext(title), } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index ccb23e069..6ae30679a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TEST = { + _TESTS = [{ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', 'only_matching': True, - } + }, { + 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', + 'only_matching': True, + }] def _real_extract(self, url): show_id = self._match_id(url) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f6f423597..b490ef74c 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, int_or_none, js_to_json, strip_jsonp, + strip_or_none, unified_strdate, US_RATINGS, ) @@ -201,7 +201,7 @@ class PBSIE(InfoExtractor): 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', 'duration': 3190, }, }, @@ -212,7 +212,7 @@ class PBSIE(InfoExtractor): 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, }, @@ -223,7 +223,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:95a19f568689d09a166dff9edada3301', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', 'duration': 801, }, }, @@ -268,7 +268,7 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -294,13 +294,13 @@ class PBSIE(InfoExtractor): # "', webpage): url = self._search_regex( @@ -423,10 +428,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id, None + return video_id, display_id, None, description def _real_extract(self, url): - video_id, display_id, upload_date = self._extract_webpage(url) + video_id, display_id, upload_date, description = self._extract_webpage(url) if isinstance(video_id, list): entries = [self.url_result( @@ -448,17 +453,6 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) - try: - video_info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id, 'Downloading video info JSON') - extract_redirect_urls(video_info) - info = video_info - except ExtractorError as e: - # videoInfo API may not work for some videos - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: - raise - # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -511,15 +505,19 @@ class PBSIE(InfoExtractor): formats)) if http_url: for m3u8_format in m3u8_formats: - bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) - # extract only the formats that we know that they will be available as http format. - # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: continue - f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) # This may produce invalid links sometimes (e.g. # http://www.pbs.org/wgbh/frontline/film/suicide-plan) - if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): continue f = m3u8_format.copy() f.update({ @@ -562,11 +560,14 @@ class PBSIE(InfoExtractor): if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + description = info.get('description') or info.get( + 'program', {}).get('description') or description + return { 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info.get('description') or info.get('program', {}).get('description'), + 'description': description, 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 75f5884a9..6c640089d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -8,7 +8,14 @@ from ..utils import ( ) -class PeriscopeIE(InfoExtractor): +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + +class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P[^/?#]+)' @@ -34,14 +41,11 @@ class PeriscopeIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, method, value): - return self._download_json( - 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) - def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api('getBroadcastPublic', token) + broadcast_data = self._call_api( + 'getBroadcastPublic', {'broadcast_id': token}, token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -61,7 +65,8 @@ class PeriscopeIE(InfoExtractor): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api('getAccessPublic', token) + stream = self._call_api( + 'getAccessPublic', {'broadcast_id': token}, token) formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): @@ -88,7 +93,7 @@ class PeriscopeIE(InfoExtractor): } -class PeriscopeUserIE(InfoExtractor): +class PeriscopeUserIE(PeriscopeBaseIE): _VALID_URL = r'https?://www\.periscope\.tv/(?P[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' @@ -106,26 +111,34 @@ class PeriscopeUserIE(InfoExtractor): } def _real_extract(self, url): - user_id = self._match_id(url) + user_name = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage(url, user_name) data_store = self._parse_json( unescapeHTML(self._search_regex( r'data-store=(["\'])(?P.+?)\1', webpage, 'data store', default='{}', group='data')), - user_id) + user_name) - user = data_store.get('User', {}).get('user', {}) - title = user.get('display_name') or user.get('username') + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name description = user.get('description') - broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or - data_store.get('BroadcastCache', {}).get('broadcastIds', [])) - entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py deleted file mode 100644 index 57c875ef0..000000000 --- a/youtube_dl/extractor/played.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import os.path - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class PlayedIE(InfoExtractor): - IE_NAME = 'played.to' - _VALID_URL = r'https?://(?:www\.)?played\.to/(?P[a-zA-Z0-9_-]+)' - - _TEST = { - 'url': 'http://played.to/j2f2sfiiukgt', - 'md5': 'c2bd75a368e82980e7257bf500c00637', - 'info_dict': { - 'id': 'j2f2sfiiukgt', - 'ext': 'flv', - 'title': 'youtube-dl_test_video.mp4', - }, - 'skip': 'Removed for copyright infringement.', # oh wow - } - - def _real_extract(self, url): - video_id = self._match_id(url) - orig_webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'(?s)Reason for deletion:.*?]*>(?P[^<]+)', orig_webpage) - if m_error: - raise ExtractorError(m_error.group('msg'), expected=True) - - data = self._hidden_inputs(orig_webpage) - - self._sleep(2, video_id) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - - video_url = self._search_regex( - r'file: "?(.+?)",', webpage, 'video URL') - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 9aab77645..ea5caefa9 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import re -import json -import random import collections +import json +import os +import random +import re from .common import InfoExtractor from ..compat import ( @@ -12,10 +13,11 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, int_or_none, parse_duration, qualities, - sanitized_Request, + srt_subtitles_timecode, urlencode_postdata, ) @@ -75,12 +77,10 @@ class PluralsightIE(PluralsightBaseIE): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) error = self._search_regex( r']+class="field-validation-error"[^>]*>([^<]+)', @@ -91,6 +91,53 @@ class PluralsightIE(PluralsightBaseIE): if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): raise ExtractorError('Unable to log in') + def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + captions_post = { + 'a': author, + 'cn': clip_id, + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/training/Player/Captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + for num, current in enumerate(subs): + current = subs[num] + start, text = float_or_none( + current.get('DisplayTimeOffset')), current.get('Text') + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + subs[num + 1].get('DisplayTimeOffset')) + if end is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) @@ -138,6 +185,8 @@ class PluralsightIE(PluralsightBaseIE): if not clip: raise ExtractorError('Unable to resolve clip') + title = '%s - %s' % (module['title'], clip['title']) + QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, @@ -196,13 +245,12 @@ class PluralsightIE(PluralsightBaseIE): 'mt': ext, 'q': '%dx%d' % (f['width'], f['height']), } - request = sanitized_Request( - '%s/training/Player/ViewClip' % self._API_BASE, - json.dumps(clip_post).encode('utf-8')) - request.add_header('Content-Type', 'application/json;charset=utf-8') format_id = '%s-%s' % (ext, quality) clip_url = self._download_webpage( - request, display_id, 'Downloading %s URL' % format_id, fatal=False) + '%s/training/Player/ViewClip' % self._API_BASE, display_id, + 'Downloading %s URL' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see @@ -225,18 +273,20 @@ class PluralsightIE(PluralsightBaseIE): formats.append(f) self._sort_formats(formats) - # TODO: captions - # http://www.pluralsight.com/training/Player/ViewClip + cap = true - # or - # http://www.pluralsight.com/training/Player/Captions - # { a = author, cn = clip_id, lc = end, m = name } + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_id, 'en', name, duration, display_id) return { 'id': clip.get('clipName') or clip['name'], - 'title': '%s - %s' % (module['title'], clip['title']), - 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'title': title, + 'duration': duration, 'creator': author, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py new file mode 100644 index 000000000..2d87e7e70 --- /dev/null +++ b/youtube_dl/extractor/pokemon.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P[^/?#]+))' + _TESTS = [{ + 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', + 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', + 'info_dict': { + 'id': 'd0436c00c3ce4071ac6cee8130ac54a1', + 'ext': 'mp4', + 'title': 'From A to Z!', + 'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!', + 'timestamp': 1460478136, + 'upload_date': '20160412', + }, + 'add_id': ['LimelightMedia'] + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data['data-video-title'] + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py new file mode 100644 index 000000000..d85e0294d --- /dev/null +++ b/youtube_dl/extractor/porncom.py @@ -0,0 +1,100 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)'), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">MPEG4 (\d+)p]*>(\d+\s+[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + r'class=["\']views["\'][^>]*>

([\d,.]+)', webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + r'(?s)]*>%s:(.+?)

' % kind.capitalize(), + webpage, kind, fatal=False) + return re.findall(r']+>([^<]+)', s or '') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), + } diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index d2c92531b..20976c101 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -111,7 +111,7 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)]+class=(["\']).*?\b(?:removed|userMessageSection)\b.*?\1[^>]*>(?P.+?)
', + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5398e708b..63816c358 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, -) +from ..utils import int_or_none class PornotubeIE(InfoExtractor): @@ -31,59 +28,55 @@ class PornotubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Fetch origin token - js_config = self._download_webpage( - 'http://www.pornotube.com/assets/src/app/config.js', video_id, - note='Download JS config') - originAuthenticationSpaceKey = self._search_regex( - r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", - js_config, 'originAuthenticationSpaceKey') + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] - # Fetch actual token - token_req_data = { - 'authenticationSpaceKey': originAuthenticationSpaceKey, - 'credentials': 'Clip Application', - } - token_req = sanitized_Request( - 'https://api.aebn.net/auth/v1/token/primal', - data=json.dumps(token_req_data).encode('utf-8')) - token_req.add_header('Content-Type', 'application/json') - token_req.add_header('Origin', 'http://www.pornotube.com') - token_answer = self._download_json( - token_req, video_id, note='Requesting primal token') - token = token_answer['tokenKey'] + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] - # Get video URL - delivery_req = sanitized_Request( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) - delivery_req.add_header('Authorization', token) - delivery_info = self._download_json( - delivery_req, video_id, note='Downloading delivery information') - video_url = delivery_info['mediaUrl'] + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) - # Get additional info (title etc.) - info_req = sanitized_Request( - 'https://api.aebn.net/content/v1/clips/%s?expand=' - 'title,description,primaryImageNumber,startSecond,endSecond,' - 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' - 'movie.studios,stars.name,studios.name,categories.name,' - 'clipActive,movieActive,publishDate,orientations' % video_id) - info_req.add_header('Authorization', token) info = self._download_json( - info_req, video_id, note='Downloading metadata') + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') - movie_id = info['movie']['movieId'] - thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( - movie_id, movie_id, info['primaryImageNumber']) - categories = [c['name'] for c in info.get('categories')] + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, - 'title': info['title'], + 'title': title, 'description': info.get('description'), + 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index cc0416cb8..b8ac93a62 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,59 +1,72 @@ from __future__ import unicode_literals import re -import os from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none class PyvideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P[^/]+)/(?P[^/?#&.]+)' - _TESTS = [ - { - 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - 'md5': '520915673e53a5c5d487c36e0c4d85b5', - 'info_dict': { - 'id': '24_4WWkSmNo', - 'ext': 'webm', - 'title': 'Become a logging expert in 30 minutes', - 'description': 'md5:9665350d466c67fb5b1598de379021f7', - 'upload_date': '20130320', - 'uploader': 'Next Day Video', - 'uploader_id': 'NextDayVideo', - }, - 'add_ie': ['Youtube'], + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', }, - { - 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', - 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', - 'info_dict': { - 'id': '2542', - 'ext': 'm4v', - 'title': 'Gloriajw-SpotifyWithErikBernhardsson182', - }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', }, - ] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + entries = [] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) - title = self._html_search_regex( - r'
\s*]*)?>([^>]+?)', - webpage, 'title', flags=re.DOTALL) - video_url = self._search_regex( - [r'Download.*?', webpage, 'media urls') + for m in re.finditer( + r']+href=(["\'])(?Phttp.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) - return { - 'id': video_id, - 'title': os.path.splitext(title)[0], - 'url': video_url, - } + return self.playlist_result(entries, video_id) diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py index 0cbb15f08..19a751da0 100644 --- a/youtube_dl/extractor/radiobremen.py +++ b/youtube_dl/extractor/radiobremen.py @@ -13,15 +13,15 @@ class RadioBremenIE(InfoExtractor): IE_NAME = 'radiobremen' _TEST = { - 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', 'info_dict': { - 'id': '114720', + 'id': '141876', 'ext': 'mp4', - 'duration': 1685, + 'duration': 178, 'width': 512, - 'title': 'buten un binnen vom 22. Dezember', + 'title': 'Druck auf Patrick Öztürk', 'thumbnail': 're:https?://.*\.jpg$', - 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', }, } diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 7932af6ef..471928ef8 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,55 +1,71 @@ -# encoding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, + clean_html, + int_or_none, + unified_timestamp, + update_url_query, ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)/episodes/(?P[^/?#&]+)' _TEST = { - 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'uploader_id': 'ford-lopatin', - 'location': 'Spain', - 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'uploader': 'Ford & Lopatin', - 'title': 'Live at Primavera Sound 2011', + 'title': 'Main Stage - Ford & Lopatin', + 'description': 'md5:4f340fb48426423530af5a9d87bd7b91', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', }, } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, episode_id) - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, 'json data', flags=re.MULTILINE) + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError('Invalid JSON: ' + str(e)) + title = episode['title'] - video_url = data['akamai_url'] + '&cbr=256' + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 256)] + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) return { - 'id': video_id, - 'url': video_url, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py new file mode 100644 index 000000000..f8eda8dea --- /dev/null +++ b/youtube_dl/extractor/rozhlas.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'skip_download': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'

(.+?)

\s*]*>.*?

\s*]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r']+title=(["\'])(?P(?:(?!\1).)+)\1[^>]*>.*?

\s*]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 4d612b5e3..f0250af8a 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -14,7 +14,7 @@ class RtlNlIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)? (?: - rtlxl\.nl/\#!/[^/]+/| + rtlxl\.nl/[^\#]*\#!/[^/]+/| rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P[0-9a-f-]+)''' @@ -67,6 +67,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index d33b05f5d..34f9c4a99 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -113,6 +113,8 @@ class RTVEALaCartaIE(InfoExtractor): png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) if not video_url.endswith('.f4m'): + if '?' not in video_url: + video_url = video_url.replace('resources/', 'auth/resources/') video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') subtitles = None diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 6ba91f202..08ddbe3c4 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -75,7 +75,7 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P[^/]+)/(?Ppart\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P[^/]+)/(?P[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -92,6 +92,9 @@ class SafariIE(SafariBaseIE): # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -132,12 +135,15 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?Ppart\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P[^/]+)/chapter(?:-content)?/(?P[^/?#&]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, - } + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 1c636f672..2dbe490bb 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -4,33 +4,43 @@ from __future__ import unicode_literals import re from .jwplatform import JWPlatformBaseIE -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, - parse_duration, + float_or_none, + parse_iso8601, + update_url_query, ) class SendtoNewsIE(JWPlatformBaseIE): - _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P[^#]+)' + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ - 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'duration': 49, + 'id': 'GxfCe0Zo7D-175909-5588' }, + 'playlist_count': 9, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '198180', + 'ext': 'mp4', + 'title': 'Recap: CLE 5, LAA 4', + 'description': '8/14/16: Naquin, Almonte lead Indians in 5-4 win', + 'duration': 57.343, + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160815', + 'timestamp': 1471221961, + }, + }], 'params': { # m3u8 download 'skip_download': True, }, } - _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod def _extract_url(cls, webpage): @@ -39,48 +49,41 @@ class SendtoNewsIE(JWPlatformBaseIE): .*\bSC=(?P[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: - sk, mk, pk = mobj.group('SC').split('-') - return cls._URL_TEMPLATE % (sk, mk, pk) + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - params = compat_parse_qs(mobj.group('query')) + playlist_id = self._match_id(url) - if 'SK' not in params or 'MK' not in params or 'PK' not in params: - raise ExtractorError('Invalid URL', expected=True) + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) - video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, rtmp_params={'no_resume': True}) - webpage = self._download_webpage(url, video_id) + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'], + 'description': video.get('S_fullStory'), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) - jwplayer_data_str = self._search_regex( - r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') - js_vars = { - 'w': 1024, - 'h': 768, - 'modeVar': 'html5', - } - for name, val in js_vars.items(): - js_val = '%d' % val if isinstance(val, int) else '"%s"' % val - jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) - - info_dict = self._parse_jwplayer_data( - self._parse_json(jwplayer_data_str, video_id), - video_id, require_title=False, rtmp_params={'no_resume': True}) - - title = self._html_search_regex( - r']+class="embedTitle">([^<]+)
', webpage, 'title') - description = self._html_search_regex( - r']+class="embedSubTitle">([^<]+)', webpage, - 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r']+class="embedDetails">([0-9:]+)', webpage, - 'duration', fatal=False)) - - info_dict.update({ - 'title': title, - 'description': description, - 'duration': duration, - }) - - return info_dict + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index e7e5f653e..d592dfeb8 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -6,7 +6,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, ) @@ -37,28 +36,33 @@ class SharedIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage, urlh = self._download_webpage_handle(url, video_id) if '>File does not exist<' in webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) download_form = self._hidden_inputs(webpage) - request = sanitized_Request( - url, urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( - request, video_id, 'Downloading video page') + urlh.geturl(), video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': urlh.geturl(), + }) video_url = self._html_search_regex( - r'data-url="([^"]+)"', video_page, 'video URL') + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') title = base64.b64decode(self._html_search_meta( 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( - r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None) + r'data-poster=(["\'])(?P(?:(?!\1).)+)\1', + video_page, 'thumbnail', default=None, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 5c3fd0fec..114358786 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -13,20 +13,21 @@ from ..utils import ( sanitized_Request, unified_strdate, urlencode_postdata, + xpath_text, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?Pv(?P[0-9]+)[a-z0-9]{4})' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?Pv(?P[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '2a7b08249e6f5636557579c368040eb9', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', @@ -174,11 +175,11 @@ class SmotriIE(InfoExtractor): if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - video = self._download_json(request, video_id, 'Downloading video JSON') + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) video_url = video.get('_vidURL') or video.get('_vidURL_mp4') @@ -196,11 +197,11 @@ class SmotriIE(InfoExtractor): raise ExtractorError(msg, expected=True) title = video['title'] - thumbnail = video['_imgURL'] - upload_date = unified_strdate(video['added']) - uploader = video['userNick'] - uploader_id = video['userLogin'] - duration = int_or_none(video['duration']) + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -209,7 +210,7 @@ class SmotriIE(InfoExtractor): # Warning if video is unavailable warning = self._html_search_regex( - r'
(.*?)
', webpage, + r']+class="videoUnModer"[^>]*>(.+?)', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( @@ -217,20 +218,22 @@ class SmotriIE(InfoExtractor): (video_id, warning)) # Adult content - if re.search('EroConfirmText">', webpage) is not None: + if 'EroConfirmText">' in webpage: self.report_age_confirmation() confirm_string = self._html_search_regex( - r'
' % video_id, + r']+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( - 'Общее количество просмотров.*?(\\d+)', - webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) + r'(?s)Общее количество просмотров.*?(\d+)', + webpage, 'view count', fatal=False) return { 'id': video_id, @@ -249,37 +252,33 @@ class SmotriIE(InfoExtractor): class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', - 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - community_id = mobj.group('communityid') + community_id = self._match_id(url) - url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, 'Downloading community RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - community_title = self._html_search_regex( - '^Видео сообщества "([^"]+)"$', description_text, 'community title') - - return self.playlist_result(entries, community_id, community_title) + return self.playlist_result(entries, community_id) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { @@ -290,19 +289,19 @@ class SmotriUserIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('userid') + user_id = self._match_id(url) - url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, 'Downloading user RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - user_nickname = self._html_search_regex( - '^Видео режиссера (.*)$', description_text, - 'user nickname') + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) return self.playlist_result(entries, user_id, user_nickname) @@ -310,11 +309,11 @@ class SmotriUserIE(InfoExtractor): class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' - _VALID_URL = r'^https?://(?:www\.)?(?Psmotri\.com/live/(?P[^/]+))/?.*' + _VALID_URL = r'https?://(?:www\.)?(?Psmotri\.com/live/(?P[^/]+))/?.*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('broadcastid') + broadcast_id = mobj.group('id') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') @@ -328,7 +327,8 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Erotic broadcasts allowed only for registered users') + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -343,8 +343,9 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') - if re.search('>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError('Unable to log in: bad username or password', expected=True) + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) adult_content = True else: @@ -383,11 +384,11 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_thumbnail = broadcast_json.get('_imgURL') broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json['description'] - broadcaster_nick = broadcast_json['nick'] - broadcaster_login = broadcast_json['login'] + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index 0d1ab07f8..4819fe5b4 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -5,9 +5,9 @@ import re from .common import InfoExtractor from ..utils import ( - float_or_none, - str_to_int, parse_duration, + parse_filesize, + str_to_int, ) @@ -17,21 +17,24 @@ class SnotrIE(InfoExtractor): 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Drone flying through fireworks!', - 'duration': 247, - 'filesize_approx': 98566144, + 'duration': 248, + 'filesize_approx': 40700000, 'description': 'A drone flying through Fourth of July Fireworks', - } + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize_approx': 8912896, + 'filesize_approx': 8500000, 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': 're:^https?://.*\.jpg$', } }] @@ -43,26 +46,28 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] view_count = str_to_int(self._html_search_regex( - r'

\nViews:\n([\d,\.]+)

', + r']*>\s*]*>Views:\s*]*>([\d,\.]+)', webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( - r'

\nLength:\n\s*([0-9:]+).*?

', + r']*>\s*]*>Length:\s*]*>([\d:]+)', webpage, 'duration', fatal=False)) - filesize_approx = float_or_none(self._html_search_regex( - r'

\nFilesize:\n\s*([0-9.]+)\s*megabyte

', - webpage, 'filesize', fatal=False), invscale=1024 * 1024) + filesize_approx = parse_filesize(self._html_search_regex( + r']*>\s*]*>Filesize:\s*]*>([^<]+)', + webpage, 'filesize', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'description': description, 'title': title, - 'url': video_url, 'view_count': view_count, 'duration': duration, 'filesize_approx': filesize_approx, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 72fe66142..48e2ba2dd 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -14,10 +14,10 @@ from ..utils import ExtractorError class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?Pmy\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P\d+)\.shtml.*?' + # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', @@ -26,7 +26,6 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -34,7 +33,6 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -48,7 +46,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -56,7 +53,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -64,7 +60,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py new file mode 100644 index 000000000..accd112aa --- /dev/null +++ b/youtube_dl/extractor/sonyliv.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': '5024612095001', + 'ext': 'mp4', + 'upload_date': '20160707', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '4338955589001', + 'timestamp': 1467870968, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 194dabc71..9635c2b49 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -32,7 +32,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P[\w\d-]+)/ - (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -119,6 +119,12 @@ class SoundcloudIE(InfoExtractor): _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen('%s: Resolving id' % video_id) @@ -259,6 +265,9 @@ class SoundcloudSetIE(SoundcloudIE): 'title': 'The Royal Concept EP', }, 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b650468..e2a9e45ac 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -17,6 +17,8 @@ class SouthParkIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', }, }] @@ -28,7 +30,12 @@ class SouthParkEsIE(SouthParkIE): _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, 'playlist_count': 4, + 'skip': 'Geo-restricted', }] @@ -42,17 +49,27 @@ class SouthParkDeIE(SouthParkIE): 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', + 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }] @@ -63,7 +80,11 @@ class SouthParkNlIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, }] @@ -74,5 +95,9 @@ class SouthParkDkIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, }] diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 50433d0f6..186d22b7d 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -14,7 +14,7 @@ class SpankBangIE(InfoExtractor): 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', - 'description': 'dillion harper masturbates on a bed', + 'description': 'Watch fantasy solo free HD porn video - 05 minutes - dillion harper masturbates on a bed free adult movies.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, @@ -44,12 +44,10 @@ class SpankBangIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') - description = self._search_regex( - r'class="desc"[^>]*>([^<]+)', - webpage, 'description', default=None) + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( - r'class="user"[^>]*>([^<]+)', + r'class="user"[^>]*><img[^>]+>([^<]+)', webpage, 'uploader', fatal=False) age_limit = self._rta_search(webpage) diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 182f286df..218785ee4 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -4,26 +4,31 @@ from .mtv import MTVServicesInfoExtractor class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| - m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+)) - ''' - _TEST = { + _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', 'md5': '1a9265f32b0c375793d6c4ce45255256', 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', - 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', + 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + 'timestamp': 1388120400, + 'upload_date': '20131227', }, - } + }, { + 'url': 'http://www.spike.com/video-clips/lhtu8m/', + 'only_matching': True, + }, { + 'url': 'http://www.spike.com/video-clips/lhtu8m', + 'only_matching': True, + }, { + 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', + 'only_matching': True, + }, { + 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] _FEED_URL = 'http://www.spike.com/feeds/mrss/' _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - - def _real_extract(self, url): - mobile_id = self._match_id(url) - if mobile_id: - url = 'http://www.spike.com/video-clips/%s' % mobile_id - return super(SpikeIE, self)._real_extract(url) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py new file mode 100644 index 000000000..1c61437a4 --- /dev/null +++ b/youtube_dl/extractor/streamable.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': 're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': 're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://streamable.com/ajax/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index e527aa971..ef9be7926 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -12,25 +12,29 @@ from ..utils import ( class SunPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', - 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', + 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } - } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'<title>([^<]+)', webpage, 'title') @@ -40,7 +44,8 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'itemprop="duration">\s*(\d+:\d+)\s*<', + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( @@ -48,7 +53,7 @@ class SunPornoIE(InfoExtractor): webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+) Comments?', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', fatal=False, default=None)) formats = [] quality = qualities(['mp4', 'flv']) diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 5ca079f88..ab8bab5cd 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -1,46 +1,58 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + smuggle_url, +) -class SyfyIE(InfoExtractor): - _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P[0-9]+)|(?!videos)(?P[^/]+)(?:$|[?#]))' - +class SyfyIE(AdobePassIE): + _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' _TESTS = [{ - 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', 'info_dict': { - 'id': 'NmqMrGnXvmO1', - 'ext': 'flv', - 'title': 'George Lucas has Advice for his Daughter', - 'description': 'Listen to what insights George Lucas give his daughter Amanda.', + 'id': '2968097', + 'ext': 'mp4', + 'title': 'The Internet Ruined My Life: Season 1 Trailer', + 'description': 'One tweet, one post, one click, can destroy everything.', + 'uploader': 'NBCU-MPAT', + 'upload_date': '20170113', + 'timestamp': 1484345640, + }, + 'params': { + # m3u8 download + 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.syfy.com/wilwheaton', - 'md5': '94dfa54ee3ccb63295b276da08c415f6', - 'info_dict': { - 'id': '4yoffOOXC767', - 'ext': 'flv', - 'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.', - 'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.', - }, - 'add_ie': ['ThePlatform'], - 'skip': 'Blocked outside the US', }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_name = mobj.group('video_name') - if video_name: - generic_webpage = self._download_webpage(url, video_name) - video_id = self._search_regex( - r'', - generic_webpage, 'video ID') - url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % ( - video_name, video_name, video_id) - else: - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage)) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + syfy_mpx = list(self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), + display_id)['syfy']['syfy_mpx'].values())[0] + video_id = syfy_mpx['mpxGUID'] + title = syfy_mpx['episodeTitle'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if syfy_mpx.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'syfy', title, video_id, + syfy_mpx.get('mpxRating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'syfy', resource) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + self._proto_relative_url(syfy_mpx['releaseURL']), query), + {'force_smil_url': True}), + 'title': title, + 'id': video_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py deleted file mode 100644 index ed560bd24..000000000 --- a/youtube_dl/extractor/tapely.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - float_or_none, - parse_iso8601, - sanitized_Request, -) - - -class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' - _API_URL = 'http://tape.ly/showtape?id={0:}' - _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' - _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' - _TESTS = [ - { - 'url': 'http://tape.ly/my-grief-as-told-by-water', - 'info_dict': { - 'id': 23952, - 'title': 'my grief as told by water', - 'thumbnail': 're:^https?://.*\.png$', - 'uploader_id': 16484, - 'timestamp': 1411848286, - 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', - }, - 'playlist_count': 13, - }, - { - 'url': 'http://tape.ly/my-grief-as-told-by-water/1', - 'md5': '79031f459fdec6530663b854cbc5715c', - 'info_dict': { - 'id': 258464, - 'title': 'Dreaming Awake (My Brightest Diamond)', - 'ext': 'm4a', - }, - }, - { - 'url': 'https://tapely.com/my-grief-as-told-by-water', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - playlist_url = self._API_URL.format(display_id) - request = sanitized_Request(playlist_url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - request.add_header('Accept', 'application/json') - request.add_header('Referer', url) - - playlist = self._download_json(request, display_id) - - tape = playlist['tape'] - - entries = [] - for s in tape['songs']: - song = s['song'] - entry = { - 'id': song['id'], - 'duration': float_or_none(song.get('songduration'), 1000), - 'title': song['title'], - } - if song['source'] == 'S3': - entry.update({ - 'url': self._S3_SONG_URL.format(song['filename']), - }) - entries.append(entry) - elif song['source'] == 'YT': - self.to_screen('YouTube video detected') - yt_id = song['filename'].replace('/youtube/', '') - entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) - entries.append(entry) - elif song['source'] == 'SC': - self.to_screen('SoundCloud song detected') - sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) - entry.update(self.url_result(sc_url, 'Soundcloud')) - entries.append(entry) - else: - self.report_warning('Unknown song source: %s' % song['source']) - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - try: - return entries[songnr] - except IndexError: - raise ExtractorError( - 'No song with index: %s' % mobj.group('songnr'), - expected=True) - - return { - '_type': 'playlist', - 'id': tape['id'], - 'display_id': display_id, - 'title': tape['name'], - 'entries': entries, - 'thumbnail': tape.get('image_url'), - 'description': clean_html(tape.get('subtext')), - 'like_count': tape.get('likescount'), - 'uploader_id': tape.get('user_id'), - 'timestamp': parse_iso8601(tape.get('published_at')), - } diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py new file mode 100644 index 000000000..79b00e376 --- /dev/null +++ b/youtube_dl/extractor/tbs.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..utils import ( + extract_attributes, + ExtractorError, +) + + +class TBSIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?Ptbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P[^/?#]+)\.html' + _TESTS = [{ + 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', + 'md5': '9e61d680e2285066ade7199e6408b2ee', + 'info_dict': { + 'id': '2007318', + 'ext': 'mp4', + 'title': 'Theatrical Trailer', + 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', + } + }, { + 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', + 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', + 'info_dict': { + 'id': '1538823', + 'ext': 'mp4', + 'title': 'You Better Run', + 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', + } + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + site = domain[:3] + webpage = self._download_webpage(url, display_id) + video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) + if video_params.get('isAuthRequired') == 'true': + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported.', expected=True) + query = None + clip_id = video_params.get('clipid') + if clip_id: + query = 'id=' + clip_id + else: + query = 'titleId=' + video_params['titleid'] + return self._extract_cvp_info( + 'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { + 'default': { + 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, + }, + 'secure': { + 'media_src': 'http://apple-secure.cdn.turner.com/%s/big' % site, + 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, + }, + }) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 9092e9b85..58078c531 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -47,11 +47,10 @@ class TelegraafIE(InfoExtractor): ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', m3u8_id='hls')) + manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # not implemented yet - continue + formats.extend(self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False)) else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index bb3efc4ea..23067e8c6 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,10 +6,10 @@ import time import hmac import binascii import hashlib -import netrc from .once import OnceIE +from .adobepass import AdobePassIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -25,9 +25,6 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, - unescapeHTML, - urlencode_postdata, - unified_timestamp, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -76,10 +73,10 @@ class ThePlatformBaseIE(OnceIE): if isinstance(captions, list): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ + subtitles.setdefault(lang, []).append({ 'ext': mimetype2ext(mime), 'url': src, - }] + }) return { 'title': info['title'], @@ -96,7 +93,7 @@ class ThePlatformBaseIE(OnceIE): return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE): +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? @@ -167,7 +164,6 @@ class ThePlatformIE(ThePlatformBaseIE): 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', 'only_matching': True, }] - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' @classmethod def _extract_urls(cls, webpage): @@ -202,96 +198,6 @@ class ThePlatformIE(ThePlatformBaseIE): sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)' % (tag, tag), xml_str, tag) - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - } - - guid = xml_text(resource, 'guid') - requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token: - token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) - if token_expires and token_expires >= time.time(): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = 'DTV' - login_info = netrc.netrc().authenticators(mso_id) - if not login_info: - return None - - def post_form(form_page, note, data={}): - post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') - return self._download_webpage( - post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - provider_redirect_page = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - provider_login_page = post_form( - provider_redirect_page, 'Downloading Provider Login Page') - mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { - 'username': login_info[0], - 'password': login_info[2], - }) - post_form(mvpd_confirm_page, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - return self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py index ba1380abc..c3f118894 100644 --- a/youtube_dl/extractor/thestar.py +++ b/youtube_dl/extractor/thestar.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs class TheStarIE(InfoExtractor): @@ -30,6 +28,9 @@ class TheStarIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + brightcove_id = self._search_regex( + r'mainartBrightcoveVideoId["\']?\s*:\s*["\']?(\d+)', + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py deleted file mode 100644 index 406f4a826..000000000 --- a/youtube_dl/extractor/thvideo.py +++ /dev/null @@ -1,84 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate -) - - -class THVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/v/th1987/', - 'md5': 'fa107b1f73817e325e9433505a70db50', - 'info_dict': { - 'id': '1987', - 'ext': 'mp4', - 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', - 'display_id': 'th1987', - 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', - 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', - 'upload_date': '20140722' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # extract download link from mobile player page - webpage_player = self._download_webpage( - 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), - video_id, note='Downloading video source page') - video_url = self._html_search_regex( - r'', webpage, - 'upload date', fatal=False)) - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date - } - - -class THVideoPlaylistIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/mylist2', - 'info_dict': { - 'id': '2', - 'title': '幻想万華鏡', - }, - 'playlist_mincount': 23, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - list_title = self._html_search_regex( - r'

(.*?)[^/]+)/video(?P\d+)' - _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r']+name="description"[^>]+content="([^"]+)"' - _UPLOADER_REGEX = r'\s*Verified Member\s*\s*

(.+?)

' + _TITLE_REGEX = r'(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)' + _DESCRIPTION_REGEX = r'(?s)>Description:]+>(.+?)<' + _UPLOADER_REGEX = r'\s*Verified Member\s*\s*(.+?)<' _CATEGORIES_REGEX = r'(?s)]*>Categories:(.+?)' _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': '7e569419fe6d69543d01e6be22f5f7c4', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', @@ -211,11 +217,11 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': 'fcba2636572895aba116171a899a5658', + 'md5': '0f5d4d490dbfd117b8607054248a07c0', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Educational xxx video', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': 're:https?://.*\.jpg$', diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py deleted file mode 100644 index d55e0c563..000000000 --- a/youtube_dl/extractor/trutube.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .nuevo import NuevoBaseIE - - -class TruTubeIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P\d+)' - _TESTS = [{ - 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', - 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', - 'info_dict': { - 'id': '14880', - 'ext': 'flv', - 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', - 'thumbnail': 're:^http:.*\.jpg$', - } - }, { - 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_nuevo( - 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1d9271d1e..4053f6c21 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, - sanitized_Request, str_to_int, ) -from ..aes import aes_decrypt_text +from .keezmovies import KeezMoviesIE -class Tube8IE(InfoExtractor): +class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', @@ -33,47 +28,17 @@ class Tube8IE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, display_id) + if not info['title']: + info['title'] = self._html_search_regex( + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), - video_id) - - formats = [] - for key, video_url in flashvars.items(): - if not isinstance(video_url, compat_str) or not video_url.startswith('http'): - continue - height = self._search_regex( - r'quality_(\d+)[pP]', key, 'height', default=None) - if not height: - continue - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text( - video_url, flashvars['video_title'], 32).decode('utf-8') - formats.append({ - 'url': video_url, - 'format_id': '%sp' % height, - 'height': int(height), - }) - self._sort_formats(formats) - - thumbnail = flashvars.get('image_url') - - title = self._html_search_regex( - r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( r'>Description:\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'\s*(.+?)\s*<', webpage, 'uploader', fatal=False) - duration = int_or_none(flashvars.get('video_duration')) like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) @@ -86,18 +51,13 @@ class Tube8IE(InfoExtractor): r'(\d+)', webpage, 'comment count', fatal=False)) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, + info.update({ 'description': description, - 'thumbnail': thumbnail, 'uploader': uploader, - 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } + }) + + return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py new file mode 100644 index 000000000..b59dafda6 --- /dev/null +++ b/youtube_dl/extractor/turner.py @@ -0,0 +1,178 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + xpath_text, + int_or_none, + determine_ext, + parse_duration, + xpath_attr, + update_url_query, + compat_urlparse, +) + + +class TurnerBaseIE(InfoExtractor): + def _extract_timestamp(self, video_data): + return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + + def _extract_cvp_info(self, data_src, video_id, path_data={}): + video_data = self._download_xml(data_src, video_id) + video_id = video_data.attrib['id'] + title = xpath_text(video_data, 'headline', fatal=True) + # rtmp_src = xpath_text(video_data, 'akamai/src') + # if rtmp_src: + # splited_rtmp_src = rtmp_src.split(',') + # if len(splited_rtmp_src) == 2: + # rtmp_src = splited_rtmp_src[1] + # aifp = xpath_text(video_data, 'akamai/aifp', default='') + + tokens = {} + urls = [] + formats = [] + rex = re.compile( + r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') + # Possible formats locations: files/file, files/groupFiles/files + # and maybe others + for video_file in video_data.findall('.//file'): + video_url = video_file.text.strip() + if not video_url: + continue + ext = determine_ext(video_url) + if video_url.startswith('/mp4:protected/'): + continue + # TODO Correct extraction for these files + # protected_path_data = path_data.get('protected') + # if not protected_path_data or not rtmp_src: + # continue + # protected_path = self._search_regex( + # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') + # auth = self._download_webpage( + # protected_path_data['tokenizer_src'], query={ + # 'path': protected_path, + # 'videoId': video_id, + # 'aifp': aifp, + # }) + # token = xpath_text(auth, 'token') + # if not token: + # continue + # video_url = rtmp_src + video_url + '?' + token + elif video_url.startswith('/secure/'): + secure_path_data = path_data.get('secure') + if not secure_path_data: + continue + video_url = secure_path_data['media_src'] + video_url + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = tokens.get(secure_path) + if not token: + auth = self._download_xml( + secure_path_data['tokenizer_src'], video_id, query={ + 'path': secure_path, + 'videoId': video_id, + }) + token = xpath_text(auth, 'token') + if not token: + continue + tokens[secure_path] = token + video_url = video_url + '?hdnea=' + token + elif not re.match('https?://', video_url): + base_path_data = path_data.get(ext, path_data.get('default', {})) + media_src = base_path_data.get('media_src') + if not media_src: + continue + video_url = media_src + video_url + if video_url in urls: + continue + urls.append(video_url) + format_id = video_file.get('bitrate') + if ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=format_id or 'hls', + fatal=False) + if m3u8_formats: + # Sometimes final URLs inside m3u8 are unsigned, let's fix this + # ourselves + qs = compat_urlparse.urlparse(video_url).query + if qs: + query = compat_urlparse.parse_qs(qs) + for m3u8_format in m3u8_formats: + m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + m3u8_format['extra_param_to_segment_url'] = qs + formats.extend(m3u8_formats) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, {'hdcore': '3.7.0'}), + video_id, f4m_id=format_id or 'hds', fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + } + mobj = rex.search(format_id + video_url) + if mobj: + f.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + elif isinstance(format_id, compat_str): + if format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + for source in video_data.findall('closedCaptions/source'): + for track in source.findall('track'): + track_url = track.get('url') + if not isinstance(track_url, compat_str) or track_url.endswith('/big'): + continue + lang = track.get('lang') or track.get('label') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_url, + 'ext': { + 'scc': 'scc', + 'webvtt': 'vtt', + 'smptett': 'tt', + }.get(source.get('format')) + }) + + thumbnails = [{ + 'id': image.get('cut'), + 'url': image.text, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in video_data.findall('images/image')] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': xpath_text(video_data, 'description'), + 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), + 'timestamp': self._extract_timestamp(video_data), + 'upload_date': xpath_attr(video_data, 'metas', 'version'), + 'series': xpath_text(video_data, 'showTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + } diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 86bb7915d..f225ec684 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, int_or_none, float_or_none, + js_to_json, parse_iso8601, remove_end, ) @@ -54,10 +55,11 @@ class TV2IE(InfoExtractor): ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id)) + video_url, video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id)) + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'ism' or video_url.endswith('.ism/Manifest'): pass else: @@ -105,7 +107,7 @@ class TV2ArticleIE(InfoExtractor): 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', + 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', 'description': 'md5:339573779d3eea3542ffe12006190954', }, 'playlist_count': 2, @@ -119,9 +121,23 @@ class TV2ArticleIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) + # Old embed pattern (looks unused nowadays) + assets = re.findall(r'data-assetid=["\'](\d+)', webpage) + + if not assets: + # New embed pattern + for v in re.findall('TV2ContentboxVideo\(({.+?})\)', webpage): + video = self._parse_json( + v, playlist_id, transform_source=js_to_json, fatal=False) + if not video: + continue + asset = video.get('assetId') + if asset: + assets.append(asset) + entries = [ - self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') - for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') + for asset_id in assets] title = remove_end(self._og_search_title(webpage), ' - TV2.no') description = remove_end(self._og_search_description(webpage), ' - TV2.no') diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index b73279dec..cb76a2a58 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -9,56 +9,23 @@ class TVLandIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', - 'playlist': [ - { - 'md5': '227e9723b9669c05bf51098b10287aa7', - 'info_dict': { - 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', - } - }, - { - 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', - 'info_dict': { - 'id': 'f4279548-6e13-40dd-92e8-860d27289197', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', - } - }, - { - 'md5': 'fde4c3bccd7cc7e3576b338734153cec', - 'info_dict': { - 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', - } - }, - { - 'md5': '247f6780cda6891f2e49b8ae2b10e017', - 'info_dict': { - 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', - } - }, - { - 'md5': 'fd269f33256e47bad5eb6c40de089ff6', - 'info_dict': { - 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', - } - } - ], + 'info_dict': { + 'description': 'md5:80973e81b916a324e05c14a3fb506d29', + 'title': 'The Invasion', + }, + 'playlist': [], }, { 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', 'md5': 'e2c6389401cf485df26c79c247b08713', 'info_dict': { 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', 'ext': 'mp4', - 'title': 'Younger|Younger: Hilary Duff - Little Lies', - 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + 'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269', + 'upload_date': '20151228', + 'timestamp': 1451289600, }, }] diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 5070082da..2abfb7830 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -24,6 +24,7 @@ class TVPIE(InfoExtractor): 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -32,6 +33,16 @@ class TVPIE(InfoExtractor): 'id': '17916176', 'ext': 'mp4', 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + }, + }, { + # page id is not the same as video id(#7799) + 'url': 'http://vod.tvp.pl/22704887/08122015-1500', + 'md5': 'cf6a4705dfd1489aef8deb168d6ba742', + 'info_dict': { + 'id': '22680786', + 'ext': 'mp4', + 'title': 'Wiadomości, 08.12.2015, 15:00', }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', @@ -53,6 +64,39 @@ class TVPIE(InfoExtractor): 'only_matching': True, }] + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex([ + r']+src="[^"]*?object_id=(\d+)', + "object_id\s*:\s*'(\d+)'"], webpage, 'video id') + return { + '_type': 'url_transparent', + 'url': 'tvp:' + video_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'ie_key': 'TVPEmbed', + } + + +class TVPEmbedIE(InfoExtractor): + IE_NAME = 'tvp:embed' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P\d+)' + + _TESTS = [{ + 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', + 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'info_dict': { + 'id': '22670268', + 'ext': 'mp4', + 'title': 'Panorama, 07.12.2015, 15:40', + }, + }, { + 'url': 'tvp:22670268', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) @@ -89,8 +133,8 @@ class TVPIE(InfoExtractor): r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url, 'video base url', default=None) if video_url_base: - # TODO: Current DASH formats are broken - $Time$ pattern in - # not implemented yet + # TODO: found instead of in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. # formats.extend(self._extract_mpd_formats( # video_url_base + '.ism/video.mpd', # video_id, mpd_id='dash', fatal=False)) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 918f8f8bc..4186e82db 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -4,27 +4,42 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urlparse, +) from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, parse_iso8601, qualities, - determine_ext, + try_get, update_url_query, - int_or_none, ) class TVPlayIE(InfoExtractor): - IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:tvplay(?:\.skaties)?\.lv/parraides| - (?:tv3play|play\.tv3)\.lt/programos| - tv3play(?:\.tv3)?\.ee/sisu| - tv(?:3|6|8|10)play\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|tv3play\.dk)/programmer| - play\.novatv\.bg/programi - )/[^/]+/(?P\d+) - ''' + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv/parraides| + (?:tv3play|play\.tv3)\.lt/programos| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play|viafree)\.se/program| + (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| + play\.novatv\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P\d+) + ''' _TESTS = [ { 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', @@ -34,6 +49,9 @@ class TVPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Kādi ir īri? - Viņas melo labāk', 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', + 'series': 'Viņas melo labāk', + 'season': '2.sezona', + 'season_number': 2, 'duration': 25, 'timestamp': 1406097056, 'upload_date': '20140723', @@ -46,6 +64,10 @@ class TVPlayIE(InfoExtractor): 'ext': 'flv', 'title': 'Moterys meluoja geriau', 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', + 'series': 'Moterys meluoja geriau', + 'episode_number': 47, + 'season': '1 sezonas', + 'season_number': 1, 'duration': 1330, 'timestamp': 1403769181, 'upload_date': '20140626', @@ -182,9 +204,22 @@ class TVPlayIE(InfoExtractor): 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, { 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', 'only_matching': True, + }, + { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, } ] @@ -192,16 +227,19 @@ class TVPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') title = video['title'] - if video.get('is_geo_blocked'): - self.report_warning( - 'This content might not be available in your country due to copyright reasons') - - streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') + try: + streams = self._download_json( + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, + video_id, 'Downloading streams JSON') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + raise ExtractorError(msg['msg'], expected=True) + raise quality = qualities(['hls', 'medium', 'high']) formats = [] @@ -226,7 +264,8 @@ class TVPlayIE(InfoExtractor): 'ext': ext, } if video_url.startswith('rtmp'): - m = re.search(r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', video_url) + m = re.search( + r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', video_url) if not m: continue fmt.update({ @@ -240,15 +279,94 @@ class TVPlayIE(InfoExtractor): 'url': video_url, }) formats.append(fmt) + + if not formats and video.get('is_geo_blocked'): + self.raise_geo_restricted( + 'This content might not be available in your country due to copyright reasons') + self._sort_formats(formats) + # TODO: webvtt in m3u8 + subtitles = {} + sami_path = video.get('sami_path') + if sami_path: + lang = self._search_regex( + r'_([a-z]{2})\.xml', sami_path, 'lang', + default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) + subtitles[lang] = [{ + 'url': sami_path, + }] + + series = video.get('format_title') + episode_number = int_or_none(video.get('format_position', {}).get('episode')) + season = video.get('_embedded', {}).get('season', {}).get('title') + season_number = int_or_none(video.get('format_position', {}).get('season')) + return { 'id': video_id, 'title': title, 'description': video.get('description'), + 'series': series, + 'episode_number': episode_number, + 'season': season, + 'season_number': season_number, 'duration': int_or_none(video.get('duration')), 'timestamp': parse_iso8601(video.get('created_at')), - 'view_count': int_or_none(video.get('views', {}).get('total')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), 'age_limit': int_or_none(video.get('age_limit', 0)), 'formats': formats, + 'subtitles': subtitles, } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'info_dict': { + 'id': '395375', + 'ext': 'mp4', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', + 'season_number': 2, + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](?P\d{6,})', + webpage, 'video id') + + return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 4025edf02..af92b713b 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -12,32 +12,32 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' - _TESTS = [ - { - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, }, - { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - } - ] + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -64,7 +64,7 @@ class TwentyFourVideoIE(InfoExtractor): r'(\d+) просмотр', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'
(\d+) комментари', + r']+href="#tab-comments"[^>]*>(\d+) комментари', webpage, 'comment count', fatal=False)) # Sets some cookies diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 67b1277cc..359a8859c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,6 +7,7 @@ import random from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -14,13 +15,13 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, js_to_json, orderedSet, parse_duration, parse_iso8601, - sanitized_Request, urlencode_postdata, ) @@ -42,7 +43,7 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): + def _call_api(self, path, item_id, note): headers = { 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', 'X-Requested-With': 'XMLHttpRequest', @@ -50,8 +51,8 @@ class TwitchBaseIE(InfoExtractor): for cookie in self._downloader.cookiejar: if cookie.name == 'api_token': headers['Twitch-Api-Token'] = cookie.value - request = sanitized_Request(url, headers=headers) - response = super(TwitchBaseIE, self)._download_json(request, video_id, note) + response = self._download_json( + '%s/%s' % (self._API_BASE, path), item_id, note) self._handle_error(response) return response @@ -63,9 +64,17 @@ class TwitchBaseIE(InfoExtractor): if username is None: return + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + login_form = self._hidden_inputs(login_page) login_form.update({ @@ -82,21 +91,24 @@ class TwitchBaseIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(redirect_url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', redirect_url) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + headers = {'Referer': redirect_url} - error_message = self._search_regex( - r']+class="subwindow_notice"[^>]*>([^<]+)
', - response, 'error message', default=None) - if error_message: - raise ExtractorError( - 'Unable to login. Twitch said: %s' % error_message, expected=True) + try: + response = self._download_json( + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise - if '>Reset your password<' in response: - self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + if response.get('redirect'): + self._download_webpage( + response['redirect'], None, 'Downloading login redirect page', + headers=headers) def _prefer_source(self, formats): try: @@ -109,14 +121,14 @@ class TwitchBaseIE(InfoExtractor): class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): - return self._extract_info(self._download_json( - '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + return self._extract_info(self._call_api( + 'kraken/videos/%s%s' % (item, item_id), item_id, 'Downloading %s info JSON' % self._ITEM_TYPE)) def _extract_media(self, item_id): info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + response = self._call_api( + 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] @@ -246,8 +258,8 @@ class TwitchVodIE(TwitchItemBaseIE): item_id = self._match_id(url) info = self._download_info(self._ITEM_SHORTCUT, item_id) - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + access_token = self._call_api( + 'api/vods/%s/access_token' % item_id, item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( @@ -275,12 +287,12 @@ class TwitchVodIE(TwitchItemBaseIE): class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 def _extract_playlist(self, channel_id): - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + info = self._call_api( + 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') channel_name = info.get('display_name') or info.get('name') entries = [] @@ -289,8 +301,8 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): broken_paging_detected = False counter_override = None for counter in itertools.count(1): - response = self._download_json( - self._PLAYLIST_URL % (channel_id, offset, limit), + response = self._call_api( + self._PLAYLIST_PATH % (channel_id, offset, limit), channel_id, 'Downloading %s videos JSON page %s' % (self._PLAYLIST_TYPE, counter_override or counter)) @@ -345,7 +357,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): IE_NAME = 'twitch:past_broadcasts' _VALID_URL = r'%s/(?P[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcasts=true' _PLAYLIST_TYPE = 'past broadcasts' _TEST = { @@ -389,8 +401,8 @@ class TwitchStreamIE(TwitchBaseIE): def _real_extract(self, url): channel_id = self._match_id(url) - stream = self._download_json( - '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + stream = self._call_api( + 'kraken/streams/%s' % channel_id, channel_id, 'Downloading stream JSON').get('stream') # Fallback on profile extraction if stream is offline @@ -405,8 +417,8 @@ class TwitchStreamIE(TwitchBaseIE): # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() - access_token = self._download_json( - '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, 'Downloading channel access token') query = { @@ -461,7 +473,7 @@ class TwitchClipsIE(InfoExtractor): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { @@ -473,7 +485,11 @@ class TwitchClipsIE(InfoExtractor): 'uploader': 'stereotype_', 'uploader_id': 'stereotype_', }, - } + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -485,15 +501,27 @@ class TwitchClipsIE(InfoExtractor): r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), video_id, transform_source=js_to_json) - video_url = clip['clip_video_url'] - title = clip['channel_title'] + title = clip.get('channel_title') or self._og_search_title(webpage) + + formats = [{ + 'url': option['source'], + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + } for option in clip.get('quality_options', []) if option.get('source')] + + if not formats: + formats = [{ + 'url': clip['clip_video_url'], + }] + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), 'uploader': clip.get('curator_login'), 'uploader_id': clip.get('curator_display_name'), + 'formats': formats, } diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py new file mode 100644 index 000000000..c27c64387 --- /dev/null +++ b/youtube_dl/extractor/uol.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + update_url_query, + str_or_none, +) + + +class UOLIE(InfoExtractor): + IE_NAME = 'uol.com.br' + _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P\d+|[\w-]+-[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', + 'md5': '25291da27dc45e0afb5718a8603d3816', + 'info_dict': { + 'id': '15951931', + 'ext': 'mp4', + 'title': 'Miss simpatia é encontrada morta', + 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', + } + }, { + 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9', + 'info_dict': { + 'id': '15954259', + 'ext': 'mp4', + 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', + 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', + } + }, { + 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/15954259', + 'only_matching': True, + }, { + 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', + 'only_matching': True, + }, { + 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'only_matching': True, + }, { + 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', + 'only_matching': True, + }] + + _FORMATS = { + '2': { + 'width': 640, + 'height': 360, + }, + '5': { + 'width': 1080, + 'height': 720, + }, + '6': { + 'width': 426, + 'height': 240, + }, + '7': { + 'width': 1920, + 'height': 1080, + }, + '8': { + 'width': 192, + 'height': 144, + }, + '9': { + 'width': 568, + 'height': 320, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if not video_id.isdigit(): + embed_page = self._download_webpage('https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, video_id) + video_id = self._search_regex(r'mediaId=(\d+)', embed_page, 'media id') + video_data = self._download_json( + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % video_id, + video_id)['item'] + title = video_data['title'] + + query = { + 'ver': video_data.get('numRevision', 2), + 'r': 'http://mais.uol.com.br', + } + formats = [] + for f in video_data.get('formats', []): + f_url = f.get('url') or f.get('secureUrl') + if not f_url: + continue + format_id = str_or_none(f.get('id')) + fmt = { + 'format_id': format_id, + 'url': update_url_query(f_url, query), + } + fmt.update(self._FORMATS.get(format_id, {})) + formats.append(fmt) + self._sort_formats(formats) + + tags = [] + for tag in video_data.get('tags', []): + tag_description = tag.get('description') + if not tag_description: + continue + tags.append(tag_description) + + return { + 'id': video_id, + 'title': title, + 'description': clean_html(video_data.get('desMedia')), + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')), + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py new file mode 100644 index 000000000..2cd22cf8a --- /dev/null +++ b/youtube_dl/extractor/uplynk.py @@ -0,0 +1,68 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' + _VALID_URL = r'https?://.*?\.uplynk\.com/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + display_id = video_id or external_id + formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = 'pbs=' + session_id + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_uplynk_info(url) + + +class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' + _TEST = None + + def _real_extract(self, url): + path, external_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self._extract_uplynk_info(content_url) diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py new file mode 100644 index 000000000..823340776 --- /dev/null +++ b/youtube_dl/extractor/usanetwork.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + smuggle_url, + update_url_query, +) + + +class USANetworkIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)' + _TEST = { + 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', + 'md5': '33c0d2ba381571b414024440d08d57fd', + 'info_dict': { + 'id': '3086229', + 'ext': 'mp4', + 'title': 'HPE Cybersecurity', + 'description': 'The more we digitize our world, the more vulnerable we are.', + 'upload_date': '20160818', + 'timestamp': 1471535460, + 'uploader': 'NBCU-USA', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_params = extract_attributes(self._search_regex( + r'(]+data-usa-tve-player-container[^>]*>)', webpage, 'player params')) + video_id = player_params['data-mpx-guid'] + title = player_params['data-episode-title'] + + account_pid, path = re.search( + r'data-src="(?:https?)?//player\.theplatform\.com/p/([^/]+)/.*?/(media/guid/\d+/\d+)', + webpage).groups() + + query = { + 'mbr': 'true', + } + if player_params.get('data-is-full-episode') == '1': + query['manifest'] = 'm3u' + + if player_params.get('data-entitlement') == 'auth': + adobe_pass = {} + drupal_settings = self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', fatal=False) + if drupal_settings: + drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) + if drupal_settings: + adobe_pass = drupal_settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'usa'), + title, video_id, player_params.get('data-episode-rating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) + + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ + '_type': 'url_transparent', + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, path), + query), {'force_smil_url': True}), + 'id': video_id, + 'title': title, + 'series': player_params.get('data-show-title'), + 'episode': title, + 'ie_key': 'ThePlatform', + }) + return info diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dff1bb702..e17988573 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -1,18 +1,23 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) +from ..utils import urlencode_postdata class Vbox7IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/(?:play:|emb/external\.php\?.*?\bvid=)(?P[\da-fA-F]+)' + _TESTS = [{ + 'url': 'http://vbox7.com/play:0946fff23c', + 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'info_dict': { + 'id': '0946fff23c', + 'ext': 'mp4', + 'title': 'Борисов: Притеснен съм за бъдещето на България', + }, + }, { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { @@ -20,43 +25,50 @@ class Vbox7IE(InfoExtractor): 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, - } + 'skip': 'georestricted', + }, { + 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + ']+src=(?P["\'])(?P(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', + webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) - # need to get the page 3 times for the correct jsSecretToken cookie - # which is necessary for the correct title - def get_session_id(): - redirect_page = self._download_webpage(url, video_id) - session_id_url = self._search_regex( - r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, - 'session id url') - self._download_webpage( - compat_urlparse.urljoin(url, session_id_url), video_id, - 'Getting session id') + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id) - get_session_id() - get_session_id() + title = self._html_search_regex( + r'(.+?)', webpage, 'title').split('/')[0].strip() - webpage = self._download_webpage(url, video_id, - 'Downloading redirect page') + video_url = self._search_regex( + r'src\s*:\s*(["\'])(?P.+?.mp4.*?)\1', + webpage, 'video url', default=None, group='url') - title = self._html_search_regex(r'(.*)', - webpage, 'title').split('/')[0].strip() + thumbnail_url = self._og_search_thumbnail(webpage) - info_url = 'http://vbox7.com/play/magare.do' - data = urlencode_postdata({'as3': '1', 'vid': video_id}) - info_request = sanitized_Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') - if info_response is None: - raise ExtractorError('Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + if not video_url: + info_response = self._download_webpage( + 'http://vbox7.com/play/magare.do', video_id, + 'Downloading info webpage', + data=urlencode_postdata({'as3': '1', 'vid': video_id}), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + final_url, thumbnail_url = map( + lambda x: x.split('=')[1], info_response.split('&')) + + if '/na.mp4' in video_url: + self.raise_geo_restricted() return { 'id': video_id, - 'url': final_url, + 'url': self._proto_relative_url(video_url, 'http:'), 'title': title, 'thumbnail': thumbnail_url, } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index b11cd254c..185756301 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -8,6 +8,7 @@ from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, + try_get, ) @@ -129,6 +130,11 @@ class VGTVIE(XstreamIE): 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', 'only_matching': True, }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -196,6 +202,12 @@ class VGTVIE(XstreamIE): info['formats'].extend(formats) + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted() + self._sort_formats(info['formats']) info.update({ diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py new file mode 100644 index 000000000..8742b607a --- /dev/null +++ b/youtube_dl/extractor/viceland.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import hashlib +import json + +from .adobepass import AdobePassIE +from ..compat import compat_HTTPError +from ..utils import ( + int_or_none, + parse_age_limit, + str_or_none, + parse_duration, + ExtractorError, + extract_attributes, +) + + +class VicelandIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' + _TEST = { + 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', + 'info_dict': { + 'id': '57608447973ee7705f6fbd4e', + 'ext': 'mp4', + 'title': 'CYBERWAR (Trailer)', + 'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.', + 'age_limit': 14, + 'timestamp': 1466008539, + 'upload_date': '20160615', + 'uploader_id': '11', + 'uploader': 'Viceland', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + watch_hub_data = extract_attributes(self._search_regex( + r'(?s)()', webpage, 'watch hub')) + video_id = watch_hub_data['vms-id'] + title = watch_hub_data['video-title'] + + query = {} + if watch_hub_data.get('video-locked') == '1': + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, + watch_hub_data.get('video-rating')) + query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + exp = int(time.time()) + 14400 + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + }) + + try: + preplay = self._download_json('https://www.viceland.com/en_us/preplay/%s' % video_id, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + error = json.loads(e.cause.read().decode()) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise + + video_data = preplay['video'] + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) + + subtitles = {} + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ + 'url': cc_url, + }] + + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': title, + 'description': base.get('body'), + 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), + 'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')), + 'timestamp': int_or_none(video_data.get('created_at')), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), + 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(watch_hub_data.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', + } diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index efa15e0b6..4351ac457 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -130,7 +130,7 @@ class VikiIE(VikiBaseIE): }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': 'feea2b1d7b3957f70886e6dfd8b8be84', + 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', 'info_dict': { 'id': '1067139v', 'ext': 'mp4', @@ -156,15 +156,11 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, 'skip': 'Blocked in the US', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '1f54697dabc8f13f31bf06bb2e4de6db', + 'md5': '5fa476a902e902783ac7a4d615cdbc7a', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -200,7 +196,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '013dc282714e22acf9447cad14ff1208', + 'md5': '1713ae35df5a521b31f6dc40730e7c9c', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -281,9 +277,16 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', 'm3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', + entry_protocol='m3u8_native', preference=-1, + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 92321d66e..7fd9b777b 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -28,23 +28,24 @@ class SprutoBaseIE(InfoExtractor): class VimpleIE(SprutoBaseIE): IE_DESC = 'Vimple - one-click video hosting' - _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P[\da-f-]{32,36})' - _TESTS = [ - { - 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', - 'md5': '2e750a330ed211d3fd41821c6ad9a279', - 'info_dict': { - 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', - 'ext': 'mp4', - 'title': 'Sunset', - 'duration': 20, - 'thumbnail': 're:https?://.*?\.jpg', - }, - }, { - 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', - 'only_matching': True, - } - ] + _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P[\da-f-]{32,36})' + _TESTS = [{ + 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', + 'md5': '2e750a330ed211d3fd41821c6ad9a279', + 'info_dict': { + 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', + 'ext': 'mp4', + 'title': 'Sunset', + 'duration': 20, + 'thumbnail': 're:https?://.*?\.jpg', + }, + }, { + 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', + 'only_matching': True, + }, { + 'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 3ee66e23e..cd22df25a 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,6 +1,7 @@ # encoding: utf-8 from __future__ import unicode_literals +import collections import re import json import sys @@ -16,7 +17,6 @@ from ..utils import ( get_element_by_class, int_or_none, orderedSet, - parse_duration, remove_start, str_to_int, unescapeHTML, @@ -52,8 +52,9 @@ class VKBaseIE(InfoExtractor): # what actually happens. # We will workaround this VK issue by resetting the remixlhk cookie to # the first one manually. - cookies = url_handle.headers.get('Set-Cookie') - if cookies: + for header, cookies in url_handle.headers.items(): + if header.lower() != 'set-cookie': + continue if sys.version_info[0] >= 3: cookies = cookies.encode('iso-8859-1') cookies = cookies.decode('utf-8') @@ -61,6 +62,7 @@ class VKBaseIE(InfoExtractor): if remixlhk: value, domain = remixlhk.groups() self._set_cookie(domain, 'remixlhk', value) + break login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, @@ -445,6 +447,9 @@ class VKWallPostIE(VKBaseIE): 'skip_download': True, }, }], + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # single YouTube embed, no leading - @@ -454,6 +459,9 @@ class VKWallPostIE(VKBaseIE): 'title': 'Sergey Gorbunov - Wall post 85155021_6319', }, 'playlist_count': 1, + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # wall page URL @@ -481,37 +489,41 @@ class VKWallPostIE(VKBaseIE): raise ExtractorError('VK said: %s' % error, expected=True) description = clean_html(get_element_by_class('wall_post_text', webpage)) - uploader = clean_html(get_element_by_class( - 'fw_post_author', webpage)) or self._og_search_description(webpage) + uploader = clean_html(get_element_by_class('author', webpage)) thumbnail = self._og_search_thumbnail(webpage) entries = [] - for audio in re.finditer(r'''(?sx) - ]+ - id=(?P["\'])audio_info(?P\d+_\d+).*?(?P=q1)[^>]+ - value=(?P["\'])(?Phttp.+?)(?P=q2) - .+? - ''', webpage): - audio_html = audio.group(0) - audio_id = audio.group('id') - duration = parse_duration(get_element_by_class('duration', audio_html)) - track = self._html_search_regex( - r']+id=["\']title%s[^>]*>([^<]+)' % audio_id, - audio_html, 'title', default=None) - artist = self._html_search_regex( - r'>([^<]+)
\s*&ndash', audio_html, - 'artist', default=None) - entries.append({ - 'id': audio_id, - 'url': audio.group('url'), - 'title': '%s - %s' % (artist, track) if artist and track else audio_id, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'artist': artist, - 'track': track, - }) + audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) + if audio_ids: + al_audio = self._download_webpage( + 'https://vk.com/al_audio.php', post_id, + note='Downloading audio info', fatal=False, + data=urlencode_postdata({ + 'act': 'reload_audio', + 'al': '1', + 'ids': ','.join(audio_ids) + })) + if al_audio: + Audio = collections.namedtuple( + 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) + audios = self._parse_json( + self._search_regex( + r'(.+?)', al_audio, 'audios', default='[]'), + post_id, fatal=False, transform_source=unescapeHTML) + if isinstance(audios, list): + for audio in audios: + a = Audio._make(audio[:6]) + entries.append({ + 'id': '%s_%s' % (a.user_id, a.id), + 'url': a.url, + 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, + 'thumbnail': thumbnail, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.artist, + 'track': a.track, + }) for video in re.finditer( r']+href=(["\'])(?P/video(?:-?[\d_]+).*?)\1', webpage): diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py new file mode 100644 index 000000000..7bdd8b1dc --- /dev/null +++ b/youtube_dl/extractor/vodplatform.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class VODPlatformIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/[eE]mbed/(?P[^/?#]+)' + _TEST = { + # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar + 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', + 'md5': '1db2b7249ce383d6be96499006e951fc', + 'info_dict': { + 'id': 'RufMcytHDolTH1MuKHY9Fw', + 'ext': 'mp4', + 'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = unescapeHTML(self._og_search_title(webpage)) + hidden_inputs = self._hidden_inputs(webpage) + + base_url = self._search_regex( + '(.*/)(?:playlist.m3u8|manifest.mpd)', + hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], + 'base url') + formats = self._extract_m3u8_formats( + base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + base_url + 'manifest.mpd', video_id, + mpd_id='dash', fatal=False)) + rtmp_formats = self._extract_smil_formats( + base_url + 'jwplayer.smil', video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index b73da5cd0..55e087bdb 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -17,12 +17,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', + 'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247§ion=recommend', 'info_dict': { - 'id': '922692425', + 'id': '1129900602', 'ext': '3gp', - 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 177, + 'title': 'Top 10 TV Convicts', + 'duration': 733, } } @@ -54,7 +54,7 @@ class VuClipIE(InfoExtractor): 'url': video_url, }] else: - formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] + formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats'] title = remove_end(self._html_search_regex( r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 48fc438ed..9f1b8b4b5 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, unified_strdate, HEADRequest, + int_or_none, ) @@ -30,48 +31,58 @@ class WatIE(InfoExtractor): }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', 'info_dict': { 'id': '11713075', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'upload_date': '20140816', - 'duration': 2910, }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], }, ] + _FORMATS = ( + (200, 416, 234), + (400, 480, 270), + (600, 640, 360), + (1200, 640, 360), + (1800, 960, 540), + (2500, 1280, 720), + ) + def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them - video_info = self._download_json( - 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] + video_data = self._download_json( + 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + self.report_warning( + '%s returned error: %s' % (self.IE_NAME, error_desc)) chapters = video_info['chapters'] - first_chapter = chapters[0] + if chapters: + first_chapter = chapters[0] - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) + # Otherwise we can continue and extract just one part, we have to use + # the video id for getting the video url + else: + first_chapter = video_info - date_diffusion = first_chapter.get('date_diffusion') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None + title = first_chapter['title'] def extract_url(path_template, url_type): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) @@ -83,36 +94,61 @@ class WatIE(InfoExtractor): expected=True) return red_url - m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') - http_url = extract_url('android5/%s.mp4', 'http') - formats = [] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - formats.extend(m3u8_formats) - formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - f = m3u8_format.copy() - f.update({ - 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - self._sort_formats(formats) + try: + http_url = extract_url('android5/%s.mp4', 'http') + m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + for m3u8_format in m3u8_formats: + vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') + if not vbr or not abr: + continue + format_id = m3u8_format['format_id'].replace('hls', 'http') + fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) + if self._is_valid_url(fmt_url, video_id, format_id): + f = m3u8_format.copy() + f.update({ + 'url': fmt_url, + 'format_id': format_id, + 'protocol': 'http', + }) + formats.append(f) + self._sort_formats(formats) + except ExtractorError: + abr = 64 + for vbr, width, height in self._FORMATS: + tbr = vbr + abr + format_id = 'http-%s' % tbr + fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) + if self._is_valid_url(fmt_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': fmt_url, + 'vbr': vbr, + 'abr': abr, + 'width': width, + 'height': height, + }) + + date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + duration = None + files = video_info['files'] + if files: + duration = int_or_none(files[0].get('duration')) return { 'id': video_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], + 'title': title, + 'thumbnail': first_chapter.get('preview'), + 'description': first_chapter.get('description'), + 'view_count': int_or_none(video_info.get('views')), 'upload_date': upload_date, - 'duration': video_info['files'][0]['duration'], + 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index a6dfc4af9..86abef257 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -13,6 +13,7 @@ class XiamiBaseIE(InfoExtractor): webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') + return webpage def _extract_track(self, track, track_id=None): title = track['title'] diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 1dfe031ca..30825daae 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -15,10 +15,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P[0-9]+)(?:.*)' _TEST = { 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', - 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', + 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Biker Takes his Girl', 'age_limit': 18, } @@ -42,24 +42,24 @@ class XVideosIE(InfoExtractor): video_url = compat_urllib_parse_unquote(self._search_regex( r'flv_url=(.+?)&', webpage, 'video URL', default='')) if video_url: - formats.append({'url': video_url}) + formats.append({ + 'url': video_url, + 'format_id': 'flv', + }) - player_args = self._search_regex( - r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) - if player_args: - for arg in player_args.split(','): - format_url = self._search_regex( - r'(["\'])(?Phttps?://.+?)\1', arg, 'url', - default=None, group='url') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'mp4': - formats.append({'url': format_url}) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for kind, _, format_url in re.findall( + r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): + format_id = kind.lower() + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif format_id in ('urllow', 'urlhigh'): + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]), + 'quality': -2 if format_id.endswith('low') else None, + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b0679dfb7..d7a81ab8c 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -8,7 +8,6 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_urllib_parse, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( ExtractorError, int_or_none, mimetype2ext, + determine_ext, ) from .brightcove import BrightcoveNewIE @@ -39,7 +39,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23', + 'md5': '251af144a19ebc4a033e8ba91ac726bb', 'info_dict': { 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', @@ -50,7 +50,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '75ffabdb87c16d4ffe8c036dc4d1c136', + 'md5': '7993e572fac98e044588d0b5260f4352', 'info_dict': { 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 'ext': 'mp4', @@ -61,7 +61,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '9035d38f88b1782682a3e89f985be5bb', + 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', 'info_dict': { 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', 'ext': 'mp4', @@ -72,10 +72,10 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -98,7 +98,7 @@ class YahooIE(InfoExtractor): 'id': '154609075', }, 'playlist': [{ - 'md5': 'f8e336c6b66f503282e5f719641d6565', + 'md5': '000887d0dc609bc3a47c974151a40fb8', 'info_dict': { 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', 'ext': 'mp4', @@ -107,7 +107,7 @@ class YahooIE(InfoExtractor): 'duration': 30, }, }, { - 'md5': '958bcb90b4d6df71c56312137ee1cd5a', + 'md5': '81bc74faf10750fe36e4542f9a184c66', 'info_dict': { 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', 'ext': 'mp4', @@ -139,7 +139,7 @@ class YahooIE(InfoExtractor): 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': 'b17ac378b1134fa44370fb27db09a744', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', 'info_dict': { 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 'ext': 'mp4', @@ -168,7 +168,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '1ddbf7c850777548438e5c4f147c7b8c', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -196,6 +196,7 @@ class YahooIE(InfoExtractor): 'description': 'Galactic', 'title': 'Dolla Diva (feat. Maggie Koerner)', }, + 'skip': 'redirect to https://www.yahoo.com/music', }, ] @@ -213,15 +214,7 @@ class YahooIE(InfoExtractor): entries = [] iframe_urls = re.findall(r']+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) for idx, iframe_url in enumerate(iframe_urls): - iframepage = self._download_webpage( - host + iframe_url, display_id, - note='Downloading iframe webpage for video #%d' % idx) - items_json = self._search_regex( - r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) - if items_json: - items = json.loads(items_json) - video_id = items[0]['id'] - entries.append(self._get_info(video_id, display_id, webpage)) + entries.append(self.url_result(host + iframe_url, 'Yahoo')) if entries: return self.playlist_result(entries, page_id) @@ -246,7 +239,9 @@ class YahooIE(InfoExtractor): if config: sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') if sapi and 'query' in sapi: - return self._extract_info(display_id, sapi, webpage) + info = self._extract_info(display_id, sapi, webpage) + self._sort_formats(info['formats']) + return info items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -292,15 +287,17 @@ class YahooIE(InfoExtractor): formats = [] for s in info['streams']: + tbr = int_or_none(s.get('bitrate')) format_info = { 'width': int_or_none(s.get('width')), 'height': int_or_none(s.get('height')), - 'tbr': int_or_none(s.get('bitrate')), + 'tbr': tbr, } host = s['host'] path = s['path'] if host.startswith('rtmp'): + fmt = 'rtmp' format_info.update({ 'url': host, 'play_path': path, @@ -308,14 +305,18 @@ class YahooIE(InfoExtractor): }) else: if s.get('format') == 'm3u8_playlist': - format_info['protocol'] = 'm3u8_native' - format_info['ext'] = 'mp4' + fmt = 'hls' + format_info.update({ + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + else: + fmt = format_info['ext'] = determine_ext(path) format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url + format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') formats.append(format_info) - self._sort_formats(formats) - closed_captions = self._html_search_regex( r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', default='[]') @@ -346,17 +347,25 @@ class YahooIE(InfoExtractor): def _get_info(self, video_id, display_id, webpage): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse_urlencode({ - 'protocol': 'http', - 'region': region.upper(), - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - return self._extract_info(display_id, query_result, webpage) + webpage, 'region', fatal=False, default='US').upper() + formats = [] + info = {} + for fmt in ('webm', 'mp4'): + query_result = self._download_json( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + display_id, 'Downloading %s video info' % fmt, query={ + 'protocol': 'http', + 'region': region, + 'format': fmt, + }) + info = self._extract_info(display_id, query_result, webpage) + formats.extend(info['formats']) + formats.extend(self._extract_m3u8_formats( + 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + info['formats'] = formats + return info class YahooSearchIE(SearchInfoExtractor): diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index b37d0eab6..fd6268ba4 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -75,6 +75,12 @@ class YandexMusicTrackIE(YandexMusicBaseIE): % storage_dir, track_id, 'Downloading track location JSON') + # Each string is now wrapped in a list, this is probably only temporarily thus + # supporting both scenarios (see https://github.com/rg3/youtube-dl/issues/10193) + for k, v in data.items(): + if v and isinstance(v, list): + data[k] = v[0] + key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() storage = storage_dir.split('.') diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 4150b28da..b50f34e9b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,61 +1,39 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P[0-9]+)\.html(?:$|[?#])' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P[0-9]+)\.html(?:$|[?#])' + _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '07e15fa469ba384c7693fd246905547c', + 'md5': '78fc1901148284c69af12640e01c6310', 'info_dict': { 'id': '2189178', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, } - } + }, { + 'url': 'http://www.youjizz.com/videos/-2189178.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') age_limit = self._rta_search(webpage) video_title = self._html_search_regex( r'\s*(.*)\s*', webpage, 'title') - embed_page_url = self._search_regex( - r'(https?://www.youjizz.com/videos/embed/[0-9]+)', - webpage, 'embed page') - webpage = self._download_webpage( - embed_page_url, video_id, note='downloading embed page') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - # Get the video URL - m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) - if m_playlist is not None: - playlist_url = m_playlist.group('playlist') - playlist_page = self._download_webpage(playlist_url, video_id, - 'Downloading playlist page') - m_levels = list(re.finditer(r'[^"]+)"\)\);', - webpage, 'video URL') - - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, 'age_limit': age_limit, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 0df2d76ee..0265a64a7 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -35,7 +35,7 @@ class YouPornIE(InfoExtractor): 'age_limit': 18, }, }, { - # Anonymous User uploader + # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'info_dict': { 'id': '561726', @@ -44,7 +44,7 @@ class YouPornIE(InfoExtractor): 'title': 'Big Tits Awesome Brunette On amazing webcam show', 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Anonymous User', + 'uploader': 'Unknown', 'upload_date': '20111125', 'average_rating': int, 'view_count': int, @@ -140,17 +140,17 @@ class YouPornIE(InfoExtractor): r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', fatal=False)) - def extract_tag_box(title): - tag_box = self._search_regex( - (r']+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?\s*' - ']+class=["\']tagBoxContent["\']>(.+?)') % re.escape(title), - webpage, '%s tag box' % title, default=None) + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) if not tag_box: return [] return re.findall(r']+href=[^>]+>([^<]+)', tag_box) - categories = extract_tag_box('Category') - tags = extract_tag_box('Tags') + categories = extract_tag_box( + r'(?s)Categories:.*?]+>(.+?)', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?\s*]+class=["\']tagBoxContent["\'][^>]*>(.+?)', + 'tags') return { 'id': video_id, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 49c264c3a..0bc85af74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -90,38 +91,18 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if login_page is False: return - galx = self._search_regex(r'(?s)[0-9A-Za-z_-]{11})' + _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:shared' _TEST = { @@ -1860,6 +1865,28 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincout': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [YoutubeIE.ie_key()], }] def _real_initialize(self): @@ -1920,9 +1947,20 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): playlist_title = self._html_search_regex( r'(?s)

]*>\s*(.*?)\s*

', - page, 'title') + page, 'title', default=None) - return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) + has_videos = True + + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + return has_videos, self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL @@ -1931,9 +1969,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None def _real_extract(self, url): # Extract playlist id @@ -1942,7 +1982,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - video = self._check_download_just_video(url, playlist_id) + video_id, video = self._check_download_just_video(url, playlist_id) if video: return video @@ -1950,7 +1990,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - return self._extract_playlist(playlist_id) + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist + + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/rg3/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @@ -2328,10 +2376,11 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): }] def _real_extract(self, url): - video = self._check_download_just_video(url, 'WL') + _, video = self._check_download_just_video(url, 'WL') if video: return video - return self._extract_playlist('WL') + _, playlist = self._extract_playlist('WL') + return playlist class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 437eecb67..bd708b42c 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -4,13 +4,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + update_url_query, +) class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item, fatal=True): - error_message = item.find('./errormessage').text + def _extract_item(self, item, page_type, fatal=True): + error_message = item.get('msg') if error_message: if not fatal: return @@ -18,25 +22,48 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - title = item.find('./title').text.strip() - source = item.find('./source').text - extension = item.attrib['type'] - thumbnail = item.find('./backimage').text + formats = [] + for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): + if not source_url or source_url == 'require vip': + continue + if not re.match(r'https?://', source_url): + source_url = '//' + source_url + source_url = self._proto_relative_url(source_url, 'http:') + quality_num = int_or_none(quality) + f = { + 'format_id': quality, + 'url': source_url, + } + if page_type == 'video': + f.update({ + 'height': quality_num, + 'ext': 'mp4', + }) + else: + f.update({ + 'abr': quality_num, + 'ext': 'mp3', + }) + formats.append(f) + + cover = item.get('cover') return { - 'title': title, - 'url': source, - 'ext': extension, - 'thumbnail': thumbnail, + 'title': (item.get('name') or item.get('title')).strip(), + 'formats': formats, + 'thumbnail': 'http:/' + cover if cover else None, + 'artist': item.get('artist'), } - def _extract_player_xml(self, player_xml_url, id, playlist_title=None): - player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML') - items = player_xml.findall('./item') + def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): + player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') + items = player_json['data'] + if 'item' in items: + items = items['item'] if len(items) == 1: # one single song - data = self._extract_item(items[0]) + data = self._extract_item(items[0], page_type) data['id'] = id return data @@ -45,7 +72,7 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item, fatal=False) + entry = self._extract_item(item, page_type, fatal=False) if not entry: continue entry['id'] = '%s-%d' % (id, i) @@ -59,8 +86,8 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): } -class ZingMp3SongIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P[^/]+)/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseInfoExtractor): + _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -70,51 +97,47 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): 'ext': 'mp3', 'thumbnail': 're:^https?://.*\.jpg$', }, - }] - IE_NAME = 'zingmp3:song' - IE_DESC = 'mp3.zing.vn songs' - - def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - song_id = matched.group('song_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id) - - player_xml_url = self._search_regex( - r'&xmlURL=(?P[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml(player_xml_url, song_id) - - -class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P[^/]+)/(?P\w+)\.html' - _TESTS = [{ + }, { + 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', + 'md5': '870295a9cd8045c0e15663565902618d', + 'info_dict': { + 'id': 'ZW6BAEA0', + 'title': 'Let It Go (Frozen OST)', + 'ext': 'mp4', + }, + }, { 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', }, 'playlist_count': 10, + 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3:album' - IE_DESC = 'mp3.zing.vn albums' + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - album_id = matched.group('album_id') + page_id = self._match_id(url) - webpage = self._download_webpage( - 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id) - player_xml_url = self._search_regex( - r'&xmlURL=(?P[^&]+)&', webpage, 'player xml url') + webpage = self._download_webpage(url, page_id) - return self._extract_player_xml( - player_xml_url, album_id, - playlist_title=self._og_search_title(webpage)) + player_json_url = self._search_regex([ + r'data-xml="([^"]+)', + r'&xmlURL=([^&]+)&' + ], webpage, 'player xml url') + + playlist_title = None + page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') + if page_type == 'video': + player_json_url = update_url_query(player_json_url, {'format': 'json'}) + else: + player_json_url = player_json_url.replace('/xml/', '/html5xml/') + if page_type == 'album': + playlist_title = self._og_search_title(webpage) + + return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py deleted file mode 100644 index de819376d..000000000 --- a/youtube_dl/extractor/zippcast.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - str_to_int, -) - - -class ZippCastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P[0-9a-zA-Z]+)' - _TESTS = [{ - # m3u8, hq direct link - 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', - 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', - 'info_dict': { - 'id': 'c9cfd5c7e44dbc29c81', - 'ext': 'mp4', - 'title': '[Vinesauce] Vinny - Digital Space Traveler', - 'description': 'Muted on youtube, but now uploaded in it\'s original form.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'vinesauce', - 'view_count': int, - 'categories': ['Entertainment'], - 'tags': list, - }, - }, { - # f4m, lq ipod direct link - 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', - 'only_matching': True, - }, { - 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.zippcast.com/video/%s' % video_id, video_id) - - formats = [] - video_url = self._search_regex( - r']+src=(["\'])(?P.+?)\1', webpage, - 'video url', default=None, group='url') - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'preference': 0, # direct link is almost always of worse quality - }) - src_url = self._search_regex( - r'src\s*:\s*(?:escape\()?(["\'])(?Phttp://.+?)\1', - webpage, 'src', default=None, group='url') - ext = determine_ext(src_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage) - uploader = self._search_regex( - r']+href="https?://[^/]+/profile/[^>]+>([^<]+)', - webpage, 'uploader', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - view_count = str_to_int(self._search_regex( - r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) - - categories = re.findall( - r']+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', - webpage) - tags = re.findall( - r']+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', - webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index c4a85b2c0..56f312f57 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import os.path import optparse +import re import sys from .downloader.external import list_external_downloaders @@ -93,8 +94,18 @@ def parseOpts(overrideArguments=None): setattr(parser.values, option.dest, value.split(',')) def _hide_login_info(opts): - opts = list(opts) - for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: + PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password'] + eqre = re.compile('^(?P' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for private_opt in PRIVATE_OPTS: try: i = opts.index(private_opt) opts[i + 1] = 'PRIVATE' @@ -412,7 +423,15 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + downloader.add_option( + '--skip-unavailable-fragments', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments (DASH and hlsnative only)') + general.add_option( + '--abort-on-unavailable-fragment', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading when some fragment is not available') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', @@ -488,9 +507,20 @@ def parseOpts(overrideArguments=None): dest='bidi_workaround', action='store_true', help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') workarounds.add_option( - '--sleep-interval', metavar='SECONDS', + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Number of seconds to sleep before each download.') + help=( + 'Number of seconds to sleep before each download when used alone ' + 'or a lower bound of a range for randomized sleep before each download ' + '(minimum possible number of seconds to sleep) when used along with ' + '--max-sleep-interval.')) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help=( + 'Upper bound of a range for randomized sleep before each download ' + '(maximum possible number of seconds to sleep). Must only be used ' + 'along with --min-sleep-interval.')) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( @@ -606,22 +636,7 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-o', '--output', dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template. Use %(title)s to get the title, ' - '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' - '%(autonumber)s to get an automatically incremented number, ' - '%(ext)s for the filename extension, ' - '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' - '%(format_id)s for the unique id of the format (like YouTube\'s itags: "137"), ' - '%(upload_date)s for the upload date (YYYYMMDD), ' - '%(extractor)s for the provider (youtube, metacafe, etc), ' - '%(id)s for the video id, ' - '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, ' - '%(playlist_index)s for the position in the playlist. ' - '%(height)s and %(width)s for the width and height of the video format. ' - '%(resolution)s for a textual description of the resolution of the video format. ' - '%% for a literal percent. ' - 'Use - to output to stdout. Can also be used to download to a different directory, ' - 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) + help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index c1e9eb159..fa99b0c2a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -363,10 +363,8 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): input_files = [filename] + sub_filenames opts = [ - '-map', '0:v', - '-c:v', 'copy', - '-map', '0:a', - '-c:a', 'copy', + '-map', '0', + '-c', 'copy', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index 42377fa0f..920573da9 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -3,11 +3,6 @@ from __future__ import unicode_literals import re from .common import PostProcessor -from ..utils import PostProcessingError - - -class MetadataFromTitlePPError(PostProcessingError): - pass class MetadataFromTitlePP(PostProcessor): @@ -38,7 +33,8 @@ class MetadataFromTitlePP(PostProcessor): title = info['title'] match = re.match(self._titleregex, title) if match is None: - raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) + return [], info for attribute, value in match.groupdict().items(): value = match.group(attribute) info[attribute] = value diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e6e0155b4..ed199c4ad 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,6 +47,7 @@ from .compat import ( compat_socket_create_connection, compat_str, compat_struct_pack, + compat_struct_unpack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, @@ -121,6 +122,7 @@ DATE_FORMATS = ( '%Y %m %d', '%Y-%m-%d', '%Y/%m/%d', + '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', @@ -1101,7 +1103,7 @@ def unified_timestamp(date_str, day_first=True): date_str = date_str.replace(',', ' ') - pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) # Remove AM/PM + timezone @@ -1109,13 +1111,13 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): try: - dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) return calendar.timegm(dt.timetuple()) except ValueError: pass timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple.timetuple()) + return calendar.timegm(timetuple) + pm_delta * 3600 def determine_ext(url, default_ext='unknown_video'): @@ -1502,38 +1504,63 @@ def parse_filesize(s): _UNIT_TABLE = { 'B': 1, 'b': 1, + 'bytes': 1, 'KiB': 1024, 'KB': 1000, 'kB': 1024, 'Kb': 1000, + 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, 'MiB': 1024 ** 2, 'MB': 1000 ** 2, 'mB': 1024 ** 2, 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, 'GiB': 1024 ** 3, 'GB': 1000 ** 3, 'gB': 1024 ** 3, 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, 'TiB': 1024 ** 4, 'TB': 1000 ** 4, 'tB': 1024 ** 4, 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, 'PiB': 1024 ** 5, 'PB': 1000 ** 5, 'pB': 1024 ** 5, 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, 'EiB': 1024 ** 6, 'EB': 1000 ** 6, 'eB': 1024 ** 6, 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, 'ZiB': 1024 ** 7, 'ZB': 1000 ** 7, 'zB': 1024 ** 7, 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, 'YiB': 1024 ** 8, 'YB': 1000 ** 8, 'yB': 1024 ** 8, 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, } return lookup_unit_table(_UNIT_TABLE, s) @@ -1983,11 +2010,27 @@ US_RATINGS = { } +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + def parse_age_limit(s): - if s is None: + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): return None m = re.match(r'^(?P\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s) + if m: + return int(m.group('age')) + if s in US_RATINGS: + return US_RATINGS[s] + return TV_PARENTAL_GUIDELINES.get(s) def strip_jsonp(code): @@ -2012,14 +2055,14 @@ def js_to_json(code): }.get(m.group(0), m.group(0)), v[1:-1]) INTEGER_TABLE = ( - (r'^0[xX][0-9a-fA-F]+', 16), - (r'^0+[0-7]+', 8), + (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16), + (r'^(0+[0-7]+)\s*:?$', 8), ) for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: - i = int(im.group(0), base) + i = int(im.group(1), base) return '"%d":' % i if v.endswith(':') else '%d' % i return '"%s"' % v @@ -2105,7 +2148,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') - res = res.lower() + res = res.split(';')[0].strip().lower() return { '3gpp': '3gp', @@ -2123,7 +2166,9 @@ def mimetype2ext(mt): 'dash+xml': 'mpd', 'f4m': 'f4m', 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', }.get(res, res) @@ -2139,7 +2184,7 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): if not vcodec: vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): if not acodec: acodec = full_codec else: @@ -2391,6 +2436,8 @@ def dfxp2srt(dfxp_data): def cli_option(params, command_option, param): param = params.get(param) + if param: + param = compat_str(param) return [command_option, param] if param is not None else [] @@ -2968,3 +3015,110 @@ def parse_m3u8_attributes(attrib): def urshift(val, n): return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 56f9f5986..5be8c0122 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.13' +__version__ = '2016.09.03'