diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index eeac09d5d..27257ee0a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.16 +[debug] youtube-dl version 2016.07.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..f24bb4b09 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,22 @@ +## Please follow the guide below + +- You will be asked some questions, please read them **carefully** and answer honestly +- Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x]) +- Use *Preview* tab to see how your *pull request* will actually look like + +--- + +### Before submitting a *pull request* make sure you have: +- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections +- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests + +### What is the purpose of your *pull request*? +- [ ] Bug fix +- [ ] New extractor +- [ ] New feature + +--- + +### Description of your *pull request* and other information + +Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible. diff --git a/.travis.yml b/.travis.yml index 136c339f0..c74c9cc12 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,9 +7,6 @@ python: - "3.4" - "3.5" sudo: false -install: - - bash ./devscripts/install_srelay.sh - - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6 script: nosetests test --verbose notifications: email: diff --git a/AUTHORS b/AUTHORS index cdf655c39..890c827a0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -175,3 +175,7 @@ Tomáš Čech Déstin Reed Roman Tsiupa Artur Krysiak +Jakub Adam Wieczorek +Aleksandar Topuzović +Nehal Patel +Rob van Bekkum diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a59fac9b2..fbf0ab7e8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,9 +97,17 @@ If you want to add support for a new site, first of all **make sure** this site After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) -2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` -3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +2. Check out the source code with: + + git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git + +3. Start a new git branch with + + cd youtube-dl + git checkout -b yourextractor + 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + ```python # coding: utf-8 from __future__ import unicode_literals @@ -143,16 +151,148 @@ After you have ensured this site is distributing it's content legally, you can f 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. -9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! +## youtube-dl coding conventions + +This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. + +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. + +### Mandatory and optional metafields + +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: + + - `id` (media identifier) + - `title` (media title) + - `url` (media download URL) or `formats` + +In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. + +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. + +#### Example + +Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`: + +```python +meta = self._download_json(url, video_id) +``` + +Assume at this point `meta`'s layout is: + +```python +{ + ... + "summary": "some fancy summary text", + ... +} +``` + +Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: + +```python +description = meta.get('summary') # correct +``` + +and not like: + +```python +description = meta['summary'] # incorrect +``` + +The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). + +Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: + +```python +description = self._search_regex( + r']+id="title"[^>]*>([^<]+)<', + webpage, 'description', fatal=False) +``` + +With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction. + +You can also pass `default=`, for example: + +```python +description = self._search_regex( + r']+id="title"[^>]*>([^<]+)<', + webpage, 'description', default=None) +``` + +On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present. + +### Provide fallbacks + +When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable. + +#### Example + +Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like: + +```python +title = meta['title'] +``` + +If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected. + +Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: + +```python +title = meta.get('title') or self._og_search_title(webpage) +``` + +This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. + +### Make regular expressions flexible + +When using regular expressions try to write them fuzzy and flexible. + +#### Example + +Say you need to extract `title` from the following HTML code: + +```html +some fancy title +``` + +The code for that task should look similar to: + +```python +title = self._search_regex( + r']+class="title"[^>]*>([^<]+)', webpage, 'title') +``` + +Or even better: + +```python +title = self._search_regex( + r']+class=(["\'])title\1[^>]*>(?P[^<]+)', + webpage, 'title', group='title') +``` + +Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: + +The code definitely should not look like: + +```python +title = self._search_regex( + r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>', + webpage, 'title', group='title') +``` + +### Use safe conversion functions + +Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + diff --git a/README.md b/README.md index f1e59542d..a9f3001a6 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: - sudo curl -L https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl + sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: @@ -44,7 +44,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a small command-line program to download videos from +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, @@ -103,9 +103,9 @@ which means you can modify it, redistribute it or use it however you like. (experimental) -6, --force-ipv6 Make all connections via IPv6 (experimental) - --cn-verification-proxy URL Use this proxy to verify the IP address for - some Chinese sites. The default proxy - specified by --proxy (or none, if the + --geo-verification-proxy URL Use this proxy to verify the IP address for + some geo-restricted sites. The default + proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental) @@ -424,7 +424,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself. For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` @@ -432,6 +432,7 @@ For example, with the following configuration file youtube-dl will always extrac --no-mtime --proxy 127.0.0.1:3128 -o ~/Movies/%(title)s.%(ext)s +# Lines starting with # are comments ``` Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. @@ -890,9 +891,17 @@ If you want to add support for a new site, first of all **make sure** this site After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) -2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` -3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +2. Check out the source code with: + + git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git + +3. Start a new git branch with + + cd youtube-dl + git checkout -b yourextractor + 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + ```python # coding: utf-8 from __future__ import unicode_literals @@ -936,19 +945,151 @@ After you have ensured this site is distributing it's content legally, you can f 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. -9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! +## youtube-dl coding conventions + +This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code. + +Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros. + +### Mandatory and optional metafields + +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl: + + - `id` (media identifier) + - `title` (media title) + - `url` (media download URL) or `formats` + +In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken. + +[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. + +#### Example + +Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`: + +```python +meta = self._download_json(url, video_id) +``` + +Assume at this point `meta`'s layout is: + +```python +{ + ... + "summary": "some fancy summary text", + ... +} +``` + +Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like: + +```python +description = meta.get('summary') # correct +``` + +and not like: + +```python +description = meta['summary'] # incorrect +``` + +The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). + +Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance: + +```python +description = self._search_regex( + r'<span[^>]+id="title"[^>]*>([^<]+)<', + webpage, 'description', fatal=False) +``` + +With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction. + +You can also pass `default=<some fallback value>`, for example: + +```python +description = self._search_regex( + r'<span[^>]+id="title"[^>]*>([^<]+)<', + webpage, 'description', default=None) +``` + +On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present. + +### Provide fallbacks + +When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable. + +#### Example + +Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like: + +```python +title = meta['title'] +``` + +If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected. + +Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario: + +```python +title = meta.get('title') or self._og_search_title(webpage) +``` + +This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. + +### Make regular expressions flexible + +When using regular expressions try to write them fuzzy and flexible. + +#### Example + +Say you need to extract `title` from the following HTML code: + +```html +<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span> +``` + +The code for that task should look similar to: + +```python +title = self._search_regex( + r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title') +``` + +Or even better: + +```python +title = self._search_regex( + r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)', + webpage, 'title', group='title') +``` + +Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: + +The code definitely should not look like: + +```python +title = self._search_regex( + r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>', + webpage, 'title', group='title') +``` + +### Use safe conversion functions + +Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + # EMBEDDING YOUTUBE-DL youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new). diff --git a/devscripts/gh-pages/generate-download.py b/devscripts/gh-pages/generate-download.py index 392e3ba21..fcd7e1dff 100755 --- a/devscripts/gh-pages/generate-download.py +++ b/devscripts/gh-pages/generate-download.py @@ -15,13 +15,9 @@ data = urllib.request.urlopen(URL).read() with open('download.html.in', 'r', encoding='utf-8') as tmplf: template = tmplf.read() -md5sum = hashlib.md5(data).hexdigest() -sha1sum = hashlib.sha1(data).hexdigest() sha256sum = hashlib.sha256(data).hexdigest() template = template.replace('@PROGRAM_VERSION@', version) template = template.replace('@PROGRAM_URL@', URL) -template = template.replace('@PROGRAM_MD5SUM@', md5sum) -template = template.replace('@PROGRAM_SHA1SUM@', sha1sum) template = template.replace('@PROGRAM_SHA256SUM@', sha256sum) template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0]) template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1]) diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh deleted file mode 100755 index 33ce8a3f7..000000000 --- a/devscripts/install_srelay.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -mkdir -p tmp && cd tmp -wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz -tar zxvf srelay-0.4.8b6.tar.gz -cd srelay-0.4.8b6 -./configure -make diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index b5a8b9190..9a79c2bc5 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -14,15 +14,17 @@ if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) from youtube_dl.extractor import _ALL_CLASSES -from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor with open('devscripts/lazy_load_template.py', 'rt') as f: module_template = f.read() -module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)] +module_contents = [ + module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] ie_template = ''' -class {name}(LazyLoadExtractor): +class {name}({bases}): _VALID_URL = {valid_url!r} _module = '{module}' ''' @@ -34,10 +36,20 @@ make_valid_template = ''' ''' +def get_base_name(base): + if base is InfoExtractor: + return 'LazyLoadExtractor' + elif base is SearchInfoExtractor: + return 'LazyLoadSearchExtractor' + else: + return base.__name__ + + def build_lazy_ie(ie, name): valid_url = getattr(ie, '_VALID_URL', None) s = ie_template.format( name=name, + bases=', '.join(map(get_base_name, ie.__bases__)), valid_url=valid_url, module=ie.__module__) if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: @@ -47,12 +59,35 @@ def build_lazy_ie(ie, name): s += make_valid_template.format(valid_url=ie._make_valid_url()) return s +# find the correct sorting and add the required base classes so that sublcasses +# can be correctly created +classes = _ALL_CLASSES[:-1] +ordered_cls = [] +while classes: + for c in classes[:]: + bases = set(c.__bases__) - set((object, InfoExtractor, SearchInfoExtractor)) + stop = False + for b in bases: + if b not in classes and b not in ordered_cls: + if b.__name__ == 'GenericIE': + exit() + classes.insert(0, b) + stop = True + if stop: + break + if all(b in ordered_cls for b in bases): + ordered_cls.append(c) + classes.remove(c) + break +ordered_cls.append(_ALL_CLASSES[-1]) + names = [] -for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]: - name = ie.ie_key() + 'IE' +for ie in ordered_cls: + name = ie.__name__ src = build_lazy_ie(ie, name) module_contents.append(src) - names.append(name) + if ie in _ALL_CLASSES: + names.append(name) module_contents.append( '_ALL_CLASSES = [{0}]'.format(', '.join(names))) diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py new file mode 100644 index 000000000..e25d28411 --- /dev/null +++ b/devscripts/show-downloads-statistics.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +import itertools +import json +import os +import re +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.compat import ( + compat_print, + compat_urllib_request, +) +from youtube_dl.utils import format_bytes + + +def format_size(bytes): + return '%s (%d bytes)' % (format_bytes(bytes), bytes) + + +total_bytes = 0 + +for page in itertools.count(1): + releases = json.loads(compat_urllib_request.urlopen( + 'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page + ).read().decode('utf-8')) + + if not releases: + break + + for release in releases: + compat_print(release['name']) + for asset in release['assets']: + asset_name = asset['name'] + total_bytes += asset['download_count'] * asset['size'] + if all(not re.match(p, asset_name) for p in ( + r'^youtube-dl$', + r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$', + r'^youtube-dl\.exe$')): + continue + compat_print( + ' %s size: %s downloads: %d' + % (asset_name, format_size(asset['size']), asset['download_count'])) + +compat_print('total downloads traffic: %s' % format_size(total_bytes)) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13315f4f4..1f89b1c14 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -14,6 +14,7 @@ - **8tracks** - **91porn** - **9gag** + - **9now.com.au** - **abc.net.au** - **Abc7News** - **abcnews** @@ -45,7 +46,7 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** - - **ARD:mediathek**: Saarländischer Rundfunk + - **Arkena** - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -74,6 +75,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles + - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:playlist** - **BeatportPro** - **Beeg** - **BehindKink** @@ -104,6 +107,8 @@ - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **Canvas** + - **CarambaTV** + - **CarambaTVPage** - **CBC** - **CBCPlayer** - **CBS** @@ -124,6 +129,7 @@ - **cliphunter** - **ClipRs** - **Clipsyndicate** + - **CloserToTruth** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -136,7 +142,7 @@ - **CollegeRama** - **ComCarCoff** - **ComedyCentral** - - **ComedyCentralShows**: The Daily Show / The Colbert Report + - **ComedyCentralTV** - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED - **Coub** - **Cracked** @@ -148,6 +154,8 @@ - **CSNNE** - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 + - **CTV** + - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** - **CWTV** @@ -218,6 +226,7 @@ - **Firstpost** - **FiveTV** - **Flickr** + - **Flipagram** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Formula1** @@ -236,6 +245,7 @@ - **FreeVideo** - **Funimation** - **FunnyOrDie** + - **Fusion** - **GameInformer** - **Gamekings** - **GameOne** @@ -243,7 +253,6 @@ - **Gamersyde** - **GameSpot** - **GameStar** - - **Gametrailers** - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites @@ -269,6 +278,7 @@ - **Helsinki**: helsinki.fi - **HentaiStigma** - **HistoricFilms** + - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** - **HornBunny** @@ -276,6 +286,8 @@ - **HotStar** - **Howcast** - **HowStuffWorks** + - **HRTi** + - **HRTiPlaylist** - **HuffPost**: Huffington Post - **Hypem** - **Iconosquare** @@ -303,6 +315,7 @@ - **jpopsuki.tv** - **JWPlatform** - **Kaltura** + - **Kamcord** - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** @@ -322,8 +335,10 @@ - **kuwo:mv**: 酷我音乐 - MV - **kuwo:singer**: 酷我音乐 - 歌手 - **kuwo:song**: 酷我音乐 - - **la7.tv** + - **la7.it** - **Laola1Tv** + - **Lcp** + - **LcpPlay** - **Le**: 乐视网 - **Learnr** - **Lecture2Go** @@ -355,6 +370,7 @@ - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **META** - **metacafe** - **Metacritic** - **Mgoon** @@ -381,10 +397,9 @@ - **MovieFap** - **Moviezine** - **MPORA** - - **MSNBC** + - **MSN** - **MTV** - **mtv.de** - - **mtviggy.com** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** @@ -424,7 +439,6 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 - - **nextmovie.com** - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -432,8 +446,11 @@ - **nhl.com:videocenter** - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** + - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** + - **NineCNineMedia** + - **Nintendo** - **njoy**: N-JOY - **njoy:embed** - **Noco** @@ -461,9 +478,12 @@ - **NYTimes** - **NYTimesArticle** - **ocw.mit.edu** + - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** + - **onet.tv** + - **onet.tv:channel** - **OnionStudios** - **Ooyala** - **OoyalaExternal** @@ -497,8 +517,9 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **PolskieRadio** - **PornHd** - - **PornHub** + - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** - **PornHubUserVideos** - **Pornotube** @@ -516,6 +537,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **R7** + - **R7Article** - **radio.de** - **radiobremen** - **radiocanada** @@ -536,6 +558,7 @@ - **RICE** - **RingTV** - **RockstarGames** + - **RoosterTeeth** - **RottenTomatoes** - **Roxwel** - **RTBF** @@ -548,7 +571,9 @@ - **rtve.es:alacarta**: RTVE a la carta - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams + - **rtve.es:television** - **RTVNH** + - **Rudo** - **RUHD** - **RulePorn** - **rutube**: Rutube videos @@ -581,8 +606,10 @@ - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** + - **SixPlay** + - **skynewsarabia:article** - **skynewsarabia:video** - - **skynewsarabia:video** + - **SkySports** - **Slideshare** - **Slutload** - **smotri**: Smotri.com @@ -614,12 +641,14 @@ - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** + - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** + - **Streamable** - **streamcloud.eu** - **StreamCZ** - **StreetVoice** @@ -667,6 +696,7 @@ - **TNAFlix** - **TNAFlixNetworkEmbed** - **toggle** + - **Tosh**: Tosh.0 - **tou.tv** - **Toypics**: Toypics user profile - **ToypicsUser**: Toypics user profile @@ -714,6 +744,7 @@ - **UDNEmbed**: 聯合影音 - **Unistra** - **Urort**: NRK P3 Urørt + - **URPlay** - **USAToday** - **ustream** - **ustream:channel** @@ -731,6 +762,7 @@ - **vh1.com** - **Vice** - **ViceShow** + - **Vidbit** - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** @@ -770,6 +802,7 @@ - **vine:user** - **vk**: VK - **vk:uservideos**: VK - User's Videos + - **vk:wallpost** - **vlive** - **Vodlocker** - **VoiceRepublic** @@ -838,6 +871,7 @@ - **youtube:search**: YouTube.com searches - **youtube:search:date**: YouTube.com searches, newest videos first - **youtube:search_url**: YouTube.com search URLs + - **youtube:shared** - **youtube:show**: YouTube.com (multi-season) shows - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) diff --git a/setup.py b/setup.py index c1e923f71..508b27f37 100644 --- a/setup.py +++ b/setup.py @@ -21,25 +21,37 @@ try: import py2exe except ImportError: if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': - print("Cannot import py2exe", file=sys.stderr) + print('Cannot import py2exe', file=sys.stderr) exit(1) py2exe_options = { - "bundle_files": 1, - "compressed": 1, - "optimize": 2, - "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], + 'bundle_files': 1, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': '.', + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], } +# Get the version from youtube_dl/version.py without importing the package +exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + +DESCRIPTION = 'YouTube video downloader' +LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites' + py2exe_console = [{ - "script": "./youtube_dl/__main__.py", - "dest_base": "youtube-dl", + 'script': './youtube_dl/__main__.py', + 'dest_base': 'youtube-dl', + 'version': __version__, + 'description': DESCRIPTION, + 'comments': LONG_DESCRIPTION, + 'product_name': 'youtube-dl', + 'product_version': __version__, }] py2exe_params = { 'console': py2exe_console, - 'options': {"py2exe": py2exe_options}, + 'options': {'py2exe': py2exe_options}, 'zipfile': None } @@ -72,7 +84,7 @@ else: params['scripts'] = ['bin/youtube-dl'] class build_lazy_extractors(Command): - description = "Build the extractor lazy loading module" + description = 'Build the extractor lazy loading module' user_options = [] def initialize_options(self): @@ -87,16 +99,11 @@ class build_lazy_extractors(Command): dry_run=self.dry_run, ) -# Get the version from youtube_dl/version.py without importing the package -exec(compile(open('youtube_dl/version.py').read(), - 'youtube_dl/version.py', 'exec')) - setup( name='youtube_dl', version=__version__, - description='YouTube video downloader', - long_description='Small command-line program to download videos from' - ' YouTube.com and other video sites.', + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, url='https://github.com/rg3/youtube-dl', author='Ricardo Garcia', author_email='ytdl@yt-dl.org', @@ -112,17 +119,17 @@ setup( # test_requires = ['nosetest'], classifiers=[ - "Topic :: Multimedia :: Video", - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "License :: Public Domain", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", + 'Topic :: Multimedia :: Video', + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'License :: Public Domain', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6404ac89f..88e8ff904 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError class TestIE(InfoExtractor): @@ -66,6 +66,11 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('d', html), '4') self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1') + self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3') + self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3') + self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) + self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index ca25025e2..0dfe25c00 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -335,6 +335,40 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) + def test_audio_only_extractor_format_selection(self): + # For extractors with incomplete formats (all formats are audio-only or + # video-only) best and worst should fallback to corresponding best/worst + # video-only or audio-only formats (as per + # https://github.com/rg3/youtube-dl/pull/5556) + formats = [ + {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, + {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'high') + + ydl = YDL({'format': 'worst'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'low') + + def test_format_not_available(self): + formats = [ + {'format_id': 'regular', 'ext': 'mp4', 'height': 360, 'url': TEST_URL}, + {'format_id': 'video', 'ext': 'mp4', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # This must fail since complete video-audio format does not match filter + # and extractor does not provide incomplete only formats (i.e. only + # video-only or audio-only). + ydl = YDL({'format': 'best[height>360]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index f5af184e6..cd1cd4b24 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import os import sys import unittest +import collections sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -100,8 +101,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) self.assertMatch(':ythistory', ['youtube:history']) - self.assertMatch(':thedailyshow', ['ComedyCentralShows']) - self.assertMatch(':tds', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel']) @@ -130,6 +129,15 @@ class TestAllURLsMatching(unittest.TestCase): 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) + def test_no_duplicated_ie_names(self): + name_accu = collections.defaultdict(list) + for ie in self.ies: + name_accu[ie.IE_NAME.lower()].append(type(ie).__name__) + for (ie_name, ie_list) in name_accu.items(): + self.assertEqual( + len(ie_list), 1, + 'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list))) + if __name__ == '__main__': unittest.main() diff --git a/test/test_compat.py b/test/test_compat.py index f5317ac3e..b57424948 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -87,6 +87,8 @@ class TestCompat(unittest.TestCase): def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag']) + self.assertEqual(compat_shlex_split('-val 中文'), ['-val', '中文']) def test_compat_etree_fromstring(self): xml = ''' diff --git a/test/test_http.py b/test/test_http.py index 5076ced51..fdc68ccb4 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -138,27 +138,27 @@ class TestProxy(unittest.TestCase): self.proxy_thread.daemon = True self.proxy_thread.start() - self.cn_proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('cn')) - self.cn_port = http_server_port(self.cn_proxy) - self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever) - self.cn_proxy_thread.daemon = True - self.cn_proxy_thread.start() + self.geo_proxy = compat_http_server.HTTPServer( + ('localhost', 0), _build_proxy_handler('geo')) + self.geo_port = http_server_port(self.geo_proxy) + self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) + self.geo_proxy_thread.daemon = True + self.geo_proxy_thread.start() def test_proxy(self): - cn_proxy = 'localhost:{0}'.format(self.cn_port) + geo_proxy = 'localhost:{0}'.format(self.geo_port) ydl = YoutubeDL({ 'proxy': 'localhost:{0}'.format(self.port), - 'cn_verification_proxy': cn_proxy, + 'geo_verification_proxy': geo_proxy, }) url = 'http://foo.com/bar' response = ydl.urlopen(url).read().decode('utf-8') self.assertEqual(response, 'normal: {0}'.format(url)) req = compat_urllib_request.Request(url) - req.add_header('Ytdl-request-proxy', cn_proxy) + req.add_header('Ytdl-request-proxy', geo_proxy) response = ydl.urlopen(req).read().decode('utf-8') - self.assertEqual(response, 'cn: {0}'.format(url)) + self.assertEqual(response, 'geo: {0}'.format(url)) def test_proxy_with_idn(self): ydl = YoutubeDL({ diff --git a/test/test_utils.py b/test/test_utils.py index b7ef51f8d..2273b5a10 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -33,6 +33,7 @@ from youtube_dl.utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + get_element_by_class, InAdvancePagedList, intlist_to_bytes, is_html, @@ -60,11 +61,13 @@ from youtube_dl.utils import ( timeconvert, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, uppercase_escape, lowercase_escape, url_basename, urlencode_postdata, + urshift, update_url_query, version_tuple, xpath_with_ns, @@ -78,6 +81,7 @@ from youtube_dl.utils import ( cli_option, cli_valueless_option, cli_bool_option, + parse_codecs, ) from youtube_dl.compat import ( compat_chr, @@ -283,8 +287,28 @@ class TestUtil(unittest.TestCase): '20150202') self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214') self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) + def test_unified_timestamps(self): + self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) + self.assertEqual(unified_timestamp('8/7/2009'), 1247011200) + self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200) + self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598) + self.assertEqual(unified_timestamp('1968 12 10'), -33436800) + self.assertEqual(unified_timestamp('1968-12-10'), -33436800) + self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200) + self.assertEqual( + unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False), + 1417001400) + self.assertEqual( + unified_timestamp('2/2/2015 6:47:40 PM', day_first=False), + 1422902860) + self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900) + self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) + self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) + self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) + def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) @@ -383,6 +407,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(res_url, url) self.assertEqual(res_data, None) + smug_url = smuggle_url(url, {'a': 'b'}) + smug_smug_url = smuggle_url(smug_url, {'c': 'd'}) + res_url, res_data = unsmuggle_url(smug_smug_url) + self.assertEqual(res_url, url) + self.assertEqual(res_data, {'a': 'b', 'c': 'd'}) + def test_shell_quote(self): args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""") @@ -579,6 +609,29 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_parse_codecs(self): + self.assertEqual(parse_codecs(''), {}) + self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { + 'vcodec': 'avc1.77.30', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.2'), { + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), { + 'vcodec': 'avc1.42001e', + 'acodec': 'mp4a.40.5', + }) + self.assertEqual(parse_codecs('avc3.640028'), { + 'vcodec': 'avc3.640028', + 'acodec': 'none', + }) + self.assertEqual(parse_codecs(', h264,,newcodec,aac'), { + 'vcodec': 'h264', + 'acodec': 'aac', + }) + def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~' @@ -959,5 +1012,17 @@ The first line self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_urshift(self): + self.assertEqual(urshift(3, 1), 1) + self.assertEqual(urshift(-3, 1), 2147483646) + + def test_get_element_by_class(self): + html = ''' + <span class="foo bar">nice</span> + ''' + + self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('no-such-class', html), None) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5036289b0..6551f086f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib +import copy import datetime import errno import fileinput @@ -196,8 +197,8 @@ class YoutubeDL(object): prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use - cn_verification_proxy: URL of the proxy to use for IP address verification - on Chinese sites. (Experimental) + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -304,6 +305,11 @@ class YoutubeDL(object): self.params.update(params) self.cache = Cache(self) + if self.params.get('cn_verification_proxy') is not None: + self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.') + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + if params.get('bidi_workaround', False): try: import pty @@ -1046,9 +1052,9 @@ class YoutubeDL(object): if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - for format in f(formats): + for format in f(ctx): yield format return selector_function elif selector.type == GROUP: @@ -1056,17 +1062,17 @@ class YoutubeDL(object): elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - picked_formats = list(f(formats)) + picked_formats = list(f(ctx)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector - def selector_function(formats): - formats = list(formats) + def selector_function(ctx): + formats = list(ctx['formats']) if not formats: return if format_spec == 'all': @@ -1079,9 +1085,10 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in formats) or - all(f.get('vcodec') != 'none' for f in formats)): + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1155,17 +1162,18 @@ class YoutubeDL(object): } video_selector, audio_selector = map(_build_selector_function, selector.selector) - def selector_function(formats): - formats = list(formats) - for pair in itertools.product(video_selector(formats), audio_selector(formats)): + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] - def final_selector(formats): + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) for _filter in filters: - formats = list(filter(_filter, formats)) - return selector_function(formats) + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) @@ -1372,7 +1380,34 @@ class YoutubeDL(object): req_format_list.append('best') req_format = '/'.join(req_format_list) format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/rg3/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/rg3/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or + # all formats are audio-only + all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4905674ad..2b34bf9c2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -382,6 +382,8 @@ def _real_main(argv=None): 'external_downloader_args': external_downloader_args, 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 67db1c7c6..b8aaf5a46 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import binascii @@ -2594,15 +2595,19 @@ except ImportError: # Python < 3.3 return "'" + s.replace("'", "'\"'\"'") + "'" -if sys.version_info >= (2, 7, 3): +try: + args = shlex.split('中文') + assert (isinstance(args, list) and + isinstance(args[0], compat_str) and + args[0] == '中文') compat_shlex_split = shlex.split -else: +except (AssertionError, UnicodeEncodeError): # Working around shlex issue with unicode strings on some python 2 # versions (see http://bugs.python.org/issue1548891) def compat_shlex_split(s, comments=False, posix=True): if isinstance(s, compat_str): s = s.encode('utf-8') - return shlex.split(s, comments, posix) + return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) def compat_ord(c): diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 8f88b0241..80c21d40b 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -196,6 +196,11 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 for _ in range(fragments_count): res.append((segment, next(fragments_counter))) @@ -329,7 +334,11 @@ class F4mFD(FragmentFD): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + # From Adobe F4M 3.0 spec: + # The <baseURL> element SHALL be the base URL for all relative + # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said + # URLs should be relative to the location of the containing document. + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 54f2108e9..3b7bb3508 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -2,14 +2,24 @@ from __future__ import unicode_literals import os.path import re +import binascii +try: + from Crypto.Cipher import AES + can_decrypt_frag = True +except ImportError: + can_decrypt_frag = False from .fragment import FragmentFD from .external import FFmpegFD -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_struct_pack, +) from ..utils import ( encodeFilename, sanitize_open, + parse_m3u8_attributes, ) @@ -21,7 +31,7 @@ class HlsFD(FragmentFD): @staticmethod def can_download(manifest): UNSUPPORTED_FEATURES = ( - r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] # Live streams heuristic does not always work (e.g. geo restricted to Germany @@ -39,7 +49,9 @@ class HlsFD(FragmentFD): # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) - return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] + check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) + return all(check_results) def real_download(self, filename, info_dict): man_url = info_dict['url'] @@ -57,36 +69,60 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - fragment_urls = [] + total_frags = 0 for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): - segment_url = ( - line - if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) - fragment_urls.append(segment_url) - # We only download the first fragment during the test - if self.params.get('test', False): - break + total_frags += 1 ctx = { 'filename': filename, - 'total_frags': len(fragment_urls), + 'total_frags': total_frags, } self._prepare_and_start_frag_download(ctx) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} frags_filenames = [] - for i, frag_url in enumerate(fragment_urls): - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: - return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - frags_filenames.append(frag_sanitized) + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + frag_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(man_url, line)) + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + if decrypt_info['METHOD'] == 'AES-128': + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + ctx['dest_stream'].write(frag_content) + frags_filenames.append(frag_sanitized) + # We only download the first fragment during the test + if self.params.get('test', False): + break + i += 1 + media_sequence += 1 + elif line.startswith('#EXT-X-KEY'): + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + if not re.match(r'^https?://', decrypt_info['URI']): + decrypt_info['URI'] = compat_urlparse.urljoin( + man_url, decrypt_info['URI']) + decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 8753ee2cf..5ae16fa16 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(url + '?format=json', video_id) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) formats = [{ 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe2641..8f53050c9 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -2,23 +2,137 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, update_url_query, unescapeHTML, + extract_attributes, + get_element_by_attribute, +) +from ..compat import ( + compat_urlparse, ) -class AENetworksIE(InfoExtractor): +class AENetworksBaseIE(ThePlatformIE): + _THEPLATFORM_KEY = 'crazyjava' + _THEPLATFORM_SECRET = 's3cr3t' + + +class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', + 'info_dict': { + 'id': '22253814', + 'ext': 'mp4', + 'title': 'Winter Is Coming', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', + 'only_matching': True + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', + 'only_matching': True + }] + _DOMAIN_TO_REQUESTOR_ID = { + 'history.com': 'HISTORY', + 'aetv.com': 'AETV', + 'mylifetime.com': 'LIFETIME', + 'fyi.tv': 'FYI', + } + def _real_extract(self, url): + domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id + webpage = self._download_webpage(url, display_id) + if show_path: + url_parts = show_path.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + + query = { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + } + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + if theplatform_metadata.get('AETN$isBehindWall'): + requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] + resource = '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/"><channel><title>%s%s%s%s' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._search_json_ld(webpage, video_id, fatal=False)) + media_url = update_url_query(media_url, query) + media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + }) + return info + + +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)(?:/[^/]+(?:/(?P[^/?#]+))?)?' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', 'info_dict': { - 'id': 'g12m5Gyt3fdR', + 'id': '40700995724', 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', @@ -31,57 +145,61 @@ class AENetworksIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'expected_warnings': ['JSON-LD'], }, { - 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', - 'info_dict': { - 'id': 'eg47EERs_JsZ', - 'ext': 'mp4', - 'title': 'Winter Is Coming', - 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - 'timestamp': 1338306241, - 'upload_date': '20120529', - 'uploader': 'AENE-NEW', + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', + 'info_dict': + { + 'id': 'world-war-i-history', + 'title': 'World War I History', }, - 'add_ie': ['ThePlatform'], + 'playlist_mincount': 24, }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', - 'only_matching': True + 'url': 'http://www.history.com/topics/world-war-i-history/videos', + 'only_matching': True, }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', - 'only_matching': True + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', + 'only_matching': True, }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', - 'only_matching': True + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', + 'only_matching': True, }] - def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() - - webpage = self._download_webpage(url, video_id) - - video_url_re = [ - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - r"media_url\s*=\s*'([^']+)'" - ] - video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) - query = {'mbr': 'true'} - if page_type == 'shows': - query['assetTypes'] = 'medium_video_s3' - if 'switch=hds' in video_url: - query['switch'] = 'hls' - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ + def theplatform_url_result(self, theplatform_url, video_id, query): + return { '_type': 'url_transparent', + 'id': video_id, 'url': smuggle_url( - update_url_query(video_url, query), + update_url_query(theplatform_url, query), { 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t'}, + 'key': self._THEPLATFORM_KEY, + 'secret': self._THEPLATFORM_SECRET, + }, 'force_smil_url': True }), - }) - return info + 'ie_key': 'ThePlatform', + } + + def _real_extract(self, url): + topic_id, video_display_id = re.match(self._VALID_URL, url).groups() + if video_display_id: + webpage = self._download_webpage(url, video_display_id) + release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() + release_url = unescapeHTML(release_url) + + return self.theplatform_url_result( + release_url, video_id, { + 'mbr': 'true', + 'switch': 'hls' + }) + else: + webpage = self._download_webpage(url, topic_id) + entries = [] + for episode_item in re.findall(r']*>', webpage): + video_attributes = extract_attributes(episode_item) + entries.append(self.theplatform_url_result( + video_attributes['data-release-url'], video_attributes['data-id'], { + 'mbr': 'true', + 'switch': 'hls' + })) + return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index d548592fe..5766b4fe8 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find internal video meta data - meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' + meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json' player_config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) - internal_meta_id = player_config['videoId'] + internal_meta_id = player_config['aptomaVideoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 8545681be..e8e40126b 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + mimetype2ext, + determine_ext, ) @@ -50,21 +52,25 @@ class AMPIE(InfoExtractor): if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: - media = media_data['@attributes'] - media_type = media['type'] - if media_type in ('video/f4m', 'application/f4m+xml'): + media = media_data.get('@attributes', {}) + media_url = media.get('url') + if not media_url: + continue + ext = mimetype2ext(media.get('type')) or determine_ext(media_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) - elif media_type == 'application/x-mpegURL': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), + 'ext': ext, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9b01e38f5..9e28f2579 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor): _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' _TESTS = [{ + # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor): }, 'playlist_mincount': 4, }, { - # Film wording is used instead of Episode + # Film wording is used instead of Episode, ger/jap, Dub/OmU 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, }, { - # Episodes without titles + # Episodes without titles, jap, OmU 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, }, { # ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/169', 'only_matching': True, + }, { + # Full length film, non-series, ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/185', + 'only_matching': True, }] def _login(self): @@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for num, episode_html in enumerate(re.findall( - r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - + def extract_info(html, video_id, num=None): + title, description = [None] * 2 formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): attributes = extract_attributes(input_) playlist_urls = [] for playlist_key in ('data-playlist', 'data-otherplaylist'): @@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(lang) if kind: format_id_list.append(kind) - if not format_id_list: + if not format_id_list and num is not None: format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) @@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor): }) formats.extend(file_formats) - if formats: - self._sort_formats(formats) + return { + 'title': title, + 'description': description, + 'formats': formats, + } + + def extract_entries(html, video_id, common_info, num=None): + info = extract_info(html, video_id, num) + + if info['formats']: + self._sort_formats(info['formats']) f = common_info.copy() - f.update({ - 'title': title, - 'description': description, - 'formats': formats, - }) + f.update(info) entries.append(f) - # Extract teaser only when full episode is not available - if not formats: + # Extract teaser/trailer only when full episode is not available + if not info['formats']: m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', + html) if m: f = common_info.copy() f.update({ - 'id': '%s-teaser' % f['id'], + 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), 'url': compat_urlparse.urljoin(url, m.group('href')), }) entries.append(f) + def extract_episodes(html): + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: + continue + + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + extract_entries(episode_html, video_id, common_info) + + def extract_film(html, video_id): + common_info = { + 'id': anime_id, + 'title': anime_title, + 'description': anime_description, + } + extract_entries(html, video_id, common_info) + + extract_episodes(webpage) + + if not entries: + extract_film(webpage, anime_id) + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..a6801f3d4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, + parse_duration, + unified_strdate, ) @@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor): _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { - 'id': 'manofsteel', + 'id': '5111', + 'title': 'Man of Steel', }, 'playlist': [ { @@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor): 'id': 'blackthorn', }, 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('thumb'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 26446c2fe..07e67dd33 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,19 +8,19 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - get_element_by_attribute, qualities, int_or_none, parse_duration, unified_strdate, xpath_text, + update_url_query, ) from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', @@ -35,6 +35,7 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', @@ -45,6 +46,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', 'duration': 5252, }, + 'skip': 'HTTP Error 404: Not Found', }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -56,9 +58,22 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'md5': '4e8f00631aac0395fee17368ac0e9867', + 'info_dict': { + 'id': '30796318', + 'ext': 'mp3', + 'title': 'Vor dem Fest', + 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', + 'duration': 3287, + }, + 'skip': 'Video is no longer available', }] def _extract_media_info(self, media_info_url, webpage, video_id): @@ -114,11 +129,14 @@ class ARDMediathekIE(InfoExtractor): continue if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds', fatal=False)) + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), + video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -232,7 +250,8 @@ class ARDIE(InfoExtractor): 'title': 'Die Story im Ersten: Mission unter falscher Flagge', 'upload_date': '20140804', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'skip': 'HTTP Error 404: Not Found', } def _real_extract(self, url): @@ -274,41 +293,3 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } - - -class SportschauIE(ARDMediathekIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html' - _TESTS = [{ - 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'info_dict': { - 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', - 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - base_url = mobj.group('baseurl') - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_media_info( - base_url + '-mc_defaultQuality-h.json', webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 000000000..d45cae301 --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + continue + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index f40532929..e0c5c1804 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,11 +180,14 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', 'only_matching': True, + }, { + 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', + 'only_matching': True, }] @classmethod @@ -240,10 +243,10 @@ class ArteTVPlus7IE(ArteTVBaseIE): return self._extract_from_json_url(json_url, video_id, lang, title=title) # Different kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - embed_url = self._search_regex( - r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', - webpage, 'embed url', group='url') - return self.url_result(embed_url) + entries = [ + self.url_result(url) + for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)] + return self.playlist_result(entries) # It also uses the arte_vp_url url from the webpage to extract the information @@ -252,22 +255,17 @@ class ArteTVCreativeIE(ArteTVPlus7IE): _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + 'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1', 'info_dict': { - 'id': '72176', + 'id': '057405-001-A', 'ext': 'mp4', - 'title': 'Folge 2 - Corporate Design', - 'upload_date': '20131004', + 'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)', + 'upload_date': '20150716', }, }, { 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', - 'info_dict': { - 'id': '160676', - 'ext': 'mp4', - 'title': 'Monty Python live (mostly)', - 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', - 'upload_date': '20140805', - } + 'playlist_count': 11, + 'add_ie': ['Youtube'], }, { 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', 'only_matching': True, @@ -349,14 +347,13 @@ class ArteTVCinemaIE(ArteTVPlus7IE): _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' _TESTS = [{ - 'url': 'http://cinema.arte.tv/de/node/38291', - 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck', + 'md5': 'a5b9dd5575a11d93daf0e3f404f45438', 'info_dict': { - 'id': '055876-000_PWA12025-D', + 'id': '062494-000-A', 'ext': 'mp4', - 'title': 'Tod auf dem Nil', - 'upload_date': '20160122', - 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + 'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck', + 'upload_date': '20150807', }, }] @@ -422,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'info_dict': { 'id': 'PL-013263', 'title': 'Areva & Uramin', + 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', }, 'playlist_mincount': 6, }, { diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index efa624de1..a813eb429 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor): 'uploader_id': 272749, 'view_count': int, }, + 'skip': 'Channel offline', }, ] @@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor): 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] title = data['title'].strip() - description = data['description'] - thumbnail = data['thumbnail'] - view_count = data['view_count'] - uploader = data['user']['username'] - uploader_id = data['user']['id'] + description = data.get('description') + thumbnail = data.get('thumbnail') + view_count = data.get('view_count') + user = data.get('user', {}) + uploader = user.get('username') + uploader_id = user.get('id') stream_params = json.loads(data['stream_params']) - timestamp = float_or_none(stream_params['creationDate'], 1000) - duration = float_or_none(stream_params['length'], 1000) + timestamp = float_or_none(stream_params.get('creationDate'), 1000) + duration = float_or_none(stream_params.get('length'), 1000) renditions = stream_params.get('renditions') or [] video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') if video: renditions.append(video) + if not renditions and not user.get('channel', {}).get('is_live', True): + raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) + formats = [{ 'url': fmt['url'], 'width': fmt['frameWidth'], diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 74c4510f9..9cb7630a1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -31,7 +31,7 @@ class BBCCoUkIE(InfoExtractor): music/clips[/#]| radio/player/ ) - (?P<id>%s) + (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ @@ -192,6 +192,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Now it\'s really geo-restricted', }, { # compact player (https://github.com/rg3/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', @@ -588,7 +589,8 @@ class BBCIE(BBCCoUkIE): 'info_dict': { 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', - 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'title': "Tel Abyad'da IŞİD bayrağı indirildi YPG bayrağı çekildi", + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -602,6 +604,7 @@ class BBCIE(BBCCoUkIE): 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -698,7 +701,9 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -815,8 +820,20 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: - entries.append(self._extract_from_playlist_sxml( - playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + for key in ('progressiveDownload', 'streaming'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + entries.append(self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp)) + except Exception as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -975,3 +992,82 @@ class BBCCoUkArticleIE(InfoExtractor): r'<div[^>]+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' + _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 6, + 'skip': 'This programme is not currently available on BBC iPlayer', + }, { + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }] + + def _extract_title_and_description(self, webpage): + title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) + description = self._search_regex( + r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>', + webpage, 'description', fatal=False, group='value') + return title, description + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P<id>%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 986245bf0..bd3ee2e2e 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -1,31 +1,27 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - xpath_text, - xpath_with_ns, - int_or_none, - parse_iso8601, -) +from .mtv import MTVServicesInfoExtractor +from ..utils import unified_strdate +from ..compat import compat_urllib_parse_urlencode -class BetIE(InfoExtractor): +class BetIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [ { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': 'news/national/2014/a-conversation-with-president-obama', + 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', 'title': 'A Conversation With President Obama', - 'description': 'md5:699d0652a350cf3e491cd15cc745b5da', + 'description': 'President Obama urges persistence in confronting racism and bias.', 'duration': 1534, - 'timestamp': 1418075340, 'upload_date': '20141208', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -35,16 +31,17 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts', + 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', 'description': 'A BET News special.', 'duration': 1696, - 'timestamp': 1416942360, 'upload_date': '20141125', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -53,57 +50,32 @@ class BetIE(InfoExtractor): } ] + _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + + def _get_feed_query(self, uri): + return compat_urllib_parse_urlencode({ + 'uuid': uri, + }) + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') + def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid) - media_url = compat_urllib_parse_unquote(self._search_regex( - [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], - webpage, 'media URL')) + info_dict = videos_info['entries'][0] - video_id = self._search_regex( - r'/video/(.*)/_jcr_content/', media_url, 'video id') + upload_date = unified_strdate(self._html_search_meta('date', webpage)) + description = self._html_search_meta('description', webpage) - mrss = self._download_xml(media_url, display_id) - - item = mrss.find('./channel/item') - - NS_MAP = { - 'dc': 'http://purl.org/dc/elements/1.1/', - 'media': 'http://search.yahoo.com/mrss/', - 'ka': 'http://kickapps.com/karss', - } - - title = xpath_text(item, './title', 'title') - description = xpath_text( - item, './description', 'description', fatal=False) - - timestamp = parse_iso8601(xpath_text( - item, xpath_with_ns('./dc:date', NS_MAP), - 'upload date', fatal=False)) - uploader = xpath_text( - item, xpath_with_ns('./dc:creator', NS_MAP), - 'uploader', fatal=False) - - media_content = item.find( - xpath_with_ns('./media:content', NS_MAP)) - duration = int_or_none(media_content.get('duration')) - smil_url = media_content.get('url') - - thumbnail = media_content.find( - xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') - - formats = self._extract_smil_formats(smil_url, display_id) - self._sort_formats(formats) - - return { - 'id': video_id, + info_dict.update({ 'display_id': display_id, - 'title': title, 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, - 'formats': formats, - } + 'upload_date': upload_date, + }) + + return info_dict diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index 33762ad93..b19f35b5d 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -12,7 +12,7 @@ class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', + 'md5': 'dc1b4aebb46e3a7077ecc0d9f43f61e3', 'info_dict': { 'id': '16537', 'ext': 'mp4', @@ -26,7 +26,7 @@ class BigflixIE(InfoExtractor): 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', - 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', 'formats': 'mincount:2', }, 'params': { diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py index 133228133..7608c0a08 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/youtube_dl/extractor/biobiochiletv.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + ExtractorError, + remove_end, +) +from .rudo import RudoIE class BioBioChileTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' _TESTS = [{ 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', @@ -18,6 +22,7 @@ class BioBioChileTVIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Fernando Atria', }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', }, { # different uploader layout 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', @@ -32,6 +37,16 @@ class BioBioChileTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', + }, { + 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', + 'info_dict': { + 'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', + 'ext': 'mp4', + 'uploader': '(none)', + 'upload_date': '20160708', + 'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', + }, }, { 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', 'only_matching': True, @@ -45,42 +60,22 @@ class BioBioChileTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + rudo_url = RudoIE._extract_url(webpage) + if not rudo_url: + raise ExtractorError('No videos found') + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') - file_url = self._search_regex( - r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1', - webpage, 'file url', group='url') - - base_url = self._search_regex( - r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage, - 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', - group='url') - - formats = self._extract_m3u8_formats( - '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - f = { - 'url': '%s%s' % (base_url, file_url), - 'format_id': 'http', - 'protocol': 'http', - 'preference': 1, - } - if formats: - f_copy = formats[-1].copy() - f_copy.update(f) - f = f_copy - formats.append(f) - self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) uploader = self._html_search_regex( - r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>', + r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) return { + '_type': 'url_transparent', + 'url': rudo_url, 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, - 'formats': formats, } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 11cf49851..ff0aa11b1 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -29,7 +29,8 @@ class BRIE(InfoExtractor): 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', @@ -40,7 +41,8 @@ class BRIE(InfoExtractor): 'title': 'Manfred Schreiber ist tot', 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, - } + }, + 'skip': '404 not found', }, { 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', @@ -51,7 +53,8 @@ class BRIE(InfoExtractor): 'title': 'Kurzweilig und sehr bewegend', 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ef560b592..aeb22be16 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,6 +26,8 @@ from ..utils import ( unescapeHTML, unsmuggle_url, update_url_query, + clean_html, + mimetype2ext, ) @@ -90,6 +92,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, + 'skip': 'Video gone', }, { # test flv videos served by akamaihd.net @@ -108,7 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor): }, }, { - # playlist test + # playlist with 'videoList' # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { @@ -117,6 +120,15 @@ class BrightcoveLegacyIE(InfoExtractor): }, 'playlist_mincount': 7, }, + { + # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + }, ] FLV_VCODECS = { 1: 'SORENSON', @@ -298,13 +310,19 @@ class BrightcoveLegacyIE(InfoExtractor): info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) - if 'videoList' not in json_data: + if 'videoList' in json_data: + playlist_info = json_data['videoList'] + playlist_dto = playlist_info['mediaCollectionDTO'] + elif 'playlistTabs' in json_data: + playlist_info = json_data['playlistTabs'] + playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] + else: raise ExtractorError('Empty playlist') - playlist_info = json_data['videoList'] - videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + playlist_title=playlist_dto['displayName']) def _extract_video_info(self, video_info): video_id = compat_str(video_info['id']) @@ -528,14 +546,16 @@ class BrightcoveNewIE(InfoExtractor): formats = [] for source in json_data.get('sources', []): container = source.get('container') - source_type = source.get('type') + ext = mimetype2ext(source.get('type')) src = source.get('src') - if source_type == 'application/x-mpegURL' or container == 'M2TS': + if ext == 'ism': + continue + elif ext == 'm3u8' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - elif source_type == 'application/dash+xml': + elif ext == 'mpd': if not src: continue formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) @@ -551,7 +571,7 @@ class BrightcoveNewIE(InfoExtractor): 'tbr': tbr, 'filesize': int_or_none(source.get('size')), 'container': container, - 'ext': container.lower(), + 'ext': ext or container.lower(), } if width == 0 and height == 0: f.update({ @@ -585,6 +605,13 @@ class BrightcoveNewIE(InfoExtractor): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + self._sort_formats(formats) subtitles = {} @@ -597,7 +624,7 @@ class BrightcoveNewIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': json_data.get('description'), + 'description': clean_html(json_data.get('description')), 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': float_or_none(json_data.get('duration'), 1000), 'timestamp': parse_iso8601(json_data.get('published_at')), diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index df503ecc0..75fa92d7c 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -5,6 +5,7 @@ import json import re from .common import InfoExtractor +from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', - 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', - 'uploader': 'Buddhanz', - 'title': 'Angry Ram destroys a punching bag', + 'uploader': 'Angry Ram', } }] }, { @@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the', 'uploader': 're:^Munchkin the', - 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], }] def _real_extract(self, url): @@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url: + entries.append(self.url_result(facebook_url)) + return { '_type': 'playlist', 'id': playlist_id, diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 6ffbeabd3..268c34392 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -10,8 +9,10 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, + parse_duration, str_to_int, + unified_strdate, ) @@ -26,14 +27,14 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { @@ -41,64 +42,71 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, + 'duration': 318, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"<div class='srcFrom'>Source: <a title='([^']+)'", page, - 'external source', default=None) + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, 'view_count': view_count, } diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py new file mode 100644 index 000000000..5797fb951 --- /dev/null +++ b/youtube_dl/extractor/carambatv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': '', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2678.31, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ff663d079..a87e97140 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( js_to_json, smuggle_url, + try_get, ) @@ -25,8 +27,22 @@ class CBCIE(InfoExtractor): 'upload_date': '20160203', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { - # with clipId + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { @@ -64,6 +80,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }], + 'skip': 'Geo-restricted to Canada', }] @classmethod @@ -81,9 +98,15 @@ class CBCIE(InfoExtractor): media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] @@ -104,6 +127,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index ac2c7dced..a23173d6f 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,17 +1,13 @@ from __future__ import unicode_literals -import re - -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformFeedIE from ..utils import ( - xpath_text, - xpath_element, int_or_none, find_xpath_attr, ) -class CBSBaseIE(ThePlatformIE): +class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') return { @@ -21,9 +17,22 @@ class CBSBaseIE(ThePlatformIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info( + 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { + 'series': entry.get('cbs$SeriesTitle'), + 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), + 'episode': entry.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), + }, { + 'StreamPack': { + 'manifest': 'm3u', + } + }) + class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -38,25 +47,7 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - '_skip': 'Blocked outside the US', - }, { - 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', - 'info_dict': { - 'id': 'WWF_5KqY3PK1', - 'display_id': 'st-vincent', - 'ext': 'flv', - 'title': 'Live on Letterman - St. Vincent', - 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', - 'duration': 3221, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'expected_warnings': ['Failed to download m3u8 information'], '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -68,44 +59,5 @@ class CBSIE(CBSBaseIE): TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): - content_id, display_id = re.match(self._VALID_URL, url).groups() - if not content_id: - webpage = self._download_webpage(url, display_id) - content_id = self._search_regex( - [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], - webpage, 'content id') - items_data = self._download_xml( - 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': 'cbs', 'contentId': content_id}) - video_data = xpath_element(items_data, './/item') - title = xpath_text(video_data, 'videoTitle', 'title', True) - - subtitles = {} - formats = [] - for item in items_data.findall('.//item'): - pid = xpath_text(item, 'pid') - if not pid: - continue - tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid - if '.m3u8' in xpath_text(item, 'contentUrl', default=''): - tp_release_url += '&manifest=m3u' - tp_formats, tp_subtitles = self._extract_theplatform_smil( - tp_release_url, content_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) - info.update({ - 'id': content_id, - 'display_id': display_id, - 'title': title, - 'series': xpath_text(video_data, 'seriesTitle'), - 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), - 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), - 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), - 'thumbnail': xpath_text(video_data, 'previewImageURL'), - 'formats': formats, - 'subtitles': subtitles, - }) - return info + content_id = self._match_id(url) + return self._extract_video_info('byGuid=%s' % content_id, content_id) diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 0011c3029..821db20b2 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE): media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) formats, subtitles = [], {} - if site == 'cnet': - formats, subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue @@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) info.update({ 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 79ddc20a0..9328e3e20 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,13 +26,17 @@ class CBSNewsIE(CBSBaseIE): # rtmp download 'skip_download': True, }, + 'skip': 'Subscribers only', }, { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { - 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '19700101', + 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { @@ -58,37 +62,15 @@ class CBSNewsIE(CBSBaseIE): webpage, 'video JSON info'), video_id) item = video_info['item'] if 'item' in video_info else video_info - title = item.get('articleTitle') or item.get('hed') - duration = item.get('duration') - thumbnail = item.get('mediaImage') or item.get('thumbnail') - - subtitles = {} - formats = [] - for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: - pid = item.get('media' + format_id) - if not pid: - continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid - tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + guid = item['mpxRefId'] + return self._extract_video_info('byGuid=%s' % guid, guid) class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -96,7 +78,15 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, - } + 'skip': 'Video gone, redirected to http://www.cbsnews.com/live/', + }, { + 'url': 'http://www.cbsnews.com/live/video/video-shows-intense-paragliding-accident/', + 'info_dict': { + 'id': 'video-shows-intense-paragliding-accident', + 'ext': 'flv', + 'title': 'Video Shows Intense Paragliding Accident', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 549ae32f3..78ca44b02 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,30 +1,28 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor +from .cbs import CBSBaseIE -class CBSSportsIE(InfoExtractor): - _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' +class CBSSportsIE(CBSBaseIE): + _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', + _TESTS = [{ + 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', 'info_dict': { - 'id': '_d5_GbO8p1sT', - 'ext': 'flv', - 'title': 'US Open flashbacks: 1990s', - 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.', + 'id': '708337219968', + 'ext': 'mp4', + 'title': 'Ben Simmons the next LeBron? Not so fast', + 'description': 'md5:854294f627921baba1f4b9a990d87197', + 'timestamp': 1466293740, + 'upload_date': '20160618', + 'uploader': 'CBSI-NEW', }, - } + 'params': { + # m3u8 download + 'skip_download': True, + } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - section = mobj.group('section') - video_id = mobj.group('id') - all_videos = self._download_json( - 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section, - video_id) - # The json file contains the info of all the videos in the section - video_info = next(v for v in all_videos if v['pcid'] == video_id) - return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform') + video_id = self._match_id(url) + return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 19f8b397e..252c2e846 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -23,7 +23,7 @@ class CliphunterIE(InfoExtractor): (?P<id>[0-9]+)/ (?P<seo>.+?)(?:$|[#\?]) ''' - _TEST = { + _TESTS = [{ 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', 'info_dict': { @@ -32,8 +32,19 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - } - } + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', + 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', + 'info_dict': { + 'id': '2019449', + 'ext': 'mp4', + 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', + 'thumbnail': 're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py index 4f9320ea5..d55b26d59 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/youtube_dl/extractor/cliprs.py @@ -1,16 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, -) +from .onet import OnetBaseIE -class ClipRsIE(InfoExtractor): +class ClipRsIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + mvp_id = self._search_mvp_id(webpage) - response = self._download_json( - 'http://qi.ckm.onetapi.pl/', video_id, - query={ - 'body[id]': video_id, - 'body[jsonrpc]': '2.0', - 'body[method]': 'get_asset_detail', - 'body[params][ID_Publikacji]': video_id, - 'body[params][Service]': 'www.onet.pl', - 'content-type': 'application/jsonp', - 'x-onet-app': 'player.front.onetapi.pl', - }) + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id - error = response.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - - video = response['result'].get('0') - - formats = [] - for _, formats_dict in video['formats'].items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_list in formats_dict.items(): - if not isinstance(format_list, list): - continue - for f in format_list: - if not f.get('url'): - continue - formats.append({ - 'url': f['url'], - 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), - 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) - self._sort_formats(formats) - - meta = video.get('meta', {}) - - title = self._og_search_title(webpage, default=None) or meta['title'] - description = self._og_search_description(webpage, default=None) or meta.get('description') - duration = meta.get('length') or meta.get('lenght') - timestamp = parse_iso8601(meta.get('addDate'), ' ') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return info_dict diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py new file mode 100644 index 000000000..26243d52d --- /dev/null +++ b/youtube_dl/extractor/closertotruth.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') + + title = self._search_regex( + r'<title>(.+?)\s*\|\s*.+?', webpage, 'video title') + + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) + + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': title + } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9a28ef354..ae5ba0015 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -17,37 +16,26 @@ from ..utils import ( class CloudyIE(InfoExtractor): - _IE_DESC = 'cloudy.ec and videoraj.ch' + _IE_DESC = 'cloudy.ec' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ + https?://(?:www\.)?cloudy\.ec/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' - _EMBED_URL = 'http://www.%s/embed.php?id=%s' - _API_URL = 'http://www.%s/api/player.api.php?%s' + _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' + _API_URL = 'http://www.cloudy.ec/api/player.api.php' _MAX_TRIES = 2 - _TESTS = [ - { - 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '5cb253ace826a42f35b4740539bedf07', - 'info_dict': { - 'id': 'af511e2527aac', - 'ext': 'flv', - 'title': 'Funny Cats and Animals Compilation june 2013', - } - }, - { - 'url': 'http://www.videoraj.to/v/47f399fd8bb60', - 'md5': '7d0f8799d91efd4eda26587421c3c3b0', - 'info_dict': { - 'id': '47f399fd8bb60', - 'ext': 'flv', - 'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?', - } + _TEST = { + 'url': 'https://www.cloudy.ec/v/af511e2527aac', + 'md5': '5cb253ace826a42f35b4740539bedf07', + 'info_dict': { + 'id': 'af511e2527aac', + 'ext': 'flv', + 'title': 'Funny Cats and Animals Compilation june 2013', } - ] + } - def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0): + def _extract_video(self, video_id, file_key, error_url=None, try_num=0): if try_num > self._MAX_TRIES - 1: raise ExtractorError('Unable to extract video URL', expected=True) @@ -64,9 +52,8 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( - data_url, video_id, 'Downloading player data') + self._API_URL, video_id, 'Downloading player data', query=form) data = compat_parse_qs(player_data) try_num += 1 @@ -88,7 +75,7 @@ class CloudyIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: self.report_warning('Invalid video URL, requesting another', video_id) - return self._extract_video(video_host, video_id, file_key, video_url, try_num) + return self._extract_video(video_id, file_key, video_url, try_num) return { 'id': video_id, @@ -98,14 +85,13 @@ class CloudyIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_host = mobj.group('host') video_id = mobj.group('id') - url = self._EMBED_URL % (video_host, video_id) + url = self._EMBED_URL % video_id webpage = self._download_webpage(url, video_id) file_key = self._search_regex( [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], webpage, 'file_key') - return self._extract_video(video_host, video_id, file_key) + return self._extract_video(video_id, file_key) diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f1311b14f..f24568dcc 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals + from .mtv import MTVIE +from ..utils import ExtractorError class CMTIE(MTVIE): @@ -16,7 +18,27 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % cls.IE_NAME, expected=True) + + return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 2b6aaa3aa..c76909e48 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,17 +1,6 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - float_or_none, - unified_strdate, -) class ComedyCentralIE(MTVServicesInfoExtractor): @@ -26,8 +15,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', - 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', @@ -35,241 +26,73 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] -class ComedyCentralShowsIE(MTVServicesInfoExtractor): - IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow - # urls for episodes like: - # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day - # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news - # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) - |https?://(:www\.)? - (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ - ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| - (?P<clip> - (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) - |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) - |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) - )| - (?P<interview> - extended-interviews/(?P<interID>[0-9a-z]+)/ - (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?) - (?:/[^/?#]?|[?#]|$)))) - ''' +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + _TESTS = [{ - 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', - 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', - 'info_dict': { - 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', - 'ext': 'mp4', - 'upload_date': '20121213', - 'description': 'Kristen Stewart learns to let loose in "On the Road."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow kristen-stewart part 1', - } - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', - 'info_dict': { - 'id': 'sarah-chayes-extended-interview', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'title': 'thedailyshow Sarah Chayes Extended Interview', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 1', - }, - }, - { - 'info_dict': { - 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 2', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', - 'only_matching': True, - }, { 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'info_dict': { + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', + }, + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': 're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', + }, + }, + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', 'only_matching': True, }] - _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) + new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') + return new_urls - _video_extensions = { - '3500': 'mp4', - '2200': 'mp4', - '1700': 'mp4', - '1200': 'mp4', - '750': 'mp4', - '400': 'mp4', - } - _video_dimensions = { - '3500': (1280, 720), - '2200': (960, 540), - '1700': (768, 432), - '1200': (640, 360), - '750': (512, 288), - '400': (384, 216), - } + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'info_dict': { + 'id': 'local_playlist-f99b626bdfe13568579a', + 'ext': 'flv', + 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', + 'only_matching': True, + }, { + 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + video_id = self._match_id(url) - if mobj.group('shortname'): - return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') + webpage = self._download_webpage(url, video_id) - if mobj.group('clip'): - if mobj.group('videotitle'): - epTitle = mobj.group('videotitle') - elif mobj.group('showname') == 'thedailyshow': - epTitle = mobj.group('tdstitle') - else: - epTitle = mobj.group('cntitle') - dlNewest = False - elif mobj.group('interview'): - epTitle = mobj.group('interview_title') - dlNewest = False - else: - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') - else: - epTitle = mobj.group('episode') - show_name = mobj.group('showname') + mrss_url = self._search_regex( + r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'mrss url', group='url') - webpage, htmlHandle = self._download_webpage_handle(url, epTitle) - if dlNewest: - url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid redirected URL: ' + url) - if mobj.group('episode') == '': - raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] - - mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) - if len(mMovieParams) == 0: - # The Colbert Report embeds the information in a without - # a URL prefix; so extract the alternate reference - # and then add the URL prefix manually. - - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) - if len(altMovieParams) == 0: - raise ExtractorError('unable to find Flash URL in webpage ' + url) - else: - mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])] - - uri = mMovieParams[0][1] - # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) - - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) - idoc = self._download_xml( - index_url, epTitle, - 'Downloading show index', 'Unable to download episode index') - - title = idoc.find('./channel/title').text - description = idoc.find('./channel/description').text - - entries = [] - item_els = idoc.findall('.//item') - for part_num, itemEl in enumerate(item_els): - upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) - thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') - - content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') - duration = float_or_none(content.attrib.get('duration')) - mediagen_url = content.attrib['url'] - guid = itemEl.find('./guid').text.rpartition(':')[-1] - - cdoc = self._download_xml( - mediagen_url, epTitle, - 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) - - turls = [] - for rendition in cdoc.findall('.//rendition'): - finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) - turls.append(finfo) - - formats = [] - for format, rtmp_video_url in turls: - w, h = self._video_dimensions.get(format, (None, None)) - formats.append({ - 'format_id': 'vhttp-%s' % format, - 'url': self._transform_rtmp_url(rtmp_video_url), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - formats.append({ - 'format_id': 'rtmp-%s' % format, - 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - self._sort_formats(formats) - - subtitles = self._extract_subtitles(cdoc, guid) - - virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) - entries.append({ - 'id': guid, - 'title': virtual_id, - 'formats': formats, - 'uploader': show_name, - 'upload_date': upload_date, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'subtitles': subtitles, - }) - - return { - '_type': 'playlist', - 'id': epTitle, - 'entries': entries, - 'title': show_name + ' ' + title, - 'description': description, - } + return self._get_videos_info_from_url(mrss_url, video_id) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfd432160..53c28f016 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -53,6 +54,9 @@ from ..utils import ( mimetype2ext, update_Request, update_url_query, + parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -160,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -748,10 +753,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -800,15 +807,17 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + return self._json_ld( + json_ld, video_id, fatal=kwargs.get('fatal', True), + expected_type=expected_type) - def _json_ld(self, json_ld, video_id, fatal=True): + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: @@ -816,6 +825,8 @@ class InfoExtractor(object): info = {} if json_ld.get('@context') == 'http://schema.org': item_type = json_ld.get('@type') + if expected_type is not None and expected_type != item_type: + return info if item_type == 'TVEpisode': info.update({ 'episode': unescapeHTML(json_ld.get('name')), @@ -834,6 +845,19 @@ class InfoExtractor(object): 'title': unescapeHTML(json_ld.get('headline')), 'description': unescapeHTML(json_ld.get('articleBody')), }) + elif item_type == 'VideoObject': + info.update({ + 'url': json_ld.get('contentUrl'), + 'title': unescapeHTML(json_ld.get('name')), + 'description': unescapeHTML(json_ld.get('description')), + 'thumbnail': json_ld.get('thumbnailUrl'), + 'duration': parse_duration(json_ld.get('duration')), + 'timestamp': unified_timestamp(json_ld.get('uploadDate')), + 'filesize': float_or_none(json_ld.get('contentSize')), + 'tbr': int_or_none(json_ld.get('bitrate')), + 'width': int_or_none(json_ld.get('width')), + 'height': int_or_none(json_ld.get('height')), + }) return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -875,7 +899,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -1150,23 +1178,11 @@ class InfoExtractor(object): }] last_info = None last_media = None - kv_rex = re.compile( - r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_info[m.group('key')] = v + last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_media[m.group('key')] = v + last_media = parse_m3u8_attributes(line) elif line.startswith('#') or not line.strip(): continue else: @@ -1191,6 +1207,7 @@ class InfoExtractor(object): 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'fps': float_or_none(last_info.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } @@ -1199,24 +1216,17 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) - codecs = last_info.get('CODECS') - if codecs: - vcodec, acodec = [None] * 2 - va_codecs = codecs.split(',') - if len(va_codecs) == 1: - # Audio only entries usually come with single codec and - # no resolution. For more robustness we also check it to - # be mp4 audio. - if not resolution and va_codecs[0].startswith('mp4a'): - vcodec, acodec = 'none', va_codecs[0] - else: - vcodec = va_codecs[0] - else: - vcodec, acodec = va_codecs[:2] + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) f.update({ - 'acodec': acodec, - 'vcodec': vcodec, + 'vbr': vbr, + 'abr': abr, }) + f.update(parse_codecs(last_info.get('CODECS'))) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1471,6 +1481,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1503,8 +1520,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1541,7 +1566,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1585,16 +1610,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1621,6 +1670,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r'<source[^>]+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r'<track[^>]+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1681,7 +1786,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) @@ -1734,6 +1839,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 84b36f44c..7e5d4f227 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -51,8 +51,11 @@ class CSpanIE(InfoExtractor): 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', 'info_dict': { 'id': 'judiciary031715', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Immigration Reforms Needed to Protect Skilled American Workers', + }, + 'params': { + 'skip_download': True, # m3u8 downloads } }] diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py new file mode 100644 index 000000000..5807fbac9 --- /dev/null +++ b/youtube_dl/extractor/ctv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctv.ca/video/player?vid=706966', + 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'info_dict': { + 'id': '706966', + 'ext': 'mp4', + 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', + 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', + 'upload_date': '20150919', + 'timestamp': 1442624700, + }, + 'expected_warnings': ['HTTP Error 404'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:ctv_web:%s' % video_id, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py new file mode 100644 index 000000000..1023b6130 --- /dev/null +++ b/youtube_dl/extractor/ctvnews.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import orderedSet + + +class CTVNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctvnews.ca/video?clipId=901995', + 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'info_dict': { + 'id': '901995', + 'ext': 'mp4', + 'title': 'Extended: \'That person cannot be me\' Johnson says', + 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', + 'timestamp': 1467286284, + 'upload_date': '20160630', + } + }, { + 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', + 'info_dict': + { + 'id': '1.2966224', + }, + 'playlist_mincount': 19, + }, { + 'url': 'http://www.ctvnews.ca/video?binId=1.2876780', + 'info_dict': + { + 'id': '1.2876780', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.ctvnews.ca/1.810401', + 'only_matching': True, + }, { + 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + def ninecninemedia_url_result(clip_id): + return { + '_type': 'url_transparent', + 'id': clip_id, + 'url': '9c9media:ctvnews_web:%s' % clip_id, + 'ie_key': 'NineCNineMedia', + } + + if page_id.isdigit(): + return ninecninemedia_url_result(page_id) + else: + webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ + 'ot': 'example.AjaxPageLayout.ot', + 'maxItemsPerPage': 1000000, + }) + entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( + re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index b60a1d813..98c835bf1 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -5,19 +5,20 @@ from .common import InfoExtractor from ..utils import ( int_or_none, determine_protocol, + unescapeHTML, ) class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' _TEST = { - 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', - 'md5': '2f639d446394f53f3a33658b518b6615', + 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', + 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { - 'id': '1288527', + 'id': '1295863', 'ext': 'mp4', - 'title': 'Turn any video into an impressionist masterpiece', - 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', + 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } } @@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) - title = video_data['title'] + title = unescapeHTML(video_data['title']) video_sources = self._download_json(video_data.get( 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) @@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video_data.get('descr'), + 'description': unescapeHTML(video_data.get('descr')), 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 'formats': formats, } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2e6226ea0..1f92823b7 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -16,6 +16,7 @@ from ..utils import ( sanitized_Request, str_to_int, unescapeHTML, + mimetype2ext, ) @@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } ] + @staticmethod + def _extract_urls(webpage): + # Look for embedded Dailymotion player + matches = re.findall( + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + return list(map(lambda m: unescapeHTML(m[1]), matches)) + def _real_extract(self, url): video_id = self._match_id(url) @@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - ext = determine_ext(media_url) - if type_ == 'application/x-mpegURL' or ext == 'm3u8': + ext = mimetype2ext(type_) or determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, m3u8_id='hls', fatal=False)) - elif type_ == 'application/f4m' or ext == 'f4m': + elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, 'format_id': 'http-%s' % quality, + 'ext': ext, } m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) if m: diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 86024a745..b5c310ccb 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -66,22 +66,32 @@ class DaumIE(InfoExtractor): 'view_count': int, 'comment_count': int, }, + }, { + # Requires dte_type=WEB (#9972) + 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': 's3794Uf1NZeZ1qMpGpeqeRU', + 'ext': 'mp4', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20160611', + }, }] def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( - 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, - video_id, 'Downloading video formats info') + 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', + video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'}) # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - 'Downloading video info') + 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, + 'Downloading video info', query={'vid': video_id}) formats = [] for format_el in movie_data['output_list']['output_list']: diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index 133cdc50b..caff8842e 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -4,78 +4,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - float_or_none, - int_or_none, - clean_html, -) class DBTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:(?:lazyplayer|player)/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:[^/]+/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?' _TESTS = [{ 'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', - 'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc', + 'md5': '2e24f67936517b143a234b4cadf792ec', 'info_dict': { - 'id': '33100', + 'id': '3649835190001', 'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', 'ext': 'mp4', 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', 'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0', - 'thumbnail': 're:https?://.*\.jpg$', - 'timestamp': 1404039863.438, + 'thumbnail': 're:https?://.*\.jpg', + 'timestamp': 1404039863, 'upload_date': '20140629', 'duration': 69.544, - 'view_count': int, - 'categories': list, - } + 'uploader_id': '1027729757001', + }, + 'add_ie': ['BrightcoveNew'] }, { 'url': 'http://dbtv.no/3649835190001', 'only_matching': True, }, { 'url': 'http://www.dbtv.no/lazyplayer/4631135248001', 'only_matching': True, + }, { + 'url': 'http://dbtv.no/vice/5000634109001', + 'only_matching': True, + }, { + 'url': 'http://dbtv.no/filmtrailer/3359293614001', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - data = self._download_json( - 'http://api.dbtv.no/discovery/%s' % video_id, display_id) - - video = data['playlist'][0] - - formats = [{ - 'url': f['URL'], - 'vcodec': f.get('container'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'vbr': float_or_none(f.get('rate'), 1000), - 'filesize': int_or_none(f.get('size')), - } for f in video['renditions'] if 'URL' in f] - - if not formats: - for url_key, format_id in [('URL', 'mp4'), ('HLSURL', 'hls')]: - if url_key in video: - formats.append({ - 'url': video[url_key], - 'format_id': format_id, - }) - - self._sort_formats(formats) + video_id, display_id = re.match(self._VALID_URL, url).groups() return { - 'id': compat_str(video['id']), + '_type': 'url_transparent', + 'url': 'http://players.brightcove.net/1027729757001/default_default/index.html?videoId=%s' % video_id, + 'id': video_id, 'display_id': display_id, - 'title': video['title'], - 'description': clean_html(video['desc']), - 'thumbnail': video.get('splash') or video.get('thumb'), - 'timestamp': float_or_none(video.get('publishedAt'), 1000), - 'duration': float_or_none(video.get('length'), 1000), - 'view_count': int_or_none(video.get('views')), - 'categories': video.get('tags'), - 'formats': formats, + 'ie_key': 'BrightcoveNew', } diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5deff5f30..b8542820a 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -20,7 +20,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() @@ -55,30 +55,30 @@ class DCNBaseIE(InfoExtractor): 'is_live': is_live, } - def _extract_video_formats(self, webpage, video_id, entry_protocol): + def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol): formats = [] - m3u8_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) - - rtsp_url = self._search_regex( - r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - + format_url_base = 'http' + self._html_search_regex( + [ + r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', + r'<a[^>]+href="rtsp(://[^"]+)"' + ], webpage, 'format url') + formats.extend(self._extract_mpd_formats( + format_url_base + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + format_url_base + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + format_url_base + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return formats class DCNVideoIE(DCNBaseIE): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { @@ -94,7 +94,10 @@ class DCNVideoIE(DCNBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -120,7 +123,7 @@ class DCNVideoIE(DCNBaseIE): class DCNLiveIE(DCNBaseIE): IE_NAME = 'dcn:live' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' def _real_extract(self, url): channel_id = self._match_id(url) @@ -147,7 +150,7 @@ class DCNLiveIE(DCNBaseIE): class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 0040e70d4..908c9e514 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -17,8 +17,12 @@ class DreiSatIE(ZDFIE): 'ext': 'mp4', 'title': 'Waidmannsheil', 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': '3sat', + 'uploader': 'SCHWEIZWEIT', + 'uploader_id': '100000210', 'upload_date': '20140913' + }, + 'params': { + 'skip_download': True, # m3u8 downloads } }, { diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 113a4966f..12d28d3b9 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -50,6 +50,14 @@ class EaglePlatformIE(InfoExtractor): 'skip': 'Georestricted', }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', + webpage) + if mobj is not None: + return mobj.group('url') + @staticmethod def _handle_error(response): status = int_or_none(response.get('status', 200)) diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 4c8190d68..74bbc5c51 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -6,12 +6,13 @@ import json from .common import InfoExtractor from ..utils import ( ExtractorError, + NO_DEFAULT, ) class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', 'md5': '4294cf98bc165f218aaa0b89e0fd8042', 'info_dict': { @@ -22,24 +23,47 @@ class EllenTVIE(InfoExtractor): 'timestamp': 1428035648, 'upload_date': '20150403', 'uploader_id': 'batchUser', - } - } + }, + }, { + # not available via http://widgets.ellentube.com/ + 'url': 'http://www.ellentv.com/videos/1-szkgu2m2/', + 'info_dict': { + 'id': '1_szkgu2m2', + 'ext': 'flv', + 'title': "Ellen's Amazingly Talented Audience", + 'description': 'md5:86ff1e376ff0d717d7171590e273f0a5', + 'timestamp': 1255140900, + 'upload_date': '20091010', + 'uploader_id': 'ellenkaltura@gmail.com', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://widgets.ellentube.com/videos/%s' % video_id, - video_id) + URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url) - partner_id = self._search_regex( - r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id') + for num, url_ in enumerate(URLS, 1): + webpage = self._download_webpage( + url_, video_id, fatal=num == len(URLS)) - kaltura_id = self._search_regex( - [r'id="kaltura_player_([^"]+)"', - r"_wb_entry_id\s*:\s*'([^']+)", - r'data-kaltura-entry-id="([^"]+)'], - webpage, 'kaltura id') + default = NO_DEFAULT if num == len(URLS) else None + + partner_id = self._search_regex( + r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id', + default=default) + + kaltura_id = self._search_regex( + [r'id="kaltura_player_([^"]+)"', + r"_wb_entry_id\s*:\s*'([^']+)", + r'data-kaltura-entry-id="([^"]+)'], + webpage, 'kaltura id', default=default) + + if partner_id and kaltura_id: + break return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura') diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index ac5d0fe24..f3734e9f8 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,19 +4,23 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, parse_duration, str_to_int, ) class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { - 'id': '95008', + 'id': 'qlDUmNsj6VS', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', @@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor): # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.*?) - EPORNER', webpage, 'title') + webpage, urlh = self._download_webpage_handle(url, display_id) - redirect_url = 'http://www.eporner.com/config5/%s' % video_id - player_code = self._download_webpage( - redirect_url, display_id, note='Downloading player config') + video_id = self._match_id(compat_str(urlh.geturl())) - sources = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources') + hash = self._search_regex( + r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') + + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] formats = [] - for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources): - fmt = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = format_dict.get('src') + if not isinstance(src, compat_str) or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) self._sort_formats(formats) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5fce9f47a..53fab1a31 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,7 +20,10 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( + AENetworksIE, + HistoryTopicIE, +) from .afreecatv import AfreecaTVIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE @@ -41,10 +44,10 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arkena import ArkenaIE from .ard import ( ARDIE, ARDMediathekIE, - SportschauIE, ) from .arte import ( ArteTvIE, @@ -71,6 +74,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, + BBCCoUkIPlayerPlaylistIE, + BBCCoUkPlaylistIE, BBCIE, ) from .beeg import BeegIE @@ -108,6 +113,10 @@ from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) from .cbc import ( CBCIE, CBCPlayerIE, @@ -131,10 +140,11 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE +from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE @@ -147,7 +157,11 @@ from .cnn import ( ) from .coub import CoubIE from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, + ToshIE, +) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import RtmpIE @@ -162,6 +176,8 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -245,6 +261,7 @@ from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .fktv import FKTVIE from .flickr import FlickrIE +from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE @@ -270,6 +287,7 @@ from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE +from .fusion import FusionIE from .gameinformer import GameInformerIE from .gamekings import GamekingsIE from .gameone import ( @@ -279,7 +297,6 @@ from .gameone import ( from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE -from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE @@ -315,6 +332,10 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE @@ -353,6 +374,7 @@ from .jove import JoveIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE +from .kamcord import KamcordIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE @@ -376,6 +398,10 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE @@ -417,6 +443,7 @@ from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@ -449,10 +476,10 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, - MTVIggyIE, MTVDEIE, ) from .muenchentv import MuenchenTVIE @@ -475,7 +502,6 @@ from .nbc import ( NBCNewsIE, NBCSportsIE, NBCSportsVPlayerIE, - MSNBCIE, ) from .ndr import ( NDRIE, @@ -503,7 +529,6 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) -from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( @@ -512,9 +537,15 @@ from .nhl import ( NHLVideocenterCategoryIE, NHLIE, ) -from .nick import NickIE +from .nick import ( + NickIE, + NickDeIE, +) from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -559,8 +590,13 @@ from .nytimes import ( NYTimesArticleIE, ) from .nuvid import NuvidIE +from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onet import ( + OnetIE, + OnetChannelIE, +) from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, @@ -599,6 +635,7 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( @@ -622,7 +659,10 @@ from .qqmusic import ( QQMusicToplistIE, QQMusicPlaylistIE, ) -from .r7 import R7IE +from .r7 import ( + R7IE, + R7ArticleIE, +) from .radiocanada import ( RadioCanadaIE, RadioCanadaAudioVideoIE, @@ -650,6 +690,7 @@ from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE @@ -658,8 +699,9 @@ from .rtlnl import RtlNlIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE +from .rudo import RudoIE from .ruhd import RUHDIE from .ruleporn import RulePornIE from .rutube import ( @@ -694,10 +736,12 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .sixplay import SixPlayIE from .skynewsarabia import ( SkyNewsArabiaIE, SkyNewsArabiaArticleIE, ) +from .skysports import SkySportsIE from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( @@ -738,6 +782,7 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE +from .sportschau import SportschauIE from .srgssr import ( SRGSSRIE, SRGSSRPlayIE, @@ -746,6 +791,7 @@ from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -878,6 +924,7 @@ from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .urort import UrortIE +from .urplay import URPlayIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( @@ -904,6 +951,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE @@ -952,6 +1000,7 @@ from .viki import ( from .vk import ( VKIE, VKUserVideosIE, + VKWallPostIE, ) from .vlive import VLiveIE from .vodlocker import VodlockerIE @@ -1037,6 +1086,7 @@ from .youtube import ( YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, + YoutubeSharedVideoIE, YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5bbd39d2..0fb781a73 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:\w+\.)?facebook\.com/ + (?:[\w-]+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: @@ -127,8 +127,26 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + }, { + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + # Facebook API embed + # see https://developers.facebook.com/docs/plugins/embedded-video-player + mobj = re.search(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) + if mobj is not None: + return mobj.group('url') + def _login(self): (useremail, password) = self._get_login_info() if useremail is None: @@ -204,12 +222,25 @@ class FacebookIE(InfoExtractor): BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) - if m: - swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + + for m in re.findall(PATTERN, webpage): + swf_params = m.replace('\\\\', '\\').replace('\\"', '"') data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) - video_data = json.loads(params_raw)['video_data'] + video_data_candidate = json.loads(params_raw)['video_data'] + for _, f in video_data_candidate.items(): + if not f: + continue + if isinstance(f, dict): + f = [f] + if not isinstance(f, list): + continue + if f[0].get('video_id') == video_id: + video_data = video_data_candidate + break + if video_data: + break def video_data_list2dict(video_data): ret = {} @@ -239,6 +270,8 @@ class FacebookIE(InfoExtractor): formats = [] for format_id, f in video_data.items(): + if f and isinstance(f, dict): + f = [f] if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py new file mode 100644 index 000000000..acb6133ff --- /dev/null +++ b/youtube_dl/extractor/flipagram.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + try_get, + unified_timestamp, +) + + +class FlipagramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://flipagram.com/f/nyvTSJMKId', + 'md5': '888dcf08b7ea671381f00fab74692755', + 'info_dict': { + 'id': 'nyvTSJMKId', + 'ext': 'mp4', + 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', + 'duration': 35.571, + 'timestamp': 1461244995, + 'upload_date': '20160421', + 'uploader': 'kitty juria', + 'uploader_id': 'sjuria101', + 'creator': 'kitty juria', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'comments': list, + 'formats': 'mincount:2', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json( + self._search_regex( + r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), + video_id) + + flipagram = video_data['flipagram'] + video = flipagram['video'] + + json_ld = self._search_json_ld(webpage, video_id, default=False) + title = json_ld.get('title') or flipagram['captionText'] + description = json_ld.get('description') or flipagram.get('captionText') + + formats = [{ + 'url': video['url'], + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': int_or_none(video_data.get('size')), + }] + + preview_url = try_get( + flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) + if preview_url: + formats.append({ + 'url': preview_url, + 'ext': 'm4a', + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + counts = flipagram.get('counts', {}) + user = flipagram.get('user', {}) + video_data = flipagram.get('video', {}) + + thumbnails = [{ + 'url': self._proto_relative_url(cover['url']), + 'width': int_or_none(cover.get('width')), + 'height': int_or_none(cover.get('height')), + 'filesize': int_or_none(cover.get('size')), + } for cover in flipagram.get('covers', []) if cover.get('url')] + + # Note that this only retrieves comments that are initally loaded. + # For videos with large amounts of comments, most won't be retrieved. + comments = [] + for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): + text = comment.get('comment') + if not text or not isinstance(text, list): + continue + comments.append({ + 'author': comment.get('user', {}).get('name'), + 'author_id': comment.get('user', {}).get('username'), + 'id': comment.get('id'), + 'text': text[0], + 'timestamp': unified_timestamp(comment.get('created')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': float_or_none(flipagram.get('duration'), 1000), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), + 'uploader': user.get('name'), + 'uploader_id': user.get('username'), + 'creator': user.get('name'), + 'view_count': int_or_none(counts.get('plays')), + 'like_count': int_or_none(counts.get('likes')), + 'repost_count': int_or_none(counts.get('reflips')), + 'comment_count': int_or_none(counts.get('comments')), + 'comments': comments, + 'formats': formats, + } diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index df7665176..a3bb98377 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FoxSportsIE(InfoExtractor): @@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor): _TEST = { 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'gA0bHB3Ladz3', - 'ext': 'flv', + 'id': 'i0qKWsk3qJaM', + 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', + 'upload_date': '20150423', + 'timestamp': 1429761109, + 'uploader': 'NEWA-FNG-FOXSPORTS', }, 'add_ie': ['ThePlatform'], } @@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor): r"data-player-config='([^']+)'", webpage, 'data player config'), video_id) - return self.url_result(smuggle_url( - config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True})) + return self.url_result(smuggle_url(update_url_query( + config['releaseURL'], { + 'mbr': 'true', + 'switch': 'http', + }), {'force_smil_url': True})) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ad94e31f3..7653975e3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -14,7 +14,10 @@ from ..utils import ( parse_duration, determine_ext, ) -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) class FranceTVBaseInfoExtractor(InfoExtractor): @@ -188,6 +191,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Dailymotion embed + 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', + 'md5': 'ee7f1828f25a648addc90cb2687b1f12', + 'info_dict': { + 'id': 'x4iiko0', + 'ext': 'mp4', + 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', + 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'timestamp': 1467011958, + 'upload_date': '20160627', + 'uploader': 'France Inter', + 'uploader_id': 'x2q2ez', + }, + 'add_ie': ['Dailymotion'], }] def _real_extract(self, url): @@ -197,7 +215,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') + return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) + + dailymotion_urls = DailymotionIE._extract_urls(webpage) + if dailymotion_urls: + return self.playlist_result([ + self.url_result(dailymotion_url, DailymotionIE.ie_key()) + for dailymotion_url in dailymotion_urls]) video_id, catalogue = self._search_regex( (r'id-video=([^@]+@[^"]+)', diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py new file mode 100644 index 000000000..b4ab4cbb7 --- /dev/null +++ b/youtube_dl/extractor/fusion.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class FusionIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'info_dict': { + 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', + 'ext': 'mp4', + 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', + 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', + 'duration': 140.0, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://fusion.net/video/201781', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-video-id=(["\'])(?P<code>.+?)\1', + webpage, 'ooyala code', group='code') + + return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 4ffdd7515..4e859e09a 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,19 +1,19 @@ from __future__ import unicode_literals import re -import json -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_urllib_parse_unquote, - compat_urlparse, ) from ..utils import ( unescapeHTML, + url_basename, + dict_get, ) -class GameSpotIE(InfoExtractor): +class GameSpotIE(OnceIE): _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', @@ -28,10 +28,13 @@ class GameSpotIE(InfoExtractor): 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', 'info_dict': { 'id': 'gs-2300-6424837', - 'ext': 'flv', - 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing', + 'ext': 'mp4', + 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, + 'params': { + 'skip_download': True, # m3u8 downloads + }, }] def _real_extract(self, url): @@ -39,29 +42,73 @@ class GameSpotIE(InfoExtractor): webpage = self._download_webpage(url, page_id) data_video_json = self._search_regex( r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = json.loads(unescapeHTML(data_video_json)) + data_video = self._parse_json(unescapeHTML(data_video_json), page_id) streams = data_video['videoStreams'] + manifest_url = None formats = [] f4m_url = streams.get('f4m_stream') - if f4m_url is not None: - # Transform the manifest url to a link to the mp4 files - # they are used in mobile devices. - f4m_path = compat_urlparse.urlparse(f4m_url).path - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') - http_path = f4m_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin( - 'http://video.gamespotcdn.com/', http_template) - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': q, - }) - else: + if f4m_url: + manifest_url = f4m_url + formats.extend(self._extract_f4m_formats( + f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) + m3u8_url = streams.get('m3u8_stream') + if m3u8_url: + manifest_url = m3u8_url + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, page_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + progressive_url = dict_get( + streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + if progressive_url and manifest_url: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if qualities_basename: + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if qualities: + qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) + http_url_basename = url_basename(progressive_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + onceux_json = self._search_regex( + r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) + if onceux_json: + onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') + if onceux_url: + formats.extend(self._extract_once_formats(re.sub( + r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', ''))) + + if not formats: for quality in ['sd', 'hd']: # It's actually a link to a flv file flv_url = streams.get('f4m_{0}'.format(quality)) @@ -71,6 +118,7 @@ class GameSpotIE(InfoExtractor): 'ext': 'flv', 'format_id': quality, }) + self._sort_formats(formats) return { 'id': data_video['guid'], diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py deleted file mode 100644 index 1e7948ab8..000000000 --- a/youtube_dl/extractor/gametrailers.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_age_limit, - url_basename, -) - - -class GametrailersIE(InfoExtractor): - _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' - - _TEST = { - 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', - 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', - 'info_dict': { - 'id': '2983958', - 'ext': 'mp4', - 'display_id': '116437-Just-Cause-3-Review', - 'title': 'Just Cause 3 - Review', - 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.+?)\|', webpage, 'title').strip() - embed_url = self._proto_relative_url( - self._search_regex( - r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, - 'embed url'), - scheme='http:') - video_id = url_basename(embed_url) - embed_page = self._download_webpage(embed_url, video_id) - embed_vars_json = self._search_regex( - r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, - 'embed vars') - info = self._parse_json(embed_vars_json, video_id) - - formats = [] - for media in info['media']: - if media['mediaPurpose'] == 'play': - formats.append({ - 'url': media['uri'], - 'height': media['height'], - 'width:': media['width'], - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('thumbUri'), - 'description': self._og_search_description(webpage), - 'duration': int_or_none(info.get('videoLengthInSeconds')), - 'age_limit': parse_age_limit(info.get('audienceRating')), - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa24061c..5364f0b19 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -49,7 +49,10 @@ from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE @@ -59,11 +62,16 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudIE class GenericIE(InfoExtractor): @@ -467,7 +475,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -634,6 +642,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> @@ -775,6 +785,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -850,6 +869,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { @@ -920,6 +940,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1129,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed @@ -1220,6 +1263,133 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player:stream embed + { + 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', + 'info_dict': { + 'id': 'master', + 'ext': 'mp4', + 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', + 'uploader': 'www.rtl.be', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook <iframe> embed + { + 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', + 'md5': 'fbcde74f534176ecb015849146dd3aee', + 'info_dict': { + 'id': '599637780109885', + 'ext': 'mp4', + 'title': 'Facebook video #599637780109885', + }, + }, + # Facebook API embed + { + 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', + 'md5': 'a47372ee61b39a7b90287094d447d94e', + 'info_dict': { + 'id': '10153467542406923', + 'ext': 'mp4', + 'title': 'Facebook video #10153467542406923', + }, + }, + # Wordpress "YouTube Video Importer" plugin + { + 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', + 'md5': 'd16797741b560b485194eddda8121b48', + 'info_dict': { + 'id': 'HNTXWDXV9Is', + 'ext': 'mp4', + 'title': 'Blue Devils Drumline Stanford lot 2016', + 'upload_date': '20160627', + 'uploader_id': 'GENOCIDE8GENERAL10', + 'uploader': 'cylus cyrus', + }, + }, + { + # video stored on custom kaltura server + 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', + 'md5': '537617d06e64dfed891fa1593c4b30cc', + 'info_dict': { + 'id': '0_1iotm5bh', + 'ext': 'mp4', + 'title': 'Elecciones británicas: 5 lecciones para Rajoy', + 'description': 'md5:435a89d68b9760b92ce67ed227055f16', + 'uploader_id': 'videos.expansion@el-mundo.net', + 'upload_date': '20150429', + 'timestamp': 1430303472, + }, + 'add_ie': ['Kaltura'], + }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # } ] def report_following_redirect(self, new_url): @@ -1576,12 +1746,16 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) - # Look for embedded Dailymotion player - matches = re.findall( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + # Look for Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return _playlist_from_matches(matches, lambda m: m[-1]) + + matches = DailymotionIE._extract_urls(webpage) + if matches: + return _playlist_from_matches(matches) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1718,10 +1892,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Facebook') + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url is not None: + return self.url_result(facebook_url, 'Facebook') # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -1836,12 +2009,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -1903,18 +2073,14 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'EaglePlatform') + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) # Look for ClipYou (uses Eagle.Platform) embeds mobj = re.search( @@ -2008,6 +2174,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2060,6 +2231,19 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default=None, expected_type='VideoObject') + if json_ld and json_ld.get('url'): + info_dict.update({ + 'title': video_title or info_dict['title'], + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit + }) + info_dict.update(json_ld) + return info_dict + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2103,6 +2287,9 @@ class GenericIE(InfoExtractor): r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if not found: # Try to find twitter cards info + # twitter:player:stream should be checked before twitter:player since + # it is expected to contain a raw stream (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) if not found: @@ -2136,6 +2323,15 @@ class GenericIE(InfoExtractor): '_type': 'url', 'url': new_url, } + + if not found: + # twitter:player is a https URL to iframe player that may or may not + # be supported by youtube-dl thus this is checked the very last (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url: + return self.url_result(embed_url) + if not found: raise UnsupportedError(url) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py new file mode 100644 index 000000000..656ce6d05 --- /dev/null +++ b/youtube_dl/extractor/hrti.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_age_limit, + sanitized_Request, + try_get, +) + + +class HRTiBaseIE(InfoExtractor): + """ + Base Information Extractor for Croatian Radiotelevision + video on demand site https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + """ + _NETRC_MACHINE = 'hrti' + + _APP_LANGUAGE = 'hr' + _APP_VERSION = '1.1' + _APP_PUBLICATION_ID = 'all_in_one' + _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + + def _initialize_api(self): + init_data = { + 'application_publication_id': self._APP_PUBLICATION_ID + } + + uuid = self._download_json( + self._API_URL, None, note='Downloading uuid', + errnote='Unable to download uuid', + data=json.dumps(init_data).encode('utf-8'))['uuid'] + + app_data = { + 'uuid': uuid, + 'application_publication_id': self._APP_PUBLICATION_ID, + 'application_version': self._APP_VERSION + } + + req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req.get_method = lambda: 'PUT' + + resources = self._download_json( + req, None, note='Downloading session information', + errnote='Unable to download session information') + + self._session_id = resources['session_id'] + + modules = resources['modules'] + + self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( + language=self._APP_LANGUAGE, + application_id=self._APP_PUBLICATION_ID) + + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) + + self._logout_url = modules['user']['resources']['logout']['uri'] + + def _login(self): + (username, password) = self._get_login_info() + # TODO: figure out authentication with cookies + if username is None or password is None: + self.raise_login_required() + + auth_data = { + 'username': username, + 'password': password, + } + + try: + auth_info = self._download_json( + self._login_url, None, note='Logging in', errnote='Unable to log in', + data=json.dumps(auth_data).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: + auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + else: + raise + + error_message = auth_info.get('error', {}).get('message') + if error_message: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_message), + expected=True) + + self._token = auth_info['secure_streaming_token'] + + def _real_initialize(self): + self._initialize_api() + self._login() + + +class HRTiIE(HRTiBaseIE): + _VALID_URL = r'''(?x) + (?: + hrti:(?P<short_id>[0-9]+)| + https?:// + hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? + ) + ''' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'display_id': 'republika-dokumentarna-serija-16-hd', + 'ext': 'mp4', + 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', + 'description': 'md5:48af85f620e8e0e1df4096270568544f', + 'duration': 2922, + 'view_count': int, + 'average_rating': int, + 'episode_number': int, + 'season_number': int, + 'age_limit': 12, + }, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', + 'only_matching': True, + }, { + 'url': 'hrti:2181385', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('short_id') or mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + '%s/video_id/%s/format/json' % (self._search_url, video_id), + display_id, 'Downloading video metadata JSON')['video'][0] + + title_info = video['title'] + title = title_info['title_long'] + + movie = video['video_assets']['movie'][0] + m3u8_url = movie['url'].format(TOKEN=self._token) + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + description = clean_html(title_info.get('summary_long')) + age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) + view_count = int_or_none(video.get('views')) + average_rating = int_or_none(video.get('user_rating')) + duration = int_or_none(movie.get('duration')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'formats': formats, + } + + +class HRTiPlaylistIE(HRTiBaseIE): + _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', + 'info_dict': { + 'id': '212', + 'title': 'ekumena', + }, + 'playlist_mincount': 8, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category_id = mobj.group('id') + display_id = mobj.group('display_id') or category_id + + response = self._download_json( + '%s/category_id/%s/format/json' % (self._search_url, category_id), + display_id, 'Downloading video metadata JSON') + + video_ids = try_get( + response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], + list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + + entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + + return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index fc0197ae1..8f7f232be 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor): 'info_dict': { 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', - 'uploader_id': 'britneyspears', 'title': 'Video by britneyspears', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1453760977, diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c916..01c7b3042 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,28 +3,22 @@ from __future__ import unicode_literals import hashlib import itertools -import math -import os -import random import re import time -import uuid from .common import InfoExtractor from ..compat import ( - compat_parse_qs, compat_str, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, ) from ..utils import ( + clean_html, decode_packed_codes, + get_element_by_id, + get_element_by_attribute, ExtractorError, ohdave_rsa_encrypt, remove_start, - sanitized_Request, - urlencode_postdata, - url_basename, ) @@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', + # MD5 checksum differs on my machine and Travis CI 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': '667171934041350c5de3f5015f7f1152', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'title': '名侦探柯南第752集', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }], - 'params': { - 'skip_download': True, + 'ext': 'mp4', + 'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇', }, + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, @@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor): 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1', + 'ext': 'mp4', 'title': '泰坦尼克号', }, - 'playlist': [{ - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }, { - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }], - 'expected_warnings': ['Needs a VIP account for full video'], + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', 'info_dict': { @@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor): 'only_matching': True, }] - _FORMATS_MAP = [ - ('1', 'h6'), - ('2', 'h5'), - ('3', 'h4'), - ('4', 'h3'), - ('5', 'h2'), - ('10', 'h1'), - ] - - AUTH_API_ERRORS = { - # No preview available (不允许试看鉴权失败) - 'Q00505': 'This video requires a VIP account', - # End of preview time (试看结束鉴权失败) - 'Q00506': 'Needs a VIP account for full video', + _FORMATS_MAP = { + '96': 1, # 216p, 240p + '1': 2, # 336p, 360p + '2': 3, # 480p, 504p + '21': 4, # 504p + '4': 5, # 720p + '17': 5, # 720p + '5': 6, # 1072p, 1080p + '18': 7, # 1080p } def _real_initialize(self): @@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor): return True - def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): - auth_params = { - # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as - 'version': '2.0', - 'platform': 'b6c13e26323c537d', - 'aid': tvid, + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) + + key = 'd5fb4bd9d50c4be6948c97edd7254b0e' + sc = md5_text(compat_str(tm) + key + tvid) + params = { 'tvid': tvid, - 'uid': '', - 'deviceId': _uuid, - 'playType': 'main', # XXX: always main? - 'filename': os.path.splitext(url_basename(api_video_url))[0], - } - - qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) - for key, val in qd_items.items(): - auth_params[key] = val[0] - - auth_req = sanitized_Request( - 'http://api.vip.iqiyi.com/services/ckn.action', - urlencode_postdata(auth_params)) - # iQiyi server throws HTTP 405 error without the following header - auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - auth_result = self._download_json( - auth_req, video_id, - note='Downloading video authentication JSON', - errnote='Unable to download video authentication JSON') - - code = auth_result.get('code') - msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code - if code == 'Q00506': - if do_report_warning: - self.report_warning(msg) - return False - if 'data' not in auth_result: - if msg is not None: - raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unexpected error from Iqiyi auth API') - - return auth_result['data'] - - def construct_video_urls(self, data, video_id, _uuid, tvid): - def do_xor(x, y): - a = y % 3 - if a == 1: - return x ^ 121 - if a == 2: - return x ^ 72 - return x ^ 103 - - def get_encode_code(l): - a = 0 - b = l.split('-') - c = len(b) - s = '' - for i in range(c - 1, -1, -1): - a = do_xor(int(b[c - i - 1], 16), i) - s += chr(a) - return s[::-1] - - def get_path_key(x, format_id, segment_index): - mg = ')(*&^flash@#$%a' - tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, - note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) - )['t'] - t = str(int(math.floor(int(tm) / (600.0)))) - return md5_text(t + mg + x) - - video_urls_dict = {} - need_vip_warning_report = True - for format_item in data['vp']['tkl'][0]['vs']: - if 0 < int(format_item['bid']) <= 10: - format_id = self.get_format(format_item['bid']) - else: - continue - - video_urls = [] - - video_urls_info = format_item['fs'] - if not format_item['fs'][0]['l'].startswith('/'): - t = get_encode_code(format_item['fs'][0]['l']) - if t.endswith('mp4'): - video_urls_info = format_item['flvs'] - - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] - if not vl.startswith('/'): - vl = get_encode_code(vl) - is_vip_video = '/vip/' in vl - filesize = segment['b'] - base_url = data['vp']['du'].split('/') - if not is_vip_video: - key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - base_url.insert(-1, key) - base_url = '/'.join(base_url) - param = { - 'su': _uuid, - 'qyid': uuid.uuid4().hex, - 'client': '', - 'z': '', - 'bt': '', - 'ct': '', - 'tn': str(int(time.time())) - } - api_video_url = base_url + vl - if is_vip_video: - api_video_url = api_video_url.replace('.f4v', '.hml') - auth_result = self._authenticate_vip_video( - api_video_url, video_id, tvid, _uuid, need_vip_warning_report) - if auth_result is False: - need_vip_warning_report = False - break - param.update({ - 't': auth_result['t'], - # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as - 'cid': 'afbe8fd3d73448c9', - 'vid': video_id, - 'QY00001': auth_result['u'], - }) - api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse_urlencode(param) - js = self._download_json( - api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) - video_url = js['l'] - video_urls.append( - (video_url, filesize)) - - video_urls_dict[format_id] = video_urls - return video_urls_dict - - def get_format(self, bid): - matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] - return matched_format_ids[0] if len(matched_format_ids) else None - - def get_bid(self, format_id): - matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] - return matched_bids[0] if len(matched_bids) else None - - def get_raw_data(self, tvid, video_id, enc_key, _uuid): - tm = str(int(time.time())) - tail = tm + tvid - param = { - 'key': 'fvip', - 'src': md5_text('youtube-dl'), - 'tvId': tvid, 'vid': video_id, - 'vinfo': 1, - 'tm': tm, - 'enc': md5_text(enc_key + tail), - 'qyid': _uuid, - 'tn': random.random(), - # In iQiyi's flash player, um is set to 1 if there's a logged user - # Some 1080P formats are only available with a logged user. - # Here force um=1 to trick the iQiyi server - 'um': 1, - 'authkey': md5_text(md5_text('') + tail), - 'k_tag': 1, + 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', + 'sc': sc, + 't': tm, } - api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse_urlencode(param) - raw_data = self._download_json(api_url, video_id) - return raw_data - - def get_enc_key(self, video_id): - # TODO: automatic key extraction - # last update at 2016-01-22 for Zombie::bite - enc_key = '4a1caba4b4465345366f28da7c117d20' - return enc_key + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=self.geo_verification_headers()) def _extract_playlist(self, webpage): PAGE_SIZE = 50 @@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(video_id) + formats = [] + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) - raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - if raw_data['code'] != 'A000000': - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + data = raw_data['data'] - data = raw_data['data'] + for stream in data['vidl']: + if 'm3utx' not in stream: + continue + vd = compat_str(stream['vd']) + formats.append({ + 'url': stream['m3utx'], + 'format_id': vd, + 'ext': 'mp4', + 'preference': self._FORMATS_MAP.get(vd, -1), + 'protocol': 'm3u8_native', + }) - title = data['vi']['vn'] + if formats: + break - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, tvid) + self._sleep(5, video_id) - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) + self._sort_formats(formats) + title = (get_element_by_id('widget-videotitle', webpage) or + clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697ff5..1729f5bfb 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,6 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, compat_parse_qs, ) @@ -15,6 +14,7 @@ from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + smuggle_url, ) @@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor): )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? ) ''' - _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' + _SERVICE_URL = 'http://cdnapi.kaltura.com' + _SERVICE_BASE = '/api_v3/index.php' _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -64,16 +65,50 @@ class KalturaIE(InfoExtractor): } ] - def _kaltura_api_call(self, video_id, actions, *args, **kwargs): + @staticmethod + def _extract_url(webpage): + mobj = ( + re.search( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P<q1>['\"])wid(?P=q1)\s*:\s* + (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? + (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), + """, webpage) or + re.search( + r'''(?xs) + (?P<q1>["\']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? + (?P=q1).*? + (?: + entry_?[Ii]d| + (?P<q2>["\'])entry_?[Ii]d(?P=q2) + )\s*:\s* + (?P<q3>["\'])(?P<id>.+?)(?P=q3) + ''', webpage)) + if mobj: + embed_info = mobj.groupdict() + url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + escaped_pid = re.escape(embed_info['partner_id']) + service_url = re.search( + r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + webpage) + if service_url: + url = smuggle_url(url, {'service_url': service_url.group(1)}) + return url + + def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] if len(actions) > 1: for i, a in enumerate(actions[1:], start=1): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse_urlencode(params) - url = self._API_BASE + query - data = self._download_json(url, video_id, *args, **kwargs) + data = self._download_json( + (service_url or self._SERVICE_URL) + self._SERVICE_BASE, + video_id, query=params, *args, **kwargs) status = data if len(actions) == 1 else data[0] if status.get('objectType') == 'KalturaAPIException': @@ -82,7 +117,7 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id): + def _get_kaltura_signature(self, video_id, partner_id, service_url=None): actions = [{ 'apiVersion': '3.1', 'expiry': 86400, @@ -92,10 +127,10 @@ class KalturaIE(InfoExtractor): 'widgetId': '_%s' % partner_id, }] return self._kaltura_api_call( - video_id, actions, note='Downloading Kaltura signature')['ks'] + video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id): - signature = self._get_kaltura_signature(video_id, partner_id) + def _get_video_info(self, video_id, partner_id, service_url=None): + signature = self._get_kaltura_signature(video_id, partner_id, service_url) actions = [ { 'action': 'null', @@ -118,7 +153,7 @@ class KalturaIE(InfoExtractor): }, ] return self._kaltura_api_call( - video_id, actions, note='Downloading video info JSON') + video_id, actions, service_url, note='Downloading video info JSON') def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -127,7 +162,7 @@ class KalturaIE(InfoExtractor): partner_id, entry_id = mobj.group('partner_id', 'id') ks = None if partner_id and entry_id: - info, flavor_assets = self._get_video_info(entry_id, partner_id) + info, flavor_assets = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -175,12 +210,17 @@ class KalturaIE(InfoExtractor): unsigned_url += '?referrer=%s' % referrer return unsigned_url + data_url = info['dataUrl'] + if '/flvclipper/' in data_url: + data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) + formats = [] for f in flavor_assets: # Continue if asset is not ready if f['status'] != 2: continue - video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) + video_url = sign_url( + '%s/flavorId/%s' % (data_url, f['id'])) formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -193,9 +233,12 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) - m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( - m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + if '/playManifest/' in data_url: + m3u8_url = sign_url(data_url.replace( + 'format/url', 'format/applehttp')) + formats.extend(self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) self._check_formats(formats, entry_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py new file mode 100644 index 000000000..b50120d98 --- /dev/null +++ b/youtube_dl/extractor/kamcord.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + qualities, +) + + +class KamcordIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.kamcord.com/v/hNYRduDgWb4', + 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', + 'info_dict': { + 'id': 'hNYRduDgWb4', + 'ext': 'mp4', + 'title': 'Drinking Madness', + 'uploader': 'jacksfilms', + 'uploader_id': '3044562', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video = self._parse_json( + self._search_regex( + r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', + webpage, 'video'), + video_id)['video'] + + title = video['title'] + + formats = self._extract_m3u8_formats( + video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('id') + + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('heartCount')) + comment_count = int_or_none(video.get('messageCount')) + + preference_key = qualities(('small', 'medium', 'large')) + + thumbnails = [{ + 'url': thumbnail_url, + 'id': thumbnail_id, + 'preference': preference_key(thumbnail_id), + } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() + if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0221fb919..b1d460599 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -26,11 +26,6 @@ class KuwoBaseIE(InfoExtractor): def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: - headers = {} - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - headers['Ytdl-request-proxy'] = cn_verification_proxy - query = { 'format': file_format['ext'], 'br': file_format.get('br', ''), @@ -42,7 +37,7 @@ class KuwoBaseIE(InfoExtractor): song_url = self._download_webpage( 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], - query=query, headers=headers, + query=query, headers=self.geo_verification_headers(), ) if song_url == 'IPDeny' and not tolerate_ip_deny: diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index b08f6e3c9..da5a5de4a 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -1,60 +1,65 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - parse_duration, + js_to_json, + smuggle_url, ) class LA7IE(InfoExtractor): - IE_NAME = 'la7.tv' - _VALID_URL = r'''(?x) - https?://(?:www\.)?la7\.tv/ - (?: - richplayer/\?assetid=| - \?contentId= - ) - (?P<id>[0-9]+)''' + IE_NAME = 'la7.it' + _VALID_URL = r'''(?x)(https?://)?(?: + (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| + tg\.la7\.it/repliche-tgla7\?id= + )(?P<id>.+)''' - _TEST = { - 'url': 'http://www.la7.tv/richplayer/?assetid=50355319', - 'md5': 'ec7d1f0224d20ba293ab56cf2259651f', + _TESTS = [{ + # 'src' is a plain URL + 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', + 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '50355319', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', - 'title': 'IL DIVO', - 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci', - 'duration': 6254, + 'title': 'Inc.Cool8', + 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', + 'thumbnail': 're:^https?://.*', + 'uploader_id': 'kdla7pillole@iltrovatore.it', + 'timestamp': 1443814869, + 'upload_date': '20151002', }, - 'skip': 'Blocked in the US', - } + }, { + # 'src' is a dictionary + 'url': 'http://tg.la7.it/repliche-tgla7?id=189080', + 'md5': '6b0d8888d286e39870208dfeceaf456b', + 'info_dict': { + 'id': '189080', + 'ext': 'mp4', + 'title': 'TG LA7', + }, + }, { + 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id - doc = self._download_xml(xml_url, video_id) - video_title = doc.find('title').text - description = doc.find('description').text - duration = parse_duration(doc.find('duration').text) - thumbnail = doc.find('img').text - view_count = int(doc.find('views').text) + webpage = self._download_webpage(url, video_id) - prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:') - - formats = [{ - 'format': vnode.find('quality').text, - 'tbr': int(vnode.find('quality').text), - 'url': vnode.find('fms').text.strip().replace('mp4:', prefix), - } for vnode in doc.findall('.//videos/video')] - self._sort_formats(formats) + player_data = self._parse_json( + self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), + video_id, transform_source=js_to_json) return { + '_type': 'url_transparent', + 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { + 'service_url': 'http://kdam.iltrovatore.it', + }), 'id': video_id, - 'title': video_title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'view_count': view_count, + 'title': player_data['title'], + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': player_data.get('poster'), + 'ie_key': 'Kaltura', } diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 000000000..ade27a99e --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .arkena import ArkenaIE + + +class LcpPlayIE(ArkenaIE): + _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+' + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': '327336', + 'ext': 'mp4', + 'title': '327336', + 'timestamp': 1456391602, + 'upload_date': '20160225', + }, + 'params': { + 'skip_download': True, + }, + }] + + +class LcpIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)' + + _TESTS = [{ + # arkena embed + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': 'd56d03e9', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'description': 'md5:96ad55009548da9dea19f4120c6c16a8', + 'timestamp': 1456488895, + 'upload_date': '20160226', + }, + 'params': { + 'skip_download': True, + }, + }, { + # dailymotion live stream + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'id': 'xji3qy', + 'ext': 'mp4', + 'title': 'La Chaine Parlementaire (LCP), Live TNT', + 'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b', + 'uploader': 'LCP', + 'uploader_id': 'xbz33d', + 'timestamp': 1308923058, + 'upload_date': '20110624', + }, + 'params': { + # m3u8 live stream + 'skip_download': True, + }, + }, { + 'url': 'http://www.lcp.fr/emissions/277792-les-volontaires', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + play_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + webpage, 'play iframe', default=None, group='url') + + if not play_url: + return self.url_result(url, 'Generic') + + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + description = self._html_search_meta( + ('description', 'twitter:description'), webpage) + + return { + '_type': 'url_transparent', + 'ie_key': LcpPlayIE.ie_key(), + 'url': play_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581cd9..e9cc9aa59 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -20,9 +20,10 @@ from ..utils import ( int_or_none, orderedSet, parse_iso8601, - sanitized_Request, str_or_none, url_basename, + urshift, + update_url_query, ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + param1 = urshift(param1, 1) + ((param1 & 1) << 31) _loc3_ += 1 return param1 @@ -93,6 +90,10 @@ class LeIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # reversed from http://jstatic.letvcdn.com/sdk/player.js + def get_mms_key(self, time): + return self.ror(time, 8) ^ 185025305 + # see M3U8Encryption class in KLetvPlayer.swf @staticmethod def decrypt_m3u8(encrypted_data): @@ -113,28 +114,7 @@ class LeIE(InfoExtractor): return bytes(_loc7_) - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - params = { - 'id': media_id, - 'platid': 1, - 'splatid': 101, - 'format': 1, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.le.com' - } - play_json_req = sanitized_Request( - 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) - ) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - play_json = self._download_json( - play_json_req, - media_id, 'Downloading playJson data') - + def _check_errors(self, play_json): # Check for errors playstatus = play_json['playstatus'] if playstatus['status'] == 0: @@ -145,43 +125,99 @@ class LeIE(InfoExtractor): msg = 'Generic error. flag = %d' % flag raise ExtractorError(msg, expected=True) - playurl = play_json['playurl'] + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) - formats = ['350', '1000', '1300', '720p', '1080p'] - dispatch = playurl['dispatch'] + play_json_h5 = self._download_json( + 'http://api.le.com/mms/out/video/playJsonH5', + media_id, 'Downloading html5 playJson data', query={ + 'id': media_id, + 'platid': 3, + 'splatid': 304, + 'format': 1, + 'tkey': self.get_mms_key(int(time.time())), + 'domain': 'www.le.com', + 'tss': 'no', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_h5) - urls = [] - for format_id in formats: - if format_id in dispatch: - media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, + play_json_flash = self._download_json( + 'http://api.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 101, + 'format': 1, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_h5_urls(media_url, format_id): + location = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, query={ 'format': 1, 'expect': 3, - 'rateid': format_id, - }) + 'tss': 'no', + })['location'] - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + return { + 'http': update_url_query(location, {'tss': 'no'}), + 'hls': update_url_query(location, {'tss': 'ios'}), + } - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) + def get_flash_urls(media_url, format_id): + media_url += '&' + compat_urllib_parse_urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, + }) - m3u8_data = self.decrypt_m3u8(req.read()) + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) - url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - 'ext': determine_ext(dispatch[format_id][1]), - 'format_id': format_id, - 'protocol': 'm3u8', - } + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) - if format_id[-1:] == 'p': - url_info_dict['height'] = int_or_none(format_id[:-1]) + m3u8_data = self.decrypt_m3u8(req.read()) - urls.append(url_info_dict) + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } + + extracted_formats = [] + formats = [] + for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): + playurl = play_json['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -190,7 +226,7 @@ class LeIE(InfoExtractor): return { 'id': media_id, - 'formats': urls, + 'formats': formats, 'title': playurl['title'], 'thumbnail': playurl['pic'], 'description': description, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 2d5040032..a98c4c530 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE): _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', + # md5 is unstable 'info_dict': { 'id': '114408', 'ext': 'mp4', diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index d5945ad66..39d2742c8 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -23,34 +21,5 @@ class M6IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id, - 'Downloading video RSS') - - title = rss.find('./channel/item/title').text - description = rss.find('./channel/item/description').text - thumbnail = rss.find('./channel/item/visuel_clip_big').text - duration = int(rss.find('./channel/item/duration').text) - view_count = int(rss.find('./channel/item/nombre_vues').text) - - formats = [] - for format_id in ['lq', 'sd', 'hq', 'hd']: - video_url = rss.find('./channel/item/url_video_%s' % format_id) - if video_url is None: - continue - formats.append({ - 'url': video_url.text, - 'format_id': format_id, - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py new file mode 100644 index 000000000..cdb46e163 --- /dev/null +++ b/youtube_dl/extractor/meta.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .pladform import PladformIE +from ..utils import ( + unescapeHTML, + int_or_none, + ExtractorError, +) + + +class METAIE(InfoExtractor): + _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.meta.ua/5502115.video', + 'md5': '71b6f3ee274bef16f1ab410f7f56b476', + 'info_dict': { + 'id': '5502115', + 'ext': 'mp4', + 'title': 'Sony Xperia Z camera test [HQ]', + 'description': 'Xperia Z shoots video in FullHD HDR.', + 'uploader_id': 'nomobile', + 'uploader': 'CHЁZA.TV', + 'upload_date': '20130211', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'http://video.meta.ua/iframe/5502115', + 'only_matching': True, + }, { + # pladform embed + 'url': 'http://video.meta.ua/7121015.video', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + st_html5 = self._search_regex( + r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) + + if st_html5: + # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js + json_str = '' + for i in range(0, len(st_html5), 3): + json_str += '�%s;' % st_html5[i:i + 3] + uppod_data = self._parse_json(unescapeHTML(json_str), video_id) + error = uppod_data.get('customnotfound') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_url = uppod_data['file'] + info = { + 'id': video_id, + 'url': video_url, + 'title': uppod_data.get('comment') or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), + 'duration': int_or_none(self._og_search_property( + 'video:duration', webpage, default=None)), + } + if 'youtube.com/' in video_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + }) + return info + + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index b6f00cc25..e6e7659a1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -11,13 +11,14 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, + get_element_by_attribute, + mimetype2ext, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor): 'uploader': 'ign', 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', }, + 'skip': 'Page is temporarily unavailable.', }, # AnyClip video { @@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor): 'id': 'an-dVVXnuY7Jh77J', 'ext': 'mp4', 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'anyclip', - 'description': 'md5:38c711dd98f5bb87acf973d573442e67', + 'uploader': 'AnyClip', + 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', }, }, # age-restricted video @@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor): def report_disclaimer(self): self.to_screen('Retrieving disclaimer') - def _real_initialize(self): + def _confirm_age(self): # Retrieve disclaimer self.report_disclaimer() self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() - self._download_webpage(request, None, False, 'Unable to confirm age') + self._download_webpage( + self._FILTER_POST, None, False, 'Unable to confirm age', + data=urlencode_postdata({ + 'filters': '0', + 'submit': "Continue - I'm over 18", + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) def _real_extract(self, url): # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - - video_id = mobj.group(1) + video_id, display_id = re.match(self._VALID_URL, url).groups() # the video may come from an external site m_external = re.match('^(\w{2})-(.*)$', video_id) @@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor): if prefix == 'cb': return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - # Retrieve video webpage to extract further information - req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id) + # self._confirm_age() # AnyClip videos require the flashversion cookie so that we get the link # to the mp4 file - mobj_an = re.match(r'^an-(.*?)$', video_id) - if mobj_an: - req.headers['Cookie'] = 'flashVersion=0;' - webpage = self._download_webpage(req, video_id) + headers = {} + if video_id.startswith('an-'): + headers['Cookie'] = 'flashVersion=0;' + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id, headers=headers) + + error = get_element_by_attribute( + 'class', 'notfound-page-title', webpage) + if error: + raise ExtractorError(error, expected=True) + + video_title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor): 'player_url': player_url, 'ext': play_path.partition(':')[0], }) + if video_url is None: + flashvars = self._parse_json(self._search_regex( + r'flashvars\s*=\s*({.*});', webpage, 'flashvars', + default=None), video_id, fatal=False) + if flashvars: + video_url = [] + for source in flashvars.get('sources'): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + if ext == 'm3u8': + video_url.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + video_url.append({ + 'url': source_url, + 'ext': ext, + }) if video_url is None: raise ExtractorError('Unsupported video type') - video_title = self._html_search_regex( - r'(?im)<title>(.*) - Video', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], + webpage, 'title', fatal=False) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'title', fatal=False) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) duration = int_or_none( - self._html_search_meta('video:duration', webpage)) - + self._html_search_meta('video:duration', webpage, default=None)) age_limit = ( 18 if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) @@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor): 'url': video_url, 'ext': video_ext, }] - self._sort_formats(formats) + return { 'id': video_id, + 'display_id': display_id, 'description': description, 'uploader': video_uploader, 'title': video_title, diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 9fbc74f5d..27bdff8b2 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor): _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' - _TEST = { + _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { @@ -20,13 +20,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + # no tbr extracted from stream_url + 'url': 'http://www.mgtv.com/v/1/1/f/3324755.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) api_data = self._download_json( 'http://v.api.mgtv.com/player/video', video_id, - query={'video_id': video_id})['data'] + query={'video_id': video_id}, + headers=self.geo_verification_headers())['data'] info = api_data['info'] formats = [] @@ -40,7 +45,8 @@ class MGTVIE(InfoExtractor): def extract_format(stream_url, format_id, idx, query={}): format_info = self._download_json( stream_url, video_id, - note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + note='Download video info for format %s' % (format_id or '#%d' % idx), + query=query) return { 'format_id': format_id, 'url': format_info['info'], diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 170ebd9eb..937ba0f28 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( xpath_text, int_or_none, @@ -18,13 +19,16 @@ class MioMioIE(InfoExtractor): _TESTS = [{ # "type=video" in flashvars 'url': 'http://www.miomio.tv/watch/cc88912/', - 'md5': '317a5f7f6b544ce8419b784ca8edae65', 'info_dict': { 'id': '88912', 'ext': 'flv', 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, + 'params': { + # The server provides broken file + 'skip_download': True, + } }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -32,7 +36,7 @@ class MioMioIE(InfoExtractor): 'title': '《动漫同人插画绘制》', }, 'playlist_mincount': 86, - 'skip': 'This video takes time too long for retrieving the URL', + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc173113/', 'info_dict': { @@ -40,20 +44,23 @@ class MioMioIE(InfoExtractor): 'title': 'The New Macbook 2015 上手试玩与简评' }, 'playlist_mincount': 2, + 'skip': 'Unable to load videos', + }, { + # new 'h5' player + 'url': 'http://www.miomio.tv/watch/cc273295/', + 'md5': '', + 'info_dict': { + 'id': '273295', + 'ext': 'mp4', + 'title': 'アウト×デラックス 20160526', + }, + 'params': { + # intermittent HTTP 500 + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - mioplayer_path = self._search_regex( - r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} - + def _extract_mioplayer(self, webpage, video_id, title, http_headers): xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -92,10 +99,34 @@ class MioMioIE(InfoExtractor): 'http_headers': http_headers, }) + return entries + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + mioplayer_path = self._search_regex( + r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path') + + if '_h5' in mioplayer_path: + player_url = compat_urlparse.urljoin(url, mioplayer_path) + player_webpage = self._download_webpage( + player_url, video_id, + note='Downloading player webpage', headers={'Referer': url}) + entries = self._parse_html5_media_entries(player_url, player_webpage) + http_headers = {'Referer': player_url} + else: + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} + entries = self._extract_mioplayer(webpage, video_id, title, http_headers) + if len(entries) == 1: segment = entries[0] segment['id'] = video_id segment['title'] = title + segment['http_headers'] = http_headers return segment return { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3589c223d..cd169f361 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,5 +1,8 @@ +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -8,82 +11,137 @@ from ..compat import ( from ..utils import ( get_element_by_attribute, int_or_none, + remove_start, + extract_attributes, + determine_ext, ) -class MiTeleIE(InfoExtractor): - IE_DESC = 'mitele.es' - _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P[^/]+)/' +class MiTeleBaseIE(InfoExtractor): + def _get_player_info(self, url, webpage): + player_data = extract_attributes(self._search_regex( + r'(?s)()', + webpage, 'ms video player')) + video_id = player_data['data-media-id'] + config_url = compat_urlparse.urljoin(url, player_data['data-config']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + mmc_url = config['services']['mmc'] - _TEST = { + duration = None + formats = [] + for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): + mmc = self._download_json( + m_url, video_id, 'Downloading mmc JSON') + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), + video_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + + +class MiTeleIE(MiTeleBaseIE): + IE_DESC = 'mitele.es' + _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P[^/]+)/' + + _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', # MD5 is unstable 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'series': 'Diario de', + 'season': 'La redacción', + 'episode': 'Programa 144', 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - } + }, { + # no explicit title + 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/', + 'info_dict': { + 'id': 'eLZSwoEd1S3pVyUm8lc6F', + 'display_id': 'programa-226', + 'ext': 'mp4', + 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', + 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', + 'series': 'Cuarto Milenio', + 'season': 'Temporada 6', + 'episode': 'Programa 226', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'duration': 7312, + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - config_url = self._search_regex( - r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') - config_url = compat_urlparse.urljoin(url, config_url) - - config = self._download_json( - config_url, display_id, 'Downloading config JSON') - - mmc = self._download_json( - config['services']['mmc'], display_id, 'Downloading mmc JSON') - - formats = [] - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') - ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): - continue - token_data = { - 'bas': bas, - 'icd': loc, - 'ogn': ogn, - 'sta': '0', - } - media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - display_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: - continue - formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - display_id, f4m_id=loc)) - self._sort_formats(formats) + info = self._get_player_info(url, webpage) title = self._search_regex( - r'class="Destacado-text"[^>]*>\s*([^<]+)', webpage, 'title') + r'class="Destacado-text"[^>]*>\s*([^<]+)', + webpage, 'title', default=None) - video_id = self._search_regex( - r'data-media-id\s*=\s*"([^"]+)"', webpage, - 'data media id', default=None) or display_id - thumbnail = config.get('poster', {}).get('imageUrl') - duration = int_or_none(mmc.get('duration')) + mobj = re.search(r'''(?sx) + class="Destacado-text"[^>]*>.*?

\s* + (?P[^<]+)\s* + (?P[^<]+)\s* + (?P[^<]+)''', webpage) + series, season, episode = mobj.groups() if mobj else [None] * 3 - return { - 'id': video_id, + if not title: + if mobj: + title = '%s - %s - %s' % (series, season, episode) + else: + title = remove_start(self._search_regex( + r'([^<]+)', webpage, 'title'), 'Ver online ') + + info.update({ 'display_id': display_id, 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + 'series': series, + 'season': season, + 'episode': episode, + }) + return info diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 483f6925f..560fe188b 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor): description = self._og_search_description(webpage) like_count = parse_count(self._search_regex( r'\bbutton-favorite[^>]+>.*?]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'([0-9,.]+)'], - webpage, 'play count', fatal=False)) + webpage, 'play count', default=None)) return { 'id': track_id, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 000000000..1ec8e0f50 --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unescapeHTML, +) + + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P[^/]+)/[a-z]{2}-(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'md5': '8442f66c116cbab1ff7098f986983458', + 'info_dict': { + 'id': 'BBqQYNE', + 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', + 'ext': 'mp4', + 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', + 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', + 'duration': 104, + 'uploader': 'CBS Entertainment', + 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', + }, + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'data-metadata\s*=\s*(["\'])(?P.+?)\1', + webpage, 'video data', default='{}', group='data'), + display_id, transform_source=unescapeHTML) + + if not video: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = video['title'] + + formats = [] + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + # .ism is not yet supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + if ext == 'ism': + continue + if 'm3u8' in format_url: + # m3u8_native should not be used here until + # https://github.com/rg3/youtube-dl/issues/9913 is fixed + m3u8_formats = self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False) + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in m3u8_formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + formats.extend(m3u8_formats) + else: + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': 'http', + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), + }) + self._sort_formats(formats) + + subtitles = {} + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 640ee3d93..2f455680e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, compat_str, + compat_xpath, ) from ..utils import ( ExtractorError, @@ -14,6 +15,8 @@ from ..utils import ( float_or_none, HEADRequest, sanitized_Request, + strip_or_none, + timeconvert, unescapeHTML, url_basename, RegexNotFoundError, @@ -34,13 +37,13 @@ class MTVServicesInfoExtractor(InfoExtractor): return uri.split(':')[-1] # This was originally implemented for ComedyCentral, but it also works here - @staticmethod - def _transform_rtmp_url(rtmp_video_url): + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) if not m: - return rtmp_video_url + return {'rtmp': rtmp_video_url} base = 'http://viacommtvstrmfs.fplive.net/' - return base + m.group('finalid') + return {'http': base + m.group('finalid')} def _get_feed_url(self, uri): return self._FEED_URL @@ -84,13 +87,14 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue - formats.append({ - 'ext': ext, - 'url': self._transform_rtmp_url(rtmp_video_url), - 'format_id': rendition.get('bitrate'), + new_urls = self._transform_rtmp_url(rtmp_video_url) + formats.extend([{ + 'ext': 'flv' if new_url.startswith('rtmp') else ext, + 'url': new_url, + 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - }) + } for kind, new_url in new_urls.items()]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -131,7 +135,9 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description = xpath_text(itemdoc, 'description') + description = strip_or_none(xpath_text(itemdoc, 'description')) + + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) title_el = None if title_el is None: @@ -139,9 +145,9 @@ class MTVServicesInfoExtractor(InfoExtractor): itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:video_title') if title_el is None: - title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) if title_el is None: - title_el = itemdoc.find('.//title') or itemdoc.find('./title') + title_el = itemdoc.find(compat_xpath('.//title')) if title_el.text is None: title_el = None @@ -165,6 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, } def _get_feed_query(self, uri): @@ -183,8 +190,13 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')]) + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_title=title, playlist_description=description) def _extract_mgid(self, webpage): try: @@ -230,6 +242,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', }, } @@ -272,6 +286,8 @@ class MTVIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', }, }, ] @@ -298,20 +314,6 @@ class MTVIE(MTVServicesInfoExtractor): return self._get_videos_info(uri) -class MTVIggyIE(MTVServicesInfoExtractor): - IE_NAME = 'mtviggy.com' - _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' - _TEST = { - 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', - 'info_dict': { - 'id': '984696', - 'ext': 'mp4', - 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', - } - } - _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' - - class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P\d+)-[^/#?]+/*(?:[#?].*)?$' @@ -319,7 +321,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, @@ -327,20 +329,21 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { - # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', @@ -352,6 +355,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] def _real_extract(self, url): @@ -364,11 +368,14 @@ class MTVDEIE(MTVServicesInfoExtractor): r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) + def _mrss_url(item): + return item['mrss'] + item.get('mrssvars', '') + # news pages contain single video in playlist with different id if len(playlist) == 1: - return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(item['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(item), video_id) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 722518663..e717abb9f 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, url_basename, @@ -61,7 +62,7 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(InfoExtractor): +class NationalGeographicChannelIE(ThePlatformIE): IE_NAME = 'natgeo:channel' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P[^/?]+)' @@ -102,12 +103,22 @@ class NationalGeographicChannelIE(InfoExtractor): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') + query = { + 'mbr': 'true', + 'switch': 'http', + } + is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) + if is_auth == 'auth': + auth_resource_id = self._search_regex( + r"video_auth_resourceId\s*=\s*'([^']+)'", + webpage, 'auth resource id') + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( - update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + update_url_query(release_url, query), {'force_smil_url': True}), 'display_id': display_id, } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6b7da1149..f694e210b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,6 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, - update_url_query, - int_or_none, - HEADRequest, - parse_iso8601, ) @@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ (?:video/.+?/(?P\d+)| - ([^/]+/)*(?P[^/?]+)) + ([^/]+/)*(?:.*-)?(?P[^/?]+)) ''' _TESTS = [ @@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', + 'uploader': 'NBCU-NEWS', + 'timestamp': 1401363060, + 'upload_date': '20140529', }, }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 'md5': 'fdbf39ab73a72df5896b6234ff98518a', 'info_dict': { - 'id': 'Wjf9EDR3A_60', + 'id': '529953347624', 'ext': 'mp4', 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', @@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + 'timestamp': 1423104900, + 'uploader': 'NBCU-NEWS', + 'upload_date': '20150205', }, }, { @@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE): 'info_dict': { 'id': '529953347624', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', - 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', + 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'upload_date': '20150922', + 'timestamp': 1442917800, + 'uploader': 'NBCU-NEWS', }, - 'expected_warnings': ['http-6000 is not available'] }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', @@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'uploader': 'NBCU-NEWS', + }, + }, + { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': '314487875924', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', + 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE): } else: # "feature" and "nightly-news" pages use theplatform.com - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) - bootstrap = self._parse_json( - bootstrap_json, display_id, transform_source=unescapeHTML) - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - else: - info = bootstrap - video_id = info['mpxId'] - title = info['title'] - - subtitles = {} - caption_links = info.get('captionLinks') - if caption_links: - for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): - sub_url = caption_links.get(sub_key) - if sub_url: - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': sub_ext, - }) - - formats = [] - for video_asset in info['videoAssets']: - video_url = video_asset.get('publicUrl') - if not video_url: - continue - container = video_asset.get('format') - asset_type = video_asset.get('assetType') or '' - if container == 'ISM' or asset_type == 'FireTV-Once': - continue - elif asset_type == 'OnceURL': - tp_formats, tp_subtitles = self._extract_theplatform_smil( - video_url, video_id) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) + video_id = mobj.group('mpx_id') + if not video_id.isdigit(): + webpage = self._download_webpage(url, video_id) + info = None + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], + webpage, 'bootstrap json', default=None) + bootstrap = self._parse_json( + bootstrap_json, video_id, transform_source=unescapeHTML) + if 'results' in bootstrap: + info = bootstrap['results'][0]['video'] + elif 'video' in bootstrap: + info = bootstrap['video'] else: - tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) - format_id = 'http%s' % ('-%d' % tbr if tbr else '') - video_url = update_url_query( - video_url, {'format': 'redirect'}) - # resolve the url so that we can check availability and detect the correct extension - head = self._request_webpage( - HEADRequest(video_url), video_id, - 'Checking %s url' % format_id, - '%s is not available' % format_id, - fatal=False) - if head: - video_url = head.geturl() - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(video_asset.get('width')), - 'height': int_or_none(video_asset.get('height')), - 'tbr': tbr, - 'container': video_asset.get('format'), - }) - self._sort_formats(formats) + info = bootstrap + video_id = info['mpxId'] return { + '_type': 'url_transparent', 'id': video_id, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnail'), - 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), - 'formats': formats, - 'subtitles': subtitles, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'ie_key': 'ThePlatformFeed', } - - -class MSNBCIE(InfoExtractor): - # https URLs redirect to corresponding http ones - _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P[^/]+)' - _TEST = { - 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', - 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', - 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', - 'ext': 'mp4', - 'title': 'The chaotic GOP immigration vote', - 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1406937606, - 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_meta('embedURL', webpage) - return self.url_result(embed_url) diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py deleted file mode 100644 index 9ccd7d774..000000000 --- a/youtube_dl/extractor/nextmovie.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class NextMovieIE(MTVServicesInfoExtractor): - IE_NAME = 'nextmovie.com' - _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P[^/?#]+)' - _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' - _TESTS = [{ - 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', - 'md5': '09a9199f2f11f10107d04fcb153218aa', - 'info_dict': { - 'id': '961726', - 'ext': 'mp4', - 'title': 'The Muppets\' Gravity', - }, - }] - - def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ - 'feed': '1505', - 'mgid': uri, - }) - - def _real_extract(self, url): - mgid = self._match_id(url) - return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index ce065f2b0..9c54846e1 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..compat import compat_urllib_parse_urlencode +from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): + # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -51,6 +53,9 @@ class NickIE(MTVServicesInfoExtractor): } }, ], + }, { + 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): @@ -61,3 +66,26 @@ class NickIE(MTVServicesInfoExtractor): def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') + + +class NickDeIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.de' + _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', + 'only_matching': True, + }, { + 'url': 'http://www.nick.de/shows/342-icarly', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mrss_url = update_url_query(self._search_regex( + r'data-mrss=(["\'])(?Phttp.+?)\1', webpage, 'mrss url', group='url'), + {'siteKey': 'nick.de'}) + + return self._get_videos_info_from_url(mrss_url, video_id) diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py new file mode 100644 index 000000000..d889245ad --- /dev/null +++ b/youtube_dl/extractor/ninecninemedia.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + ExtractorError +) + + +class NineCNineMediaIE(InfoExtractor): + _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + + def _real_extract(self, url): + destination_code, video_id = re.match(self._VALID_URL, url).groups() + api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) + content = self._download_json(api_base_url, video_id, query={ + '$include': '[contentpackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] + stacks = self._download_json(stacks_base_url, video_id)['Items'] + if len(stacks) > 1: + raise ExtractorError('multiple stacks') + stack = stacks[0] + stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + formats = [] + formats.extend(self._extract_m3u8_formats( + stack_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + stack_base_url + 'f4m', video_id, + f4m_id='hds', fatal=False)) + mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'duration': parse_duration(content.get('BroadcastTime')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py new file mode 100644 index 000000000..faa577237 --- /dev/null +++ b/youtube_dl/extractor/ninenow.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + ExtractorError, +) + + +class NineNowIE(InfoExtractor): + IE_NAME = '9now.com.au' + _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P[^/?#]+)' + _TESTS = [{ + # clip + 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', + 'md5': '17cf47d63ec9323e562c9957a968b565', + 'info_dict': { + 'id': '16801', + 'ext': 'mp4', + 'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike', + 'description': 'Is a boycott of the NAB Cup "on the table"?', + 'uploader_id': '4460760524001', + 'upload_date': '20160713', + 'timestamp': 1468421266, + }, + 'skip': 'Only available in Australia', + }, { + # episode + 'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19', + 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + page_data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.*?});', webpage, + 'page data'), display_id) + common_data = page_data.get('episode', {}).get('episode') or page_data.get('clip', {}).get('clip') + video_data = common_data['video'] + + if video_data.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + + brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId'] + video_id = compat_str(video_data.get('id') or brightcove_id) + title = common_data['name'] + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_id[1:]) + } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()] + + return { + '_type': 'url_transparent', + 'url': self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'id': video_id, + 'title': title, + 'description': common_data.get('description'), + 'duration': float_or_none(video_data.get('duration'), 1000), + 'thumbnails': thumbnails, + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py new file mode 100644 index 000000000..4b4e66b05 --- /dev/null +++ b/youtube_dl/extractor/nintendo.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import unescapeHTML + + +class NintendoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nintendo\.com/games/detail/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nintendo.com/games/detail/yEiAzhU2eQI1KZ7wOHhngFoAHc1FpHwj', + 'info_dict': { + 'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW', + 'ext': 'flv', + 'title': 'Duck Hunt Wii U VC NES - Trailer', + 'duration': 60.326, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u', + 'info_dict': { + 'id': 'tokyo-mirage-sessions-fe-wii-u', + 'title': 'Tokyo Mirage Sessions ♯FE', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + entries = [ + OoyalaIE._build_url_result(m.group('code')) + for m in re.finditer( + r'class=(["\'])embed-video\1[^>]+data-video-code=(["\'])(?P(?:(?!\2).)+)\2', + webpage)] + + return self.playlist_result( + entries, page_id, unescapeHTML(self._og_search_title(webpage, fatal=False))) diff --git a/youtube_dl/extractor/odatv.py b/youtube_dl/extractor/odatv.py new file mode 100644 index 000000000..314527f98 --- /dev/null +++ b/youtube_dl/extractor/odatv.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + NO_DEFAULT, + remove_start +) + + +class OdaTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?odatv\.com/(?:mob|vid)_video\.php\?.*\bid=(?P[^&]+)' + _TESTS = [{ + 'url': 'http://odatv.com/vid_video.php?id=8E388', + 'md5': 'dc61d052f205c9bf2da3545691485154', + 'info_dict': { + 'id': '8E388', + 'ext': 'mp4', + 'title': 'Artık Davutoğlu ile devam edemeyiz' + } + }, { + # mobile URL + 'url': 'http://odatv.com/mob_video.php?id=8E388', + 'only_matching': True, + }, { + # no video + 'url': 'http://odatv.com/mob_video.php?id=8E900', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + no_video = 'NO VIDEO!' in webpage + + video_url = self._search_regex( + r'mp4\s*:\s*(["\'])(?Phttp.+?)\1', webpage, 'video url', + default=None if no_video else NO_DEFAULT, group='url') + + if no_video: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + return { + 'id': video_id, + 'url': video_url, + 'title': remove_start(self._og_search_title(webpage), 'Video: '), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py new file mode 100644 index 000000000..fc22ad5eb --- /dev/null +++ b/youtube_dl/extractor/onet.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_iso8601, + remove_start, + strip_or_none, + url_basename, +) + + +class OnetBaseIE(InfoExtractor): + def _search_mvp_id(self, webpage): + return self._search_regex( + r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + + def _extract_from_id(self, video_id, webpage): + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + video_url = f.get('url') + if not video_url: + continue + ext = determine_ext(video_url) + if format_id == 'ism': + # TODO: Support Microsoft Smooth Streaming + continue + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class OnetIE(OnetBaseIE): + _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + IE_NAME = 'onet.tv' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'info_dict': { + 'id': 'qbpyqc', + 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', + 'ext': 'mp4', + 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', + 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', + 'upload_date': '20160705', + 'timestamp': 1467721580, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + + return info_dict + + +class OnetChannelIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P[a-z]+)(?:[?#]|$)' + IE_NAME = 'onet.tv:channel' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival', + 'info_dict': { + 'id': 'openerfestival', + 'title': 'Open\'er Festival Live', + 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + }, + 'playlist_mincount': 46, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + current_clip_info = self._parse_json(self._search_regex( + r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, + transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) + video_id = remove_start(current_clip_info['ckmId'], 'mvp:') + video_name = url_basename(current_clip_info['url']) + + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_name) + return self._extract_from_id(video_id, webpage) + + self.to_screen( + 'Downloading channel %s - add --no-playlist to just download video %s' % ( + channel_id, video_name)) + matches = re.findall( + r']+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + webpage) + entries = [ + self.url_result(video_link, OnetIE.ie_key()) + for video_link in matches] + + channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) + channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) + return self.playlist_result(entries, channel_id, channel_title, channel_description) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index d7b13a0f1..6fb1a3fcc 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + float_or_none, + mimetype2ext, ) @@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'md5': 'e49f947c105b8a78a675a0ee1bddedfe', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', - 'uploader_id': 'TheAVClub', + 'uploader_id': 'the-av-club', }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + video_data = self._download_json( + 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) + + title = video_data['title'] formats = [] - for src in re.findall(r']+src="([^"]+)"', webpage): - ext = determine_ext(src) + for source in video_data.get('sources', []): + source_url = source.get('url') + if not source_url: + continue + ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: - height = int_or_none(self._search_regex( - r'/(\d+)\.%s' % ext, src, 'height', default=None)) + tbr = int_or_none(source.get('bitrate')) formats.append({ - 'format_id': ext + ('-%sp' % height if height else ''), - 'url': src, - 'height': height, + 'format_id': ext + ('-%d' % tbr if tbr else ''), + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'tbr': tbr, 'ext': ext, - 'preference': 1, }) self._sort_formats(formats) - title = self._search_regex( - r'share_title\s*=\s*(["\'])(?P[^\1]+?)\1', - webpage, 'title', group='title') - description = self._search_regex( - r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', - webpage, 'description', default=None, group='description') - thumbnail = self._search_regex( - r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', - webpage, 'thumbnail', default=False, group='thumbnail') - - uploader_id = self._search_regex( - r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', - webpage, 'uploader id', fatal=False, group='uploader_id') - uploader = self._search_regex( - r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', - webpage, 'uploader', default=False, group='uploader') - return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'thumbnail': video_data.get('poster_url'), + 'uploader': video_data.get('channel_name'), + 'uploader_id': video_data.get('channel_slug'), + 'duration': float_or_none(video_data.get('duration', 1000)), + 'tags': video_data.get('tags'), 'formats': formats, } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 4e3864f0d..6ae30679a 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -40,16 +40,16 @@ class ORFTVthekIE(InfoExtractor): 'skip': 'Blocked outside of Austria / Germany', }, { 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', - 'playlist': [{ - 'md5': '68f543909aea49d621dfc7703a11cfaf', - 'info_dict': { - 'id': '7982259', - 'ext': 'mp4', - 'title': 'Best of Ingrid Thurnher', - 'upload_date': '20140527', - 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', - } - }], + 'info_dict': { + 'id': '7982259', + 'ext': 'mp4', + 'title': 'Best of Ingrid Thurnher', + 'upload_date': '20140527', + 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', + }, + 'params': { + 'skip_download': True, # rtsp downloads + }, '_skip': 'Blocked outside of Austria / Germany', }] @@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TEST = { + _TESTS = [{ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', 'only_matching': True, - } + }, { + 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', + 'only_matching': True, + }] def _real_extract(self, url): show_id = self._match_id(url) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac6e..f6f423597 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -516,9 +516,14 @@ class PBSIE(InfoExtractor): # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): continue + f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k|baseline', bitrate, http_url), + 'url': f_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c23b314e7..75f5884a9 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor): title = user.get('display_name') or user.get('username') description = user.get('description') + broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or + data_store.get('BroadcastCache', {}).get('broadcastIds', [])) + entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) - for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index bc559d1df..77e1211d6 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -49,7 +49,7 @@ class PladformIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index 2eb4fd96d..78d219299 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -15,7 +15,7 @@ from ..utils import ( class PlayvidIE(InfoExtractor): _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { @@ -24,8 +24,19 @@ class PlayvidIE(InfoExtractor): 'title': 'md5:9256d01c6317e3f703848b5906880dc8', 'duration': 82, 'age_limit': 18, - } - } + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 000000000..f559b899f --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 39b53ecf6..8df12eec0 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -1,19 +1,32 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, js_to_json, - qualities, ) class PornHdIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' - _TEST = { + _TESTS = [{ + 'url': 'http://www.pornhd.com/videos/9864/selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'md5': 'c8b964b1f0a4b5f7f28ae3a5c9f86ad5', + 'info_dict': { + 'id': '9864', + 'display_id': 'selfie-restroom-masturbation-fun-with-chubby-cutie-hd-porn-video', + 'ext': 'mp4', + 'title': 'Restroom selfie masturbation', + 'description': 'md5:3748420395e03e31ac96857a8f125b2b', + 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'age_limit': 18, + } + }, { + # removed video 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', 'info_dict': { @@ -25,8 +38,9 @@ class PornHdIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'view_count': int, 'age_limit': 18, - } - } + }, + 'skip': 'Not available anymore', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -38,28 +52,38 @@ class PornHdIE(InfoExtractor): title = self._html_search_regex( [r'<span[^>]+class=["\']video-name["\'][^>]*>([^<]+)', r'<title>(.+?) - .*?[Pp]ornHD.*?'], webpage, 'title') - description = self._html_search_regex( - r'
([^<]+)
', webpage, 'description', fatal=False) - view_count = int_or_none(self._html_search_regex( - r'(\d+) views\s*', webpage, 'view count', fatal=False)) - thumbnail = self._search_regex( - r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) - quality = qualities(['sd', 'hd']) - sources = json.loads(js_to_json(self._search_regex( + sources = self._parse_json(js_to_json(self._search_regex( r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]", - webpage, 'sources'))) + webpage, 'sources', default='{}')), video_id) + + if not sources: + message = self._html_search_regex( + r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P.+?)]+class="description"[^>]*>(?P[^<]+)[0-9a-z]+)' + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P[0-9a-z]+) + ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', @@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor): 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { + # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, }] @classmethod @@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)
(.*?)
', - webpage, 'error message', default=None) + r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 07d49d489..c6eee3b72 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,7 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', @@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor): ] def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - }) - - video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': access_token, + 'client_location': client_location, + 'client_name': client_name, + 'ids': clip_id, + })[0] if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) duration = float_or_none(video.get('duration')) - source_ids = [source['id'] for source in video['sources']] - source_ids_str = ','.join(map(str, source_ids)) + source_ids = [compat_str(source['id']) for source in video['sources']] g = '01!8d8F_)r9]4s[qeuXfP%' + client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) - .encode('utf-8')).hexdigest() - - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - })) - - sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + }) server_id = sources['server_id'] - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, - client_location, source_ids_str, g, client_name]) - .encode('utf-8')).hexdigest() - - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_ids_str, - })) - - urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - - formats = [] - - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() def fix_bitrate(bitrate): bitrate = int_or_none(bitrate) @@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor): return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - for source in urls_sources: - protocol = source['protocol'] - source_url = source['url'] - if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source_url) - if not mobj: + formats = [] + for source_id in source_ids: + client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'vbr': fix_bitrate(source['bitrate']), - 'ext': 'mp4', - 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), - }) - elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats(source_url, clip_id)) - else: - formats.append({ - 'url': source_url, - 'vbr': fix_bitrate(source['bitrate']), - }) - + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) self._sort_formats(formats) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._html_search_regex( + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) + return { 'id': clip_id, 'title': title, diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index 976c8feec..069dbfaed 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -2,22 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - js_to_json, - unescapeHTML, - int_or_none, -) +from ..utils import int_or_none class R7IE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// + _VALID_URL = r'''(?x) + https?:// (?: (?:[a-zA-Z]+)\.r7\.com(?:/[^/]+)+/idmedia/| noticias\.r7\.com(?:/[^/]+)+/[^/]+-| player\.r7\.com/video/i/ ) (?P[\da-f]{24}) - ''' + ''' _TESTS = [{ 'url': 'http://videos.r7.com/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-/idmedia/54e7050b0cf2ff57e0279389.html', 'md5': '403c4e393617e8e8ddc748978ee8efde', @@ -25,6 +22,7 @@ class R7IE(InfoExtractor): 'id': '54e7050b0cf2ff57e0279389', 'ext': 'mp4', 'title': 'Policiais humilham suspeito à beira da morte: "Morre com dignidade"', + 'description': 'md5:01812008664be76a6479aa58ec865b72', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 98, 'like_count': int, @@ -44,45 +42,72 @@ class R7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://player.r7.com/video/i/%s' % video_id, video_id) + video = self._download_json( + 'http://player-api.r7.com/video/i/%s' % video_id, video_id) - item = self._parse_json(js_to_json(self._search_regex( - r'(?s)var\s+item\s*=\s*({.+?});', webpage, 'player')), video_id) - - title = unescapeHTML(item['title']) - thumbnail = item.get('init', {}).get('thumbUri') - duration = None - - statistics = item.get('statistics', {}) - like_count = int_or_none(statistics.get('likes')) - view_count = int_or_none(statistics.get('views')) + title = video['title'] formats = [] - for format_key, format_dict in item['playlist'][0].items(): - src = format_dict.get('src') - if not src: - continue - format_id = format_dict.get('format') or format_key - if duration is None: - duration = format_dict.get('duration') - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id, preference=-1)) - elif src.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(src, video_id, 'mp4', preference=-2)) - else: - formats.append({ - 'url': src, - 'format_id': format_id, - }) + media_url_hls = video.get('media_url_hls') + if media_url_hls: + formats.extend(self._extract_m3u8_formats( + media_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + media_url = video.get('media_url') + if media_url: + f = { + 'url': media_url, + 'format_id': 'http', + } + # m3u8 format always matches the http format, let's copy metadata from + # one to another + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + formats)) + if len(m3u8_formats) == 1: + f_copy = m3u8_formats[0].copy() + f_copy.update(f) + f_copy['protocol'] = 'http' + f = f_copy + formats.append(f) self._sort_formats(formats) + description = video.get('description') + thumbnail = video.get('thumb') + duration = int_or_none(video.get('media_duration')) + like_count = int_or_none(video.get('likes')) + view_count = int_or_none(video.get('views')) + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, 'like_count': like_count, 'view_count': view_count, 'formats': formats, } + + +class R7ArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+)\.r7\.com/(?:[^/]+/)+[^/?#&]+-(?P\d+)' + _TEST = { + 'url': 'http://tv.r7.com/record-play/balanco-geral/videos/policiais-humilham-suspeito-a-beira-da-morte-morre-com-dignidade-16102015', + 'only_matching': True, + } + + @classmethod + def suitable(cls, url): + return False if R7IE.suitable(url) else super(R7ArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r']+(?:id=["\']player-|class=["\']embed["\'][^>]+id=["\'])([\da-f]{24})', + webpage, 'video id') + + return self.url_result('http://player.r7.com/video/i/%s' % video_id, R7IE.ie_key()) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 4f05bbddc..8ec402646 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -12,6 +12,7 @@ from ..utils import ( unified_strdate, xpath_element, ExtractorError, + determine_protocol, ) @@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', 'info_dict': { 'id': '7184272', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Le parcours du tireur capté sur vidéo', 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', 'upload_date': '20141023', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } @@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor): def _real_extract(self, url): app_code, video_id = re.match(self._VALID_URL, url).groups() + device_types = ['ipad', 'android'] + if app_code != 'toutv': + device_types.append('flash') + formats = [] - # TODO: extract m3u8 and f4m formats - # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements + # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file - for device_type in ('flash',): + for device_type in device_types: v_data = self._download_xml( 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', video_id, note='Downloading %s XML' % device_type, query={ @@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor): # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction 'paysJ391wsHjbOJwvCs26toz': 'CA', 'bypasslock': 'NZt5K62gRqfc', - }) + }, fatal=False) v_url = xpath_text(v_data, 'url') if not v_url: continue @@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif ext == 'f4m': - formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_f4m_formats( + v_url, video_id, f4m_id='hds', fatal=False)) else: ext = determine_ext(v_url) bitrates = xpath_element(v_data, 'bitrates') @@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor): tbr = int_or_none(url_e.get('bitrate')) if not tbr: continue + f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) + protocol = determine_protocol({'url': f_url}) formats.append({ - 'format_id': 'rtmp-%d' % tbr, - 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), - 'ext': 'flv', - 'protocol': 'rtmp', + 'format_id': '%s-%d' % (protocol, tbr), + 'url': f_url, + 'ext': 'flv' if protocol == 'rtmp' else ext, + 'protocol': protocol, 'width': int_or_none(url_e.get('width')), 'height': int_or_none(url_e.get('height')), 'tbr': tbr, }) + if protocol == 'rtsp': + base_url = self._search_regex( + r'rtsp://([^?]+)', f_url, 'base url', default=None) + if base_url: + base_url = 'http://' + base_url + formats.extend(self._extract_m3u8_formats( + base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + base_url + '/manifest.f4m', video_id, + f4m_id='hds', fatal=False)) self._sort_formats(formats) metadata = self._download_xml( @@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Barack Obama au Vietnam', 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', 'upload_date': '20160523', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } diff --git a/youtube_dl/extractor/radiojavan.py b/youtube_dl/extractor/radiojavan.py index 884c28420..ec4fa6e60 100644 --- a/youtube_dl/extractor/radiojavan.py +++ b/youtube_dl/extractor/radiojavan.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import( +from ..utils import ( unified_strdate, str_to_int, ) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1aa1..dc640b1bc 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,47 +1,141 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + int_or_none, parse_duration, unified_strdate, - int_or_none, + update_url_query, xpath_text, ) -class RaiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): + def _extract_relinker_formats(self, relinker_url, video_id): + formats = [] + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if media_url == 'http://download.rai.it/video_no_available.mp4': + self.raise_geo_restricted() + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + return formats + + def _extract_from_content_id(self, content_id, base_url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + }) + + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) + self._sort_formats(formats) + else: + raise ExtractorError('not a media file') + + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + + return { + 'id': content_id, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class RaiTVIE(RaiBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '96382709b61dd64a6b88e0f791e6df4c', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', 'duration': 6160, + 'thumbnail': 're:^https?://.*\.jpg$', } }, { + # no m3u8 stream 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': 'd9751b78eac9710d62c2447b224dea39', + # HDS download, MD5 is unstable 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'flv', 'title': 'TG PRIMO TEMPO', 'upload_date': '20140612', 'duration': 1758, + 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'Geo-restricted to Italy', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor): }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '496ab63e420574447f70d02578333437', + 'md5': 'e57493e1cb8bc7c564663f363b171847', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', 'description': 'md5:364b604f7db50594678f483353164fb8', 'upload_date': '20140923', 'duration': 386, + 'thumbnail': 're:^https?://.*\.jpg$', } }, ] def _real_extract(self, url): video_id = self._match_id(url) - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, - video_id, 'Downloading video JSON') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - }) - - subtitles = [] - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - def fix_xml(xml): - return xml.replace(' tag elementi', '').replace('>/', ' 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - elif content_type.startswith('image/'): - thumbnails.append({ - 'url': media_url, - }) - - self._sort_formats(formats) - - if has_subtitle: - webpage = self._download_webpage(url, video_id) - subtitles = self._get_subtitles(video_id, webpage) - else: - raise ExtractorError('not a media file') - - return { - 'id': video_id, - 'title': media['name'], - 'description': media.get('desc'), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, webpage): - subtitles = {} - m = re.search(r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 'upload_date': '20141221', }, - } + }, + { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'skip': 'Geo-restricted to Italy', + }, + { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'md5': '84c1135ce960e8822ae63cec34441d63', + 'info_dict': { + 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 02/07/2016', + 'upload_date': '20160702', + }, + }, + { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'flv', + 'title': 'La diretta di Rainews24', + }, + }, ] @classmethod @@ -201,7 +238,30 @@ class RaiIE(InfoExtractor): iframe_url = self._search_regex( [r']+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + content_item_id = self._search_regex( + r'initEdizione\((?P[\'"])ContentItem-(?P[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P[\'"])(?P(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 796adfdf9..bf200ea4d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -1,23 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, + js_to_json, ) +from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { - 'id': '3.1132799', + 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', 'ext': 'mp4', 'title': 'Fowler Jr. prend la direction de Jacksonville', @@ -33,22 +33,17 @@ class RDSIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # TODO: extract f4m from 9c9media.com - video_url = self._search_regex( - r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', - webpage, 'video url') - - title = self._og_search_title(webpage) or self._html_search_meta( + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ class RDSIE(InfoExtractor): age_limit = self._family_friendly_search(webpage) return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 000000000..f5b2f560c --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + strip_or_none, + unescapeHTML, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' + _LOGIN_URL = 'https://roosterteeth.com/login' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '26576', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', + 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'thumbnail': 're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'comment_count': int, + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='Unable to download login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + login_request = self._download_webpage( + self._LOGIN_URL, None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._LOGIN_URL, + }) + + if not any(re.search(p, login_request) for p in ( + r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', + r'>Sign Out<')): + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', + login_request, 'alert', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + episode = strip_or_none(unescapeHTML(self._search_regex( + (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title'))) + + title = strip_or_none(self._og_search_title( + webpage, default=None)) or episode + + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?Phttp.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not m3u8_url: + if re.search(r']+class=["\']non-sponsor', webpage): + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + + if re.search(r']+class=["\']golive-gate', webpage): + self.raise_login_required('%s is not available yet' % display_id) + + raise ExtractorError('Unable to extract m3u8 URL') + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + + series = self._search_regex( + (r'

More ([^<]+)

', r']+>See All ([^<]+) Videos<'), + webpage, 'series', fatal=False) + + comment_count = int_or_none(self._search_regex( + r'>Comments \((\d+)\)<', webpage, + 'comment count', fatal=False)) + + video_id = self._search_regex( + (r'containerId\s*=\s*["\']episode-(\d+)\1', + r'\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 4896d09d6..f6454c6b0 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', @@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor): raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) - formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) - for item in meta['source']['fb']: - if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats( - item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) - elif item.get('type') == '': - formats.append({'url': item['file']}) + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py new file mode 100644 index 000000000..38366b784 --- /dev/null +++ b/youtube_dl/extractor/rudo.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + js_to_json, + get_element_by_class, + unified_strdate, +) + + +class RudoIE(JWPlatformBaseIE): + _VALID_URL = r'https?://rudo\.video/vod/(?P[0-9a-zA-Z]+)' + + _TEST = { + 'url': 'http://rudo.video/vod/oTzw0MGnyG', + 'md5': '2a03a5b32dd90a04c83b6d391cf7b415', + 'info_dict': { + 'id': 'oTzw0MGnyG', + 'ext': 'mp4', + 'title': 'Comentario Tomás Mosciatti', + 'upload_date': '20160617', + }, + } + + @classmethod + def _extract_url(self, webpage): + mobj = re.search( + ']+src=(?P[\'"])(?P(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, encoding='iso-8859-1') + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) + + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, m3u8_id='hls') + + info_dict.update({ + 'title': self._og_search_title(webpage), + 'upload_date': unified_strdate(get_element_by_class('date', webpage)), + }) + + return info_dict diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a49..96e43af84 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -1,18 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json -import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, - js_to_json, mimetype2ext, - sanitized_Request, - unified_strdate, ) @@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', + 'upload_date': '20120409', + 'timestamp': 1333983600, 'duration': 7794, } } @@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) + presentation_data = self._download_json( + 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', + video_id, data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': video_id, + 'QueryString': '', + } + }), headers={ + 'Content-Type': 'application/json; charset=utf-8', + })['d']['Presentation'] - js_path = self._search_regex( - r'' % video_id, + r'window\.POST_DATA\s*=\s*({.+?});\s*', webpage, 'vine data'), video_id) + data = data[list(data.keys())[0]] + formats = [{ 'format_id': '%(format)s-%(rate)s' % f, 'vcodec': f.get('format'), @@ -109,6 +115,7 @@ class VineIE(InfoExtractor): 'upload_date': unified_strdate(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), + 'view_count': int_or_none(data.get('loops', {}).get('count')), 'like_count': int_or_none(data.get('likes', {}).get('count')), 'comment_count': int_or_none(data.get('comments', {}).get('count')), 'repost_count': int_or_none(data.get('reposts', {}).get('count')), diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 79c819bc3..3ee66e23e 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -3,14 +3,21 @@ from __future__ import unicode_literals import re import json +import sys from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( + clean_html, ExtractorError, + get_element_by_class, int_or_none, orderedSet, - sanitized_Request, + parse_duration, + remove_start, str_to_int, unescapeHTML, unified_strdate, @@ -20,26 +27,72 @@ from .vimeo import VimeoIE from .pladform import PladformIE -class VKIE(InfoExtractor): +class VKBaseIE(InfoExtractor): + _NETRC_MACHINE = 'vk' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page, url_handle = self._download_webpage_handle( + 'https://vk.com', None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username.encode('cp1251'), + 'pass': password.encode('cp1251'), + }) + + # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header + # and expects the first one to be set rather than second (see + # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). + # As of RFC6265 the newer one cookie should be set into cookie store + # what actually happens. + # We will workaround this VK issue by resetting the remixlhk cookie to + # the first one manually. + cookies = url_handle.headers.get('Set-Cookie') + if cookies: + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) + if remixlhk: + value, domain = remixlhk.groups() + self._set_cookie(domain, 'remixlhk', value) + + login_page = self._download_webpage( + 'https://login.vk.com/?act=login', None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form)) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + + +class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' _VALID_URL = r'''(?x) https?:// (?: (?: - (?:m\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk\.com/video_| (?:www\.)?daxab.com/ ) ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: - (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?daxab.com/embed/ ) (?P-?\d+_\d+)(?:.*\blist=(?P[\da-f]+))? ) ''' - _NETRC_MACHINE = 'vk' - _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', @@ -182,37 +235,13 @@ class VKIE(InfoExtractor): # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, + }, + { + 'url': 'http://new.vk.com/video205387401_165548505', + 'only_matching': True, } ] - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_page = self._download_webpage( - 'https://vk.com', None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'email': username.encode('cp1251'), - 'pass': password.encode('cp1251'), - }) - - request = sanitized_Request( - 'https://login.vk.com/?act=login', - urlencode_postdata(login_form)) - login_page = self._download_webpage( - request, None, note='Logging in as %s' % username) - - if re.search(r'onLoginFailed', login_page): - raise ExtractorError( - 'Unable to login, incorrect username and/or password', expected=True) - - def _real_initialize(self): - self._login() - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -336,10 +365,10 @@ class VKIE(InfoExtractor): } -class VKUserVideosIE(InfoExtractor): +class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', @@ -354,6 +383,12 @@ class VKUserVideosIE(InfoExtractor): }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, }] def _real_extract(self, url): @@ -371,3 +406,121 @@ class VKUserVideosIE(InfoExtractor): webpage, 'title', default=page_id)) return self.playlist_result(entries, page_id, title) + + +class VKWallPostIE(VKBaseIE): + IE_NAME = 'vk:wallpost' + _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P-?\d+_\d+)))' + _TESTS = [{ + # public page URL, audio playlist + 'url': 'https://vk.com/bs.official?w=wall-23538238_35', + 'info_dict': { + 'id': '23538238_35', + 'title': 'Black Shadow - Wall post 23538238_35', + 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', + }, + 'playlist': [{ + 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', + 'info_dict': { + 'id': '135220665_111806521', + 'ext': 'mp3', + 'title': 'Black Shadow - Слепое Верование', + 'duration': 370, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Слепое Верование', + }, + }, { + 'md5': '4cc7e804579122b17ea95af7834c9233', + 'info_dict': { + 'id': '135220665_111802303', + 'ext': 'mp3', + 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', + 'duration': 423, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Война - Негасимое Бездны Пламя!', + }, + 'params': { + 'skip_download': True, + }, + }], + 'skip': 'Requires vk account credentials', + }, { + # single YouTube embed, no leading - + 'url': 'https://vk.com/wall85155021_6319', + 'info_dict': { + 'id': '85155021_6319', + 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + }, + 'playlist_count': 1, + 'skip': 'Requires vk account credentials', + }, { + # wall page URL + 'url': 'https://vk.com/wall-23538238_35', + 'only_matching': True, + }, { + # mobile wall page URL + 'url': 'https://m.vk.com/wall-23538238_35', + 'only_matching': True, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + + wall_url = 'https://vk.com/wall%s' % post_id + + post_id = remove_start(post_id, '-') + + webpage = self._download_webpage(wall_url, post_id) + + error = self._html_search_regex( + r'>Error\s*]+class=["\']body["\'][^>]*>([^<]+)', + webpage, 'error', default=None) + if error: + raise ExtractorError('VK said: %s' % error, expected=True) + + description = clean_html(get_element_by_class('wall_post_text', webpage)) + uploader = clean_html(get_element_by_class( + 'fw_post_author', webpage)) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + entries = [] + + for audio in re.finditer(r'''(?sx) + ]+ + id=(?P["\'])audio_info(?P\d+_\d+).*?(?P=q1)[^>]+ + value=(?P["\'])(?Phttp.+?)(?P=q2) + .+? + ''', webpage): + audio_html = audio.group(0) + audio_id = audio.group('id') + duration = parse_duration(get_element_by_class('duration', audio_html)) + track = self._html_search_regex( + r']+id=["\']title%s[^>]*>([^<]+)' % audio_id, + audio_html, 'title', default=None) + artist = self._html_search_regex( + r'>([^<]+)\s*&ndash', audio_html, + 'artist', default=None) + entries.append({ + 'id': audio_id, + 'url': audio.group('url'), + 'title': '%s - %s' % (artist, track) if artist and track else audio_id, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'artist': artist, + 'track': track, + }) + + for video in re.finditer( + r']+href=(["\'])(?P/video(?:-?[\d_]+).*?)\1', webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) + + title = 'Wall post %s' % post_id + + return self.playlist_result( + orderedSet(entries), post_id, + '%s - %s' % (uploader, title) if uploader else title, + description) diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 8e35f24e8..bec7ab327 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -25,7 +25,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1414271750.949, 'upload_date': '20141025', 'duration': 929, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # sporza.be { @@ -39,7 +40,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1413835980.560, 'upload_date': '20141020', 'duration': 3238, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # cobra.be { @@ -53,16 +55,39 @@ class VRTIE(InfoExtractor): 'timestamp': 1413967500.494, 'upload_date': '20141022', 'duration': 661, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { # YouTube video 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'only_matching': True, + 'md5': 'b8b93da1df1cea6c8556255a796b7d61', + 'info_dict': { + 'id': 'Wji-BZ0oCwg', + 'ext': 'mp4', + 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', + 'description': 'md5:8e468944dce15567a786a67f74262583', + 'uploader': 'Star Wars', + 'uploader_id': 'starwars', + 'upload_date': '20160407', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'only_matching': True, + 'md5': '', + 'info_dict': { + 'id': '2377055', + 'ext': 'mp4', + 'title': 'Cafe Derby', + 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', + 'upload_date': '20150626', + 'timestamp': 1435305240.769, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } } ] @@ -98,6 +123,32 @@ class VRTIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + src.replace('playlist.m3u8', 'manifest.f4m'), + video_id, f4m_id='hds', fatal=False)) + if 'data-video-geoblocking="true"' not in webpage: + rtmp_formats = self._extract_smil_formats( + src.replace('playlist.m3u8', 'jwplayer.smil'), + video_id, fatal=False) + formats.extend(rtmp_formats) + for rtmp_format in rtmp_formats: + rtmp_format_c = rtmp_format.copy() + rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtmp_format_c['play_path'] + del rtmp_format_c['ext'] + http_format = rtmp_format_c.copy() + http_format.update({ + 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), + 'protocol': 'http', + }) + rtsp_format = rtmp_format_c.copy() + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([http_format, rtsp_format]) else: formats.extend(self._extract_f4m_formats( '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index eaa888f00..b73da5cd0 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -9,7 +9,7 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, - qualities, + remove_end, ) @@ -22,7 +22,7 @@ class VuClipIE(InfoExtractor): 'id': '922692425', 'ext': '3gp', 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 180, + 'duration': 177, } } @@ -46,34 +46,21 @@ class VuClipIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, error_msg), expected=True) # These clowns alternate between two page types - links_code = self._search_regex( - r'''(?xs) - (?: - | - \s* - ) - (.*?) - (?: - - ) - ''', webpage, 'links') - title = self._html_search_regex( - r'(.*?)-\s*Vuclip', webpage, 'title').strip() + video_url = self._search_regex( + r']+href="([^"]+)"[^>]*>]+src="[^"]*/play\.gif', + webpage, 'video URL', default=None) + if video_url: + formats = [{ + 'url': video_url, + }] + else: + formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] - quality_order = qualities(['Reg', 'Hi']) - formats = [] - for url, q in re.findall( - r'[^"]+)".*?>(?:]*>)?(?P[^<]+)(?:)?', links_code): - format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q - formats.append({ - 'format_id': format_id, - 'url': url, - 'quality': quality_order(q), - }) - self._sort_formats(formats) + title = remove_end(self._html_search_regex( + r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') - duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) + duration = parse_duration(self._html_search_regex( + r'[(>]([0-9]+:[0-9]+)(?:]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) + + if not json_metadata: + return + + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] + + metadata = self._download_json( + jsonp_url, 'metadata', transform_source=strip_jsonp) + + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] + + formats = [] + + # check if the metadata contains a direct URL to a file + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + caption_url = metadata_media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + title = metadata_tracker_data['trackerClipTitle'] + + return { + 'id': metadata_tracker_data.get('trackerClipId', display_id), + 'display_id': display_id, + 'title': title, + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + } + + +class WDRIE(WDRBaseIE): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL @@ -91,10 +171,10 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', - # HDS download, MD5 is unstable + 'md5': '803138901f6368ee497b4d195bb164f2', 'info_dict': { 'id': 'mdb-186083', - 'ext': 'flv', + 'ext': 'mp4', 'upload_date': '20130919', 'title': 'Sachgeschichte - Achterbahn ', 'description': '- Die Sendung mit der Maus -', @@ -120,14 +200,9 @@ class WDRIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - # for wdr.de the data-extension is in a tag with the class "mediaLink" - # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" - # for wdrmaus its in a link to the page in a multiline "videoLink"-tag - json_metadata = self._html_search_regex( - r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', - webpage, 'media link', default=None, flags=re.MULTILINE) + info_dict = self._extract_wdr_video(webpage, display_id) - if not json_metadata: + if not info_dict: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( @@ -140,86 +215,22 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) - media_link_obj = self._parse_json(json_metadata, display_id, - transform_source=js_to_json) - jsonp_url = media_link_obj['mediaObj']['url'] - - metadata = self._download_json( - jsonp_url, 'metadata', transform_source=strip_jsonp) - - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] - - formats = [] - - # check if the metadata contains a direct URL to a file - for kind, media_resource in metadata_media_resource.items(): - if kind not in ('dflt', 'alt'): - continue - - for tag_name, medium_url in media_resource.items(): - if tag_name not in ('videoURL', 'audioURL'): - continue - - ext = determine_ext(medium_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - medium_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls')) - elif ext == 'f4m': - manifest_url = update_url_query( - medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) - formats.extend(self._extract_f4m_formats( - manifest_url, display_id, f4m_id='hds', fatal=False)) - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - medium_url, 'stream', fatal=False)) - else: - a_format = { - 'url': medium_url - } - if ext == 'unknown_video': - urlh = self._request_webpage( - medium_url, display_id, note='Determining extension') - ext = urlhandle_detect_ext(urlh) - a_format['ext'] = ext - formats.append(a_format) - - self._sort_formats(formats) - - subtitles = {} - caption_url = metadata_media_resource.get('captionURL') - if caption_url: - subtitles['de'] = [{ - 'url': caption_url, - 'ext': 'ttml', - }] - - title = metadata_tracker_data.get('trackerClipTitle') is_live = url_type == 'live' if is_live: - title = self._live_title(title) - upload_date = None - elif 'trackerClipAirTime' in metadata_tracker_data: - upload_date = metadata_tracker_data['trackerClipAirTime'] - else: - upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') + info_dict.update({ + 'title': self._live_title(info_dict['title']), + 'upload_date': None, + }) + elif 'upload_date' not in info_dict: + info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - if upload_date: - upload_date = unified_strdate(upload_date) - - return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), - 'formats': formats, - 'upload_date': upload_date, + info_dict.update({ 'description': self._html_search_meta('Description', webpage), 'is_live': is_live, - 'subtitles': subtitles, - } + }) + + return info_dict class WDRMobileIE(InfoExtractor): diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 5a41f8ffa..bcb140305 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -6,17 +6,23 @@ from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): - _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P[0-9]+)/(.*)' - _TEST = { - 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - 'md5': '0831677e2b4761795f68d417e0b7b445', + _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P[0-9a-z]+)/' + _TESTS = [{ + 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', + 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', 'info_dict': { - 'id': '1135332', + 'id': '55awb78', 'ext': 'flv', - 'title': 'lida » Naked Funny Actress (5)', + 'title': 'Skyrim Test Video', 'age_limit': 18, - } - } + }, + }, { + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'only_matching': True, + }, { + 'url': 'http://www.xnxx.com/video-55awb78/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 4075b8a4f..83bc1fef2 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -4,17 +4,23 @@ import itertools import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, orderedSet, + parse_duration, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P[^/]+)-))(?P[^/?&#]+)' + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P[^/]+)-) + ) + (?P[^/?&#]+) + ''' _TESTS = [{ # old URL schema @@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor): 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, } }, { @@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') webpage = self._download_webpage(req, display_id) - flashvars = self._parse_json( - self._search_regex( - r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), - video_id)['flashvars'] + sources = self._parse_json(self._search_regex( + r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) - title = flashvars.get('title') or self._search_regex( - r'

([^<]+)

', webpage, 'title') - video_url = compat_urllib_parse_unquote(flashvars['video_url']) - duration = int_or_none(flashvars.get('video_duration')) + formats = [] + for format_id, format_url in sources.items(): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) - uploader = self._search_regex( - r']+name="contentOwnerId"[^>]+value="([^"]+)"', - webpage, 'uploader', fatal=False) + title = self._search_regex( + (r'

(?P[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + duration = parse_duration(self._search_regex( + r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) @@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'uploader': uploader, @@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 0be8932ad..a66daee46 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -67,6 +67,20 @@ class XuiteIE(InfoExtractor): 'categories': ['電玩動漫'], }, 'skip': 'Video removed', + }, { + # Video with encoded media id + # from http://forgetfulbc.blogspot.com/2016/06/date.html + 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', + 'info_dict': { + 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', + 'ext': 'mp4', + 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', + 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', + 'timestamp': 1466160960, + 'upload_date': '20160617', + 'uploader': 'B.C. & Lowy', + 'uploader_id': '232279340', + }, }, { 'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9', 'only_matching': True, @@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor): def base64_encode_utf8(data): return base64.b64encode(data.encode('utf-8')).decode('utf-8') - def _extract_flv_config(self, media_id): - base64_media_id = self.base64_encode_utf8(media_id) + def _extract_flv_config(self, encoded_media_id): flv_config = self._download_xml( - 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, + 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): @@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_msg), expected=True) - video_id = self._html_search_regex( - r'data-mediaid="(\d+)"', webpage, 'media id') - flv_config = self._extract_flv_config(video_id) + encoded_media_id = self._search_regex( + r'attributes\.name\s*=\s*"([^"]+)"', webpage, + 'encoded media id', default=None) + if encoded_media_id is None: + video_id = self._html_search_regex( + r'data-mediaid="(\d+)"', webpage, 'media id') + encoded_media_id = self.base64_encode_utf8(video_id) + flv_config = self._extract_flv_config(encoded_media_id) FORMATS = { 'audio': 'mp3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 927a964a4..b0679dfb7 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,6 +19,7 @@ from ..utils import ( mimetype2ext, ) +from .brightcove import BrightcoveNewIE from .nbc import NBCSportsVPlayerIE @@ -227,7 +228,12 @@ class YahooIE(InfoExtractor): # Look for NBCSports iframes nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + + # Look for Brightcove New Studio embeds + bc_url = BrightcoveNewIE._extract_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) # Query result is often embedded in webpage as JSON. Sometimes explicit requests # to video API results in a failure with geo restriction reason therefore using diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 4150b28da..31e2f9263 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -9,8 +9,8 @@ from ..utils import ( class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' + _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', 'md5': '07e15fa469ba384c7693fd246905547c', 'info_dict': { @@ -19,7 +19,10 @@ class YouJizzIE(InfoExtractor): 'title': 'Zeichentrick 1', 'age_limit': 18, } - } + }, { + 'url': 'http://www.youjizz.com/videos/-2189178.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 147608ebe..e37f237c7 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -16,7 +16,6 @@ from ..compat import ( from ..utils import ( ExtractorError, get_element_by_attribute, - sanitized_Request, ) @@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor): headers = { 'Referer': req_url, } + headers.update(self.geo_verification_headers()) self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - req = sanitized_Request(req_url, headers=headers) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - raw_data = self._download_json(req, video_id, note=note) + raw_data = self._download_json(req_url, video_id, note=note, headers=headers) return raw_data['data'] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 00dd602ff..268080ba6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -116,12 +117,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = urlencode_postdata(login_form_strs) - - req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( - req, None, - note='Logging in', errnote='unable to log in', fatal=False) + self._PASSWORD_CHALLENGE_URL, None, + note='Logging in', errnote='unable to log in', fatal=False, + data=urlencode_postdata(login_form_strs)) if login_results is False: return False @@ -137,7 +136,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user - if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: + if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: tfa_code = self._get_tfa_info('2-step verification code') if not tfa_code: @@ -165,17 +164,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if tfa_results is False: return False - if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: + if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: + if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: + if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -501,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'youtube_include_dash_manifest': True, 'format': '141', }, + 'skip': 'format 141 not served anymore', }, # DASH manifest with encrypted signature { @@ -517,7 +517,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # JS player signature function name containing $ @@ -537,7 +537,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'params': { 'youtube_include_dash_manifest': True, - 'format': '141', + 'format': '141/bestaudio[ext=m4a]', }, }, # Controversy video @@ -618,7 +618,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', - 'uploader': 'Olympics', + 'uploader': 'Olympic', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', }, 'params': { @@ -671,7 +671,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', 'license': 'Standard YouTube License', - 'formats': 'mincount:33', + 'formats': 'mincount:32', }, }, # DASH manifest with segment_list @@ -691,7 +691,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'youtube_include_dash_manifest': True, 'format': '135', # bestvideo - } + }, + 'skip': 'This live event has ended.', }, { # Multifeed videos (multiple cameras), URL is for Main Camera @@ -762,6 +763,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', }, 'playlist_count': 2, + 'skip': 'Not multifeed anymore', }, { 'url': 'http://vid.plus/FlRa-iH7PGw', @@ -814,6 +816,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video does not exist.', }, { # Video licensed under Creative Commons @@ -854,6 +857,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): { 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', 'only_matching': True, + }, + { + # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) + 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', + 'only_matching': True, } ] @@ -1331,7 +1339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? - class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> + class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> ''', r'\1', video_description) @@ -1726,6 +1734,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } +class YoutubeSharedVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})' + IE_NAME = 'youtube:shared' + + _TEST = { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': ['Youtube'], + 'params': { + # There are already too many Youtube downloads + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + real_video_id = self._html_search_meta( + 'videoId', webpage, 'YouTube video id', fatal=True) + + return self.url_result(real_video_id, YoutubeIE.ie_key()) + + class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: @@ -1941,10 +1982,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)) + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id + def _real_extract(self, url): channel_id = self._match_id(url) - url = self._TEMPLATE_URL % channel_id + url = self._build_template_url(url, channel_id) # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -1958,9 +2002,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-(?:channel-external-|yt)id="([^"]+)"', - channel_page, 'channel id', default=None) + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -1983,20 +2031,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', + } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', } }, { 'url': 'ytuser:phihag', @@ -2004,6 +2071,13 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, + }, { + # This channel is not available. + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod @@ -2016,6 +2090,10 @@ class YoutubeUserIE(YoutubeChannelIE): else: return super(YoutubeUserIE, cls).suitable(url) + def _build_template_url(self, url, channel_id): + mobj = re.match(self._VALID_URL, url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a7440c582..9737f7002 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -232,7 +232,7 @@ class JSInterpreter(object): def extract_function(self, funcname): func_m = re.search( r'''(?x) - (?:function\s+%s|[{;,]%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* \{(?P<code>[^}]+)\}''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 99ce4131f..c4a85b2c0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -26,9 +26,11 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [] - for l in optionf: - res += compat_shlex_split(l, comments=True) + # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) finally: optionf.close() return res @@ -211,11 +213,16 @@ def parseOpts(overrideArguments=None): action='store_const', const='::', dest='source_address', help='Make all connections via IPv6 (experimental)', ) + network.add_option( + '--geo-verification-proxy', + dest='geo_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some geo-restricted sites. ' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + ) network.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some Chinese sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + help=optparse.SUPPRESS_HELP, ) selection = optparse.OptionGroup(parser, 'Video Selection') @@ -809,11 +816,11 @@ def parseOpts(overrideArguments=None): system_conf = [] user_conf = [] else: - system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf')) + system_conf = _readOptions('/etc/youtube-dl.conf') if '--ignore-config' in system_conf: user_conf = [] else: - user_conf = compat_conf(_readUserConf()) + user_conf = _readUserConf() argv = system_conf + user_conf + command_line_conf opts, args = parser.parse_args(argv) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index fd49d7435..104807242 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -76,7 +76,7 @@ class Socks4Error(ProxyError): CODES = { 91: 'request rejected or failed', - 92: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 92: 'request rejected because SOCKS server cannot connect to identd on the client', 93: 'request rejected because the client program and identd report different user-ids' } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 82f67f6cd..f5cd6819b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -110,6 +110,49 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐ،٠itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + def preferredencoding(): """Get preferred encoding. @@ -267,9 +310,17 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): + return get_element_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_element_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" + value = re.escape(value) if escape_value else value + m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? @@ -278,7 +329,7 @@ def get_element_by_attribute(attribute, value, html): \s*> (?P<content>.*?) </\1> - ''' % (re.escape(attribute), re.escape(value)), html) + ''' % (re.escape(attribute), value), html) if not m: return None @@ -975,6 +1026,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -984,20 +1053,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): date_str = re.sub(r'\.[0-9]+', '', date_str) if timezone is None: - m = re.search( - r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + timezone, date_str = extract_timezone(date_str) + try: date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1063,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): pass +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1075,11 @@ def unified_strdate(date_str, day_first=True): upload_date = None # Replace commas date_str = date_str.replace(',', ' ') - # %z (UTC offset) is only supported in python>=3.2 - if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) - format_expressions = [ - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%b %d %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - ] - if day_first: - format_expressions.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - ]) - else: - format_expressions.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', - ]) - for expression in format_expressions: + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except ValueError: @@ -1076,6 +1095,29 @@ def unified_strdate(date_str, day_first=True): return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = date_str.replace(',', ' ') + + pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0) + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple.timetuple()) + + def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext @@ -1410,6 +1452,8 @@ def shell_quote(args): def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ + url, idata = unsmuggle_url(url, {}) + data.update(idata) sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1591,6 +1635,11 @@ class HEADRequest(compat_urllib_request.Request): return 'HEAD' +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: @@ -1626,6 +1675,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def strip_or_none(v): + return None if v is None else v.strip() + + def parse_duration(s): if not isinstance(s, compat_basestring): return None @@ -1882,7 +1935,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}): req_headers.update(headers) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -2046,6 +2105,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') + res = res.lower() return { '3gpp': '3gp', @@ -2057,9 +2117,53 @@ def mimetype2ext(mt): 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m': 'f4m', + 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', + 'vnd.ms-sstr+xml': 'ism', }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get @@ -2852,3 +2956,16 @@ def decode_packed_codes(code): return re.sub( r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], obfucasted_code) + + +def parse_m3u8_attributes(attrib): + info = {} + for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib): + if val.startswith('"'): + val = val[1:-1] + info[key] = val + return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 52619cae8..2cfa406d9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.16' +__version__ = '2016.07.28'