diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index ed3e0a157..000000000 --- a/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,61 +0,0 @@ -## Please follow the guide below - -- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly -- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`) -- Use the *Preview* tab to see what your issue will actually look like - ---- - -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.09.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.09.26** - -### Before submitting an *issue* make sure you have: -- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections -- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones -- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser - -### What is the purpose of your *issue*? -- [ ] Bug report (encountered problems with youtube-dl) -- [ ] Site support request (request for adding support for a new site) -- [ ] Feature request (request for a new functionality) -- [ ] Question -- [ ] Other - ---- - -### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* - ---- - -### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: - -Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v `), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): - -``` -[debug] System config: [] -[debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] -[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.09.26 -[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 -[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 -[debug] Proxy map: {} -... - -``` - ---- - -### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): -- Single video: https://www.youtube.com/watch?v=BaW_jenozKc -- Single video: https://youtu.be/BaW_jenozKc -- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc - -Note that **youtube-dl does not support sites dedicated to [copyright infringement](https://github.com/rg3/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - ---- - -### Description of your *issue*, suggested solution and other information - -Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. -If work on your *issue* requires account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md new file mode 100644 index 000000000..0c24155e6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -0,0 +1,63 @@ +--- +name: Broken site support +about: Report broken or misfunctioning site +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a broken site support +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped +- [ ] I've searched the bugtracker for similar issues including closed ones + + +## Verbose log + + + +``` +PASTE VERBOSE LOG HERE +``` + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md new file mode 100644 index 000000000..9babe0360 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -0,0 +1,54 @@ +--- +name: Site support request +about: Request support for a new site +title: '' +labels: 'site-support-request' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a new site support request +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that none of provided URLs violate any copyrights +- [ ] I've searched the bugtracker for similar site support requests including closed ones + + +## Example URLs + + + +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md new file mode 100644 index 000000000..72322fe26 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -0,0 +1,37 @@ +--- +name: Site feature request +about: Request a new functionality for a site +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a site feature request +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've searched the bugtracker for similar site feature requests including closed ones + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md new file mode 100644 index 000000000..da7f2cf93 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -0,0 +1,65 @@ +--- +name: Bug report +about: Report a bug unrelated to any particular site or extractor +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a broken site support issue +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped +- [ ] I've searched the bugtracker for similar bug reports including closed ones +- [ ] I've read bugs section in FAQ + + +## Verbose log + + + +``` +PASTE VERBOSE LOG HERE +``` + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md new file mode 100644 index 000000000..d41022b9f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -0,0 +1,38 @@ +--- +name: Feature request +about: Request a new functionality unrelated to any particular site or extractor +title: '' +labels: 'request' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a feature request +- [ ] I've verified that I'm running youtube-dl version **2019.09.12.1** +- [ ] I've searched the bugtracker for similar feature requests including closed ones + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE/6_question.md b/.github/ISSUE_TEMPLATE/6_question.md new file mode 100644 index 000000000..1fd7cd5dc --- /dev/null +++ b/.github/ISSUE_TEMPLATE/6_question.md @@ -0,0 +1,38 @@ +--- +name: Ask question +about: Ask youtube-dl related question +title: '' +labels: 'question' +--- + + + + +## Checklist + + + +- [ ] I'm asking a question +- [ ] I've looked through the README and FAQ for similar questions +- [ ] I've searched the bugtracker for similar questions including closed ones + + +## Question + + + +WRITE QUESTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md deleted file mode 100644 index 8edbd5a0f..000000000 --- a/.github/ISSUE_TEMPLATE_tmpl.md +++ /dev/null @@ -1,61 +0,0 @@ -## Please follow the guide below - -- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly -- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`) -- Use the *Preview* tab to see what your issue will actually look like - ---- - -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s** - -### Before submitting an *issue* make sure you have: -- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections -- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones -- [ ] Checked that provided video/audio/playlist URLs (if any) are alive and playable in a browser - -### What is the purpose of your *issue*? -- [ ] Bug report (encountered problems with youtube-dl) -- [ ] Site support request (request for adding support for a new site) -- [ ] Feature request (request for a new functionality) -- [ ] Question -- [ ] Other - ---- - -### The following sections concretize particular purposed issues, you can erase any section (the contents between triple ---) not applicable to your *issue* - ---- - -### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows: - -Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v `), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```): - -``` -[debug] System config: [] -[debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] -[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version %(version)s -[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 -[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 -[debug] Proxy map: {} -... - -``` - ---- - -### If the purpose of this *issue* is a *site support request* please provide all kinds of example URLs support for which should be included (replace following example URLs by **yours**): -- Single video: https://www.youtube.com/watch?v=BaW_jenozKc -- Single video: https://youtu.be/BaW_jenozKc -- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc - -Note that **youtube-dl does not support sites dedicated to [copyright infringement](https://github.com/rg3/youtube-dl#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - ---- - -### Description of your *issue*, suggested solution and other information - -Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. -If work on your *issue* requires account credentials please provide them or explain how one can obtain them. diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md new file mode 100644 index 000000000..c7600d5b5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md @@ -0,0 +1,63 @@ +--- +name: Broken site support +about: Report broken or misfunctioning site +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a broken site support +- [ ] I've verified that I'm running youtube-dl version **%(version)s** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped +- [ ] I've searched the bugtracker for similar issues including closed ones + + +## Verbose log + + + +``` +PASTE VERBOSE LOG HERE +``` + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md new file mode 100644 index 000000000..d4988e639 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md @@ -0,0 +1,54 @@ +--- +name: Site support request +about: Request support for a new site +title: '' +labels: 'site-support-request' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a new site support request +- [ ] I've verified that I'm running youtube-dl version **%(version)s** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that none of provided URLs violate any copyrights +- [ ] I've searched the bugtracker for similar site support requests including closed ones + + +## Example URLs + + + +- Single video: https://www.youtube.com/watch?v=BaW_jenozKc +- Single video: https://youtu.be/BaW_jenozKc +- Playlist: https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md new file mode 100644 index 000000000..65f0a32f3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md @@ -0,0 +1,37 @@ +--- +name: Site feature request +about: Request a new functionality for a site +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a site feature request +- [ ] I've verified that I'm running youtube-dl version **%(version)s** +- [ ] I've searched the bugtracker for similar site feature requests including closed ones + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md new file mode 100644 index 000000000..41fb14b72 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md @@ -0,0 +1,65 @@ +--- +name: Bug report +about: Report a bug unrelated to any particular site or extractor +title: '' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a broken site support issue +- [ ] I've verified that I'm running youtube-dl version **%(version)s** +- [ ] I've checked that all provided URLs are alive and playable in a browser +- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped +- [ ] I've searched the bugtracker for similar bug reports including closed ones +- [ ] I've read bugs section in FAQ + + +## Verbose log + + + +``` +PASTE VERBOSE LOG HERE +``` + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md new file mode 100644 index 000000000..b3431a7f0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md @@ -0,0 +1,38 @@ +--- +name: Feature request +about: Request a new functionality unrelated to any particular site or extractor +title: '' +labels: 'request' +--- + + + + +## Checklist + + + +- [ ] I'm reporting a feature request +- [ ] I've verified that I'm running youtube-dl version **%(version)s** +- [ ] I've searched the bugtracker for similar feature requests including closed ones + + +## Description + + + +WRITE DESCRIPTION HERE diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index ba4ca7553..e69b907d8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -7,8 +7,8 @@ --- ### Before submitting a *pull request* make sure you have: -- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections -- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests +- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/ytdl-org/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/ytdl-org/youtube-dl#youtube-dl-coding-conventions) sections +- [ ] [Searched](https://github.com/ytdl-org/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests - [ ] Checked the code with [flake8](https://pypi.python.org/pypi/flake8) ### In order to be accepted and merged into youtube-dl each piece of code must be in public domain or released under [Unlicense](http://unlicense.org/). Check one of the following options: diff --git a/.travis.yml b/.travis.yml index 92f326860..6d16c2955 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,12 +9,24 @@ python: - "3.6" - "pypy" - "pypy3" -sudo: false +dist: trusty env: - YTDL_TEST_SET=core - YTDL_TEST_SET=download matrix: include: + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.7 + dist: xenial + env: YTDL_TEST_SET=download + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=core + - python: 3.8-dev + dist: xenial + env: YTDL_TEST_SET=download - env: JYTHON=true; YTDL_TEST_SET=core - env: JYTHON=true; YTDL_TEST_SET=download fast_finish: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 333acee80..ac759ddc4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -42,11 +42,11 @@ Before reporting any issue, type `youtube-dl -U`. This should report that you're ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/rg3/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? @@ -70,7 +70,7 @@ It may sound strange, but some bug reports we receive are completely unrelated t # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://ytdl-org.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -98,7 +98,7 @@ If you want to add support for a new site, first of all **make sure** this site After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): -1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) +1. [Fork this repository](https://github.com/ytdl-org/youtube-dl/fork) 2. Check out the source code with: git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git @@ -150,18 +150,22 @@ After you have ensured this site is distributing its content legally, you can fo # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. -7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: +7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want. +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart): + + $ flake8 youtube_dl/extractor/yourextractor.py + +9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor -10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. +11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. In any case, thank you very much for your contributions! @@ -173,7 +177,7 @@ Extractors are very fragile by nature since they depend on the layout of the sou ### Mandatory and optional metafields -For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: +For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl: - `id` (media identifier) - `title` (media title) @@ -181,7 +185,7 @@ For extraction to work youtube-dl relies on metadata your extractor extracts and In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken. -[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. +[Any field](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. #### Example @@ -257,11 +261,33 @@ title = meta.get('title') or self._og_search_title(webpage) This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`. -### Make regular expressions flexible +### Regular expressions -When using regular expressions try to write them fuzzy and flexible. +#### Don't capture groups you don't use + +Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing. + +##### Example + +Don't capture id attribute name here since you can't use it for anything anyway. + +Correct: + +```python +r'(?:id|ID)=(?P\d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: @@ -294,7 +320,115 @@ title = self._search_regex( webpage, 'title', group='title') ``` -### Use safe conversion functions +### Long lines policy -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +### Inline values + +Extracting variables is acceptable for reducing code duplication and improving readability of complex expressions. However, you should avoid extracting variables used only once and moving them to opposite parts of the extractor file, which makes reading the linear flow difficult. + +#### Example + +Correct: + +```python +title = self._html_search_regex(r'([^<]+)', webpage, 'title') +``` + +Incorrect: + +```python +TITLE_RE = r'([^<]+)' +# ...some lines of code... +title = self._html_search_regex(TITLE_RE, webpage, 'title') +``` + +### Collapse fallbacks + +Multiple fallback values can quickly become unwieldy. Collapse multiple fallback values into a single expression via a list of patterns. + +#### Example + +Good: + +```python +description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) +``` + +Unwieldy: + +```python +description = ( + self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, default=None) + or self._html_search_meta('twitter:description', webpage, default=None)) +``` + +Methods supporting list of patterns are: `_search_regex`, `_html_search_regex`, `_og_search_property`, `_html_search_meta`. + +### Trailing parentheses + +Always move trailing parentheses after the last argument. + +#### Example + +Correct: + +```python + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list) +``` + +Incorrect: + +```python + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list, +) +``` + +### Use convenience conversion and parsing functions + +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + +Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` diff --git a/ChangeLog b/ChangeLog index 241712037..2aba02065 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,863 @@ +version 2019.09.12.1 + +Extractors +* [youtube] Remove quality and tbr for itag 43 (#22372) + + +version 2019.09.12 + +Extractors +* [youtube] Quick extraction tempfix (#22367, #22163) + + +version 2019.09.01 + +Core ++ [extractor/generic] Add support for squarespace embeds (#21294, #21802, + #21859) ++ [downloader/external] Respect mtime option for aria2c (#22242) + +Extractors ++ [xhamster:user] Add support for user pages (#16330, #18454) ++ [xhamster] Add support for more domains ++ [verystream] Add support for woof.tube (#22217) ++ [dailymotion] Add support for lequipe.fr (#21328, #22152) ++ [openload] Add support for oload.vip (#22205) ++ [bbccouk] Extend URL regular expression (#19200) ++ [youtube] Add support for invidious.nixnet.xyz and yt.elukerio.org (#22223) +* [safari] Fix authentication (#22161, #22184) +* [usanetwork] Fix extraction (#22105) ++ [einthusan] Add support for einthusan.ca (#22171) +* [youtube] Improve unavailable message extraction (#22117) ++ [piksel] Extract subtitles (#20506) + + +version 2019.08.13 + +Core +* [downloader/fragment] Fix ETA calculation of resumed download (#21992) +* [YoutubeDL] Check annotations availability (#18582) + +Extractors +* [youtube:playlist] Improve flat extraction (#21927) +* [youtube] Fix annotations extraction (#22045) ++ [discovery] Extract series meta field (#21808) +* [youtube] Improve error detection (#16445) +* [vimeo] Fix album extraction (#1933, #15704, #15855, #18967, #21986) ++ [roosterteeth] Add support for watch URLs +* [discovery] Limit video data by show slug (#21980) + + +version 2019.08.02 + +Extractors ++ [tvigle] Add support for HLS and DASH formats (#21967) +* [tvigle] Fix extraction (#21967) ++ [yandexvideo] Add support for DASH formats (#21971) +* [discovery] Use API call for video data extraction (#21808) ++ [mgtv] Extract format_note (#21881) +* [tvn24] Fix metadata extraction (#21833, #21834) +* [dlive] Relax URL regular expression (#21909) ++ [openload] Add support for oload.best (#21913) +* [youtube] Improve metadata extraction for age gate content (#21943) + + +version 2019.07.30 + +Extractors +* [youtube] Fix and improve title and description extraction (#21934) + + +version 2019.07.27 + +Extractors ++ [yahoo:japannews] Add support for yahoo.co.jp (#21698, #21265) ++ [discovery] Add support go.discovery.com URLs +* [youtube:playlist] Relax video regular expression (#21844) +* [generic] Restrict --default-search schemeless URLs detection pattern + (#21842) +* [vrv] Fix CMS signing query extraction (#21809) + + +version 2019.07.16 + +Extractors ++ [asiancrush] Add support for yuyutv.com, midnightpulp.com and cocoro.tv + (#21281, #21290) +* [kaltura] Check source format URL (#21290) +* [ctsnews] Fix YouTube embeds extraction (#21678) ++ [einthusan] Add support for einthusan.com (#21748, #21775) ++ [youtube] Add support for invidious.mastodon.host (#21777) ++ [gfycat] Extend URL regular expression (#21779, #21780) +* [youtube] Restrict is_live extraction (#21782) + + +version 2019.07.14 + +Extractors +* [porn91] Fix extraction (#21312) ++ [yandexmusic] Extract track number and disk number (#21421) ++ [yandexmusic] Add support for multi disk albums (#21420, #21421) +* [lynda] Handle missing subtitles (#20490, #20513) ++ [youtube] Add more invidious instances to URL regular expression (#21694) +* [twitter] Improve uploader id extraction (#21705) +* [spankbang] Fix and improve metadata extraction +* [spankbang] Fix extraction (#21763, #21764) ++ [dlive] Add support for dlive.tv (#18080) ++ [livejournal] Add support for livejournal.com (#21526) +* [roosterteeth] Fix free episode extraction (#16094) +* [dbtv] Fix extraction +* [bellator] Fix extraction +- [rudo] Remove extractor (#18430, #18474) +* [facebook] Fallback to twitter:image meta for thumbnail extraction (#21224) +* [bleacherreport] Fix Bleacher Report CMS extraction +* [espn] Fix fivethirtyeight.com extraction +* [5tv] Relax video URL regular expression and support https URLs +* [youtube] Fix is_live extraction (#21734) +* [youtube] Fix authentication (#11270) + + +version 2019.07.12 + +Core ++ [adobepass] Add support for AT&T U-verse (mso ATT) (#13938, #21016) + +Extractors ++ [mgtv] Pass Referer HTTP header for format URLs (#21726) ++ [beeg] Add support for api/v6 v2 URLs without t argument (#21701) +* [voxmedia:volume] Improvevox embed extraction (#16846) +* [funnyordie] Move extraction to VoxMedia extractor (#16846) +* [gameinformer] Fix extraction (#8895, #15363, #17206) +* [funk] Fix extraction (#17915) +* [packtpub] Relax lesson URL regular expression (#21695) +* [packtpub] Fix extraction (#21268) +* [philharmoniedeparis] Relax URL regular expression (#21672) +* [peertube] Detect embed URLs in generic extraction (#21666) +* [mixer:vod] Relax URL regular expression (#21657, #21658) ++ [lecturio] Add support id based URLs (#21630) ++ [go] Add site info for disneynow (#21613) +* [ted] Restrict info regular expression (#21631) +* [twitch:vod] Actualize m3u8 URL (#21538, #21607) +* [vzaar] Fix videos with empty title (#21606) +* [tvland] Fix extraction (#21384) +* [arte] Clean extractor (#15583, #21614) + + +version 2019.07.02 + +Core ++ [utils] Introduce random_user_agent and use as default User-Agent (#21546) + +Extractors ++ [vevo] Add support for embed.vevo.com URLs (#21565) ++ [openload] Add support for oload.biz (#21574) +* [xiami] Update API base URL (#21575) +* [yourporn] Fix extraction (#21585) ++ [acast] Add support for URLs with episode id (#21444) ++ [dailymotion] Add support for DM.player embeds +* [soundcloud] Update client id + + +version 2019.06.27 + +Extractors ++ [go] Add support for disneynow.com (#21528) +* [mixer:vod] Relax URL regular expression (#21531, #21536) +* [drtv] Relax URL regular expression +* [fusion] Fix extraction (#17775, #21269) +- [nfb] Remove extractor (#21518) ++ [beeg] Add support for api/v6 v2 URLs (#21511) ++ [brightcove:new] Add support for playlists (#21331) ++ [openload] Add support for oload.life (#21495) +* [vimeo:channel,group] Make title extraction non fatal +* [vimeo:likes] Implement extrator in terms of channel extractor (#21493) ++ [pornhub] Add support for more paged video sources ++ [pornhub] Add support for downloading single pages and search pages (#15570) +* [pornhub] Rework extractors (#11922, #16078, #17454, #17936) ++ [youtube] Add another signature function pattern +* [tf1] Fix extraction (#21365, #21372) +* [crunchyroll] Move Accept-Language workaround to video extractor since + it causes playlists not to list any videos +* [crunchyroll:playlist] Fix and relax title extraction (#21291, #21443) + + +version 2019.06.21 + +Core +* [utils] Restrict parse_codecs and add theora as known vcodec (#21381) + +Extractors +* [youtube] Update signature function patterns (#21469, #21476) +* [youtube] Make --write-annotations non fatal (#21452) ++ [sixplay] Add support for rtlmost.hu (#21405) +* [youtube] Hardcode codec metadata for av01 video only formats (#21381) +* [toutv] Update client key (#21370) ++ [biqle] Add support for new embed domain +* [cbs] Improve DRM protected videos detection (#21339) + + +version 2019.06.08 + +Core +* [downloader/common] Improve rate limit (#21301) +* [utils] Improve strip_or_none +* [extractor/common] Strip src attribute for HTML5 entries code (#18485, + #21169) + +Extractors +* [ted] Fix playlist extraction (#20844, #21032) +* [vlive:playlist] Fix video extraction when no playlist is found (#20590) ++ [vlive] Add CH+ support (#16887, #21209) ++ [openload] Add support for oload.website (#21329) ++ [tvnow] Extract HD formats (#21201) ++ [redbulltv] Add support for rrn:content URLs (#21297) +* [youtube] Fix average rating extraction (#21304) ++ [bitchute] Extract HTML5 formats (#21306) +* [cbsnews] Fix extraction (#9659, #15397) +* [vvvvid] Relax URL regular expression (#21299) ++ [prosiebensat1] Add support for new API (#21272) ++ [vrv] Extract adaptive_hls formats (#21243) +* [viki] Switch to HTTPS (#21001) +* [LiveLeak] Check if the original videos exist (#21206, #21208) +* [rtp] Fix extraction (#15099) +* [youtube] Improve DRM protected videos detection (#1774) ++ [srgssrplay] Add support for popupvideoplayer URLs (#21155) ++ [24video] Add support for porno.24video.net (#21194) ++ [24video] Add support for 24video.site (#21193) +- [pornflip] Remove extractor +- [criterion] Remove extractor (#21195) +* [pornhub] Use HTTPS (#21061) +* [bitchute] Fix uploader extraction (#21076) +* [streamcloud] Reduce waiting time to 6 seconds (#21092) +- [novamov] Remove extractors (#21077) ++ [openload] Add support for oload.press (#21135) +* [vivo] Fix extraction (#18906, #19217) + + +version 2019.05.20 + +Core ++ [extractor/common] Move workaround for applying first Set-Cookie header + into a separate _apply_first_set_cookie_header method + +Extractors +* [safari] Fix authentication (#21090) +* [vk] Use _apply_first_set_cookie_header +* [vrt] Fix extraction (#20527) ++ [canvas] Add support for vrtnieuws and sporza site ids and extract + AES HLS formats ++ [vrv] Extract captions (#19238) +* [tele5] Improve video id extraction +* [tele5] Relax URL regular expression (#21020, #21063) +* [svtplay] Update API URL (#21075) ++ [yahoo:gyao] Add X-User-Agent header to dam proxy requests (#21071) + + +version 2019.05.11 + +Core +* [utils] Transliterate "þ" as "th" (#20897) + +Extractors ++ [cloudflarestream] Add support for videodelivery.net (#21049) ++ [byutv] Add support for DVR videos (#20574, #20676) ++ [gfycat] Add support for URLs with tags (#20696, #20731) ++ [openload] Add support for verystream.com (#20701, #20967) +* [youtube] Use sp field value for signature field name (#18841, #18927, + #21028) ++ [yahoo:gyao] Extend URL regular expression (#21008) +* [youtube] Fix channel id extraction (#20982, #21003) ++ [sky] Add support for news.sky.com (#13055) ++ [youtube:entrylistbase] Retry on 5xx HTTP errors (#20965) ++ [francetvinfo] Extend video id extraction (#20619, #20740) +* [4tube] Update token hosts (#20918) +* [hotstar] Move to API v2 (#20931) +* [fox] Fix API error handling under python 2 (#20925) ++ [redbulltv] Extend URL regular expression (#20922) + + +version 2019.04.30 + +Extractors +* [openload] Use real Chrome versions (#20902) +- [youtube] Remove info el for get_video_info request +* [youtube] Improve extraction robustness +- [dramafever] Remove extractor (#20868) +* [adn] Fix subtitle extraction (#12724) ++ [ccc] Extract creator (#20355) ++ [ccc:playlist] Add support for media.ccc.de playlists (#14601, #20355) ++ [sverigesradio] Add support for sverigesradio.se (#18635) ++ [cinemax] Add support for cinemax.com +* [sixplay] Try extracting non-DRM protected manifests (#20849) ++ [youtube] Extract Youtube Music Auto-generated metadata (#20599, #20742) +- [wrzuta] Remove extractor (#20684, #20801) +* [twitch] Prefer source format (#20850) ++ [twitcasting] Add support for private videos (#20843) +* [reddit] Validate thumbnail URL (#20030) +* [yandexmusic] Fix track URL extraction (#20820) + + +version 2019.04.24 + +Extractors +* [youtube] Fix extraction (#20758, #20759, #20761, #20762, #20764, #20766, + #20767, #20769, #20771, #20768, #20770) +* [toutv] Fix extraction and extract series info (#20757) ++ [vrv] Add support for movie listings (#19229) ++ [youtube] Print error when no data is available (#20737) ++ [soundcloud] Add support for new rendition and improve extraction (#20699) ++ [ooyala] Add support for geo verification proxy ++ [nrl] Add support for nrl.com (#15991) ++ [vimeo] Extract live archive source format (#19144) ++ [vimeo] Add support for live streams and improve info extraction (#19144) ++ [ntvcojp] Add support for cu.ntv.co.jp ++ [nhk] Extract RTMPT format ++ [nhk] Add support for audio URLs ++ [udemy] Add another course id extraction pattern (#20491) ++ [openload] Add support for oload.services (#20691) ++ [openload] Add support for openloed.co (#20691, #20693) +* [bravotv] Fix extraction (#19213) + + +version 2019.04.17 + +Extractors +* [openload] Randomize User-Agent (closes #20688) ++ [openload] Add support for oladblock domains (#20471) +* [adn] Fix subtitle extraction (#12724) ++ [aol] Add support for localized websites ++ [yahoo] Add support GYAO episode URLs ++ [yahoo] Add support for streaming.yahoo.co.jp (#5811, #7098) ++ [yahoo] Add support for gyao.yahoo.co.jp +* [aenetworks] Fix history topic extraction and extract more formats ++ [cbs] Extract smpte and vtt subtitles ++ [streamango] Add support for streamcherry.com (#20592) ++ [yourporn] Add support for sxyprn.com (#20646) +* [mgtv] Fix extraction (#20650) +* [linkedin:learning] Use urljoin for form action URL (#20431) ++ [gdc] Add support for kaltura embeds (#20575) +* [dispeak] Improve mp4 bitrate extraction +* [kaltura] Sanitize embed URLs +* [jwplatfom] Do not match manifest URLs (#20596) +* [aol] Restrict URL regular expression and improve format extraction ++ [tiktok] Add support for new URL schema (#20573) ++ [stv:player] Add support for player.stv.tv (#20586) + + +version 2019.04.07 + +Core ++ [downloader/external] Pass rtmp_conn to ffmpeg + +Extractors ++ [ruutu] Add support for audio podcasts (#20473, #20545) ++ [xvideos] Extract all thumbnails (#20432) ++ [platzi] Add support for platzi.com (#20562) +* [dvtv] Fix extraction (#18514, #19174) ++ [vrv] Add basic support for individual movie links (#19229) ++ [bfi:player] Add support for player.bfi.org.uk (#19235) +* [hbo] Fix extraction and extract subtitles (#14629, #13709) +* [youtube] Extract srv[1-3] subtitle formats (#20566) +* [adultswim] Fix extraction (#18025) +* [teamcoco] Fix extraction and add suport for subdomains (#17099, #20339) +* [adn] Fix subtitle compatibility with ffmpeg +* [adn] Fix extraction and add support for positioning styles (#20549) +* [vk] Use unique video id (#17848) +* [newstube] Fix extraction +* [rtl2] Actualize extraction ++ [adobeconnect] Add support for adobeconnect.com (#20283) ++ [gaia] Add support for authentication (#14605) ++ [mediasite] Add support for dashed ids and named catalogs (#20531) + + +version 2019.04.01 + +Core +* [utils] Improve int_or_none and float_or_none (#20403) +* Check for valid --min-sleep-interval when --max-sleep-interval is specified + (#20435) + +Extractors ++ [weibo] Extend URL regular expression (#20496) ++ [xhamster] Add support for xhamster.one (#20508) ++ [mediasite] Add support for catalogs (#20507) ++ [teamtreehouse] Add support for teamtreehouse.com (#9836) ++ [ina] Add support for audio URLs +* [ina] Improve extraction +* [cwtv] Fix episode number extraction (#20461) +* [npo] Improve DRM detection ++ [pornhub] Add support for DASH formats (#20403) +* [svtplay] Update API endpoint (#20430) + + +version 2019.03.18 + +Core +* [extractor/common] Improve HTML5 entries extraction ++ [utils] Introduce parse_bitrate +* [update] Hide update URLs behind redirect +* [extractor/common] Fix url meta field for unfragmented DASH formats (#20346) + +Extractors ++ [yandexvideo] Add extractor +* [openload] Improve embed detection ++ [corus] Add support for bigbrothercanada.ca (#20357) ++ [orf:radio] Extract series (#20012) ++ [cbc:watch] Add support for gem.cbc.ca (#20251, #20359) +- [anysex] Remove extractor (#19279) ++ [ciscolive] Add support for new URL schema (#20320, #20351) ++ [youtube] Add support for invidiou.sh (#20309) +- [anitube] Remove extractor (#20334) +- [ruleporn] Remove extractor (#15344, #20324) +* [npr] Fix extraction (#10793, #13440) +* [biqle] Fix extraction (#11471, #15313) +* [viddler] Modernize +* [moevideo] Fix extraction +* [primesharetv] Remove extractor +* [hypem] Modernize and extract more metadata (#15320) +* [veoh] Fix extraction +* [escapist] Modernize +- [videomega] Remove extractor (#10108) ++ [beeg] Add support for beeg.porn (#20306) +* [vimeo:review] Improve config url extraction and extract original format + (#20305) +* [fox] Detect geo restriction and authentication errors (#20208) + + +version 2019.03.09 + +Core +* [extractor/common] Use compat_etree_Element ++ [compat] Introduce compat_etree_Element +* [extractor/common] Fallback url to base URL for DASH formats +* [extractor/common] Do not fail on invalid data while parsing F4M manifest + in non fatal mode +* [extractor/common] Return MPD manifest as format's url meta field (#20242) +* [utils] Strip #HttpOnly_ prefix from cookies files (#20219) + +Extractors +* [francetv:site] Relax video id regular expression (#20268) +* [toutv] Detect invalid login error +* [toutv] Fix authentication (#20261) ++ [urplay] Extract timestamp (#20235) ++ [openload] Add support for oload.space (#20246) +* [facebook] Improve uploader extraction (#20250) +* [bbc] Use compat_etree_Element +* [crunchyroll] Use compat_etree_Element +* [npo] Improve ISM extraction +* [rai] Improve extraction (#20253) +* [paramountnetwork] Fix mgid extraction (#20241) +* [libsyn] Improve extraction (#20229) ++ [youtube] Add more invidious instances to URL regular expression (#20228) +* [spankbang] Fix extraction (#20023) +* [espn] Extend URL regular expression (#20013) +* [sixplay] Handle videos with empty assets (#20016) ++ [vimeo] Add support for Vimeo Pro portfolio protected videos (#20070) + + +version 2019.03.01 + +Core ++ [downloader/external] Add support for rate limit and retries for wget +* [downloader/external] Fix infinite retries for curl (#19303) + +Extractors +* [npo] Fix extraction (#20084) +* [francetv:site] Extend video id regex (#20029, #20071) ++ [periscope] Extract width and height (#20015) +* [servus] Fix extraction (#19297) +* [bbccouk] Make subtitles non fatal (#19651) +* [metacafe] Fix family filter bypass (#19287) + + +version 2019.02.18 + +Extractors +* [tvp:website] Fix and improve extraction ++ [tvp] Detect unavailable videos +* [tvp] Fix description extraction and make thumbnail optional ++ [linuxacademy] Add support for linuxacademy.com (#12207) +* [bilibili] Update keys (#19233) +* [udemy] Extend URL regular expressions (#14330, #15883) +* [udemy] Update User-Agent and detect captcha (#14713, #15839, #18126) +* [noovo] Fix extraction (#19230) +* [rai] Relax URL regular expression (#19232) ++ [vshare] Pass Referer to download request (#19205, #19221) ++ [openload] Add support for oload.live (#19222) +* [imgur] Use video id as title fallback (#18590) ++ [twitch] Add new source format detection approach (#19193) +* [tvplayhome] Fix video id extraction (#19190) +* [tvplayhome] Fix episode metadata extraction (#19190) +* [rutube:embed] Fix extraction (#19163) ++ [rutube:embed] Add support private videos (#19163) ++ [soundcloud] Extract more metadata ++ [trunews] Add support for trunews.com (#19153) ++ [linkedin:learning] Extract chapter_number and chapter_id (#19162) + + +version 2019.02.08 + +Core +* [utils] Improve JSON-LD regular expression (#18058) +* [YoutubeDL] Fallback to ie_key of matching extractor while making + download archive id when no explicit ie_key is provided (#19022) + +Extractors ++ [malltv] Add support for mall.tv (#18058, #17856) ++ [spankbang:playlist] Add support for playlists (#19145) +* [spankbang] Extend URL regular expression +* [trutv] Fix extraction (#17336) +* [toutv] Fix authentication (#16398, #18700) +* [pornhub] Fix tags and categories extraction (#13720, #19135) +* [pornhd] Fix formats extraction ++ [pornhd] Extract like count (#19123, #19125) +* [radiocanada] Switch to the new media requests (#19115) ++ [teachable] Add support for courses.workitdaily.com (#18871) +- [vporn] Remove extractor (#16276) ++ [soundcloud:pagedplaylist] Add ie and title to entries (#19022, #19086) ++ [drtuber] Extract duration (#19078) +* [soundcloud] Fix paged playlists extraction, add support for albums and update client id +* [soundcloud] Update client id +* [drtv] Improve preference (#19079) ++ [openload] Add support for openload.pw and oload.pw (#18930) ++ [openload] Add support for oload.info (#19073) +* [crackle] Authorize media detail request (#16931) + + +version 2019.01.30.1 + +Core +* [postprocessor/ffmpeg] Fix avconv processing broken in #19025 (#19067) + + +version 2019.01.30 + +Core +* [postprocessor/ffmpeg] Do not copy Apple TV chapter tracks while embedding + subtitles (#19024, #19042) +* [postprocessor/ffmpeg] Disable "Last message repeated" messages (#19025) + +Extractors +* [yourporn] Fix extraction and extract duration (#18815, #18852, #19061) +* [drtv] Improve extraction (#19039) + + Add support for EncryptedUri videos + + Extract more metadata + * Fix subtitles extraction ++ [fox] Add support for locked videos using cookies (#19060) +* [fox] Fix extraction for free videos (#19060) ++ [zattoo] Add support for tv.salt.ch (#19059) + + +version 2019.01.27 + +Core ++ [extractor/common] Extract season in _json_ld +* [postprocessor/ffmpeg] Fallback to ffmpeg/avconv for audio codec detection + (#681) + +Extractors +* [vice] Fix extraction for locked videos (#16248) ++ [wakanim] Detect DRM protected videos ++ [wakanim] Add support for wakanim.tv (#14374) +* [usatoday] Fix extraction for videos with custom brightcove partner id + (#18990) +* [drtv] Fix extraction (#18989) +* [nhk] Extend URL regular expression (#18968) +* [go] Fix Adobe Pass requests for Disney Now (#18901) ++ [openload] Add support for oload.club (#18969) + + +version 2019.01.24 + +Core +* [YoutubeDL] Fix negation for string operators in format selection (#18961) + + +version 2019.01.23 + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + +version 2019.01.17 + +Extractors +* [youtube] Extend JS player signature function name regular expressions + (#18890, #18891, #18893) + + +version 2019.01.16 + +Core ++ [test/helper] Add support for maxcount and count collection len checkers +* [downloader/hls] Fix uplynk ad skipping (#18824) +* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813) + +Extractors +* [youtube] Skip unsupported adaptive stream type (#18804) ++ [youtube] Extract DASH formats from player response (#18804) +* [funimation] Fix extraction (#14089) +* [skylinewebcams] Fix extraction (#18853) ++ [curiositystream] Add support for non app URLs ++ [bitchute] Check formats (#18833) +* [wistia] Extend URL regular expression (#18823) ++ [playplustv] Add support for playplus.com (#18789) + + +version 2019.01.10 + +Core +* [extractor/common] Use episode name as title in _json_ld ++ [extractor/common] Add support for movies in _json_ld +* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes + (#18765) ++ [utils] Add language codes replaced in 1989 revision of ISO 639 + to ISO639Utils (#18765) + +Extractors +* [youtube] Extract live HLS URL from player response (#18799) ++ [outsidetv] Add support for outsidetv.com (#18774) +* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs ++ [fox] Add support National Geographic (#17985, #15333, #14698) ++ [playplustv] Add support for playplus.tv (#18789) +* [globo] Set GLBID cookie manually (#17346) ++ [gaia] Add support for gaia.com (#14605) +* [youporn] Fix title and description extraction (#18748) ++ [hungama] Add support for hungama.com (#17402, #18771) +* [dtube] Fix extraction (#18741) +* [tvnow] Fix and rework extractors and prepare for a switch to the new API + (#17245, #18499) +* [carambatv:page] Fix extraction (#18739) + + +version 2019.01.02 + +Extractors +* [discovery] Use geo verification headers (#17838) ++ [packtpub] Add support for subscription.packtpub.com (#18718) +* [yourporn] Fix extraction (#18583) ++ [acast:channel] Add support for play.acast.com (#18587) ++ [extractors] Add missing age limits (#18621) ++ [rmcdecouverte] Add support for live stream +* [rmcdecouverte] Bypass geo restriction +* [rmcdecouverte] Update URL regular expression (#18595, 18697) +* [manyvids] Fix extraction (#18604, #18614) +* [bitchute] Fix extraction (#18567) + + +version 2018.12.31 + +Extractors ++ [bbc] Add support for another embed pattern (#18643) ++ [npo:live] Add support for npostart.nl (#18644) +* [beeg] Fix extraction (#18610, #18626) +* [youtube] Unescape HTML for series (#18641) ++ [youtube] Extract more format metadata +* [youtube] Detect DRM protected videos (#1774) +* [youtube] Relax HTML5 player regular expressions (#18465, #18466) +* [youtube] Extend HTML5 player regular expression (#17516) ++ [liveleak] Add support for another embed type and restore original + format extraction ++ [crackle] Extract ISM and HTTP formats ++ [twitter] Pass Referer with card request (#18579) +* [mediasite] Extend URL regular expression (#18558) ++ [lecturio] Add support for lecturio.de (#18562) ++ [discovery] Add support for Scripps Networks watch domains (#17947) + + +version 2018.12.17 + +Extractors +* [ard:beta] Improve geo restricted videos extraction +* [ard:beta] Fix subtitles extraction +* [ard:beta] Improve extraction robustness +* [ard:beta] Relax URL regular expression (#18441) +* [acast] Add support for embed.acast.com and play.acast.com (#18483) +* [iprima] Relax URL regular expression (#18515, #18540) +* [vrv] Fix initial state extraction (#18553) +* [youtube] Fix mark watched (#18546) ++ [safari] Add support for learning.oreilly.com (#18510) +* [youtube] Fix multifeed extraction (#18531) +* [lecturio] Improve subtitles extraction (#18488) +* [uol] Fix format URL extraction (#18480) ++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473) + + +version 2018.12.09 + +Core +* [YoutubeDL] Keep session cookies in cookie file between runs +* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929) + +Extractors ++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272) ++ [aenetworks] Add support for historyvault.com (#18460) +* [imgur] Improve gallery and album detection and extraction (#9133, #16577, + #17223, #18404) +* [iprima] Relax URL regular expression (#18453) +* [hotstar] Fix video data extraction (#18386) +* [ard:mediathek] Fix title and description extraction (#18349, #18371) +* [xvideos] Switch to HTTPS (#18422, #18427) ++ [lecturio] Add support for lecturio.com (#18405) ++ [nrktv:series] Add support for extra materials +* [nrktv:season,series] Fix extraction (#17159, #17258) +* [nrktv] Relax URL regular expression (#18304, #18387) +* [yourporn] Fix extraction (#18424, #18425) +* [tbs] Fix info extraction (#18403) ++ [gamespot] Add support for review URLs + + +version 2018.12.03 + +Core +* [utils] Fix random_birthday to generate existing dates only (#18284) + +Extractors ++ [tiktok] Add support for tiktok.com (#18108, #18135) +* [pornhub] Use actual URL host for requests (#18359) +* [lynda] Fix authentication (#18158, #18217) +* [gfycat] Update API endpoint (#18333, #18343) ++ [hotstar] Add support for alternative app state layout (#18320) +* [azmedien] Fix extraction (#18334, #18336) ++ [vimeo] Add support for VHX (Vimeo OTT) (#14835) +* [joj] Fix extraction (#18280, #18281) ++ [wistia] Add support for fast.wistia.com (#18287) + + +version 2018.11.23 + +Core ++ [setup.py] Add more relevant classifiers + +Extractors +* [mixcloud] Fallback to hardcoded decryption key (#18016) +* [nbc:news] Fix article extraction (#16194) +* [foxsports] Fix extraction (#17543) +* [loc] Relax regular expression and improve formats extraction ++ [ciscolive] Add support for ciscolive.cisco.com (#17984) +* [nzz] Relax kaltura regex (#18228) +* [sixplay] Fix formats extraction +* [bitchute] Improve title extraction +* [kaltura] Limit requested MediaEntry fields ++ [americastestkitchen] Add support for zype embeds (#18225) ++ [pornhub] Add pornhub.net alias +* [nova:embed] Fix extraction (#18222) + + +version 2018.11.18 + +Extractors ++ [wwe] Extract subtitles ++ [wwe] Add support for playlistst (#14781) ++ [wwe] Add support for wwe.com (#14781, #17450) +* [vk] Detect geo restriction (#17767) +* [openload] Use original host during extraction (#18211) +* [atvat] Fix extraction (#18041) ++ [rte] Add support for new API endpoint (#18206) +* [tnaflixnetwork:embed] Fix extraction (#18205) +* [picarto] Use API and add token support (#16518) ++ [zype] Add support for player.zype.com (#18143) +* [vivo] Fix extraction (#18139) +* [ruutu] Update API endpoint (#18138) + + +version 2018.11.07 + +Extractors ++ [youtube] Add another JS signature function name regex (#18091, #18093, + #18094) +* [facebook] Fix tahoe request (#17171) +* [cliphunter] Fix extraction (#18083) ++ [youtube:playlist] Add support for invidio.us (#18077) +* [zattoo] Arrange API hosts for derived extractors (#18035) ++ [youtube] Add fallback metadata extraction from videoDetails (#18052) + + +version 2018.11.03 + +Core +* [extractor/common] Ensure response handle is not prematurely closed before + it can be read if it matches expected_status (#17195, #17846, #17447) + +Extractors +* [laola1tv:embed] Set correct stream access URL scheme (#16341) ++ [ehftv] Add support for ehftv.com (#15408) +* [azmedien] Adopt to major site redesign (#17745, #17746) ++ [twitcasting] Add support for twitcasting.tv (#17981) +* [orf:tvthek] Fix extraction (#17737, #17956, #18024) ++ [openload] Add support for oload.fun (#18045) +* [njpwworld] Fix authentication (#17427) ++ [linkedin:learning] Add support for linkedin.com/learning (#13545) +* [theplatform] Improve error detection (#13222) +* [cnbc] Simplify extraction (#14280, #17110) ++ [cbnc] Add support for new URL schema (#14193) +* [aparat] Improve extraction and extract more metadata (#17445, #18008) +* [aparat] Fix extraction + + +version 2018.10.29 + +Core ++ [extractor/common] Add validation for JSON-LD URLs + +Extractors ++ [sportbox] Add support for matchtv.ru +* [sportbox] Fix extraction (#17978) +* [screencast] Fix extraction (#14590, #14617, #17990) ++ [openload] Add support for oload.icu ++ [ivi] Add support for ivi.tv +* [crunchyroll] Improve extraction failsafeness (#17991) +* [dailymail] Fix formats extraction (#17976) +* [viewster] Reduce format requests +* [cwtv] Handle API errors (#17905) ++ [rutube] Use geo verification headers (#17897) ++ [brightcove:legacy] Add fallbacks to brightcove:new (#13912) +- [tv3] Remove extractor (#10461, #15339) +* [ted] Fix extraction for HTTP and RTMP formats (#5941, #17572, #17894) ++ [openload] Add support for oload.cc (#17823) ++ [patreon] Extract post_file URL (#17792) +* [patreon] Fix extraction (#14502, #10471) + + +version 2018.10.05 + +Extractors +* [pluralsight] Improve authentication (#17762) +* [dailymotion] Fix extraction (#17699) +* [crunchyroll] Switch to HTTPS for RpcApi (#17749) ++ [philharmoniedeparis] Add support for pad.philharmoniedeparis.fr (#17705) +* [philharmoniedeparis] Fix extraction (#17705) ++ [jamendo] Add support for licensing.jamendo.com (#17724) ++ [openload] Add support for oload.cloud (#17710) +* [pluralsight] Fix subtitles extraction (#17726, #17728) ++ [vimeo] Add another config regular expression (#17690) +* [spike] Fix Paramount Network extraction (#17677) +* [hotstar] Fix extraction (#14694, #14931, #17637) + + version 2018.09.26 Extractors diff --git a/Makefile b/Makefile index 4a62f44bc..3e17365b8 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete @@ -78,8 +78,12 @@ README.md: youtube_dl/*.py youtube_dl/*/*.py CONTRIBUTING.md: README.md $(PYTHON) devscripts/make_contributing.py README.md CONTRIBUTING.md -.github/ISSUE_TEMPLATE.md: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md youtube_dl/version.py - $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl.md .github/ISSUE_TEMPLATE.md +issuetemplates: devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md youtube_dl/version.py + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/1_broken_site.md .github/ISSUE_TEMPLATE/1_broken_site.md + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md .github/ISSUE_TEMPLATE/2_site_support_request.md + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md .github/ISSUE_TEMPLATE/3_site_feature_request.md + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/4_bug_report.md .github/ISSUE_TEMPLATE/4_bug_report.md + $(PYTHON) devscripts/make_issue_template.py .github/ISSUE_TEMPLATE_tmpl/5_feature_request.md .github/ISSUE_TEMPLATE/5_feature_request.md supportedsites: $(PYTHON) devscripts/make_supportedsites.py docs/supportedsites.md diff --git a/README.md b/README.md index fdd115c9b..c39b13616 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/rg3/youtube-dl.svg?branch=master)](https://travis-ci.org/rg3/youtube-dl) +[![Build Status](https://travis-ci.org/ytdl-org/youtube-dl.svg?branch=master)](https://travis-ci.org/ytdl-org/youtube-dl) youtube-dl - download videos from youtube.com or other video platforms @@ -43,7 +43,7 @@ Or with [MacPorts](https://www.macports.org/): sudo port install youtube-dl -Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). +Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://ytdl-org.github.io/youtube-dl/download.html). # DESCRIPTION **youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on macOS. It is released to the public domain, which means you can modify it, redistribute it or use it however you like. @@ -496,7 +496,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title @@ -642,6 +642,7 @@ The simplest case is requesting a specific format, for example with `-f 22` you You can also use a file extension (currently `3gp`, `aac`, `flv`, `m4a`, `mp3`, `mp4`, `ogg`, `wav`, `webm` are supported) to download the best quality format of a particular file extension served as a single file, e.g. `-f webm` will download the best quality format with the `webm` extension served as a single file. You can also use special names to select particular edge case formats: + - `best`: Select the best quality format represented by a single file with video and audio. - `worst`: Select the worst quality format represented by a single file with video and audio. - `bestvideo`: Select the best quality video-only format (e.g. DASH video). May not be available. @@ -658,6 +659,7 @@ If you want to download several formats of the same video use a comma as a separ You can also filter the video formats by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, `>=`, `=` (equals), `!=` (not equals): + - `filesize`: The number of bytes, if known in advance - `width`: Width of the video, if known - `height`: Height of the video, if known @@ -667,7 +669,8 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate -Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields: +Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: + - `ext`: File extension - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use @@ -675,6 +678,8 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format +Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). + Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. @@ -683,7 +688,7 @@ You can merge the video and audio of two formats into a single file using `-f \d+)' +``` + +Incorrect: +```python +r'(id|ID)=(?P\d+)' +``` + + +#### Make regular expressions relaxed and flexible + +When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on. -#### Example +##### Example Say you need to extract `title` from the following HTML code: @@ -1166,13 +1197,121 @@ title = self._search_regex( webpage, 'title', group='title') ``` -### Use safe conversion functions +### Long lines policy -Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse. + +For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit: + +Correct: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +Incorrect: + +```python +'https://www.youtube.com/watch?v=FqZTN594JQw&list=' +'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4' +``` + +### Inline values + +Extracting variables is acceptable for reducing code duplication and improving readability of complex expressions. However, you should avoid extracting variables used only once and moving them to opposite parts of the extractor file, which makes reading the linear flow difficult. + +#### Example + +Correct: + +```python +title = self._html_search_regex(r'([^<]+)', webpage, 'title') +``` + +Incorrect: + +```python +TITLE_RE = r'([^<]+)' +# ...some lines of code... +title = self._html_search_regex(TITLE_RE, webpage, 'title') +``` + +### Collapse fallbacks + +Multiple fallback values can quickly become unwieldy. Collapse multiple fallback values into a single expression via a list of patterns. + +#### Example + +Good: + +```python +description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) +``` + +Unwieldy: + +```python +description = ( + self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, default=None) + or self._html_search_meta('twitter:description', webpage, default=None)) +``` + +Methods supporting list of patterns are: `_search_regex`, `_html_search_regex`, `_og_search_property`, `_html_search_meta`. + +### Trailing parentheses + +Always move trailing parentheses after the last argument. + +#### Example + +Correct: + +```python + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list) +``` + +Incorrect: + +```python + lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], + list, +) +``` + +### Use convenience conversion and parsing functions + +Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. + +Use `url_or_none` for safe URL processing. + +Use `try_get` for safe metadata extraction from parsed JSON. + +Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. + +Explore [`youtube_dl/utils.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py) for more useful convenience functions. + +#### More examples + +##### Safely extract optional description from parsed JSON +```python +description = try_get(response, lambda x: x['result']['video'][0]['summary'], compat_str) +``` + +##### Safely extract more optional metadata +```python +video = try_get(response, lambda x: x['result']['video'][0], dict) or {} +description = video.get('summary') +duration = float_or_none(video.get('durationMs'), scale=1000) +view_count = int_or_none(video.get('views')) +``` # EMBEDDING YOUTUBE-DL -youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new). +youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/ytdl-org/youtube-dl/issues/new). From a Python program, you can embed youtube-dl in a more powerful fashion, like this: @@ -1185,7 +1324,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/ytdl-org/youtube-dl/blob/3e4cedf9e8cd3157df2457df7274d0c842421945/youtube_dl/YoutubeDL.py#L137-L312). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: @@ -1226,7 +1365,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` @@ -1272,11 +1411,11 @@ Before reporting any issue, type `youtube-dl -U`. This should report that you're ### Is the issue already documented? -Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/rg3/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. +Make sure that someone has not already opened the issue you're trying to open. Search at the top of the window or browse the [GitHub Issues](https://github.com/ytdl-org/youtube-dl/search?type=Issues) of this repository. If there is an issue, feel free to write something along the lines of "This affects me as well, with version 2015.01.01. Here is some more information on the issue: ...". While some issues may be old, a new post into them often spurs rapid activity. ### Why are existing options not enough? -Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. +Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/ytdl-org/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem. ### Is there enough context in your bug report? diff --git a/devscripts/buildserver.py b/devscripts/buildserver.py index 1344b4d87..4a4295ba9 100644 --- a/devscripts/buildserver.py +++ b/devscripts/buildserver.py @@ -322,7 +322,7 @@ class GITBuilder(GITInfoBuilder): class YoutubeDLBuilder(object): - authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile'] + authorizedUsers = ['fraca7', 'phihag', 'rg3', 'FiloSottile', 'ytdl-org'] def __init__(self, **kwargs): if self.repoName != 'youtube-dl': diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 72b2ee422..740f04de0 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -45,12 +45,12 @@ for test in gettestcases(): RESULT = ('.' + domain + '\n' in LIST or '\n' + domain + '\n' in LIST) - if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] or - test['info_dict']['age_limit'] != 18): + if RESULT and ('info_dict' not in test or 'age_limit' not in test['info_dict'] + or test['info_dict']['age_limit'] != 18): print('\nPotential missing age_limit check: {0}'.format(test['name'])) - elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] and - test['info_dict']['age_limit'] == 18): + elif not RESULT and ('info_dict' in test and 'age_limit' in test['info_dict'] + and test['info_dict']['age_limit'] == 18): print('\nPotential false negative: {0}'.format(test['name'])) else: diff --git a/devscripts/create-github-release.py b/devscripts/create-github-release.py index 30716ad8e..428111b3f 100644 --- a/devscripts/create-github-release.py +++ b/devscripts/create-github-release.py @@ -27,8 +27,8 @@ from youtube_dl.utils import ( class GitHubReleaser(object): - _API_URL = 'https://api.github.com/repos/rg3/youtube-dl/releases' - _UPLOADS_URL = 'https://uploads.github.com/repos/rg3/youtube-dl/releases/%s/assets?name=%s' + _API_URL = 'https://api.github.com/repos/ytdl-org/youtube-dl/releases' + _UPLOADS_URL = 'https://uploads.github.com/repos/ytdl-org/youtube-dl/releases/%s/assets?name=%s' _NETRC_MACHINE = 'github.com' def __init__(self, debuglevel=0): diff --git a/devscripts/gh-pages/update-feed.py b/devscripts/gh-pages/update-feed.py index e93eb60fb..506a62377 100755 --- a/devscripts/gh-pages/update-feed.py +++ b/devscripts/gh-pages/update-feed.py @@ -10,7 +10,7 @@ import textwrap atom_template = textwrap.dedent("""\ - + youtube-dl releases https://yt-dl.org/feed/youtube-dl-updates-feed @TIMESTAMP@ @@ -21,7 +21,7 @@ entry_template = textwrap.dedent(""" https://yt-dl.org/feed/youtube-dl-updates-feed/youtube-dl-@VERSION@ New version @VERSION@ - +
Downloads available at https://yt-dl.org/downloads/@VERSION@/ diff --git a/devscripts/release.sh b/devscripts/release.sh index 4db5def5d..f2411c927 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -78,8 +78,8 @@ sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py sed -i "s//$version/" ChangeLog /bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..." -make README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md supportedsites -git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py ChangeLog +make README.md CONTRIBUTING.md issuetemplates supportedsites +git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE/1_broken_site.md .github/ISSUE_TEMPLATE/2_site_support_request.md .github/ISSUE_TEMPLATE/3_site_feature_request.md .github/ISSUE_TEMPLATE/4_bug_report.md .github/ISSUE_TEMPLATE/5_feature_request.md .github/ISSUE_TEMPLATE/6_question.md docs/supportedsites.md youtube_dl/version.py ChangeLog git commit $gpg_sign_commits -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." @@ -96,7 +96,7 @@ git push origin "$version" REV=$(git rev-parse HEAD) make youtube-dl youtube-dl.tar.gz read -p "VM running? (y/n) " -n 1 -wget "http://$buildserver/build/rg3/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe +wget "http://$buildserver/build/ytdl-org/youtube-dl/youtube-dl.exe?rev=$REV" -O youtube-dl.exe mkdir -p "build/$version" mv youtube-dl youtube-dl.exe "build/$version" mv youtube-dl.tar.gz "build/$version/youtube-dl-$version.tar.gz" diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py index e25d28411..6c8d1cc2d 100644 --- a/devscripts/show-downloads-statistics.py +++ b/devscripts/show-downloads-statistics.py @@ -24,7 +24,7 @@ total_bytes = 0 for page in itertools.count(1): releases = json.loads(compat_urllib_request.urlopen( - 'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page + 'https://api.github.com/repos/ytdl-org/youtube-dl/releases?page=%s' % page ).read().decode('utf-8')) if not releases: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 736ab6da7..18bddc138 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,12 +28,13 @@ - **acast:channel** - **AddAnime** - **ADN**: Anime Digital Network + - **AdobeConnect** - **AdobeTV** - **AdobeTVChannel** - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network + - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault - **afreecatv**: afreecatv.com - **AirMozilla** - **AliExpressLive** @@ -44,9 +45,8 @@ - **AmericasTestKitchen** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - - **anitube.se** - **Anvato** - - **AnySex** + - **aol.com** - **APA** - **Aparat** - **AppleConnect** @@ -58,16 +58,8 @@ - **ARD:mediathek** - **ARDBetaMediathek** - **Arkena** - - **arte.tv** - **arte.tv:+7** - - **arte.tv:cinema** - - **arte.tv:concert** - - **arte.tv:creative** - - **arte.tv:ddc** - **arte.tv:embed** - - **arte.tv:future** - - **arte.tv:info** - - **arte.tv:magazine** - **arte.tv:playlist** - **AsianCrush** - **AsianCrushPlaylist** @@ -78,14 +70,11 @@ - **AudioBoom** - **audiomack** - **audiomack:album** - - **auroravid**: AuroraVid - **AWAAN** - **awaan:live** - **awaan:season** - **awaan:video** - **AZMedien**: AZ Medien videos - - **AZMedienPlaylist**: AZ Medien playlists - - **AZMedienShowPlaylist**: AZ Medien show playlists - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** @@ -105,6 +94,7 @@ - **Bellator** - **BellMedia** - **Bet** + - **bfi:player** - **Bigflix** - **Bild**: Bild.de - **BiliBili** @@ -151,6 +141,7 @@ - **CBSInteractive** - **CBSLocal** - **cbsnews**: CBS News + - **cbsnews:embed** - **cbsnews:livevideo**: CBS News Live Videos - **CBSSports** - **CCMA** @@ -165,6 +156,9 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** + - **Cinemax** + - **CiscoLiveSearch** + - **CiscoLiveSession** - **CJSW** - **cliphunter** - **Clippit** @@ -172,12 +166,12 @@ - **Clipsyndicate** - **CloserToTruth** - **CloudflareStream** - - **cloudtime**: CloudTime - **Cloudy** - **Clubic** - **Clyp** - **cmt.com** - **CNBC** + - **CNBCVideo** - **CNN** - **CNNArticle** - **CNNBlogs** @@ -191,7 +185,6 @@ - **Coub** - **Cracked** - **Crackle** - - **Criterion** - **CrooksAndLiars** - **crunchyroll** - **crunchyroll:playlist** @@ -199,6 +192,7 @@ - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTVNews** + - **cu.ntv.co.jp**: Nippon Television Network - **Culturebox** - **CultureUnplugged** - **curiositystream** @@ -229,13 +223,13 @@ - **DiscoveryNetworksDe** - **DiscoveryVR** - **Disney** + - **dlive:stream** + - **dlive:vod** - **Dotsub** - **DouyuShow** - **DouyuTV**: 斗鱼 - **DPlay** - **DPlayIt** - - **dramafever** - - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -251,6 +245,7 @@ - **EchoMsk** - **egghead:course**: egghead.io course - **egghead:lesson**: egghead.io lesson + - **ehftv** - **eHow** - **EinsUndEinsTV** - **Einthusan** @@ -312,12 +307,11 @@ - **FrontendMastersCourse** - **FrontendMastersLesson** - **Funimation** - - **FunkChannel** - - **FunkMix** - - **FunnyOrDie** + - **Funk** - **Fusion** - **Fux** - **FXNetworks** + - **Gaia** - **GameInformer** - **GameOne** - **gameone:playlist** @@ -344,7 +338,6 @@ - **Groupon** - **Hark** - **hbo** - - **hbo:episode** - **HearThisAt** - **Heise** - **HellPorno** @@ -358,9 +351,10 @@ - **hitbox** - **hitbox:live** - **HitRecord** + - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - **HornBunny** - **HotNewHipHop** - - **HotStar** + - **hotstar** - **hotstar:playlist** - **Howcast** - **HowStuffWorks** @@ -368,18 +362,22 @@ - **HRTiPlaylist** - **Huajiao**: 花椒直播 - **HuffPost**: Huffington Post + - **Hungama** + - **HungamaSong** - **Hypem** - **Iconosquare** - **ign.com** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** - - **ImgurAlbum** + - **imgur:album** + - **imgur:gallery** - **Ina** - **Inc** - **IndavideoEmbed** - **InfoQ** - **Instagram** + - **instagram:tag**: Instagram hashtag search - **instagram:user**: Instagram user profile - **Internazionale** - **InternetVideoArchive** @@ -433,6 +431,9 @@ - **Le**: 乐视网 - **Learnr** - **Lecture2Go** + - **Lecturio** + - **LecturioCourse** + - **LecturioDeCourse** - **LEGO** - **Lemonde** - **Lenta** @@ -445,7 +446,11 @@ - **limelight:channel** - **limelight:channel_list** - **LineTV** + - **linkedin:learning** + - **linkedin:learning:course** + - **LinuxAcademy** - **LiTV** + - **LiveJournal** - **LiveLeak** - **LiveLeakEmbed** - **livestream** @@ -463,6 +468,7 @@ - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru - **MakerTV** + - **MallTV** - **mangomolo:live** - **mangomolo:video** - **ManyVids** @@ -472,9 +478,12 @@ - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **media.ccc.de:lists** - **Medialaan** - **Mediaset** - **Mediasite** + - **MediasiteCatalog** + - **MediasiteNamedCatalog** - **Medici** - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 @@ -532,9 +541,8 @@ - **MyviEmbed** - **MyVisionTV** - **n-tv.de** - - **natgeo** - - **natgeo:episodeguide** - **natgeo:video** + - **NationalGeographicTV** - **Naver** - **NBA** - **NBC** @@ -566,7 +574,6 @@ - **NextTV**: 壹電視 - **Nexx** - **NexxEmbed** - - **nfb**: National Film Board of Canada - **nfl.com** - **NhkVod** - **nhl.com** @@ -592,7 +599,6 @@ - **nowness** - **nowness:playlist** - **nowness:series** - - **nowvideo**: NowVideo - **Noz** - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **npo.nl:live** @@ -608,6 +614,7 @@ - **NRKTVEpisodes** - **NRKTVSeason** - **NRKTVSeries** + - **NRLTV** - **ntv.ru** - **Nuvid** - **NYTimes** @@ -617,7 +624,6 @@ - **OdaTV** - **Odnoklassniki** - **OktoberfestTV** - - **on.aol.com** - **OnDemandKorea** - **onet.pl** - **onet.tv** @@ -634,6 +640,7 @@ - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek - **OsnatelTV** + - **OutsideTV** - **PacktPub** - **PacktPubCourse** - **PandaTV**: 熊猫TV @@ -657,7 +664,10 @@ - **Piksel** - **Pinkbike** - **Pladform** + - **Platzi** + - **PlatziCourse** - **play.fm** + - **PlayPlusTV** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -672,17 +682,16 @@ - **PopcornTV** - **PornCom** - **PornerBros** - - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla - - **PornHubPlaylist** - - **PornHubUserVideos** + - **PornHubPagedVideoList** + - **PornHubUser** + - **PornHubUserVideosUpload** - **Pornotube** - **PornoVoisines** - **PornoXO** - **PornTube** - **PressTV** - - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital - **puhutv** @@ -702,7 +711,7 @@ - **radio.de** - **radiobremen** - **radiocanada** - - **RadioCanadaAudioVideo** + - **radiocanada:audiovideo** - **radiofrance** - **RadioJavan** - **Rai** @@ -714,6 +723,7 @@ - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** + - **RedBullTVRrnContent** - **Reddit** - **RedditR** - **RedTube** @@ -747,9 +757,7 @@ - **rtve.es:television** - **RTVNH** - **RTVS** - - **Rudo** - **RUHD** - - **RulePorn** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels - **rutube:embed**: Rutube embedded videos @@ -763,6 +771,7 @@ - **safari:api** - **safari:course**: safaribooksonline.com online courses - **SAKTV** + - **SaltTV** - **Sapo**: SAPO Vídeos - **savefrom.net** - **SBS**: sbs.com.au @@ -785,6 +794,7 @@ - **ShowRoomLive** - **Sina** - **SkylineWebcams** + - **SkyNews** - **skynewsarabia:article** - **skynewsarabia:video** - **SkySports** @@ -812,13 +822,14 @@ - **southpark.nl** - **southparkstudios.dk** - **SpankBang** + - **SpankBangPlaylist** - **Spankwire** - **Spiegel** - **Spiegel:Article**: Articles on spiegel.de - **Spiegeltv** - **sport.francetvinfo.fr** - **Sport5** - - **SportBoxEmbed** + - **SportBox** - **SportDeutschland** - **SpringboardPlatform** - **Sprout** @@ -834,7 +845,10 @@ - **StreamCZ** - **StreetVoice** - **StretchInternet** + - **stv:player** - **SunPorno** + - **sverigesradio:episode** + - **sverigesradio:publication** - **SVT** - **SVTPage** - **SVTPlay**: SVT Play and Öppet arkiv @@ -849,10 +863,13 @@ - **TastyTrade** - **TBS** - **TDSLifeway** + - **Teachable** + - **TeachableCourse** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** - **Teamcoco** + - **TeamTreeHouse** - **TechTalks** - **techtv.mit.edu** - **ted** @@ -871,7 +888,6 @@ - **TF1** - **TFO** - **TheIntercept** - - **theoperaplatform** - **ThePlatform** - **ThePlatformFeed** - **TheScene** @@ -881,6 +897,8 @@ - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **TikTok** + - **TikTokUser** - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -894,6 +912,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** + - **TruNews** - **TruTV** - **Tube8** - **TubiTv** @@ -909,7 +928,6 @@ - **TV2** - **tv2.hu** - **TV2Article** - - **TV3** - **TV4**: tv4.se and tv4play.se - **TV5MondePlus**: TV5MONDE+ - **TVA** @@ -923,7 +941,9 @@ - **TVNet** - **TVNoe** - **TVNow** - - **TVNowList** + - **TVNowAnnual** + - **TVNowNew** + - **TVNowSeason** - **TVNowShow** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska @@ -931,6 +951,7 @@ - **TVPlayer** - **TVPlayHome** - **Tweakers** + - **TwitCasting** - **twitch:chapter** - **twitch:clips** - **twitch:profile** @@ -955,8 +976,6 @@ - **uol.com.br** - **uplynk** - **uplynk:preplay** - - **Upskill** - - **UpskillCourse** - **Urort**: NRK P3 Urørt - **URPlay** - **USANetwork** @@ -969,12 +988,14 @@ - **Vbox7** - **VeeHD** - **Veoh** + - **verystream** - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **vhx:embed** - **Viafree** - **vice** - **vice:article** @@ -986,13 +1007,11 @@ - **video.mit.edu** - **VideoDetective** - **videofy.me** - - **VideoMega** - **videomore** - **videomore:season** - **videomore:video** - **VideoPremium** - **VideoPress** - - **videoweed**: VideoWeed - **Vidio** - **VidLii** - **vidme** @@ -1037,10 +1056,9 @@ - **Voot** - **VoxMedia** - **VoxMediaVolume** - - **Vporn** - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VRT**: VRT NWS, Flanders News, Flandern Info and Sporza - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** @@ -1051,6 +1069,7 @@ - **VVVVID** - **VyboryMos** - **Vzaar** + - **Wakanim** - **Walla** - **WalyTV** - **washingtonpost** @@ -1069,20 +1088,19 @@ - **Weibo** - **WeiboMobile** - **WeiqiTV**: WQTV - - **wholecloud**: WholeCloud - **Wimp** - **Wistia** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** - - **wrzuta.pl** - - **wrzuta.pl:playlist** - **WSJ**: Wall Street Journal - **WSJArticle** + - **WWE** - **XBef** - **XboxClips** - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me - **XHamster** - **XHamsterEmbed** + - **XHamsterUser** - **xiami:album**: 虾米音乐 - 专辑 - **xiami:artist**: 虾米音乐 - 歌手 - **xiami:collection**: 虾米音乐 - 精选集 @@ -1098,10 +1116,14 @@ - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies + - **yahoo:gyao** + - **yahoo:gyao:player** + - **yahoo:japannews**: Yahoo! Japan News - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек + - **YandexVideo** - **YapFiles** - **YesJapan** - **yinyuetai:video**: 音悦Tai @@ -1137,3 +1159,4 @@ - **ZDF** - **ZDFChannel** - **zingmp3**: mp3.zing.vn + - **Zype** diff --git a/setup.cfg b/setup.cfg index af9a554c6..da78a9c47 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,4 +3,4 @@ universal = True [flake8] exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git,venv -ignore = E402,E501,E731,E741 +ignore = E402,E501,E731,E741,W503 diff --git a/setup.py b/setup.py index 7dbb5805f..af68b485e 100644 --- a/setup.py +++ b/setup.py @@ -104,7 +104,7 @@ setup( version=__version__, description=DESCRIPTION, long_description=LONG_DESCRIPTION, - url='https://github.com/rg3/youtube-dl', + url='https://github.com/ytdl-org/youtube-dl', author='Ricardo Garcia', author_email='ytdl@yt-dl.org', maintainer='Sergey M.', @@ -124,6 +124,8 @@ setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: Public Domain', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', @@ -132,6 +134,13 @@ setup( 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: Implementation', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: IronPython', + 'Programming Language :: Python :: Implementation :: Jython', + 'Programming Language :: Python :: Implementation :: PyPy', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, diff --git a/test/helper.py b/test/helper.py index dfee217a9..e62aab11e 100644 --- a/test/helper.py +++ b/test/helper.py @@ -7,6 +7,7 @@ import json import os.path import re import types +import ssl import sys import youtube_dl.extractor @@ -152,15 +153,27 @@ def expect_value(self, got, expected, field): isinstance(got, compat_str), 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) - elif isinstance(expected, compat_str) and expected.startswith('mincount:'): + elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected): self.assertTrue( isinstance(got, (list, dict)), 'Expected field %s to be a list or a dict, but it is of type %s' % ( field, type(got).__name__)) - expected_num = int(expected.partition(':')[2]) - assertGreaterEqual( + op, _, expected_num = expected.partition(':') + expected_num = int(expected_num) + if op == 'mincount': + assert_func = assertGreaterEqual + msg_tmpl = 'Expected %d items in field %s, but only got %d' + elif op == 'maxcount': + assert_func = assertLessEqual + msg_tmpl = 'Expected maximum %d items in field %s, but got %d' + elif op == 'count': + assert_func = assertEqual + msg_tmpl = 'Expected exactly %d items in field %s, but got %d' + else: + assert False + assert_func( self, len(got), expected_num, - 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got))) + msg_tmpl % (expected_num, field, len(got))) return self.assertEqual( expected, got, @@ -236,6 +249,20 @@ def assertGreaterEqual(self, got, expected, msg=None): self.assertTrue(got >= expected, msg) +def assertLessEqual(self, got, expected, msg=None): + if not (got <= expected): + if msg is None: + msg = '%r not less than or equal to %r' % (got, expected) + self.assertTrue(got <= expected, msg) + + +def assertEqual(self, got, expected, msg=None): + if not (got == expected): + if msg is None: + msg = '%r not equal to %r' % (got, expected) + self.assertTrue(got == expected, msg) + + def expect_warnings(ydl, warnings_re): real_warning = ydl.report_warning @@ -244,3 +271,12 @@ def expect_warnings(ydl, warnings_re): real_warning(w) ydl.report_warning = _report_warning + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4833396a5..71f6608fe 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -9,11 +9,30 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL, expect_dict, expect_value -from youtube_dl.compat import compat_etree_fromstring +from test.helper import FakeYDL, expect_dict, expect_value, http_server_port +from youtube_dl.compat import compat_etree_fromstring, compat_http_server from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError +import threading + + +TEAPOT_RESPONSE_STATUS = 418 +TEAPOT_RESPONSE_BODY = "

418 I'm a teapot

" + + +class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == '/teapot': + self.send_response(TEAPOT_RESPONSE_STATUS) + self.send_header('Content-Type', 'text/html; charset=utf-8') + self.end_headers() + self.wfile.write(TEAPOT_RESPONSE_BODY.encode()) + else: + assert False class TestIE(InfoExtractor): @@ -42,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') @@ -50,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._og_search_property('foobar', html), 'Foo') self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') + self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph') self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar') self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True) @@ -86,6 +107,184 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(ExtractorError, self.ie._download_json, uri, None) self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + def test_parse_html5_media_entries(self): + # from https://www.r18.com/ + # with kpbs in label + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://www.r18.com/', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4', + 'ext': 'mp4', + 'format_id': '300kbps', + 'height': 240, + 'tbr': 300, + }, { + 'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4', + 'ext': 'mp4', + 'format_id': '1000kbps', + 'height': 480, + 'tbr': 1000, + }, { + 'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4', + 'ext': 'mp4', + 'format_id': '1500kbps', + 'height': 740, + 'tbr': 1500, + }], + 'thumbnail': '//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg' + }) + + # from https://www.csfd.cz/ + # with width and height + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://www.csfd.cz/', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4', + 'ext': 'mp4', + 'width': 640, + 'height': 360, + }, { + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4', + 'ext': 'mp4', + 'width': 1280, + 'height': 720, + }, { + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4', + 'ext': 'mp4', + 'width': 1920, + 'height': 1080, + }, { + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm', + 'ext': 'webm', + 'width': 640, + 'height': 360, + }, { + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm', + 'ext': 'webm', + 'width': 1280, + 'height': 720, + }, { + 'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm', + 'ext': 'webm', + 'width': 1920, + 'height': 1080, + }], + 'subtitles': { + 'cs': [{'url': 'https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt'}] + }, + 'thumbnail': 'https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360' + }) + + # from https://tamasha.com/v/Kkdjw + # with height in label + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://tamasha.com/v/Kkdjw', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4', + }, { + 'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4', + 'ext': 'mp4', + 'format_id': '240p', + 'height': 240, + }, { + 'url': 'https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4', + 'ext': 'mp4', + 'format_id': '144p', + 'height': 144, + }] + }) + + # from https://www.directvnow.com + # with data-src + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://www.directvnow.com', + r''' + + ''', None)[0], + { + 'formats': [{ + 'ext': 'mp4', + 'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4', + }] + }) + + # from https://www.directvnow.com + # with data-src + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://www.directvnow.com', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4', + 'ext': 'mp4', + }] + }) + + # from https://www.klarna.com/uk/ + # with data-video-src + expect_dict( + self, + self.ie._parse_html5_media_entries( + 'https://www.directvnow.com', + r''' + + ''', None)[0], + { + 'formats': [{ + 'url': 'https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4', + 'ext': 'mp4', + }], + }) + def test_extract_jwplayer_data_realworld(self): # from http://www.suffolk.edu/sjc/ expect_dict( @@ -180,7 +379,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ def test_parse_m3u8_formats(self): _TEST_CASES = [ ( - # https://github.com/rg3/youtube-dl/issues/11507 + # https://github.com/ytdl-org/youtube-dl/issues/11507 # http://pluzz.francetv.fr/videos/le_ministere.html 'pluzz_francetv_11507', 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais', @@ -242,7 +441,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ }] ), ( - # https://github.com/rg3/youtube-dl/issues/11995 + # https://github.com/ytdl-org/youtube-dl/issues/11995 # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor 'teamcoco_11995', 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8', @@ -316,7 +515,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ }] ), ( - # https://github.com/rg3/youtube-dl/issues/12211 + # https://github.com/ytdl-org/youtube-dl/issues/12211 # http://video.toggle.sg/en/series/whoopie-s-world/ep3/478601 'toggle_mobile_12211', 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8', @@ -478,7 +677,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1280, 'height': 720, }] - ) + ), + ( + # https://github.com/ytdl-org/youtube-dl/issues/18923 + # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa + 'ted_18923', + 'http://hls.ted.com/talks/31241.m3u8', + [{ + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '600k-Audio', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '68', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '163', + 'acodec': 'none', + 'width': 320, + 'height': 180, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '481', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '769', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '984', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1255', + 'acodec': 'none', + 'width': 640, + 'height': 360, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1693', + 'acodec': 'none', + 'width': 853, + 'height': 480, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '2462', + 'acodec': 'none', + 'width': 1280, + 'height': 720, + }] + ), ] for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: @@ -492,11 +748,12 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ def test_parse_mpd_formats(self): _TEST_CASES = [ ( - # https://github.com/rg3/youtube-dl/issues/13919 + # https://github.com/ytdl-org/youtube-dl/issues/13919 # Also tests duplicate representation ids, see - # https://github.com/rg3/youtube-dl/issues/15111 + # https://github.com/ytdl-org/youtube-dl/issues/15111 'float_duration', - 'http://unknown/manifest.mpd', + 'http://unknown/manifest.mpd', # mpd_url + None, # mpd_base_url [{ 'manifest_url': 'http://unknown/manifest.mpd', 'ext': 'm4a', @@ -574,9 +831,10 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'height': 1080, }] ), ( - # https://github.com/rg3/youtube-dl/pull/14844 + # https://github.com/ytdl-org/youtube-dl/pull/14844 'urls_only', - 'http://unknown/manifest.mpd', + 'http://unknown/manifest.mpd', # mpd_url + None, # mpd_base_url [{ 'manifest_url': 'http://unknown/manifest.mpd', 'ext': 'mp4', @@ -655,22 +913,68 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1920, 'height': 1080, }] + ), ( + # https://github.com/ytdl-org/youtube-dl/issues/20346 + # Media considered unfragmented even though it contains + # Initialization tag + 'unfragmented', + 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd', # mpd_url + 'https://v.redd.it/hw1x7rcg7zl21', # mpd_base_url + [{ + 'url': 'https://v.redd.it/hw1x7rcg7zl21/audio', + 'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd', + 'ext': 'm4a', + 'format_id': 'AUDIO-1', + 'format_note': 'DASH audio', + 'container': 'm4a_dash', + 'acodec': 'mp4a.40.2', + 'vcodec': 'none', + 'tbr': 129.87, + 'asr': 48000, + + }, { + 'url': 'https://v.redd.it/hw1x7rcg7zl21/DASH_240', + 'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd', + 'ext': 'mp4', + 'format_id': 'VIDEO-2', + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'acodec': 'none', + 'vcodec': 'avc1.4d401e', + 'tbr': 608.0, + 'width': 240, + 'height': 240, + 'fps': 30, + }, { + 'url': 'https://v.redd.it/hw1x7rcg7zl21/DASH_360', + 'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd', + 'ext': 'mp4', + 'format_id': 'VIDEO-1', + 'format_note': 'DASH video', + 'container': 'mp4_dash', + 'acodec': 'none', + 'vcodec': 'avc1.4d401e', + 'tbr': 804.261, + 'width': 360, + 'height': 360, + 'fps': 30, + }] ) ] - for mpd_file, mpd_url, expected_formats in _TEST_CASES: + for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES: with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, mode='r', encoding='utf-8') as f: formats = self.ie._parse_mpd_formats( compat_etree_fromstring(f.read().encode('utf-8')), - mpd_url=mpd_url) + mpd_base_url=mpd_base_url, mpd_url=mpd_url) self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) def test_parse_f4m_formats(self): _TEST_CASES = [ ( - # https://github.com/rg3/youtube-dl/issues/14660 + # https://github.com/ytdl-org/youtube-dl/issues/14660 'custom_base_url', 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', [{ @@ -743,6 +1047,25 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ for i in range(len(entries)): expect_dict(self, entries[i], expected_entries[i]) + def test_response_with_expected_status_returns_content(self): + # Checks for mitigations against the effects of + # that affect Python 3.4.1+, which + # manifest as `_download_webpage`, `_download_xml`, `_download_json`, + # or the underlying `_download_webpage_handle` returning no content + # when a response matches `expected_status`. + + httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), InfoExtractorTestRequestHandler) + port = http_server_port(httpd) + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + (content, urlh) = self.ie._download_webpage_handle( + 'http://127.0.0.1:%d/teapot' % port, None, + expected_status=TEAPOT_RESPONSE_STATUS) + self.assertEqual(content, TEAPOT_RESPONSE_BODY) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f0f5a8470..ce9666171 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -239,6 +239,76 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_string_ops(self): + formats = [ + {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, + {'format_id': 'zxc-cxz', 'ext': 'webm', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # equals (=) + ydl = YDL({'format': '[format_id=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not equal (!=) + ydl = YDL({'format': '[format_id!=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!=abc-cba][format_id!=zxc-cxz]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # starts with (^=) + ydl = YDL({'format': '[format_id^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not start with (!^=) + ydl = YDL({'format': '[format_id!^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!^=abc][format_id!^=zxc]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # ends with ($=) + ydl = YDL({'format': '[format_id$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not end with (!$=) + ydl = YDL({'format': '[format_id!$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!$=cba][format_id!$=cxz]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # contains (*=) + ydl = YDL({'format': '[format_id*=bc-cb]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not contain (!*=) + ydl = YDL({'format': '[format_id!*=bc-cb]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'zxc-cxz') + + ydl = YDL({'format': '[format_id!*=abc][format_id!*=zxc]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + ydl = YDL({'format': '[format_id!*=-]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', @@ -341,7 +411,7 @@ class TestFormatSelection(unittest.TestCase): # For extractors with incomplete formats (all formats are audio-only or # video-only) best and worst should fallback to corresponding best/worst # video-only or audio-only formats (as per - # https://github.com/rg3/youtube-dl/pull/5556) + # https://github.com/ytdl-org/youtube-dl/pull/5556) formats = [ {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL}, @@ -372,7 +442,7 @@ class TestFormatSelection(unittest.TestCase): self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) def test_format_selection_issue_10083(self): - # See https://github.com/rg3/youtube-dl/issues/10083 + # See https://github.com/ytdl-org/youtube-dl/issues/10083 formats = [ {'format_id': 'regular', 'height': 360, 'url': TEST_URL}, {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, @@ -783,7 +853,7 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(result, [2, 3, 4]) def test_urlopen_no_file_protocol(self): - # see https://github.com/rg3/youtube-dl/issues/8227 + # see https://github.com/ytdl-org/youtube-dl/issues/8227 ydl = YDL() self.assertRaises(compat_urllib_error.URLError, ydl.urlopen, 'file:///etc/passwd') diff --git a/test/test_YoutubeDLCookieJar.py b/test/test_YoutubeDLCookieJar.py new file mode 100644 index 000000000..f959798de --- /dev/null +++ b/test/test_YoutubeDLCookieJar.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# coding: utf-8 + +from __future__ import unicode_literals + +import os +import re +import sys +import tempfile +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.utils import YoutubeDLCookieJar + + +class TestYoutubeDLCookieJar(unittest.TestCase): + def test_keep_session_cookies(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + tf = tempfile.NamedTemporaryFile(delete=False) + try: + cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True) + temp = tf.read().decode('utf-8') + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp)) + self.assertTrue(re.search( + r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpires0\s+YoutubeDLExpires0Value', temp)) + finally: + tf.close() + os.remove(tf.name) + + def test_strip_httponly_prefix(self): + cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/httponly_cookies.txt') + cookiejar.load(ignore_discard=True, ignore_expires=True) + + def assert_cookie_has_value(key): + self.assertEqual(cookiejar._cookies['www.foobar.foobar']['/'][key].value, key + '_VALUE') + + assert_cookie_has_value('HTTPONLY_COOKIE') + assert_cookie_has_value('JS_ACCESSIBLE_COOKIE') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_aes.py b/test/test_aes.py index 78a28751b..cc89fb6ab 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -44,16 +44,16 @@ class TestAES(unittest.TestCase): def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode('utf-8') encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + - b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' + intlist_to_bytes(self.iv[:8]) + + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) password = intlist_to_bytes(self.key).decode('utf-8') encrypted = base64.b64encode( - intlist_to_bytes(self.iv[:8]) + - b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' + intlist_to_bytes(self.iv[:8]) + + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index cd1cd4b24..465ce0050 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -110,7 +110,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('https://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) - # https://github.com/rg3/youtube-dl/issues/1930 + # https://github.com/ytdl-org/youtube-dl/issues/1930 def test_soundcloud_not_matching_sets(self): self.assertMatch('http://soundcloud.com/floex/sets/gone-ep', ['soundcloud:set']) @@ -119,12 +119,12 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://tatianamaslanydaily.tumblr.com/post/54196191430', ['Tumblr']) def test_pbs(self): - # https://github.com/rg3/youtube-dl/issues/2350 + # https://github.com/ytdl-org/youtube-dl/issues/2350 self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs']) self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs']) def test_yahoo_https(self): - # https://github.com/rg3/youtube-dl/issues/2701 + # https://github.com/ytdl-org/youtube-dl/issues/2701 self.assertMatch( 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) diff --git a/test/test_compat.py b/test/test_compat.py index d6c54e135..86ff389fd 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,6 +13,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.compat import ( compat_getenv, compat_setenv, + compat_etree_Element, compat_etree_fromstring, compat_expanduser, compat_shlex_split, @@ -39,7 +40,7 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') - test_str = 'C:\Documents and Settings\тест\Application Data' + test_str = r'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) compat_setenv('HOME', old_home or '') @@ -90,6 +91,12 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag']) self.assertEqual(compat_shlex_split('-val 中文'), ['-val', '中文']) + def test_compat_etree_Element(self): + try: + compat_etree_Element.items + except AttributeError: + self.fail('compat_etree_Element is not a type') + def test_compat_etree_fromstring(self): xml = ''' diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py index 5cf2bf1a5..750472281 100644 --- a/test/test_downloader_http.py +++ b/test/test_downloader_http.py @@ -9,26 +9,16 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import try_rm +from test.helper import http_server_port, try_rm from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server from youtube_dl.downloader.http import HttpFD from youtube_dl.utils import encodeFilename -import ssl import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - TEST_SIZE = 10 * 1024 diff --git a/test/test_http.py b/test/test_http.py index 409fec9c8..3ee0a5dda 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -8,6 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from test.helper import http_server_port from youtube_dl import YoutubeDL from youtube_dl.compat import compat_http_server, compat_urllib_request import ssl @@ -16,15 +17,6 @@ import threading TEST_DIR = os.path.dirname(os.path.abspath(__file__)) -def http_server_port(httpd): - if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): - # In Jython SSLSocket is not a subclass of socket.socket - sock = httpd.socket.sock - else: - sock = httpd.socket - return sock.getsockname()[1] - - class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index addb69d6f..4209d1d9a 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -14,4 +14,4 @@ from youtube_dl.postprocessor import MetadataFromTitlePP class TestMetadataFromTitle(unittest.TestCase): def test_format_to_regex(self): pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s') - self.assertEqual(pp._titleregex, '(?P.+)\ \-\ (?P<artist>.+)') + self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)') diff --git a/test/test_swfinterp.py b/test/test_swfinterp.py index f1e899819..9f18055e6 100644 --- a/test/test_swfinterp.py +++ b/test/test_swfinterp.py @@ -34,8 +34,8 @@ def _make_testfunc(testfile): def test_func(self): as_file = os.path.join(TEST_DIR, testfile) swf_file = os.path.join(TEST_DIR, test_id + '.swf') - if ((not os.path.exists(swf_file)) or - os.path.getmtime(swf_file) < os.path.getmtime(as_file)): + if ((not os.path.exists(swf_file)) + or os.path.getmtime(swf_file) < os.path.getmtime(as_file)): # Recompile try: subprocess.check_call([ diff --git a/test/test_utils.py b/test/test_utils.py index 9e28e008f..659c6ece5 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -33,11 +33,13 @@ from youtube_dl.utils import ( ExtractorError, find_xpath_attr, fix_xml_ampersands, + float_or_none, get_element_by_class, get_element_by_attribute, get_elements_by_class, get_elements_by_attribute, InAdvancePagedList, + int_or_none, intlist_to_bytes, is_html, js_to_json, @@ -55,6 +57,7 @@ from youtube_dl.utils import ( parse_count, parse_iso8601, parse_resolution, + parse_bitrate, pkcs1pad, read_batch_urls, sanitize_filename, @@ -70,6 +73,7 @@ from youtube_dl.utils import ( smuggle_url, str_to_int, strip_jsonp, + strip_or_none, timeconvert, unescapeHTML, unified_strdate, @@ -180,7 +184,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename( 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True), - 'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYPssaaaaaaaeceeeeiiiionooooooooeuuuuuypy') + 'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYTHssaaaaaaaeceeeeiiiionooooooooeuuuuuythy') def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') @@ -467,6 +471,21 @@ class TestUtil(unittest.TestCase): shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') + def test_float_or_none(self): + self.assertEqual(float_or_none('42.42'), 42.42) + self.assertEqual(float_or_none('42'), 42.0) + self.assertEqual(float_or_none(''), None) + self.assertEqual(float_or_none(None), None) + self.assertEqual(float_or_none([]), None) + self.assertEqual(float_or_none(set()), None) + + def test_int_or_none(self): + self.assertEqual(int_or_none('42'), 42) + self.assertEqual(int_or_none(''), None) + self.assertEqual(int_or_none(None), None) + self.assertEqual(int_or_none([]), None) + self.assertEqual(int_or_none(set()), None) + def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) self.assertEqual(str_to_int('123.456'), 123456) @@ -507,6 +526,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ''), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de') + self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de') def test_url_or_none(self): self.assertEqual(url_or_none(None), None) @@ -732,6 +753,18 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + def test_strip_or_none(self): + self.assertEqual(strip_or_none(' abc'), 'abc') + self.assertEqual(strip_or_none('abc '), 'abc') + self.assertEqual(strip_or_none(' abc '), 'abc') + self.assertEqual(strip_or_none('\tabc\t'), 'abc') + self.assertEqual(strip_or_none('\n\tabc\n\t'), 'abc') + self.assertEqual(strip_or_none('abc'), 'abc') + self.assertEqual(strip_or_none(''), '') + self.assertEqual(strip_or_none(None), None) + self.assertEqual(strip_or_none(42), None) + self.assertEqual(strip_or_none([]), None) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') @@ -789,6 +822,15 @@ class TestUtil(unittest.TestCase): 'vcodec': 'av01.0.05M.08', 'acodec': 'none', }) + self.assertEqual(parse_codecs('theora, vorbis'), { + 'vcodec': 'theora', + 'acodec': 'vorbis', + }) + self.assertEqual(parse_codecs('unknownvcodec, unknownacodec'), { + 'vcodec': 'unknownvcodec', + 'acodec': 'unknownacodec', + }) + self.assertEqual(parse_codecs('unknown'), {}) def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" @@ -1028,6 +1070,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_resolution('4k'), {'height': 2160}) self.assertEqual(parse_resolution('8K'), {'height': 4320}) + def test_parse_bitrate(self): + self.assertEqual(parse_bitrate(None), None) + self.assertEqual(parse_bitrate(''), None) + self.assertEqual(parse_bitrate('300kbps'), 300) + self.assertEqual(parse_bitrate('1500kbps'), 1500) + self.assertEqual(parse_bitrate('300 kbps'), 300) + def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) diff --git a/test/testdata/cookies/httponly_cookies.txt b/test/testdata/cookies/httponly_cookies.txt new file mode 100644 index 000000000..c46541d6b --- /dev/null +++ b/test/testdata/cookies/httponly_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +#HttpOnly_www.foobar.foobar FALSE / TRUE 2147483647 HTTPONLY_COOKIE HTTPONLY_COOKIE_VALUE +www.foobar.foobar FALSE / TRUE 2147483647 JS_ACCESSIBLE_COOKIE JS_ACCESSIBLE_COOKIE_VALUE diff --git a/test/testdata/cookies/session_cookies.txt b/test/testdata/cookies/session_cookies.txt new file mode 100644 index 000000000..f6996f031 --- /dev/null +++ b/test/testdata/cookies/session_cookies.txt @@ -0,0 +1,6 @@ +# Netscape HTTP Cookie File +# http://curl.haxx.se/rfc/cookie_spec.html +# This is a generated file! Do not edit. + +www.foobar.foobar FALSE / TRUE YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue +www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value diff --git a/test/testdata/m3u8/ted_18923.m3u8 b/test/testdata/m3u8/ted_18923.m3u8 new file mode 100644 index 000000000..52a27118b --- /dev/null +++ b/test/testdata/m3u8/ted_18923.m3u8 @@ -0,0 +1,28 @@ +#EXTM3U +#EXT-X-VERSION:4 +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360 +/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180 +/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480 +/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720 +/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES +/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b + +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400 diff --git a/test/testdata/mpd/unfragmented.mpd b/test/testdata/mpd/unfragmented.mpd new file mode 100644 index 000000000..5a3720be7 --- /dev/null +++ b/test/testdata/mpd/unfragmented.mpd @@ -0,0 +1,28 @@ +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<MPD mediaPresentationDuration="PT54.915S" minBufferTime="PT1.500S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" type="static" xmlns="urn:mpeg:dash:schema:mpd:2011"> + <Period duration="PT54.915S"> + <AdaptationSet segmentAlignment="true" subsegmentAlignment="true" subsegmentStartsWithSAP="1"> + <Representation bandwidth="804261" codecs="avc1.4d401e" frameRate="30" height="360" id="VIDEO-1" mimeType="video/mp4" startWithSAP="1" width="360"> + <BaseURL>DASH_360</BaseURL> + <SegmentBase indexRange="915-1114" indexRangeExact="true"> + <Initialization range="0-914"/> + </SegmentBase> + </Representation> + <Representation bandwidth="608000" codecs="avc1.4d401e" frameRate="30" height="240" id="VIDEO-2" mimeType="video/mp4" startWithSAP="1" width="240"> + <BaseURL>DASH_240</BaseURL> + <SegmentBase indexRange="913-1112" indexRangeExact="true"> + <Initialization range="0-912"/> + </SegmentBase> + </Representation> + </AdaptationSet> + <AdaptationSet> + <Representation audioSamplingRate="48000" bandwidth="129870" codecs="mp4a.40.2" id="AUDIO-1" mimeType="audio/mp4" startWithSAP="1"> + <AudioChannelConfiguration schemeIdUri="urn:mpeg:dash:23003:3:audio_channel_configuration:2011" value="2"/> + <BaseURL>audio</BaseURL> + <SegmentBase indexRange="832-1007" indexRangeExact="true"> + <Initialization range="0-831"/> + </SegmentBase> + </Representation> + </AdaptationSet> + </Period> +</MPD> diff --git a/youtube-dl.plugin.zsh b/youtube-dl.plugin.zsh index 4edab5214..17ab1341a 100644 --- a/youtube-dl.plugin.zsh +++ b/youtube-dl.plugin.zsh @@ -7,7 +7,7 @@ # https://github.com/zsh-users/antigen # Install youtube-dl: -# antigen bundle rg3/youtube-dl +# antigen bundle ytdl-org/youtube-dl # Bundles installed by antigen are available for use immediately. # Update youtube-dl (and all other antigen bundles): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 38ba43a97..6a44bc7ba 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -82,12 +82,14 @@ from .utils import ( sanitize_url, sanitized_Request, std_headers, + str_or_none, subtitles_filename, UnavailableVideoError, url_basename, version_tuple, write_json_file, write_string, + YoutubeDLCookieJar, YoutubeDLCookieProcessor, YoutubeDLHandler, ) @@ -307,6 +309,8 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, otherwise prefer ffmpeg. + ffmpeg_location: Location of the ffmpeg/avconv binary; either the path + to the binary or its containing directory. postprocessor_args: A list of additional command-line arguments for the postprocessor. @@ -396,9 +400,9 @@ class YoutubeDL(object): else: raise - if (sys.platform != 'win32' and - sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and - not params.get('restrictfilenames', False)): + if (sys.platform != 'win32' + and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] + and not params.get('restrictfilenames', False)): # Unicode filesystem API will throw errors (#1474, #13027) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -436,9 +440,9 @@ class YoutubeDL(object): if re.match(r'^-[0-9A-Za-z_-]{10}$', a)] if idxs: correct_argv = ( - ['youtube-dl'] + - [a for i, a in enumerate(argv) if i not in idxs] + - ['--'] + [argv[i] for i in idxs] + ['youtube-dl'] + + [a for i, a in enumerate(argv) if i not in idxs] + + ['--'] + [argv[i] for i in idxs] ) self.report_warning( 'Long argument string detected. ' @@ -558,7 +562,7 @@ class YoutubeDL(object): self.restore_console_title() if self.params.get('cookiefile') is not None: - self.cookiejar.save() + self.cookiejar.save(ignore_discard=True, ignore_expires=True) def trouble(self, message=None, tb=None): """Determine action to take when a download problem appears. @@ -846,8 +850,8 @@ class YoutubeDL(object): if result_type in ('url', 'url_transparent'): ie_result['url'] = sanitize_url(ie_result['url']) extract_flat = self.params.get('extract_flat', False) - if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or - extract_flat is True): + if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) + or extract_flat is True): if self.params.get('forcejson', False): self.to_stdout(json.dumps(ie_result)) return ie_result @@ -887,7 +891,7 @@ class YoutubeDL(object): # url_transparent. In such cases outer metadata (from ie_result) # should be propagated to inner one (info). For this to happen # _type of info should be overridden with url_transparent. This - # fixes issue from https://github.com/rg3/youtube-dl/pull/11163. + # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163. if new_result.get('_type') == 'url': new_result['_type'] = 'url_transparent' @@ -1062,21 +1066,24 @@ class YoutubeDL(object): if not m: STR_OPERATORS = { '=': operator.eq, - '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) - \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? + \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op(attr, value) + else: + op = str_op if not m: raise ValueError('Invalid filter specification %r' % filter_spec) @@ -1601,7 +1608,7 @@ class YoutubeDL(object): # by extractor are incomplete or not (i.e. whether extractor provides only # video-only or audio-only formats) for proper formats selection for # extractors with such incomplete formats (see - # https://github.com/rg3/youtube-dl/pull/5556). + # https://github.com/ytdl-org/youtube-dl/pull/5556). # Since formats may be filtered during format selection and may not match # the original formats the results may be incorrect. Thus original formats # or pre-calculated metrics should be passed to format selection routines @@ -1609,12 +1616,12 @@ class YoutubeDL(object): # We will pass a context object containing all necessary additional data # instead of just formats. # This fixes incorrect format selection issue (see - # https://github.com/rg3/youtube-dl/issues/10083). + # https://github.com/ytdl-org/youtube-dl/issues/10083). incomplete_formats = ( # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) # all formats are audio-only - all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) ctx = { 'formats': formats, @@ -1776,6 +1783,8 @@ class YoutubeDL(object): annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)): self.to_screen('[info] Video annotations are already present') + elif not info_dict.get('annotations'): + self.report_warning('There are no annotations to write.') else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) @@ -1805,7 +1814,7 @@ class YoutubeDL(object): if sub_info.get('data') is not None: try: # Use newline='' to prevent conversion of newline characters - # See https://github.com/rg3/youtube-dl/issues/10268 + # See https://github.com/ytdl-org/youtube-dl/issues/10268 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_info['data']) except (OSError, IOError): @@ -1940,8 +1949,8 @@ class YoutubeDL(object): else: assert fixup_policy in ('ignore', 'never') - if (info_dict.get('requested_formats') is None and - info_dict.get('container') == 'm4a_dash'): + if (info_dict.get('requested_formats') is None + and info_dict.get('container') == 'm4a_dash'): if fixup_policy == 'warn': self.report_warning( '%s: writing DASH m4a. ' @@ -1960,9 +1969,9 @@ class YoutubeDL(object): else: assert fixup_policy in ('ignore', 'never') - if (info_dict.get('protocol') == 'm3u8_native' or - info_dict.get('protocol') == 'm3u8' and - self.params.get('hls_prefer_native')): + if (info_dict.get('protocol') == 'm3u8_native' + or info_dict.get('protocol') == 'm3u8' + and self.params.get('hls_prefer_native')): if fixup_policy == 'warn': self.report_warning('%s: malformed AAC bitstream detected.' % ( info_dict['id'])) @@ -1988,10 +1997,10 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) - if (len(url_list) > 1 and - outtmpl != '-' and - '%' not in outtmpl and - self.params.get('max_downloads') != 1): + if (len(url_list) > 1 + and outtmpl != '-' + and '%' not in outtmpl + and self.params.get('max_downloads') != 1): raise SameFileError(outtmpl) for url in url_list: @@ -2056,15 +2065,24 @@ class YoutubeDL(object): self.report_warning('Unable to remove downloaded original file') def _make_archive_id(self, info_dict): + video_id = info_dict.get('id') + if not video_id: + return # Future-proof against any change in case # and backwards compatibility with prior versions - extractor = info_dict.get('extractor_key') + extractor = info_dict.get('extractor_key') or info_dict.get('ie_key') # key in a playlist if extractor is None: - if 'id' in info_dict: - extractor = info_dict.get('ie_key') # key in a playlist - if extractor is None: - return None # Incomplete video information - return extractor.lower() + ' ' + info_dict['id'] + url = str_or_none(info_dict.get('url')) + if not url: + return + # Try to find matching extractor for the URL and take its ie_key + for ie in self._ies: + if ie.suitable(url): + extractor = ie.ie_key() + break + else: + return + return extractor.lower() + ' ' + video_id def in_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -2072,7 +2090,7 @@ class YoutubeDL(object): return False vid_id = self._make_archive_id(info_dict) - if vid_id is None: + if not vid_id: return False # Incomplete video information try: @@ -2127,8 +2145,8 @@ class YoutubeDL(object): if res: res += ', ' res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None and - fdict.get('vcodec') != 'none'): + if (fdict.get('vcodec') is not None + and fdict.get('vcodec') != 'none'): if res: res += ', ' res += fdict['vcodec'] @@ -2215,7 +2233,7 @@ class YoutubeDL(object): return if type('') is not compat_str: - # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326) + # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326) self.report_warning( 'Your Python is broken! Update to a newer and supported version') @@ -2297,10 +2315,9 @@ class YoutubeDL(object): self.cookiejar = compat_cookiejar.CookieJar() else: opts_cookiefile = expand_path(opts_cookiefile) - self.cookiejar = compat_cookiejar.MozillaCookieJar( - opts_cookiefile) + self.cookiejar = YoutubeDLCookieJar(opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): - self.cookiejar.load() + self.cookiejar.load(ignore_discard=True, ignore_expires=True) cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: @@ -2310,7 +2327,7 @@ class YoutubeDL(object): proxies = {'http': opts_proxy, 'https': opts_proxy} else: proxies = compat_urllib_request.getproxies() - # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) + # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] proxy_handler = PerRequestProxyHandler(proxies) @@ -2323,7 +2340,7 @@ class YoutubeDL(object): # When passing our own FileHandler instance, build_opener won't add the # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see - # https://github.com/rg3/youtube-dl/issues/8227) + # https://github.com/ytdl-org/youtube-dl/issues/8227) file_handler = compat_urllib_request.FileHandler() def file_open(*args, **kwargs): @@ -2335,7 +2352,7 @@ class YoutubeDL(object): # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play - # (See https://github.com/rg3/youtube-dl/issues/1309 for details) + # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details) opener.addheaders = [] self._opener = opener diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ba435ea42..9a659fc65 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -48,7 +48,7 @@ from .YoutubeDL import YoutubeDL def _real_main(argv=None): # Compatibility fixes for Windows if sys.platform == 'win32': - # https://github.com/rg3/youtube-dl/issues/820 + # https://github.com/ytdl-org/youtube-dl/issues/820 codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) workaround_optparse_bug9161() @@ -94,7 +94,7 @@ def _real_main(argv=None): if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') except IOError: - sys.exit('ERROR: batch file could not be read') + sys.exit('ERROR: batch file %s could not be read' % opts.batchfile) all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] @@ -166,6 +166,8 @@ def _real_main(argv=None): if opts.max_sleep_interval is not None: if opts.max_sleep_interval < 0: parser.error('max sleep interval must be positive or 0') + if opts.sleep_interval is None: + parser.error('min sleep interval must be specified, use --min-sleep-interval') if opts.max_sleep_interval < opts.sleep_interval: parser.error('max sleep interval must be greater than or equal to min sleep interval') else: @@ -228,14 +230,14 @@ def _real_main(argv=None): if opts.allsubtitles and not opts.writeautomaticsub: opts.writesubtitles = True - outtmpl = ((opts.outtmpl is not None and opts.outtmpl) or - (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') or - (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') or - (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') or - (opts.usetitle and '%(title)s-%(id)s.%(ext)s') or - (opts.useid and '%(id)s.%(ext)s') or - (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') or - DEFAULT_OUTTMPL) + outtmpl = ((opts.outtmpl is not None and opts.outtmpl) + or (opts.format == '-1' and opts.usetitle and '%(title)s-%(id)s-%(format)s.%(ext)s') + or (opts.format == '-1' and '%(id)s-%(format)s.%(ext)s') + or (opts.usetitle and opts.autonumber and '%(autonumber)s-%(title)s-%(id)s.%(ext)s') + or (opts.usetitle and '%(title)s-%(id)s.%(ext)s') + or (opts.useid and '%(id)s.%(ext)s') + or (opts.autonumber and '%(autonumber)s-%(id)s.%(ext)s') + or DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error('Cannot download a video and extract audio into the same' ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 7b770340f..c75ab131b 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2364,7 +2364,7 @@ except ImportError: # Python 2 # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version - # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) + # is apparently broken (see https://github.com/ytdl-org/youtube-dl/pull/6244) def compat_urllib_parse_unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" @@ -2508,6 +2508,15 @@ class _TreeBuilder(etree.TreeBuilder): pass +try: + # xml.etree.ElementTree.Element is a method in Python <=2.6 and + # the following will crash with: + # TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types + isinstance(None, xml.etree.ElementTree.Element) + from xml.etree.ElementTree import Element as compat_etree_Element +except TypeError: # Python <=2.6 + from xml.etree.ElementTree import _ElementInterface as compat_etree_Element + if sys.version_info[0] >= 3: def compat_etree_fromstring(text): return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) @@ -2640,9 +2649,9 @@ else: try: args = shlex.split('中文') - assert (isinstance(args, list) and - isinstance(args[0], compat_str) and - args[0] == '中文') + assert (isinstance(args, list) + and isinstance(args[0], compat_str) + and args[0] == '中文') compat_shlex_split = shlex.split except (AssertionError, UnicodeEncodeError): # Working around shlex issue with unicode strings on some python 2 @@ -2819,7 +2828,7 @@ else: compat_socket_create_connection = socket.create_connection -# Fix https://github.com/rg3/youtube-dl/issues/4223 +# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 # See http://bugs.python.org/issue9161 for what is broken def workaround_optparse_bug9161(): op = optparse.OptionParser() @@ -2944,7 +2953,7 @@ if platform.python_implementation() == 'PyPy' and sys.pypy_version_info < (5, 4, # PyPy2 prior to version 5.4.0 expects byte strings as Windows function # names, see the original PyPy issue [1] and the youtube-dl one [2]. # 1. https://bitbucket.org/pypy/pypy/issues/2360/windows-ctypescdll-typeerror-function-name - # 2. https://github.com/rg3/youtube-dl/pull/4392 + # 2. https://github.com/ytdl-org/youtube-dl/pull/4392 def compat_ctypes_WINFUNCTYPE(*args, **kwargs): real = ctypes.WINFUNCTYPE(*args, **kwargs) @@ -2969,6 +2978,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookies', 'compat_ctypes_WINFUNCTYPE', + 'compat_etree_Element', 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5979833c0..1cdba89cd 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -176,7 +176,9 @@ class FileDownloader(object): return speed = float(byte_counter) / elapsed if speed > rate_limit: - time.sleep(max((byte_counter // rate_limit) - elapsed, 0)) + sleep_time = float(byte_counter) / rate_limit - elapsed + if sleep_time > 0: + time.sleep(sleep_time) def temp_name(self, filename): """Returns a temporary filename for the given filename.""" @@ -330,15 +332,15 @@ class FileDownloader(object): """ nooverwrites_and_exists = ( - self.params.get('nooverwrites', False) and - os.path.exists(encodeFilename(filename)) + self.params.get('nooverwrites', False) + and os.path.exists(encodeFilename(filename)) ) if not hasattr(filename, 'write'): continuedl_and_exists = ( - self.params.get('continuedl', True) and - os.path.isfile(encodeFilename(filename)) and - not self.params.get('nopart', False) + self.params.get('continuedl', True) + and os.path.isfile(encodeFilename(filename)) + and not self.params.get('nopart', False) ) # Check file already present diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index eaa7adf7c..c6d674bc6 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -53,7 +53,7 @@ class DashSegmentsFD(FragmentFD): except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately - # retried with the same request data this usually succeeds (1-2 attemps + # retried with the same request data this usually succeeds (1-2 attempts # is usually enough) thus allowing to download the whole file successfully. # To be future-proof we will retry all fragments that fail with any # HTTP error. diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 958d00aac..c31f8910a 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -121,7 +121,11 @@ class CurlFD(ExternalFD): cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') cmd += self._option('--limit-rate', 'ratelimit') - cmd += self._option('--retry', 'retries') + retry = self._option('--retry', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '2147483647' + cmd += retry cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') @@ -160,6 +164,12 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._option('--limit-rate', 'ratelimit') + retry = self._option('--tries', 'retries') + if len(retry) == 2: + if retry[1] in ('inf', 'infinite'): + retry[1] = '0' + cmd += retry cmd += self._option('--bind-address', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') @@ -184,6 +194,7 @@ class Aria2cFD(ExternalFD): cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') + cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') cmd += ['--', info_dict['url']] return cmd @@ -229,7 +240,7 @@ class FFmpegFD(ExternalFD): # setting -seekable prevents ffmpeg from guessing if the server # supports seeking(by adding the header `Range: bytes=0-`), which # can cause problems in some cases - # https://github.com/rg3/youtube-dl/issues/11800#issuecomment-275037127 + # https://github.com/ytdl-org/youtube-dl/issues/11800#issuecomment-275037127 # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] @@ -279,6 +290,7 @@ class FFmpegFD(ExternalFD): tc_url = info_dict.get('tc_url') flash_version = info_dict.get('flash_version') live = info_dict.get('rtmp_live', False) + conn = info_dict.get('rtmp_conn') if player_url is not None: args += ['-rtmp_swfverify', player_url] if page_url is not None: @@ -293,6 +305,11 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_flashver', flash_version] if live: args += ['-rtmp_live', 'live'] + if isinstance(conn, list): + for entry in conn: + args += ['-rtmp_conn', entry] + elif isinstance(conn, compat_str): + args += ['-rtmp_conn', conn] args += ['-i', url, '-c', 'copy'] @@ -324,7 +341,7 @@ class FFmpegFD(ExternalFD): # mp4 file couldn't be played, but if we ask ffmpeg to quit it # produces a file that is playable (this is mostly useful for live # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/rg3/youtube-dl/issues/8300). + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). if sys.platform != 'win32': proc.communicate(b'q') raise diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 15e71be9a..8dd3c2eeb 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -238,8 +238,8 @@ def write_metadata_tag(stream, metadata): def remove_encrypted_media(media): - return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and - 'drmAdditionalHeaderSetId' not in e.attrib, + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib + and 'drmAdditionalHeaderSetId' not in e.attrib, media)) @@ -267,8 +267,8 @@ class F4mFD(FragmentFD): media = doc.findall(_add_ns('media')) if not media: self.report_error('No media found') - for e in (doc.findall(_add_ns('drmAdditionalHeader')) + - doc.findall(_add_ns('drmAdditionalHeaderSet'))): + for e in (doc.findall(_add_ns('drmAdditionalHeader')) + + doc.findall(_add_ns('drmAdditionalHeaderSet'))): # If id attribute is missing it's valid for all media nodes # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute if 'id' not in e.attrib: @@ -324,8 +324,8 @@ class F4mFD(FragmentFD): urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) man_url = urlh.geturl() # Some manifests may be malformed, e.g. prosiebensat1 generated manifests - # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 - # and https://github.com/rg3/youtube-dl/issues/7823) + # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/ytdl-org/youtube-dl/issues/7823) manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() doc = compat_etree_fromstring(manifest) @@ -409,7 +409,7 @@ class F4mFD(FragmentFD): # In tests, segments may be truncated, and thus # FlvReader may not be able to parse the whole # chunk. If so, write the segment as is - # See https://github.com/rg3/youtube-dl/issues/9214 + # See https://github.com/ytdl-org/youtube-dl/issues/9214 dest_stream.write(down_data) break raise diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 917f6dc01..02f35459e 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -190,12 +190,13 @@ class FragmentFD(FileDownloader): }) def _start_frag_download(self, ctx): + resume_len = ctx['complete_frags_downloaded_bytes'] total_frags = ctx['total_frags'] # This dict stores the download progress, it's updated by the progress # hook state = { 'status': 'downloading', - 'downloaded_bytes': ctx['complete_frags_downloaded_bytes'], + 'downloaded_bytes': resume_len, 'fragment_index': ctx['fragment_index'], 'fragment_count': total_frags, 'filename': ctx['filename'], @@ -219,8 +220,8 @@ class FragmentFD(FileDownloader): frag_total_bytes = s.get('total_bytes') or 0 if not ctx['live']: estimated_size = ( - (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / - (state['fragment_index'] + 1) * total_frags) + (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) + / (state['fragment_index'] + 1) * total_frags) state['total_bytes_estimate'] = estimated_size if s['status'] == 'finished': @@ -234,8 +235,8 @@ class FragmentFD(FileDownloader): state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] if not ctx['live']: state['eta'] = self.calc_eta( - start, time_now, estimated_size, - state['downloaded_bytes']) + start, time_now, estimated_size - resume_len, + state['downloaded_bytes'] - resume_len) state['speed'] = s.get('speed') or ctx.get('speed') ctx['speed'] = state['speed'] ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index fd304527e..b59aad73f 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -75,9 +75,13 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - def is_ad_fragment(s): - return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or - s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + def is_ad_fragment_start(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) + + def is_ad_fragment_end(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s + or s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment')) media_frags = 0 ad_frags = 0 @@ -87,12 +91,13 @@ class HlsFD(FragmentFD): if not line: continue if line.startswith('#'): - if is_ad_fragment(line): - ad_frags += 1 + if is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False continue if ad_frag_next: - ad_frag_next = False + ad_frags += 1 continue media_frags += 1 @@ -123,7 +128,6 @@ class HlsFD(FragmentFD): if line: if not line.startswith('#'): if ad_frag_next: - ad_frag_next = False continue frag_index += 1 if frag_index <= ctx['fragment_index']: @@ -148,8 +152,8 @@ class HlsFD(FragmentFD): except compat_urllib_error.HTTPError as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. - # See https://github.com/rg3/youtube-dl/issues/10165, - # https://github.com/rg3/youtube-dl/issues/10448). + # See https://github.com/ytdl-org/youtube-dl/issues/10165, + # https://github.com/ytdl-org/youtube-dl/issues/10448). count += 1 if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) @@ -196,8 +200,10 @@ class HlsFD(FragmentFD): 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } - elif is_ad_fragment(line): + elif is_ad_fragment_start(line): ad_frag_next = True + elif is_ad_fragment_end(line): + ad_frag_next = False self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 5b1e96013..3c72ea18b 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -46,8 +46,8 @@ class HttpFD(FileDownloader): is_test = self.params.get('test', False) chunk_size = self._TEST_FILE_SIZE if is_test else ( - info_dict.get('downloader_options', {}).get('http_chunk_size') or - self.params.get('http_chunk_size') or 0) + info_dict.get('downloader_options', {}).get('http_chunk_size') + or self.params.get('http_chunk_size') or 0) ctx.open_mode = 'wb' ctx.resume_len = 0 @@ -111,7 +111,7 @@ class HttpFD(FileDownloader): # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see - # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) + # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) if has_range: content_range = ctx.data.headers.get('Content-Range') if content_range: @@ -123,11 +123,11 @@ class HttpFD(FileDownloader): content_len = int_or_none(content_range_m.group(3)) accept_content_len = ( # Non-chunked download - not ctx.chunk_size or + not ctx.chunk_size # Chunked download and requested piece or # its part is promised to be served - content_range_end == range_end or - content_len < range_end) + or content_range_end == range_end + or content_len < range_end) if accept_content_len: ctx.data_len = content_len return @@ -152,8 +152,8 @@ class HttpFD(FileDownloader): raise else: # Examine the reported length - if (content_length is not None and - (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): + if (content_length is not None + and (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 063fcf444..1ca666b4a 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -146,7 +146,7 @@ def write_piff_header(stream, params): sps, pps = codec_private_data.split(u32.pack(1))[1:] avcc_payload = u8.pack(1) # configuration version avcc_payload += sps[1:4] # avc profile indication + profile compatibility + avc level indication - avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete represenation (1) + reserved (11111) + length size minus one + avcc_payload += u8.pack(0xfc | (params.get('nal_unit_length_field', 4) - 1)) # complete representation (1) + reserved (11111) + length size minus one avcc_payload += u8.pack(1) # reserved (0) + number of sps (0000001) avcc_payload += u16.pack(len(sps)) avcc_payload += sps diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index cd29aca77..8b407bf9c 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -15,10 +15,13 @@ class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' _VALID_URL = r'''(?x) https?:// - abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + abcnews\.go\.com/ + (?: + [^/]+/video/(?P<display_id>[0-9a-z-]+)-| + video/embed\?.*?\bid= + )| + fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) (?P<id>\d+) ''' diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 6d846ea7a..b17c792d2 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -7,6 +7,7 @@ import functools from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + clean_html, float_or_none, int_or_none, try_get, @@ -17,27 +18,17 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<channel>[^/]+)/(?P<id>[^/#?]+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:(?:embed|www)\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<channel>[^/]+)/(?P<id>[^/#?]+) + ''' _TESTS = [{ - # test with one bling - 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', - 'md5': 'ada3de5a1e3a2a381327d749854788bb', - 'info_dict': { - 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', - 'ext': 'mp3', - 'title': '"Where Are You?": Taipei 101, Taiwan', - 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', - 'timestamp': 1196172000, - 'upload_date': '20071127', - 'duration': 211, - 'creator': 'Concierge', - 'series': 'Condé Nast Traveler Podcast', - 'episode': '"Where Are You?": Taipei 101, Taiwan', - } - }, { - # test with multiple blings 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': 'a02393c74f3bdb1801c3ec2695577ce0', + 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', @@ -50,28 +41,43 @@ class ACastIE(InfoExtractor): 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', } + }, { + 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'only_matching': True, + }, { + 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', + 'only_matching': True, }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() s = self._download_json( - 'https://play-api.acast.com/stitch/%s/%s' % (channel, display_id), - display_id)['result'] + 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), + display_id) media_url = s['url'] + if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): + episode_url = s.get('episodeUrl') + if episode_url: + display_id = episode_url + else: + channel, display_id = re.match(self._VALID_URL, s['link']).groups() cast_data = self._download_json( 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), display_id)['result'] e = cast_data['episode'] - title = e['name'] + title = e.get('name') or s['title'] return { 'id': compat_str(e['id']), 'display_id': display_id, 'url': media_url, 'title': title, - 'description': e.get('description') or e.get('summary'), + 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate')), - 'duration': float_or_none(s.get('duration') or e.get('duration')), + 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), + 'duration': float_or_none(e.get('duration') or s.get('duration')), 'filesize': int_or_none(e.get('contentLength')), 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), @@ -83,17 +89,27 @@ class ACastIE(InfoExtractor): class ACastChannelIE(InfoExtractor): IE_NAME = 'acast:channel' - _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<id>[^/#?]+)' - _TEST = { - 'url': 'https://www.acast.com/condenasttraveler', + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?acast\.com/| + play\.acast\.com/s/ + ) + (?P<id>[^/#?]+) + ''' + _TESTS = [{ + 'url': 'https://www.acast.com/todayinfocus', 'info_dict': { - 'id': '50544219-29bb-499e-a083-6087f4cb7797', - 'title': 'Condé Nast Traveler Podcast', - 'description': 'md5:98646dee22a5b386626ae31866638fbd', + 'id': '4efc5294-5385-4847-98bd-519799ce5786', + 'title': 'Today in Focus', + 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', }, - 'playlist_mincount': 20, - } - _API_BASE_URL = 'https://www.acast.com/api/' + 'playlist_mincount': 35, + }, { + 'url': 'http://play.acast.com/s/ft-banking-weekly', + 'only_matching': True, + }] + _API_BASE_URL = 'https://play.acast.com/api/' _PAGE_SIZE = 10 @classmethod @@ -106,7 +122,7 @@ class ACastChannelIE(InfoExtractor): channel_slug, note='Download page %d of channel data' % page) for cast in casts: yield self.url_result( - 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']), + 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), 'ACast', cast['id']) def _real_extract(self, url): diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 9f8a71262..5e7c0724e 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -59,9 +59,9 @@ class AddAnimeIE(InfoExtractor): parsed_url = compat_urllib_parse_urlparse(url) av_val = av_res + len(parsed_url.netloc) confirm_url = ( - parsed_url.scheme + '://' + parsed_url.netloc + - action + '?' + - compat_urllib_parse_urlencode({ + parsed_url.scheme + '://' + parsed_url.netloc + + action + '?' + + compat_urllib_parse_urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 1eb99c39a..c95ad2173 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -21,7 +21,6 @@ from ..utils import ( intlist_to_bytes, long_to_bytes, pkcs1pad, - srt_subtitles_timecode, strip_or_none, urljoin, ) @@ -42,6 +41,18 @@ class ADNIE(InfoExtractor): } _BASE_URL = 'http://animedigitalnetwork.fr' _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _POS_ALIGN_MAP = { + 'start': 1, + 'end': 3, + } + _LINE_ALIGN_MAP = { + 'middle': 8, + 'end': 4, + } + + @staticmethod + def _ass_subtitles_timecode(seconds): + return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) def _get_subtitles(self, sub_path, video_id): if not sub_path: @@ -49,14 +60,20 @@ class ADNIE(InfoExtractor): enc_subtitles = self._download_webpage( urljoin(self._BASE_URL, sub_path), - video_id, fatal=False) + video_id, 'Downloading subtitles location', fatal=False) or '{}' + subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') + if subtitle_location: + enc_subtitles = self._download_webpage( + urljoin(self._BASE_URL, subtitle_location), + video_id, 'Downloading subtitles data', fatal=False, + headers={'Origin': 'https://animedigitalnetwork.fr'}) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + '9032ad7083106400')), + bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -67,23 +84,27 @@ class ADNIE(InfoExtractor): subtitles = {} for sub_lang, sub in subtitles_json.items(): - srt = '' - for num, current in enumerate(sub): - start, end, text = ( + ssa = '''[Script Info] +ScriptType:V4.00 +[V4 Styles] +Format: Name,Fontname,Fontsize,PrimaryColour,SecondaryColour,TertiaryColour,BackColour,Bold,Italic,BorderStyle,Outline,Shadow,Alignment,MarginL,MarginR,MarginV,AlphaLevel,Encoding +Style: Default,Arial,18,16777215,16777215,16777215,0,-1,0,1,1,0,2,20,20,20,0,0 +[Events] +Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' + for current in sub: + start, end, text, line_align, position_align = ( float_or_none(current.get('startTime')), float_or_none(current.get('endTime')), - current.get('text')) + current.get('text'), current.get('lineAlign'), + current.get('positionAlign')) if start is None or end is None or text is None: continue - srt += os.linesep.join( - ( - '%d' % num, - '%s --> %s' % ( - srt_subtitles_timecode(start), - srt_subtitles_timecode(end)), - text, - os.linesep, - )) + alignment = self._POS_ALIGN_MAP.get(position_align, 2) + self._LINE_ALIGN_MAP.get(line_align, 0) + ssa += os.linesep + 'Dialogue: Marked=0,%s,%s,Default,,0,0,0,,%s%s' % ( + self._ass_subtitles_timecode(start), + self._ass_subtitles_timecode(end), + '{\\a%d}' % alignment if alignment != 2 else '', + text.replace('\n', '\\N').replace('<i>', '{\\i1}').replace('</i>', '{\\i0}')) if sub_lang == 'vostf': sub_lang = 'fr' @@ -91,8 +112,8 @@ class ADNIE(InfoExtractor): 'ext': 'json', 'data': json.dumps(sub), }, { - 'ext': 'srt', - 'data': srt, + 'ext': 'ssa', + 'data': ssa, }]) return subtitles @@ -100,7 +121,15 @@ class ADNIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_config = self._parse_json(self._search_regex( - r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id) + r'playerConfig\s*=\s*({.+});', webpage, + 'player config', default='{}'), video_id, fatal=False) + if not player_config: + config_url = urljoin(self._BASE_URL, self._search_regex( + r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', + webpage, 'config url')) + player_config = self._download_json( + config_url, video_id, + 'Downloading player config JSON metadata')['player'] video_info = {} video_info_str = self._search_regex( @@ -129,12 +158,15 @@ class ADNIE(InfoExtractor): encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() links_data = self._download_json( - urljoin(self._BASE_URL, links_url), video_id, headers={ + urljoin(self._BASE_URL, links_url), video_id, + 'Downloading links JSON metadata', headers={ 'Authorization': 'Bearer ' + authorization, }) links = links_data.get('links') or {} metas = metas or links_data.get('meta') or {} - sub_path = (sub_path or links_data.get('subtitles')) + '&token=' + token + sub_path = sub_path or links_data.get('subtitles') or \ + 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id + sub_path += '&token=' + token error = links_data.get('error') title = metas.get('title') or video_info['title'] @@ -142,9 +174,11 @@ class ADNIE(InfoExtractor): for format_id, qualities in links.items(): if not isinstance(qualities, dict): continue - for load_balancer_url in qualities.values(): + for quality, load_balancer_url in qualities.items(): load_balancer_data = self._download_json( - load_balancer_url, video_id, fatal=False) or {} + load_balancer_url, video_id, + 'Downloading %s %s JSON metadata' % (format_id, quality), + fatal=False) or {} m3u8_url = load_balancer_data.get('location') if not m3u8_url: continue diff --git a/youtube_dl/extractor/adobeconnect.py b/youtube_dl/extractor/adobeconnect.py new file mode 100644 index 000000000..728549eb9 --- /dev/null +++ b/youtube_dl/extractor/adobeconnect.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class AdobeConnectIE(InfoExtractor): + _VALID_URL = r'https?://\w+\.adobeconnect\.com/(?P<id>[\w-]+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title') + qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) + is_live = qs.get('isLive', ['false'])[0] == 'true' + formats = [] + for con_string in qs['conStrings'][0].split(','): + formats.append({ + 'format_id': con_string.split('://')[0], + 'app': compat_urlparse.quote('?' + con_string.split('?')[1] + 'flvplayerapp/' + qs['appInstance'][0]), + 'ext': 'flv', + 'play_path': 'mp4:' + qs['streamName'][0], + 'rtmp_conn': 'S:' + qs['ticket'][0], + 'rtmp_live': is_live, + 'url': con_string, + }) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 1cf2dcbf3..38dca1b0a 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -25,6 +25,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'ATT': { + 'name': 'AT&T U-verse', + 'username_field': 'userid', + 'password_field': 'password', + }, 'ATTOTT': { 'name': 'DIRECTV NOW', 'username_field': 'email', diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 88c96a950..8d1d9ac7d 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -1,13 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .turner import TurnerBaseIE from ..utils import ( + determine_ext, + float_or_none, int_or_none, + mimetype2ext, + parse_age_limit, + parse_iso8601, strip_or_none, - url_or_none, + try_get, ) @@ -21,8 +27,8 @@ class AdultSwimIE(TurnerBaseIE): 'ext': 'mp4', 'title': 'Rick and Morty - Pilot', 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', - 'timestamp': 1493267400, - 'upload_date': '20170427', + 'timestamp': 1543294800, + 'upload_date': '20181127', }, 'params': { # m3u8 download @@ -43,6 +49,7 @@ class AdultSwimIE(TurnerBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', 'info_dict': { @@ -61,9 +68,9 @@ class AdultSwimIE(TurnerBaseIE): }, { 'url': 'http://www.adultswim.com/videos/attack-on-titan', 'info_dict': { - 'id': 'b7A69dzfRzuaXIECdxW8XQ', + 'id': 'attack-on-titan', 'title': 'Attack on Titan', - 'description': 'md5:6c8e003ea0777b47013e894767f5e114', + 'description': 'md5:41caa9416906d90711e31dc00cb7db7e', }, 'playlist_mincount': 12, }, { @@ -78,83 +85,118 @@ class AdultSwimIE(TurnerBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': '404 Not Found', }] def _real_extract(self, url): show_path, episode_path = re.match(self._VALID_URL, url).groups() display_id = episode_path or show_path - webpage = self._download_webpage(url, display_id) - initial_data = self._parse_json(self._search_regex( - r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', - webpage, 'initial data'), display_id) - - is_stream = show_path == 'streams' - if is_stream: - if not episode_path: - episode_path = 'live-stream' - - video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) - video_id = video_data.get('stream') - - if not video_id: - entries = [] - for episode in video_data.get('archiveEpisodes', []): - episode_url = url_or_none(episode.get('url')) - if not episode_url: - continue - entries.append(self.url_result( - episode_url, 'AdultSwim', episode.get('id'))) - return self.playlist_result( - entries, video_data.get('id'), video_data.get('title'), - strip_or_none(video_data.get('description'))) + query = '''query { + getShowBySlug(slug:"%s") { + %%s + } +}''' % show_path + if episode_path: + query = query % '''title + getVideoBySlug(slug:"%s") { + _id + auth + description + duration + episodeNumber + launchDate + mediaID + seasonNumber + poster + title + tvRating + }''' % episode_path + ['getVideoBySlug'] else: - show_data = initial_data['show'] + query = query % '''metaDescription + title + videos(first:1000,sort:["episode_number"]) { + edges { + node { + _id + slug + } + } + }''' + show_data = self._download_json( + 'https://www.adultswim.com/api/search', display_id, + data=json.dumps({'query': query}).encode(), + headers={'Content-Type': 'application/json'})['data']['getShowBySlug'] + if episode_path: + video_data = show_data['getVideoBySlug'] + video_id = video_data['_id'] + episode_title = title = video_data['title'] + series = show_data.get('title') + if series: + title = '%s - %s' % (series, title) + info = { + 'id': video_id, + 'title': title, + 'description': strip_or_none(video_data.get('description')), + 'duration': float_or_none(video_data.get('duration')), + 'formats': [], + 'subtitles': {}, + 'age_limit': parse_age_limit(video_data.get('tvRating')), + 'thumbnail': video_data.get('poster'), + 'timestamp': parse_iso8601(video_data.get('launchDate')), + 'series': series, + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode': episode_title, + 'episode_number': int_or_none(video_data.get('episodeNumber')), + } - if not episode_path: - entries = [] - for video in show_data.get('videos', []): - slug = video.get('slug') - if not slug: + auth = video_data.get('auth') + media_id = video_data.get('mediaID') + if media_id: + info.update(self._extract_ngtv_info(media_id, { + # CDN_TOKEN_APP_ID from: + # https://d2gg02c3xr550i.cloudfront.net/assets/asvp.e9c8bef24322d060ef87.bundle.js + 'appId': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhcHBJZCI6ImFzLXR2ZS1kZXNrdG9wLXB0enQ2bSIsInByb2R1Y3QiOiJ0dmUiLCJuZXR3b3JrIjoiYXMiLCJwbGF0Zm9ybSI6ImRlc2t0b3AiLCJpYXQiOjE1MzI3MDIyNzl9.BzSCk-WYOZ2GMCIaeVb8zWnzhlgnXuJTCu0jGp_VaZE', + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': auth, + })) + + if not auth: + extract_data = self._download_json( + 'https://www.adultswim.com/api/shows/v1/videos/' + video_id, + video_id, query={'fields': 'stream'}, fatal=False) or {} + assets = try_get(extract_data, lambda x: x['data']['video']['stream']['assets'], list) or [] + for asset in assets: + asset_url = asset.get('url') + if not asset_url: continue - entries.append(self.url_result( - 'http://adultswim.com/videos/%s/%s' % (show_path, slug), - 'AdultSwim', video.get('id'))) - return self.playlist_result( - entries, show_data.get('id'), show_data.get('title'), - strip_or_none(show_data.get('metadata', {}).get('description'))) + ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) + if ext == 'm3u8': + info['formats'].extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + continue + # info['formats'].extend(self._extract_f4m_formats( + # asset_url, video_id, f4m_id='hds', fatal=False)) + elif ext in ('scc', 'ttml', 'vtt'): + info['subtitles'].setdefault('en', []).append({ + 'url': asset_url, + }) + self._sort_formats(info['formats']) - video_data = show_data['sluggedVideo'] - video_id = video_data['id'] - - info = self._extract_cvp_info( - 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, - video_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', - 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', - }, - }, { - 'url': url, - 'site_name': 'AdultSwim', - 'auth_required': video_data.get('auth'), - }) - - info.update({ - 'id': video_id, - 'display_id': display_id, - 'description': info.get('description') or strip_or_none(video_data.get('description')), - }) - if not is_stream: - info.update({ - 'duration': info.get('duration') or int_or_none(video_data.get('duration')), - 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), - 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), - 'episode': info['title'], - 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), - }) - - info['series'] = video_data.get('collection_title') or info.get('series') - if info['series'] and info['series'] != info['title']: - info['title'] = '%s - %s' % (info['series'], info['title']) - - return info + return info + else: + entries = [] + for edge in show_data.get('videos', {}).get('edges', []): + video = edge.get('node') or {} + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('_id'))) + return self.playlist_result( + entries, show_path, show_data.get('title'), + strip_or_none(show_data.get('metaDescription'))) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 398e56ea3..611b948f5 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -1,14 +1,15 @@ +# coding: utf-8 from __future__ import unicode_literals import re from .theplatform import ThePlatformIE from ..utils import ( + extract_attributes, + ExtractorError, + int_or_none, smuggle_url, update_url_query, - unescapeHTML, - extract_attributes, - get_element_by_attribute, ) from ..compat import ( compat_urlparse, @@ -19,35 +20,76 @@ class AENetworksBaseIE(ThePlatformIE): _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + def _extract_aen_smil(self, smil_url, video_id, auth=None): + query = {'mbr': 'true'} + if auth: + query['auth'] = auth + TP_SMIL_QUERY = [{ + 'assetTypes': 'high_video_ak', + 'switch': 'hls_high_ak' + }, { + 'assetTypes': 'high_video_s3' + }, { + 'assetTypes': 'high_video_s3', + 'switch': 'hls_ingest_fastly' + }] + formats = [] + subtitles = {} + last_e = None + for q in TP_SMIL_QUERY: + q.update(query) + m_url = update_url_query(smil_url, q) + m_url = self._sign_url(m_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + try: + tp_formats, tp_subtitles = self._extract_theplatform_smil( + m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) + except ExtractorError as e: + last_e = e + continue + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + if last_e and not formats: + raise last_e + self._sort_formats(formats) + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + } + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' - IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' + IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' _VALID_URL = r'''(?x) https?:// (?:www\.)? (?P - (?:history|aetv|mylifetime|lifetimemovieclub)\.com| + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| fyi\.tv )/ (?: shows/(?P[^/]+(?:/[^/]+){0,2})| movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/full-special + specials/(?P[^/]+)/(?:full-special|preview-)| + collections/[^/]+/(?P[^/]+) ) ''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'md5': 'a97a65f7e823ae10e9244bc5433d5fe6', 'info_dict': { 'id': '22253814', 'ext': 'mp4', - 'title': 'Winter Is Coming', + 'title': 'Winter is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', 'timestamp': 1338306241, 'upload_date': '20120529', 'uploader': 'AENE-NEW', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.history.com/shows/ancient-aliens/season-1', @@ -80,6 +122,12 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'only_matching': True + }, { + 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', @@ -90,9 +138,9 @@ class AENetworksIE(AENetworksBaseIE): } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id - webpage = self._download_webpage(url, display_id) + domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id or collection_display_id + webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) if show_path: url_parts = show_path.split('/') url_parts_len = len(url_parts) @@ -120,11 +168,6 @@ class AENetworksIE(AENetworksBaseIE): return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) - query = { - 'mbr': 'true', - 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak', - } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( [r"media_url\s*=\s*'(?P[^']+)'", @@ -134,64 +177,39 @@ class AENetworksIE(AENetworksBaseIE): theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None if theplatform_metadata.get('AETN$isBehindWall'): requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) - query['auth'] = self._extract_mvpd_auth( + auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) info.update(self._search_json_ld(webpage, video_id, fatal=False)) - media_url = update_url_query(media_url, query) - media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) - formats, subtitles = self._extract_theplatform_smil(media_url, video_id) - self._sort_formats(formats) - info.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - }) + info.update(self._extract_aen_smil(media_url, video_id, auth)) return info class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' - _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)(?:/[^/]+(?:/(?P[^/?#]+))?)?' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/[^/]+/(?P[\w+-]+?)-video' _TESTS = [{ - 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', + 'url': 'https://www.history.com/topics/valentines-day/history-of-valentines-day-video', 'info_dict': { 'id': '40700995724', 'ext': 'mp4', - 'title': "Bet You Didn't Know: Valentine's Day", + 'title': "History of Valentine’s Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', - 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', - 'info_dict': - { - 'id': 'world-war-i-history', - 'title': 'World War I History', - }, - 'playlist_mincount': 23, - }, { - 'url': 'http://www.history.com/topics/world-war-i-history/videos', - 'only_matching': True, - }, { - 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', - 'only_matching': True, - }, { - 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', - 'only_matching': True, }] def theplatform_url_result(self, theplatform_url, video_id, query): @@ -211,27 +229,19 @@ class HistoryTopicIE(AENetworksBaseIE): } def _real_extract(self, url): - topic_id, video_display_id = re.match(self._VALID_URL, url).groups() - if video_display_id: - webpage = self._download_webpage(url, video_display_id) - release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() - release_url = unescapeHTML(release_url) - - return self.theplatform_url_result( - release_url, video_id, { - 'mbr': 'true', - 'switch': 'hls', - 'assetTypes': 'high_video_ak', - }) - else: - webpage = self._download_webpage(url, topic_id) - entries = [] - for episode_item in re.findall(r']*>', webpage): - video_attributes = extract_attributes(episode_item) - entries.append(self.theplatform_url_result( - video_attributes['data-release-url'], video_attributes['data-id'], { - 'mbr': 'true', - 'switch': 'hls', - 'assetTypes': 'high_video_ak', - })) - return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/history/videos', + video_id, query={'filter[id]': video_id})['results'][0] + title = result['title'] + info = self._extract_aen_smil(result['publicUrl'], video_id) + info.update({ + 'title': title, + 'description': result.get('description'), + 'duration': int_or_none(result.get('duration')), + 'timestamp': int_or_none(result.get('added'), 1000), + }) + return info diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py index 01736872d..8b32aa886 100644 --- a/youtube_dl/extractor/americastestkitchen.py +++ b/youtube_dl/extractor/americastestkitchen.py @@ -43,10 +43,6 @@ class AmericasTestKitchenIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id') - video_data = self._parse_json( self._search_regex( r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', @@ -58,7 +54,18 @@ class AmericasTestKitchenIE(InfoExtractor): (lambda x: x['episodeDetail']['content']['data'], lambda x: x['videoDetail']['content']['data']), dict) ep_meta = ep_data.get('full_video', {}) - external_id = ep_data.get('external_id') or ep_meta['external_id'] + + zype_id = ep_meta.get('zype_id') + if zype_id: + embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id + ie_key = 'Zype' + else: + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + external_id = ep_data.get('external_id') or ep_meta['external_id'] + embed_url = 'kaltura:%s:%s' % (partner_id, external_id) + ie_key = 'Kaltura' title = ep_data.get('title') or ep_meta.get('title') description = clean_html(ep_meta.get('episode_description') or ep_data.get( @@ -72,8 +79,8 @@ class AmericasTestKitchenIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, external_id), - 'ie_key': 'Kaltura', + 'url': embed_url, + 'ie_key': ie_key, 'title': title, 'description': description, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py deleted file mode 100644 index 2fd912da4..000000000 --- a/youtube_dl/extractor/anitube.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -from .nuevo import NuevoBaseIE - - -class AnitubeIE(NuevoBaseIE): - IE_NAME = 'anitube.se' - _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P\d+)' - - _TEST = { - 'url': 'http://www.anitube.se/video/36621', - 'md5': '59d0eeae28ea0bc8c05e7af429998d43', - 'info_dict': { - 'id': '36621', - 'ext': 'mp4', - 'title': 'Recorder to Randoseru 01', - 'duration': 180.19, - }, - 'skip': 'Blocked in the US', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - key = self._search_regex( - r'src=["\']https?://[^/]+/embed/([A-Za-z0-9_-]+)', webpage, 'key') - - return self._extract_nuevo( - 'http://www.anitube.se/nuevo/econfig.php?key=%s' % key, video_id) diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py deleted file mode 100644 index ad86d6e58..000000000 --- a/youtube_dl/extractor/anysex.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) - - -class AnySexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?anysex\.com/(?P\d+)' - _TEST = { - 'url': 'http://anysex.com/156592/', - 'md5': '023e9fbb7f7987f5529a394c34ad3d3d', - 'info_dict': { - 'id': '156592', - 'ext': 'mp4', - 'title': 'Busty and sexy blondie in her bikini strips for you', - 'description': 'md5:de9e418178e2931c10b62966474e1383', - 'categories': ['Erotic'], - 'duration': 270, - 'age_limit': 18, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex(r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') - - title = self._html_search_regex(r'(.*?)', webpage, 'title') - description = self._html_search_regex( - r'
]*>([^<]+)
', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) - - categories = re.findall( - r'([^<]+)', webpage) - - duration = parse_duration(self._search_regex( - r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r'Views: (\d+)', webpage, 'view count', fatal=False)) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - } diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index cb9279193..e87994a6a 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -12,12 +16,12 @@ from ..utils import ( class AolIE(InfoExtractor): - IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:(?:www|on)\.)?aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' + IE_NAME = 'aol.com' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P[0-9a-f]+)' _TESTS = [{ # video with 5min ID - 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', + 'url': 'https://www.aol.com/video/view/u-s--official-warns-of-largest-ever-irs-phone-scam/518167793/', 'md5': '18ef68f48740e86ae94b98da815eec42', 'info_dict': { 'id': '518167793', @@ -34,7 +38,7 @@ class AolIE(InfoExtractor): } }, { # video with vidible ID - 'url': 'http://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', + 'url': 'https://www.aol.com/video/view/netflix-is-raising-rates/5707d6b8e4b090497b04f706/', 'info_dict': { 'id': '5707d6b8e4b090497b04f706', 'ext': 'mp4', @@ -49,17 +53,29 @@ class AolIE(InfoExtractor): 'skip_download': True, } }, { - 'url': 'http://on.aol.com/partners/abc-551438d309eab105804dbfe8/sneak-peek-was-haley-really-framed-570eaebee4b0448640a5c944', + 'url': 'https://www.aol.com/video/view/park-bench-season-2-trailer/559a1b9be4b0c3bfad3357a7/', 'only_matching': True, }, { - 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', - 'only_matching': True, - }, { - 'url': 'http://on.aol.com/video/519442220', + 'url': 'https://www.aol.com/video/view/donald-trump-spokeswoman-tones-down-megyn-kelly-attacks/519442220/', 'only_matching': True, }, { 'url': 'aol-video:5707d6b8e4b090497b04f706', 'only_matching': True, + }, { + 'url': 'https://www.aol.com/video/playlist/PL8245/5ca79d19d21f1a04035db606/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.ca/video/view/u-s-woman-s-family-arrested-for-murder-first-pinned-on-panhandler-police/5c7ccf45bc03931fa04b2fe1/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.co.uk/video/view/-one-dead-and-22-hurt-in-bus-crash-/5cb3a6f3d21f1a072b457347/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.de/video/view/eva-braun-privataufnahmen-von-hitlers-geliebter-werden-digitalisiert/5cb2d49de98ab54c113d3d5d/', + 'only_matching': True, + }, { + 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', + 'only_matching': True, }] def _real_extract(self, url): @@ -73,7 +89,7 @@ class AolIE(InfoExtractor): video_data = response['data'] formats = [] - m3u8_url = video_data.get('videoMasterPlaylist') + m3u8_url = url_or_none(video_data.get('videoMasterPlaylist')) if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) @@ -96,6 +112,12 @@ class AolIE(InfoExtractor): 'width': int(mobj.group(1)), 'height': int(mobj.group(2)), }) + else: + qs = compat_parse_qs(compat_urllib_parse_urlparse(video_url).query) + f.update({ + 'width': int_or_none(qs.get('w', [None])[0]), + 'height': int_or_none(qs.get('h', [None])[0]), + }) formats.append(f) self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 6eb8bbb6e..883dcee7a 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + merge_dicts, mimetype2ext, url_or_none, ) @@ -12,59 +13,83 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', 'title': 'تیم گلکسی 11 - زومیت', - 'age_limit': 0, + 'description': 'md5:096bdabcdcc4569f2b8a5e903a3b3028', + 'duration': 231, + 'timestamp': 1387394859, + 'upload_date': '20131218', + 'view_count': int, }, - # 'skip': 'Extremely unreliable', - } + }, { + # multiple formats + 'url': 'https://www.aparat.com/v/8dflw/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work - webpage = self._download_webpage( - 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) + # Provides more metadata + webpage = self._download_webpage(url, video_id, fatal=False) - title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + if not webpage: + # Note: There is an easier-to-parse configuration at + # http://www.aparat.com/video/video/config/videohash/%video_id + # but the URL in there does not work + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) - file_list = self._parse_json( + options = self._parse_json( self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, - 'file list'), + r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', + webpage, 'options', group='value'), video_id) + player = options['plugins']['sabaPlayerPlugin'] + formats = [] - for item in file_list[0]: - file_url = url_or_none(item.get('file')) - if not file_url: - continue - ext = mimetype2ext(item.get('type')) - label = item.get('label') - formats.append({ - 'url': file_url, - 'ext': ext, - 'format_id': label or ext, - 'height': int_or_none(self._search_regex( - r'(\d+)[pP]', label or '', 'height', default=None)), - }) - self._sort_formats(formats) + for sources in player['multiSRC']: + for item in sources: + if not isinstance(item, dict): + continue + file_url = url_or_none(item.get('src')) + if not file_url: + continue + item_type = item.get('type') + if item_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': 'http-%s' % (label or ext), + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', + default=None)), + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) - thumbnail = self._search_regex( - r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) + info = self._search_json_ld(webpage, video_id, default={}) - return { + if not info.get('title'): + info['title'] = player['title'] + + return merge_dicts(info, { 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'age_limit': self._family_friendly_search(webpage), + 'thumbnail': url_or_none(options.get('poster')), + 'duration': int_or_none(player.get('duration')), 'formats': formats, - } + }) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6bf8f61eb..8adae4644 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,20 +8,23 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - qualities, int_or_none, parse_duration, + qualities, + str_or_none, + try_get, unified_strdate, - xpath_text, + unified_timestamp, update_url_query, url_or_none, + xpath_text, ) from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ # available till 26.07.2022 @@ -51,8 +54,15 @@ class ARDMediathekIE(InfoExtractor): # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', 'only_matching': True, + }, { + 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) + def _extract_media_info(self, media_info_url, webpage, video_id): media_info = self._download_json( media_info_url, video_id, 'Downloading media JSON') @@ -173,13 +183,18 @@ class ARDMediathekIE(InfoExtractor): title = self._html_search_regex( [r'(.*?)', r'', - r'

(.*?)

'], + r'

(.*?)

', + r']*>(.*?)'], webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( - 'description', webpage, 'meta description') + 'description', webpage, 'meta description', default=None) + if description is None: + description = self._html_search_regex( + r'(.+?)

', + webpage, 'teaser text', default=None) # Thumbnail is sometimes not present. # It is in the mobile version, but that seems to use a different URL @@ -288,7 +303,7 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(InfoExtractor): - _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P[a-zA-Z0-9]+)/(?P[^/?#]+)' + _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P[a-zA-Z0-9]+)(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', 'md5': '2d02d996156ea3c397cfc5036b5d7f8f', @@ -302,12 +317,18 @@ class ARDBetaMediathekIE(InfoExtractor): 'upload_date': '20180826', 'ext': 'mp4', }, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id webpage = self._download_webpage(url, display_id) data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') @@ -318,43 +339,62 @@ class ARDBetaMediathekIE(InfoExtractor): 'display_id': display_id, } formats = [] + subtitles = {} + geoblocked = False for widget in data.values(): - if widget.get('_geoblocked'): - raise ExtractorError('This video is not available due to geoblocking', expected=True) - + if widget.get('_geoblocked') is True: + geoblocked = True if '_duration' in widget: - res['duration'] = widget['_duration'] + res['duration'] = int_or_none(widget['_duration']) if 'clipTitle' in widget: res['title'] = widget['clipTitle'] if '_previewImage' in widget: res['thumbnail'] = widget['_previewImage'] if 'broadcastedOn' in widget: - res['upload_date'] = unified_strdate(widget['broadcastedOn']) + res['timestamp'] = unified_timestamp(widget['broadcastedOn']) if 'synopsis' in widget: res['description'] = widget['synopsis'] - if '_subtitleUrl' in widget: - res['subtitles'] = {'de': [{ + subtitle_url = url_or_none(widget.get('_subtitleUrl')) + if subtitle_url: + subtitles.setdefault('de', []).append({ 'ext': 'ttml', - 'url': widget['_subtitleUrl'], - }]} + 'url': subtitle_url, + }) if '_quality' in widget: - format_url = widget['_stream']['json'][0] - - if format_url.endswith('.f4m'): + format_url = url_or_none(try_get( + widget, lambda x: x['_stream']['json'][0])) + if not format_url: + continue + ext = determine_ext(format_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( format_url + '?hdcore=3.11.0', video_id, f4m_id='hds', fatal=False)) - elif format_url.endswith('m3u8'): + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + format_url, video_id, 'mp4', m3u8_id='hls', + fatal=False)) else: + # HTTP formats are not available when geoblocked is True, + # other formats are fine though + if geoblocked: + continue + quality = str_or_none(widget.get('_quality')) formats.append({ - 'format_id': 'http-' + widget['_quality'], + 'format_id': ('http-' + quality) if quality else 'http', 'url': format_url, 'preference': 10, # Plain HTTP, that's nice }) + if not formats and geoblocked: + self.raise_geo_restricted( + msg='This video is not available due to geoblocking', + countries=['DE']) + self._sort_formats(formats) - res['formats'] = formats + res.update({ + 'subtitles': subtitles, + 'formats': formats, + }) return res diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py index 4495ddbb0..854f58767 100644 --- a/youtube_dl/extractor/arkena.py +++ b/youtube_dl/extractor/arkena.py @@ -103,7 +103,7 @@ class ArkenaIE(InfoExtractor): f_url, video_id, mpd_id=kind, fatal=False)) elif kind == 'silverlight': # TODO: process when ism is supported (see - # https://github.com/rg3/youtube-dl/issues/8118) + # https://github.com/ytdl-org/youtube-dl/issues/8118) continue else: tbr = float_or_none(f.get('Bitrate'), 1000) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index ffc321821..2bd3bfe8a 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -4,17 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_str, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, - find_xpath_attr, - get_element_by_attribute, int_or_none, - NO_DEFAULT, qualities, try_get, unified_strdate, @@ -25,59 +18,7 @@ from ..utils import ( # add tests. -class ArteTvIE(InfoExtractor): - _VALID_URL = r'https?://videos\.arte\.tv/(?Pfr|de|en|es)/.*-(?P.*?)\.html' - IE_NAME = 'arte.tv' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lang = mobj.group('lang') - video_id = mobj.group('id') - - ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') - ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml_doc = self._download_xml( - ref_xml_url, video_id, note='Downloading metadata') - config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) - config_xml_url = config_node.attrib['ref'] - config = self._download_xml( - config_xml_url, video_id, note='Downloading configuration') - - formats = [{ - 'format_id': q.attrib['quality'], - # The playpath starts at 'mp4:', if we don't manually - # split the url, rtmpdump will incorrectly parse them - 'url': q.text.split('mp4:', 1)[0], - 'play_path': 'mp4:' + q.text.split('mp4:', 1)[1], - 'ext': 'flv', - 'quality': 2 if q.attrib['quality'] == 'hd' else 1, - } for q in config.findall('./urls/url')] - self._sort_formats(formats) - - title = config.find('.//name').text - thumbnail = config.find('.//firstThumbnailUrl').text - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } - - class ArteTVBaseIE(InfoExtractor): - @classmethod - def _extract_url_info(cls, url): - mobj = re.match(cls._VALID_URL, url) - lang = mobj.group('lang') - query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) - if 'vid' in query: - video_id = query['vid'][0] - else: - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') - return video_id, lang - def _extract_from_json_url(self, json_url, video_id, lang, title=None): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] @@ -108,13 +49,15 @@ class ArteTVBaseIE(InfoExtractor): 'upload_date': unified_strdate(upload_date_str), 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), } - qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) + qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { 'fr': 'F', 'de': 'A', 'en': 'E[ANG]', 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', } langcode = LANGS.get(lang, lang) @@ -126,8 +69,8 @@ class ArteTVBaseIE(InfoExtractor): l = re.escape(langcode) # Language preference from most to least priority - # Reference: section 5.6.3 of - # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + # Reference: section 6.8 of + # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf PREFERENCES = ( # original version in requested language, without subtitles r'VO{0}$'.format(l), @@ -193,274 +136,59 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?Pfr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' _TESTS = [{ - 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', - 'only_matching': True, - }, { - 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', - 'only_matching': True, - }, { - 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn', - 'only_matching': True, + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, }] - @classmethod - def suitable(cls, url): - return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url) - def _real_extract(self, url): - video_id, lang = self._extract_url_info(url) - webpage = self._download_webpage(url, video_id) - return self._extract_from_webpage(webpage, video_id, lang) - - def _extract_from_webpage(self, webpage, video_id, lang): - patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') - ids = (video_id, '') - # some pages contain multiple videos (like - # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), - # so we first try to look for json URLs that contain the video id from - # the 'vid' parameter. - patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] - json_url = self._html_search_regex( - patterns, webpage, 'json vp url', default=None) - if not json_url: - def find_iframe_url(webpage, default=NO_DEFAULT): - return self._html_search_regex( - r']+src=(["\'])(?P.+\bjson_url=.+?)\1', - webpage, 'iframe url', group='url', default=default) - - iframe_url = find_iframe_url(webpage, None) - if not iframe_url: - embed_url = self._html_search_regex( - r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) - if embed_url: - player = self._download_json( - embed_url, video_id, 'Downloading player page') - iframe_url = find_iframe_url(player['html']) - # en and es URLs produce react-based pages with different layout (e.g. - # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) - if not iframe_url: - program = self._search_regex( - r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', - webpage, 'program', default=None) - if program: - embed_html = self._parse_json(program, video_id) - if embed_html: - iframe_url = find_iframe_url(embed_html['embed_html']) - if iframe_url: - json_url = compat_parse_qs( - compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] - if json_url: - title = self._search_regex( - r']+title=(["\'])(?P.+?)\1', - webpage, 'title', default=None, group='title') - return self._extract_from_json_url(json_url, video_id, lang, title=title) - # Different kind of embed URL (e.g. - # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - entries = [ - self.url_result(url) - for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)] - return self.playlist_result(entries) - - -# It also uses the arte_vp_url url from the webpage to extract the information -class ArteTVCreativeIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1', - 'info_dict': { - 'id': '057405-001-A', - 'ext': 'mp4', - 'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)', - 'upload_date': '20150716', - }, - }, { - 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', - 'playlist_count': 11, - 'add_ie': ['Youtube'], - }, { - 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', - 'only_matching': True, - }] - - -class ArteTVInfoIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:info' - _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', - 'info_dict': { - 'id': '067528-000-A', - 'ext': 'mp4', - 'title': 'Service civique, un cache misère ?', - 'upload_date': '20160403', - }, - }] - - -class ArteTVFutureIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:future' - _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', - 'info_dict': { - 'id': '050940-028-A', - 'ext': 'mp4', - 'title': 'Les écrevisses aussi peuvent être anxieuses', - 'upload_date': '20140902', - }, - }, { - 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', - 'only_matching': True, - }] - - -class ArteTVDDCIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:ddc' - _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)' - - _TESTS = [] - - def _real_extract(self, url): - video_id, lang = self._extract_url_info(url) - if lang == 'folge': - lang = 'de' - elif lang == 'emission': - lang = 'fr' - webpage = self._download_webpage(url, video_id) - scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) - script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') - javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') - json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') - return self._extract_from_json_url(json_url, video_id, lang) - - -class ArteTVConcertIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:concert' - _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', - 'md5': '9ea035b7bd69696b67aa2ccaaa218161', - 'info_dict': { - 'id': '186', - 'ext': 'mp4', - 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', - 'upload_date': '20140128', - 'description': 'md5:486eb08f991552ade77439fe6d82c305', - }, - }] - - -class ArteTVCinemaIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:cinema' - _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)' - - _TESTS = [{ - 'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck', - 'md5': 'a5b9dd5575a11d93daf0e3f404f45438', - 'info_dict': { - 'id': '062494-000-A', - 'ext': 'mp4', - 'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck', - 'upload_date': '20150807', - }, - }] - - -class ArteTVMagazineIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:magazine' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - - _TESTS = [{ - # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..." - 'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium', - 'md5': '2a9369bcccf847d1c741e51416299f25', - 'info_dict': { - 'id': '065965-000-A', - 'ext': 'mp4', - 'title': 'Trepalium - Extrait Ep.01', - 'upload_date': '20160121', - }, - }, { - # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium" - 'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium', - 'md5': 'fedc64fc7a946110fe311634e79782ca', - 'info_dict': { - 'id': '054813-004_PLUS7-F', - 'ext': 'mp4', - 'title': 'Trepalium (4/6)', - 'description': 'md5:10057003c34d54e95350be4f9b05cb40', - 'upload_date': '20160218', - }, - }, { - 'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis', - 'only_matching': True, - }] + lang, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_from_json_url( + 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), + video_id, lang) class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) - http://www\.arte\.tv - /(?:playerv2/embed|arte_vp/index)\.php\?json_url= + https://www\.arte\.tv + /player/v3/index\.php\?json_url= (?P<json_url> - http://arte\.tv/papi/tvguide/videos/stream/player/ - (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]* + https?://api\.arte\.tv/api/player/v1/config/ + (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) ) ''' _TESTS = [] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - lang = mobj.group('lang') - json_url = mobj.group('json_url') + json_url, lang, video_id = re.match(self._VALID_URL, url).groups() return self._extract_from_json_url(json_url, video_id, lang) -class TheOperaPlatformIE(ArteTVPlus7IE): - IE_NAME = 'theoperaplatform' - _VALID_URL = r'https?://(?:www\.)?theoperaplatform\.eu/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'http://www.theoperaplatform.eu/de/opera/verdi-otello', - 'md5': '970655901fa2e82e04c00b955e9afe7b', - 'info_dict': { - 'id': '060338-009-A', - 'ext': 'mp4', - 'title': 'Verdi - OTELLO', - 'upload_date': '20160927', - }, - }] - - class ArteTVPlaylistIE(ArteTVBaseIE): IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' _TESTS = [{ - 'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV', + 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { - 'id': 'PL-013263', - 'title': 'Areva & Uramin', - 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', + 'id': 'RC-016954', + 'title': 'Earn a Living', + 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, - }, { - 'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV', - 'only_matching': True, }] def _real_extract(self, url): - playlist_id, lang = self._extract_url_info(url) + lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' % (lang, playlist_id), playlist_id) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py index 6d71c5ad5..0348e680c 100644 --- a/youtube_dl/extractor/asiancrush.py +++ b/youtube_dl/extractor/asiancrush.py @@ -5,14 +5,12 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - extract_attributes, - remove_end, -) +from ..utils import extract_attributes class AsianCrushIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' + _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE _TESTS = [{ 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', 'md5': 'c3b740e48d0ba002a42c0b72857beae6', @@ -20,7 +18,7 @@ class AsianCrushIE(InfoExtractor): 'id': '1_y4tmjm5r', 'ext': 'mp4', 'title': 'Women Who Flirt', - 'description': 'md5:3db14e9186197857e7063522cb89a805', + 'description': 'md5:7e986615808bcfb11756eb503a751487', 'timestamp': 1496936429, 'upload_date': '20170608', 'uploader_id': 'craig@crifkin.com', @@ -28,10 +26,27 @@ class AsianCrushIE(InfoExtractor): }, { 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/013886v/the-act-of-killing/', + 'only_matching': True, + }, { + 'url': 'https://www.yuyutv.com/video/peep-show/013922v-warring-factions/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/010400v/drifters/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/video/mononoke/016378v-zashikiwarashi-part-1/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) @@ -51,7 +66,7 @@ class AsianCrushIE(InfoExtractor): r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') player = self._download_webpage( - 'https://api.asiancrush.com/embeddedVideoPlayer', video_id, + 'https://api.%s/embeddedVideoPlayer' % host, video_id, query={'id': entry_id}) kaltura_id = self._search_regex( @@ -63,15 +78,23 @@ class AsianCrushIE(InfoExtractor): r'/p(?:artner_id)?/(\d+)', player, 'partner id', default='513551') - return self.url_result( - 'kaltura:%s:%s' % (partner_id, kaltura_id), - ie=KalturaIE.ie_key(), video_id=kaltura_id, - video_title=title) + description = self._html_search_regex( + r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', fatal=False) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': KalturaIE.ie_key(), + 'id': video_id, + 'title': title, + 'description': description, + } class AsianCrushPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/series/0+(?P<id>\d+)s\b' - _TEST = { + _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE + _TESTS = [{ 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', 'info_dict': { 'id': '12481', @@ -79,7 +102,16 @@ class AsianCrushPlaylistIE(InfoExtractor): 'description': 'md5:7addd7c5132a09fd4741152d96cce886', }, 'playlist_count': 20, - } + }, { + 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', + 'only_matching': True, + }, { + 'url': 'https://www.midnightpulp.com/series/016375s/mononoke/', + 'only_matching': True, + }, { + 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', + 'only_matching': True, + }] def _real_extract(self, url): playlist_id = self._match_id(url) @@ -96,15 +128,15 @@ class AsianCrushPlaylistIE(InfoExtractor): entries.append(self.url_result( mobj.group('url'), ie=AsianCrushIE.ie_key())) - title = remove_end( - self._html_search_regex( - r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)', webpage, 'title', fatal=False), - ' | AsianCrush') + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) description = self._og_search_description( webpage, default=None) or self._html_search_meta( diff --git a/youtube_dl/extractor/atvat.py b/youtube_dl/extractor/atvat.py index 1584d53fc..95e572d70 100644 --- a/youtube_dl/extractor/atvat.py +++ b/youtube_dl/extractor/atvat.py @@ -28,8 +28,10 @@ class ATVAtIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_data = self._parse_json(unescapeHTML(self._search_regex( - r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"', - webpage, 'player data')), display_id)['config']['initial_video'] + [r'flashPlayerOptions\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P[^"]+)"'], + webpage, 'player data', group='json')), + display_id)['config']['initial_video'] video_id = video_data['id'] video_title = video_data['title'] diff --git a/youtube_dl/extractor/audiomack.py b/youtube_dl/extractor/audiomack.py index 62049b921..cc7771354 100644 --- a/youtube_dl/extractor/audiomack.py +++ b/youtube_dl/extractor/audiomack.py @@ -62,7 +62,7 @@ class AudiomackIE(InfoExtractor): # Audiomack wraps a lot of soundcloud tracks in their branded wrapper # if so, pass the work off to the soundcloud extractor if SoundcloudIE.suitable(api_response['url']): - return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'} + return self.url_result(api_response['url'], SoundcloudIE.ie_key()) return { 'id': compat_str(api_response.get('id', album_url_tag)), diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index 68f26e2ca..fcbdc71b9 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -1,213 +1,86 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import ( - get_element_by_class, - get_element_by_id, - strip_or_none, - urljoin, -) -class AZMedienBaseIE(InfoExtractor): - def _kaltura_video(self, partner_id, entry_id): - return self.url_result( - 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), - video_id=entry_id) - - -class AZMedienIE(AZMedienBaseIE): +class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// (?:www\.)? - (?: + (?P telezueri\.ch| telebaern\.tv| telem1\.ch )/ - [0-9]+-show-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - (?: - /[0-9]+-segment-(?:[^/\#]+\#)?| - \# - )| - \# + [^/]+/ + (?P + [^/]+-(?P\d+) ) - (?P[^\#]+) + (?: + \#video= + (?P + [_0-9a-z]+ + ) + )? ''' _TESTS = [{ - # URL with 'segment' - 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', + 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { - 'id': '1_2444peh4', + 'id': '1_anruz3wy', 'ext': 'mp4', - 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', - 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', - 'uploader_id': 'TeleZ?ri', - 'upload_date': '20161218', - 'timestamp': 1482084490, + 'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen', + 'uploader_id': 'TVOnline', + 'upload_date': '20180930', + 'timestamp': 1538328802, }, 'params': { 'skip_download': True, }, }, { - # URL with 'segment' and fragment: - 'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger', - 'only_matching': True - }, { - # URL with 'episode' and fragment: - 'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz', - 'only_matching': True - }, { - # URL with 'show' and fragment: - 'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch', + 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - partner_id = self._search_regex( - r']+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)', - webpage, 'kaltura partner id') - entry_id = self._html_search_regex( - r']+data-id=(["\'])(?P(?:(?!\1).)+)\1[^>]+data-slug=["\']%s' - % re.escape(video_id), webpage, 'kaltura entry id', group='id') - - return self._kaltura_video(partner_id, entry_id) - - -class AZMedienPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?P[0-9]+- - (?: - show| - topic| - themen - )-[^/\#]+ - (?: - /[0-9]+-episode-[^/\#]+ - )? - )$ - ''' - - _TESTS = [{ - # URL with 'episode' - 'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'info_dict': { - 'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016', - 'title': 'News - Donnerstag, 15. Dezember 2016', - }, - 'playlist_count': 9, - }, { - # URL with 'themen' - 'url': 'http://www.telem1.ch/258-themen-tele-m1-classics', - 'info_dict': { - 'id': '258-themen-tele-m1-classics', - 'title': 'Tele M1 Classics', - }, - 'playlist_mincount': 15, - }, { - # URL with 'topic', contains nested playlists - 'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen', - 'only_matching': True, - }, { - # URL with 'show' only - 'url': 'http://www.telezueri.ch/86-show-talktaeglich', - 'only_matching': True - }] + _PARTNER_ID = '1719221' def _real_extract(self, url): - show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') + entry_id = mobj.group('kaltura_id') - entries = [] + if not entry_id: + api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0]) + payload = { + 'query': '''query VideoContext($articleId: ID!) { + article: node(id: $articleId) { + ... on Article { + mainAssetRelation { + asset { + ... on VideoAsset { + kalturaId + } + } + } + } + } + }''', + 'variables': {'articleId': 'Article:%s' % mobj.group('article_id')}, + } + json_data = self._download_json( + api_url, video_id, headers={ + 'Content-Type': 'application/json', + }, + data=json.dumps(payload).encode()) + entry_id = json_data['data']['article']['mainAssetRelation']['asset']['kalturaId'] - partner_id = self._search_regex( - r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', - webpage, 'kaltura partner id', default=None) - - if partner_id: - entries = [ - self._kaltura_video(partner_id, m.group('id')) - for m in re.finditer( - r'data-id=(["\'])(?P(?:(?!\1).)+)\1', webpage)] - - if not entries: - entries = [ - self.url_result(m.group('url'), ie=AZMedienIE.ie_key()) - for m in re.finditer( - r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] - - if not entries: - entries = [ - # May contain nested playlists (e.g. [1]) thus no explicit - # ie_key - # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen) - self.url_result(urljoin(url, m.group('url'))) - for m in re.finditer( - r']+name=[^>]+href=(["\'])(?P/.+?)\1', webpage)] - - title = self._search_regex( - r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'title', - default=strip_or_none(get_element_by_id( - 'video-title', webpage)), group='title') - - return self.playlist_result(entries, show_id, title) - - -class AZMedienShowPlaylistIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien show playlists' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - telezueri\.ch| - telebaern\.tv| - telem1\.ch - )/ - (?: - all-episodes| - alle-episoden - )/ - (?P<id>[^/?#&]+) - ''' - - _TEST = { - 'url': 'http://www.telezueri.ch/all-episodes/astrotalk', - 'info_dict': { - 'id': 'astrotalk', - 'title': 'TeleZüri: AstroTalk - alle episoden', - 'description': 'md5:4c0f7e7d741d906004266e295ceb4a26', - }, - 'playlist_mincount': 13, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - episodes = get_element_by_class('search-mobile-box', webpage) - entries = [self.url_result( - urljoin(url, m.group('url'))) for m in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', episodes)] - title = self._og_search_title(webpage, fatal=False) - description = self._og_search_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.url_result( + 'kaltura:%s:%s' % (self._PARTNER_ID, entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id) diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 34f1b3d83..4400ff9c1 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -23,7 +23,7 @@ class BambuserIE(InfoExtractor): _TEST = { 'url': 'http://bambuser.com/v/4050584', - # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388 + # MD5 seems to be flaky, see https://travis-ci.org/ytdl-org/youtube-dl/jobs/14051016#L388 # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641', 'info_dict': { 'id': '4050584', @@ -38,7 +38,7 @@ class BambuserIE(InfoExtractor): }, 'params': { # It doesn't respect the 'Range' header, it would download the whole video - # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59 + # caused the travis builds to fail: https://travis-ci.org/ytdl-org/youtube-dl/jobs/14493845#L59 'skip_download': True, }, } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index abcfa301d..901c5a54f 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,8 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -import re import itertools +import re from .common import InfoExtractor from ..utils import ( @@ -17,10 +17,12 @@ from ..utils import ( parse_iso8601, try_get, unescapeHTML, + url_or_none, urlencode_postdata, urljoin, ) from ..compat import ( + compat_etree_Element, compat_HTTPError, compat_urlparse, ) @@ -38,6 +40,7 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| + sounds/play/| events/[^/]+/play/[^/]+/ ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) @@ -68,7 +71,7 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', + 'title': 'Kaleidoscope, Leonard Cohen', 'description': 'The Canadian poet and songwriter reflects on his musical career.', }, 'params': { @@ -206,7 +209,7 @@ class BBCCoUkIE(InfoExtractor): }, 'skip': 'Now it\'s really geo-restricted', }, { - # compact player (https://github.com/rg3/youtube-dl/issues/8147) + # compact player (https://github.com/ytdl-org/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', 'info_dict': { 'id': 'p028bfkj', @@ -218,6 +221,20 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + }, { + 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', + 'note': 'Audio', + 'info_dict': { + 'id': 'm0007jz9', + 'ext': 'mp4', + 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', + 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", + 'duration': 9840, + }, + 'params': { + # rtmp download + 'skip_download': True, + } }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -310,7 +327,13 @@ class BBCCoUkIE(InfoExtractor): def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): - captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') + cc_url = url_or_none(connection.get('href')) + if not cc_url: + continue + captions = self._download_xml( + cc_url, programme_id, 'Downloading captions', fatal=False) + if not isinstance(captions, compat_etree_Element): + continue lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') subtitles[lang] = [ { @@ -601,7 +624,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'title': 'Russia stages massive WW2 parade', 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, @@ -795,6 +818,15 @@ class BBCIE(BBCCoUkIE): 'uploader': 'Radio 3', 'uploader_id': 'bbc_radio_three', }, + }, { + 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227', + 'info_dict': { + 'id': 'p06w9tws', + 'ext': 'mp4', + 'title': 'md5:2fabf12a726603193a2879a055f72514', + 'description': 'Learn English words and phrases from this story', + }, + 'add_ie': [BBCCoUkIE.ie_key()], }] @classmethod @@ -945,6 +977,15 @@ class BBCIE(BBCCoUkIE): if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) + # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227 + group_id = self._search_regex( + r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, + webpage, 'group id', default=None) + if playlist_id: + return self.url_result( + 'https://www.bbc.co.uk/programmes/%s' % group_id, + ie=BBCCoUkIE.ie_key()) + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) programme_id = self._search_regex( [r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX, diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py index 2eaec1ab4..86abdae00 100644 --- a/youtube_dl/extractor/beampro.py +++ b/youtube_dl/extractor/beampro.py @@ -99,8 +99,8 @@ class BeamProLiveIE(BeamProBaseIE): class BeamProVodIE(BeamProBaseIE): IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)' + _TESTS = [{ 'url': 'https://mixer.com/willow8714?vod=2259830', 'md5': 'b2431e6e8347dc92ebafb565d368b76b', 'info_dict': { @@ -119,7 +119,13 @@ class BeamProVodIE(BeamProBaseIE): 'params': { 'skip_download': True, }, - } + }, { + 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', + 'only_matching': True, + }, { + 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', + 'only_matching': True, + }] @staticmethod def _extract_format(vod, vod_type): diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index bf22a41b7..5788d13ba 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -2,20 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, + compat_str, + compat_urlparse, ) from ..utils import ( int_or_none, - parse_iso8601, - urljoin, + unified_timestamp, ) class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)' + _TESTS = [{ + # api/v6 v1 'url': 'http://beeg.com/5416503', 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', 'info_dict': { @@ -29,76 +28,53 @@ class BeegIE(InfoExtractor): 'tags': list, 'age_limit': 18, } - } + }, { + # api/v6 v2 + 'url': 'https://beeg.com/1941093077?t=911-1391', + 'only_matching': True, + }, { + # api/v6 v2 w/o t + 'url': 'https://beeg.com/1277207756', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/video/5416503', + 'only_matching': True, + }, { + 'url': 'https://beeg.porn/5416503', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - cpl_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', - webpage, 'cpl', default=None, group='url') + beeg_version = self._search_regex( + r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', + default='1546225636701') - cpl_url = urljoin(url, cpl_url) - - beeg_version, beeg_salt = [None] * 2 - - if cpl_url: - cpl = self._download_webpage( - self._proto_relative_url(cpl_url), video_id, - 'Downloading cpl JS', fatal=False) - if cpl: - beeg_version = int_or_none(self._search_regex( - r'beeg_version\s*=\s*([^\b]+)', cpl, - 'beeg version', default=None)) or self._search_regex( - r'/(\d+)\.js', cpl_url, 'beeg version', default=None) - beeg_salt = self._search_regex( - r'beeg_salt\s*=\s*(["\'])(?P<beeg_salt>.+?)\1', cpl, 'beeg salt', - default=None, group='beeg_salt') - - beeg_version = beeg_version or '2185' - beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' + if len(video_id) >= 10: + query = { + 'v': 2, + } + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + t = qs.get('t', [''])[0].split('-') + if len(t) > 1: + query.update({ + 's': t[0], + 'e': t[1], + }) + else: + query = {'v': 1} for api_path in ('', 'api.'): video = self._download_json( 'https://%sbeeg.com/api/v6/%s/video/%s' % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.') + fatal=api_path == 'api.', query=query) if video: break - def split(o, e): - def cut(s, x): - n.append(s[:x]) - return s[x:] - n = [] - r = len(o) % e - if r > 0: - o = cut(o, r) - while len(o) > e: - o = cut(o, e) - n.append(o) - return n - - def decrypt_key(key): - # Reverse engineered from http://static.beeg.com/cpl/1738.js - a = beeg_salt - e = compat_urllib_parse_unquote(key) - o = ''.join([ - compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21) - for n in range(len(e))]) - return ''.join(split(o, 3)[::-1]) - - def decrypt_url(encrypted_url): - encrypted_url = self._proto_relative_url( - encrypted_url.replace('{DATA_MARKERS}', ''), 'https:') - key = self._search_regex( - r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) - if not key: - return encrypted_url - return encrypted_url.replace(key, decrypt_key(key)) - formats = [] for format_id, video_url in video.items(): if not video_url: @@ -108,18 +84,20 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': decrypt_url(video_url), + 'url': self._proto_relative_url( + video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), 'format_id': format_id, 'height': int(height), }) self._sort_formats(formats) title = video['title'] - video_id = video.get('id') or video_id + video_id = compat_str(video.get('id') or video_id) display_id = video.get('code') description = video.get('desc') + series = video.get('ps_name') - timestamp = parse_iso8601(video.get('date'), ' ') + timestamp = unified_timestamp(video.get('date')) duration = int_or_none(video.get('duration')) tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None @@ -129,6 +107,7 @@ class BeegIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': description, + 'series': series, 'timestamp': timestamp, 'duration': duration, 'tags': tags, diff --git a/youtube_dl/extractor/bfi.py b/youtube_dl/extractor/bfi.py new file mode 100644 index 000000000..60c8944b5 --- /dev/null +++ b/youtube_dl/extractor/bfi.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFIPlayerIE(InfoExtractor): + IE_NAME = 'bfi:player' + _VALID_URL = r'https?://player\.bfi\.org\.uk/[^/]+/film/watch-(?P<id>[\w-]+)-online' + _TEST = { + 'url': 'https://player.bfi.org.uk/free/film/watch-computer-doctor-1974-online', + 'md5': 'e8783ebd8e061ec4bc6e9501ed547de8', + 'info_dict': { + 'id': 'htNnhlZjE60C9VySkQEIBtU-cNV1Xx63', + 'ext': 'mp4', + 'title': 'Computer Doctor', + 'description': 'md5:fb6c240d40c4dbe40428bdd62f78203b', + }, + 'skip': 'BFI Player films cannot be played outside of the UK', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + entries = [] + for player_el in re.findall(r'(?s)<[^>]+class="player"[^>]*>', webpage): + player_attr = extract_attributes(player_el) + ooyala_id = player_attr.get('data-video-id') + if not ooyala_id: + continue + entries.append(self.url_result( + 'ooyala:' + ooyala_id, 'Ooyala', + ooyala_id, player_attr.get('data-label'))) + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 4d6b051fe..3746671d3 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -93,8 +93,8 @@ class BiliBiliIE(InfoExtractor): }] }] - _APP_KEY = '84956560bc028eb7' - _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + _APP_KEY = 'iVGUTjsxvpLeuDCf' + _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' def _report_error(self, result): if 'message' in result: diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py index b92031c8a..dc86c57c5 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/youtube_dl/extractor/biobiochiletv.py @@ -6,7 +6,6 @@ from ..utils import ( ExtractorError, remove_end, ) -from .rudo import RudoIE class BioBioChileTVIE(InfoExtractor): @@ -41,11 +40,15 @@ class BioBioChileTVIE(InfoExtractor): }, { 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', 'info_dict': { - 'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', + 'id': 'b4xd0LK3SK', 'ext': 'mp4', - 'uploader': '(none)', - 'upload_date': '20160708', - 'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', + # TODO: fix url_transparent information overriding + # 'uploader': 'Juan Pablo Echenique', + 'title': 'Comentario Oscar Cáceres', + }, + 'params': { + # empty m3u8 manifest + 'skip_download': True, }, }, { 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', @@ -60,7 +63,9 @@ class BioBioChileTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - rudo_url = RudoIE._extract_url(webpage) + rudo_url = self._search_regex( + r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage, 'embed URL', None, group='url') if not rudo_url: raise ExtractorError('No videos found') @@ -68,7 +73,7 @@ class BioBioChileTVIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) uploader = self._html_search_regex( - r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', + r'<a[^>]+href=["\'](?:https?://(?:busca|www)\.biobiochile\.cl)?/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) return { diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index beaebfd2a..af21e3ee5 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -2,39 +2,96 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .vk import VKIE +from ..utils import ( + HEADRequest, + int_or_none, +) class BIQLEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' _TESTS = [{ - 'url': 'http://www.biqle.ru/watch/847655_160197695', - 'md5': 'ad5f746a874ccded7b8f211aeea96637', + # Youtube embed + 'url': 'https://biqle.ru/watch/-115995369_456239081', + 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06', 'info_dict': { - 'id': '160197695', + 'id': '8v4f-avW-VI', 'ext': 'mp4', - 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', - 'uploader': 'Andrey Rogozin', - 'upload_date': '20110605', - } + 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer", + 'description': 'Passe-Partout', + 'uploader_id': 'mrsimpsonstef3', + 'uploader': 'Phanolito', + 'upload_date': '20120822', + }, }, { - 'url': 'https://biqle.org/watch/-44781847_168547604', + 'url': 'http://biqle.org/watch/-44781847_168547604', 'md5': '7f24e72af1db0edf7c1aaba513174f97', 'info_dict': { - 'id': '168547604', + 'id': '-44781847_168547604', 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', + 'timestamp': 1396633454, 'uploader': 'Dmitry Kotov', + 'upload_date': '20140404', + 'uploader_id': '47850140', }, - 'skip': ' This video was marked as adult. Embedding adult videos on external sites is prohibited.', }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) embed_url = self._proto_relative_url(self._search_regex( - r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url')) + r'<iframe.+?src="((?:https?:)?//(?:daxab\.com|dxb\.to|[^/]+/player)/[^"]+)".*?></iframe>', + webpage, 'embed url')) + if VKIE.suitable(embed_url): + return self.url_result(embed_url, VKIE.ie_key(), video_id) + + self._request_webpage( + HEADRequest(embed_url), video_id, headers={'Referer': url}) + video_id, sig, _, access_token = self._get_cookies(embed_url)['video_ext'].value.split('%3A') + item = self._download_json( + 'https://api.vk.com/method/video.get', video_id, + headers={'User-Agent': 'okhttp/3.4.1'}, query={ + 'access_token': access_token, + 'sig': sig, + 'v': 5.44, + 'videos': video_id, + })['response']['items'][0] + title = item['title'] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + formats.append({ + 'format_id': height + 'p', + 'url': f_url, + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) return { - '_type': 'url_transparent', - 'url': embed_url, + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': item.get('description'), + 'duration': int_or_none(item.get('duration')), + 'thumbnails': thumbnails, + 'timestamp': int_or_none(item.get('date')), + 'uploader': item.get('owner_id'), + 'view_count': int_or_none(item.get('views')), } diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py index 446a1ab19..430663fbf 100644 --- a/youtube_dl/extractor/bitchute.py +++ b/youtube_dl/extractor/bitchute.py @@ -5,7 +5,10 @@ import itertools import re from .common import InfoExtractor -from ..utils import urlencode_postdata +from ..utils import ( + orderedSet, + urlencode_postdata, +) class BitChuteIE(InfoExtractor): @@ -37,16 +40,27 @@ class BitChuteIE(InfoExtractor): 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', }) - title = self._search_regex( + title = self._html_search_regex( (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', default=None) or self._og_search_description(webpage) + format_urls = [] + for mobj in re.finditer( + r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): + format_urls.append(mobj.group('url')) + format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) + formats = [ - {'url': mobj.group('url')} - for mobj in re.finditer( - r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage)] + {'url': format_url} + for format_url in orderedSet(format_urls)] + + if not formats: + formats = self._parse_html5_media_entries( + url, webpage, video_id)[0]['formats'] + + self._check_formats(formats, video_id) self._sort_formats(formats) description = self._html_search_regex( @@ -56,8 +70,9 @@ class BitChuteIE(InfoExtractor): webpage, default=None) or self._html_search_meta( 'twitter:image:src', webpage, 'thumbnail') uploader = self._html_search_regex( - r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage, - 'uploader', fatal=False) + (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', + r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), + webpage, 'uploader', fatal=False) return { 'id': video_id, diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index e829974ff..dc60224d0 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -71,7 +71,7 @@ class BleacherReportIE(InfoExtractor): video = article_data.get('video') if video: video_type = video['type'] - if video_type == 'cms.bleacherreport.com': + if video_type in ('cms.bleacherreport.com', 'vid.bleacherreport.com'): info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] elif video_type == 'ooyala.com': info['url'] = 'ooyala:%s' % video['id'] @@ -87,9 +87,9 @@ class BleacherReportIE(InfoExtractor): class BleacherReportCMSIE(AMPIE): - _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ - 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', @@ -101,6 +101,6 @@ class BleacherReportCMSIE(AMPIE): def _real_extract(self, url): video_id = self._match_id(url) - info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info = self._extract_feed_info('http://vid.bleacherreport.com/videos/%s.akamai' % video_id) info['id'] = video_id return info diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 3b8eabe8f..db5e12b21 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -32,8 +32,8 @@ class BlinkxIE(InfoExtractor): video_id = self._match_id(url) display_id = video_id[:8] - api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + - 'video=%s' % video_id) + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + + 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] duration = None diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index a25d500e4..b9715df00 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .adobepass import AdobePassIE from ..utils import ( smuggle_url, @@ -12,16 +14,16 @@ from ..utils import ( class BravoTVIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', - 'md5': '9086d0b7ef0ea2aabc4781d75f4e5863', + 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', + 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', 'info_dict': { - 'id': 'zHyk1_HU_mPy', + 'id': 'epL0pmK1kQlT', 'ext': 'mp4', - 'title': 'LCK Ep 12: Fishy Finale', - 'description': 'S13/E12: Two eliminated chefs have just 12 minutes to cook up a delicious fish dish.', + 'title': 'The Top Chef Season 16 Winner Is...', + 'description': 'Find out who takes the title of Top Chef!', 'uploader': 'NBCU-BRAV', - 'upload_date': '20160302', - 'timestamp': 1456945320, + 'upload_date': '20190314', + 'timestamp': 1552591860, } }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', @@ -32,30 +34,38 @@ class BravoTVIE(AdobePassIE): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), + r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), display_id) info = {} query = { 'mbr': 'true', } account_pid, release_pid = [None] * 2 - tve = settings.get('sharedTVE') + tve = settings.get('ls_tve') if tve: query['manifest'] = 'm3u' - account_pid = 'HNK2IC' - release_pid = tve['release_pid'] + mobj = re.search(r'<[^>]+id="pdk-player"[^>]+data-url=["\']?(?:https?:)?//player\.theplatform\.com/p/([^/]+)/(?:[^/]+/)*select/([^?#&"\']+)', webpage) + if mobj: + account_pid, tp_path = mobj.groups() + release_pid = tp_path.strip('/').split('/')[-1] + else: + account_pid = 'HNK2IC' + tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': - adobe_pass = settings.get('adobePass', {}) + adobe_pass = settings.get('tve_adobe_auth', {}) resource = self._get_mvpd_resource( adobe_pass.get('adobePassResourceId', 'bravo'), tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) else: - shared_playlist = settings['shared_playlist'] + shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] - release_pid = metadata['release_pid'] + tp_path = release_pid = metadata.get('release_pid') + if not release_pid: + release_pid = metadata['guid'] + tp_path = 'media/guid/2140479951/' + release_pid info.update({ 'title': metadata['title'], 'description': metadata.get('description'), @@ -67,7 +77,7 @@ class BravoTVIE(AdobePassIE): '_type': 'url_transparent', 'id': release_pid, 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/%s/%s' % (account_pid, release_pid), + 'http://link.theplatform.com/s/%s/%s' % (account_pid, tp_path), query), {'force_smil_url': True}), 'ie_key': 'ThePlatform', }) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 14f9a14ed..8e2f7217a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,22 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re -import json +import struct from .common import InfoExtractor from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, - compat_str, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, compat_HTTPError, ) from ..utils import ( - determine_ext, ExtractorError, extract_attributes, find_xpath_attr, @@ -25,18 +24,19 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, + smuggle_url, unescapeHTML, unsmuggle_url, update_url_query, clean_html, mimetype2ext, + UnsupportedError, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' - _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -53,7 +53,8 @@ class BrightcoveLegacyIE(InfoExtractor): 'timestamp': 1368213670, 'upload_date': '20130510', 'uploader_id': '1589608506001', - } + }, + 'skip': 'The player has been deactivated by the content owner', }, { # From http://medianetwork.oracle.com/video/player/1785452137001 @@ -68,6 +69,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'upload_date': '20120814', 'uploader_id': '1460825906', }, + 'skip': 'video not playable', }, { # From http://mashable.com/2013/10/26/thermoelectric-bracelet-lets-you-control-your-body-temperature/ @@ -77,7 +79,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'ext': 'mp4', 'title': 'This Bracelet Acts as a Personal Thermostat', 'description': 'md5:547b78c64f4112766ccf4e151c20b6a0', - 'uploader': 'Mashable', + # 'uploader': 'Mashable', 'timestamp': 1382041798, 'upload_date': '20131017', 'uploader_id': '1130468786001', @@ -122,15 +124,17 @@ class BrightcoveLegacyIE(InfoExtractor): 'id': '3550319591001', }, 'playlist_mincount': 7, + 'skip': 'Unsupported URL', }, { - # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965) + # playlist with 'playlistTab' (https://github.com/ytdl-org/youtube-dl/issues/9965) 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', 'info_dict': { 'id': '1522758701001', 'title': 'Lesson 08', }, 'playlist_mincount': 10, + 'skip': 'Unsupported URL', }, { # playerID inferred from bcpid @@ -139,12 +143,6 @@ class BrightcoveLegacyIE(InfoExtractor): 'only_matching': True, # Tested in GenericIE } ] - FLV_VCODECS = { - 1: 'SORENSON', - 2: 'ON2', - 3: 'H264', - 4: 'VP8', - } @classmethod def _build_brighcove_url(cls, object_str): @@ -153,10 +151,10 @@ class BrightcoveLegacyIE(InfoExtractor): <object class="BrightcoveExperience">{params}</object> """ - # Fix up some stupid HTML, see https://github.com/rg3/youtube-dl/issues/1553 + # Fix up some stupid HTML, see https://github.com/ytdl-org/youtube-dl/issues/1553 object_str = re.sub(r'(<param(?:\s+[a-zA-Z0-9_]+="[^"]*")*)>', lambda m: m.group(1) + '/>', object_str) - # Fix up some stupid XML, see https://github.com/rg3/youtube-dl/issues/1608 + # Fix up some stupid XML, see https://github.com/ytdl-org/youtube-dl/issues/1608 object_str = object_str.replace('<--', '<!--') # remove namespace to simplify extraction object_str = re.sub(r'(<object[^>]*)(xmlns=".*?")', r'\1', object_str) @@ -236,7 +234,8 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - return update_url_query(cls._FEDERATED_URL, params) + return update_url_query( + 'http://c.brightcove.com/services/viewer/htmlFederated', params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -295,163 +294,50 @@ class BrightcoveLegacyIE(InfoExtractor): videoPlayer = query.get('@videoPlayer') if videoPlayer: # We set the original url as the default 'Referer' header - referer = smuggled_data.get('Referer', url) + referer = query.get('linkBaseURL', [None])[0] or smuggled_data.get('Referer', url) + video_id = videoPlayer[0] if 'playerID' not in query: mobj = re.search(r'/bcpid(\d+)', url) if mobj is not None: query['playerID'] = [mobj.group(1)] - return self._get_video_info( - videoPlayer[0], query, referer=referer) - elif 'playerKey' in query: - player_key = query['playerKey'] - return self._get_playlist_info(player_key[0]) - else: - raise ExtractorError( - 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', - expected=True) - - def _get_video_info(self, video_id, query, referer=None): - headers = {} - linkBase = query.get('linkBaseURL') - if linkBase is not None: - referer = linkBase[0] - if referer is not None: - headers['Referer'] = referer - webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) - - error_msg = self._html_search_regex( - r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, - 'error message', default=None) - if error_msg is not None: - raise ExtractorError( - 'brightcove said: %s' % error_msg, expected=True) - - self.report_extraction(video_id) - info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json') - info = json.loads(info)['data'] - video_info = info['programmedContent']['videoPlayer']['mediaDTO'] - video_info['_youtubedl_adServerURL'] = info.get('adServerURL') - - return self._extract_video_info(video_info) - - def _get_playlist_info(self, player_key): - info_url = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' % player_key - playlist_info = self._download_webpage( - info_url, player_key, 'Downloading playlist information') - - json_data = json.loads(playlist_info) - if 'videoList' in json_data: - playlist_info = json_data['videoList'] - playlist_dto = playlist_info['mediaCollectionDTO'] - elif 'playlistTabs' in json_data: - playlist_info = json_data['playlistTabs'] - playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] - else: - raise ExtractorError('Empty playlist') - - videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] - - return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_dto['displayName']) - - def _extract_video_info(self, video_info): - video_id = compat_str(video_info['id']) - publisher_id = video_info.get('publisherId') - info = { - 'id': video_id, - 'title': video_info['displayName'].strip(), - 'description': video_info.get('shortDescription'), - 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), - 'uploader': video_info.get('publisherName'), - 'uploader_id': compat_str(publisher_id) if publisher_id else None, - 'duration': float_or_none(video_info.get('length'), 1000), - 'timestamp': int_or_none(video_info.get('creationDate'), 1000), - } - - renditions = video_info.get('renditions', []) + video_info.get('IOSRenditions', []) - if renditions: - formats = [] - for rend in renditions: - url = rend['defaultURL'] - if not url: - continue - ext = None - if rend['remote']: - url_comp = compat_urllib_parse_urlparse(url) - if url_comp.path.endswith('.m3u8'): - formats.extend( - self._extract_m3u8_formats( - url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - continue - elif 'akamaihd.net' in url_comp.netloc: - # This type of renditions are served through - # akamaihd.net, but they don't use f4m manifests - url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' - ext = 'flv' - if ext is None: - ext = determine_ext(url) - tbr = int_or_none(rend.get('encodingRate'), 1000) - a_format = { - 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), - 'url': url, - 'ext': ext, - 'filesize': int_or_none(rend.get('size')) or None, - 'tbr': tbr, - } - if rend.get('audioOnly'): - a_format.update({ - 'vcodec': 'none', - }) + publisher_id = query.get('publisherId') + if publisher_id and publisher_id[0].isdigit(): + publisher_id = publisher_id[0] + if not publisher_id: + player_key = query.get('playerKey') + if player_key and ',' in player_key[0]: + player_key = player_key[0] else: - a_format.update({ - 'height': int_or_none(rend.get('frameHeight')), - 'width': int_or_none(rend.get('frameWidth')), - 'vcodec': rend.get('videoCodec'), - }) - - # m3u8 manifests with remote == false are media playlists - # Not calling _extract_m3u8_formats here to save network traffic - if ext == 'm3u8': - a_format.update({ - 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }) - - formats.append(a_format) - self._sort_formats(formats) - info['formats'] = formats - elif video_info.get('FLVFullLengthURL') is not None: - info.update({ - 'url': video_info['FLVFullLengthURL'], - 'vcodec': self.FLV_VCODECS.get(video_info.get('FLVFullCodec')), - 'filesize': int_or_none(video_info.get('FLVFullSize')), - }) - - if self._downloader.params.get('include_ads', False): - adServerURL = video_info.get('_youtubedl_adServerURL') - if adServerURL: - ad_info = { - '_type': 'url', - 'url': adServerURL, - } - if 'url' in info: - return { - '_type': 'playlist', - 'title': info['title'], - 'entries': [ad_info, info], - } - else: - return ad_info - - if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % video_id) - return info + player_id = query.get('playerID') + if player_id and player_id[0].isdigit(): + headers = {} + if referer: + headers['Referer'] = referer + player_page = self._download_webpage( + 'http://link.brightcove.com/services/player/bcpid' + player_id[0], + video_id, headers=headers, fatal=False) + if player_page: + player_key = self._search_regex( + r'<param\s+name="playerKey"\s+value="([\w~,-]+)"', + player_page, 'player key', fatal=False) + if player_key: + enc_pub_id = player_key.split(',')[1].replace('~', '=') + publisher_id = struct.unpack('>Q', base64.urlsafe_b64decode(enc_pub_id))[0] + if publisher_id: + brightcove_new_url = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id) + if referer: + brightcove_new_url = smuggle_url(brightcove_new_url, {'referrer': referer}) + return self.url_result(brightcove_new_url, BrightcoveNewIE.ie_key(), video_id) + # TODO: figure out if it's possible to extract playlistId from playerKey + # elif 'playerKey' in query: + # player_key = query['playerKey'] + # return self._get_playlist_info(player_key[0]) + raise UnsupportedError(url) class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' - _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', 'md5': 'c8100925723840d4b0d243f7025703be', @@ -484,6 +370,21 @@ class BrightcoveNewIE(AdobePassIE): # m3u8 download 'skip_download': True, } + }, { + # playlist stream + 'url': 'https://players.brightcove.net/1752604059001/S13cJdUBz_default/index.html?playlistId=5718313430001', + 'info_dict': { + 'id': '5718313430001', + 'title': 'No Audio Playlist', + }, + 'playlist_count': 7, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=5743160747001', + 'only_matching': True, }, { # ref: prefixed video id 'url': 'http://players.brightcove.net/3910869709001/21519b5c-4b3b-4363-accb-bdc8f358f823_default/index.html?videoId=ref:7069442', @@ -683,7 +584,7 @@ class BrightcoveNewIE(AdobePassIE): 'ip_blocks': smuggled_data.get('geo_ip_blocks'), }) - account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() + account_id, player_id, embed, content_type, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage( 'http://players.brightcove.net/%s/%s_%s/index.min.js' @@ -704,7 +605,7 @@ class BrightcoveNewIE(AdobePassIE): r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', webpage, 'policy key', group='pk') - api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) headers = { 'Accept': 'application/json;pk=%s' % policy_key, } @@ -739,5 +640,12 @@ class BrightcoveNewIE(AdobePassIE): 'tveToken': tve_token, }) + if content_type == 'playlist': + return self.playlist_result( + [self._parse_brightcove_metadata(vid, vid.get('id'), headers) + for vid in json_data.get('videos', []) if vid.get('id')], + json_data.get('id'), json_data.get('name'), + json_data.get('description')) + return self._parse_brightcove_metadata( json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 4bf4efe1f..562c83af9 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import parse_duration class BYUtvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?byutv\.org/(?:watch|player)/(?!event/)(?P<id>[0-9a-f-]+)(?:/(?P<display_id>[^/?#&]+))?' _TESTS = [{ + # ooyalaVOD 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'info_dict': { 'id': 'ZvanRocTpW-G5_yZFeltTAMv6jxOU9KH', @@ -22,6 +24,20 @@ class BYUtvIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['Ooyala'], + }, { + # dvr + 'url': 'https://www.byutv.org/player/8f1dab9b-b243-47c8-b525-3e2d021a3451/byu-softball-pacific-vs-byu-41219---game-2', + 'info_dict': { + 'id': '8f1dab9b-b243-47c8-b525-3e2d021a3451', + 'display_id': 'byu-softball-pacific-vs-byu-41219---game-2', + 'ext': 'mp4', + 'title': 'Pacific vs. BYU (4/12/19)', + 'description': 'md5:1ac7b57cb9a78015910a4834790ce1f3', + 'duration': 11645, + }, + 'params': { + 'skip_download': True + }, }, { 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d', 'only_matching': True, @@ -35,24 +51,42 @@ class BYUtvIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id - ep = self._download_json( - 'https://api.byutv.org/api3/catalog/getvideosforcontent', video_id, - query={ + info = self._download_json( + 'https://api.byutv.org/api3/catalog/getvideosforcontent', + display_id, query={ 'contentid': video_id, 'channel': 'byutv', 'x-byutv-context': 'web$US', }, headers={ 'x-byutv-context': 'web$US', 'x-byutv-platformkey': 'xsaaw9c7y5', - })['ooyalaVOD'] + }) + ep = info.get('ooyalaVOD') + if ep: + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'display_id': display_id, + 'title': ep.get('title'), + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + + ep = info['dvr'] + title = ep['title'] + formats = self._extract_m3u8_formats( + ep['videoUrl'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % ep['providerId'], 'id': video_id, 'display_id': display_id, - 'title': ep.get('title'), + 'title': title, 'description': ep.get('description'), 'thumbnail': ep.get('imageThumbnail'), + 'duration': parse_duration(ep.get('length')), + 'formats': formats, } diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 79350817f..1eb81b75e 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -14,6 +14,7 @@ class CamModelsIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.cammodels.com/cam/AutumnKnight/', 'only_matching': True, + 'age_limit': 18 }] def _real_extract(self, url): @@ -93,4 +94,5 @@ class CamModelsIE(InfoExtractor): 'title': self._live_title(user_id), 'is_live': True, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camtube.py b/youtube_dl/extractor/camtube.py index c7d40f849..b3be3bdcf 100644 --- a/youtube_dl/extractor/camtube.py +++ b/youtube_dl/extractor/camtube.py @@ -20,6 +20,7 @@ class CamTubeIE(InfoExtractor): 'duration': 1274, 'timestamp': 1528018608, 'upload_date': '20180603', + 'age_limit': 18 }, 'params': { 'skip_download': True, @@ -66,4 +67,5 @@ class CamTubeIE(InfoExtractor): 'like_count': like_count, 'creator': creator, 'formats': formats, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/camwithher.py b/youtube_dl/extractor/camwithher.py index afbc5ea26..bbc5205fd 100644 --- a/youtube_dl/extractor/camwithher.py +++ b/youtube_dl/extractor/camwithher.py @@ -25,6 +25,7 @@ class CamWithHerIE(InfoExtractor): 'comment_count': int, 'uploader': 'MileenaK', 'upload_date': '20160322', + 'age_limit': 18, }, 'params': { 'skip_download': True, @@ -84,4 +85,5 @@ class CamWithHerIE(InfoExtractor): 'comment_count': comment_count, 'uploader': uploader, 'upload_date': upload_date, + 'age_limit': 18 } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 174fd9e2b..c506bc5dd 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -17,7 +17,7 @@ from ..utils import ( class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrtvideo)/assets/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '90139b746a0a9bd7bb631283f6e2a64e', @@ -35,6 +35,10 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _HLS_ENTRY_PROTOCOLS_MAP = { + 'HLS': 'm3u8_native', + 'HLS_AES': 'm3u8', + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -52,9 +56,9 @@ class CanvasIE(InfoExtractor): format_url, format_type = target.get('url'), target.get('type') if not format_url or not format_type: continue - if format_type == 'HLS': + if format_type in self._HLS_ENTRY_PROTOCOLS_MAP: formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', + format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type], m3u8_id=format_type, fatal=False)) elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py index 9ba909a91..b57b86af7 100644 --- a/youtube_dl/extractor/carambatv.py +++ b/youtube_dl/extractor/carambatv.py @@ -82,6 +82,12 @@ class CarambaTVPageIE(InfoExtractor): webpage = self._download_webpage(url, video_id) videomore_url = VideomoreIE._extract_url(webpage) + if not videomore_url: + videomore_id = self._search_regex( + r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id', + default=None) + if videomore_id: + videomore_url = 'videomore:%s' % videomore_id if videomore_url: title = self._og_search_title(webpage) return { diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 6aeebd7b3..48b33617f 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .turner import TurnerBaseIE +from ..utils import int_or_none class CartoonNetworkIE(TurnerBaseIE): _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' _TEST = { - 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'url': 'https://www.cartoonnetwork.com/video/ben-10/how-to-draw-upgrade-episode.html', 'info_dict': { - 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'id': '6e3375097f63874ebccec7ef677c1c3845fa850e', 'ext': 'mp4', - 'title': 'Starfire the Cat Lady', - 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + 'title': 'How to Draw Upgrade', + 'description': 'md5:2061d83776db7e8be4879684eefe8c0f', }, 'params': { # m3u8 download @@ -25,18 +24,39 @@ class CartoonNetworkIE(TurnerBaseIE): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() - query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id - return self._extract_cvp_info( - 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', - 'tokenizer_src': 'https://token.vgtf.net/token/token_mobile', - }, - }, { + + def find_field(global_re, name, content_re=None, value_re='[^"]+', fatal=False): + metadata_re = '' + if content_re: + metadata_re = r'|video_metadata\.content_' + content_re + return self._search_regex( + r'(?:_cnglobal\.currentVideo\.%s%s)\s*=\s*"(%s)";' % (global_re, metadata_re, value_re), + webpage, name, fatal=fatal) + + media_id = find_field('mediaId', 'media id', 'id', '[0-9a-f]{40}', True) + title = find_field('episodeTitle', 'title', '(?:episodeName|name)', fatal=True) + + info = self._extract_ngtv_info( + media_id, {'networkId': 'cartoonnetwork'}, { 'url': url, 'site_name': 'CartoonNetwork', - 'auth_required': self._search_regex( - r'_cnglobal\.cvpFullOrPreviewAuth\s*=\s*(true|false);', - webpage, 'auth required', default='false') == 'true', + 'auth_required': find_field('authType', 'auth type') != 'unauth', }) + + series = find_field( + 'propertyName', 'series', 'showName') or self._html_search_meta('partOfSeries', webpage) + info.update({ + 'id': media_id, + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta('description', webpage), + 'series': series, + 'episode': title, + }) + + for field in ('season', 'episode'): + field_name = field + 'Number' + info[field + '_number'] = int_or_none(find_field( + field_name, field + ' number', value_re=r'\d+') or self._html_search_meta(field_name, webpage)) + + return info diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 43f95c739..751a3a8f2 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -360,7 +360,7 @@ class CBCWatchVideoIE(CBCWatchBaseIE): class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' - _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' + _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' _TESTS = [{ # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', @@ -386,6 +386,9 @@ class CBCWatchIE(CBCWatchBaseIE): 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, + }, { + 'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1799d63ea..4a19a73d2 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -13,13 +13,17 @@ from ..utils import ( class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): - closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') - return { - 'en': [{ - 'ext': 'ttml', - 'url': closed_caption_e.attrib['value'], - }] - } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + subtitles = {} + for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: + cc_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', k) + if cc_e is not None: + cc_url = cc_e.get('value') + if cc_url: + subtitles.setdefault(subtitles_lang, []).append({ + 'ext': ext, + 'url': cc_url, + }) + return subtitles class CBSIE(CBSBaseIE): @@ -65,7 +69,7 @@ class CBSIE(CBSBaseIE): last_e = None for item in items_data.findall('.//item'): asset_type = xpath_text(item, 'assetType') - if not asset_type or asset_type in asset_types or asset_type in ('HLS_FPS', 'DASH_CENC'): + if not asset_type or asset_type in asset_types or 'HLS_FPS' in asset_type or 'DASH_CENC' in asset_type: continue asset_types.append(asset_type) query = { diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 51df15fac..345debcf0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -1,40 +1,62 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import zlib + from .common import InfoExtractor from .cbs import CBSIE +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, +) from ..utils import ( parse_duration, ) +class CBSNewsEmbedIE(CBSIE): + IE_NAME = 'cbsnews:embed' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' + _TESTS = [{ + 'url': 'https://www.cbsnews.com/embed/video/?v=1.c9b5b61492913d6660db0b2f03579ef25e86307a#1Vb7b9s2EP5XBAHbT6Gt98PAMKTJ0se6LVjWYWtdGBR1stlIpEBSTtwi%2F%2FvuJNkNhmHdGxgM2NL57vjd6zt%2B8PngdN%2Fyg79qeGvhzN%2FLGrS%2F%2BuBLB531V28%2B%2BO7Qg7%2Fy97r2z3xZ42NW8yLhDbA0S0KWlHnIijwKWJBHZZnHBa8Cgbpdf%2F89NM9Hi9fXifhpr8sr%2FlP848tn%2BTdXycX25zh4cdX%2FvHl6PmmPqnWQv9w8Ed%2B9GjYRim07bFEqdG%2BZVHuwTm65A7bVRrYtR5lAyMox7pigF6W4k%2By91mjspGsJ%2BwVae4%2BsvdnaO1p73HkXs%2FVisUDTGm7R8IcdnOROeq%2B19qT1amhA1VJtPenoTUgrtfKc9m7Rq8dP7nnjwOB7wg7ADdNt7VX64DWAWlKhPtmDEq22g4GF99x6Dk9E8OSsankHXqPNKDxC%2FdK7MLKTircTDgsI3mmj4OBdSq64dy7fd1x577RU1rt4cvMtOaulFYOd%2FLewRWvDO9lIgXFpZSnkZmjbv5SxKTPoQXClFbpsf%2Fhbbpzs0IB3vb8KkyzJQ%2BywOAgCrMpgRrz%2BKk4fvb7kFbR4XJCu0gAdtNO7woCwZTu%2BBUs9bam%2Fds71drVerpeisgrubLjAB4nnOSkWQnfr5W6o1ku5Xpr1MgrCbL0M0vUyDtfLLK15WiYp47xKWSLyjFVpwVmVJSLIoCjSOFkv3W7oKsVliwZJcB9nwXpZ5GEQQwY8jNKqKCBrgjTLeFxgdCIpazojDgnRtn43J6kG7nZ6cAbxh0EeFFk4%2B1u867cY5u4344n%2FxXjCqAjucdTHgLKojNKmSfO8KRsOFY%2FzKEYCKEJBzv90QA9nfm9gL%2BHulaFqUkz9ULUYxl62B3U%2FRVNLA8IhggaPycOoBuwOCESciDQVSSUgiOMsROB%2FhKfwCKOzEk%2B4k6rWd4uuT%2FwTDz7K7t3d3WLO8ISD95jSPQbayBacthbz86XVgxHwhex5zawzgDOmtp%2F3GPcXn0VXHdSS029%2Fj99UC%2FwJUvyKQ%2FzKyixIEVlYJOn4RxxuaH43Ty9fbJ5OObykHH435XAzJTHeOF4hhEUXD8URe%2FQ%2FBT%2BMpf8d5GN02Ox%2FfiGsl7TA7POu1xZ5%2BbTzcAVKMe48mqcC21hkacVEVScM26liVVBnrKkC4CLKyzAvHu0lhEaTKMFwI3a4SN9MsrfYzdBLq2vkwRD1gVviLT8kY9h2CHH6Y%2Bix6609weFtey4ESp60WtyeWMy%2BsmBuhsoKIyuoT%2Bq2R%2FrW5qi3g%2FvzS2j40DoixDP8%2BKP0yUdpXJ4l6Vla%2Bg9vce%2BC4yM5YlUcbA%2F0jLKdpmTwvsdN5z88nAIe08%2F0HgxeG1iv%2B6Hlhjh7uiW0SDzYNI92L401uha3JKYk268UVRzdOzNQvAaJqoXzAc80dAV440NZ1WVVAAMRYQ2KrGJFmDUsq8saWSnjvIj8t78y%2FRa3JRnbHVfyFpfwoDiGpPgjzekyUiKNlU3OMlwuLMmzgvEojllYVE2Z1HhImvsnk%2BuhusTEoB21PAtSFodeFK3iYhXEH9WOG2%2FkOE833sfeG%2Ff5cfHtEFNXgYes0%2FXj7aGivUgJ9XpusCtoNcNYVVnJVrrDo0OmJAutHCpuZul4W9lLcfy7BnuLPT02%2ByXsCTk%2B9zhzswIN04YueNSK%2BPtM0jS88QdLqSLJDTLsuGZJNolm2yO0PXh3UPnz9Ix5bfIAqxPjvETQsDCEiPG4QbqNyhBZISxybLnZYCrW5H3Axp690%2F0BJdXtDZ5ITuM4xj3f4oUHGzc5JeJmZKpp%2FjwKh4wMV%2FV1yx3emLoR0MwbG4K%2F%2BZgVep3PnzXGDHZ6a3i%2Fk%2BJrONDN13%2Bnq6tBTYk4o7cLGhBtqCC4KwacGHpEVuoH5JNro%2FE6JfE6d5RydbiR76k%2BW5wioDHBIjw1euhHjUGRB0y5A97KoaPx6MlL%2BwgboUVtUFRI%2FLemgTpdtF59ii7pab08kuPcfWzs0l%2FRI5takWnFpka0zOgWRtYcuf9aIxZMxlwr6IiGpsb6j2DQUXPl%2FimXI599Ev7fWjoPD78A', + 'only_matching': True, + }] + + def _real_extract(self, url): + item = self._parse_json(zlib.decompress(compat_b64decode( + compat_urllib_parse_unquote(self._match_id(url))), + -zlib.MAX_WBITS), None)['video']['items'][0] + return self._extract_video_info(item['mpxRefId'], 'cbsnews') + + class CBSNewsIE(CBSIE): IE_NAME = 'cbsnews' IE_DESC = 'CBS News' - _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' _TESTS = [ { # 60 minutes 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', 'info_dict': { - 'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_', - 'ext': 'mp4', - 'title': 'Artificial Intelligence', - 'description': 'md5:8818145f9974431e0fb58a1b8d69613c', + 'id': 'Y_nf_aEg6WwO9OLAq0MpKaPgfnBUxfW4', + 'ext': 'flv', + 'title': 'Artificial Intelligence, real-life applications', + 'description': 'md5:a7aaf27f1b4777244de8b0b442289304', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1606, + 'duration': 317, 'uploader': 'CBSI-NEW', - 'timestamp': 1498431900, - 'upload_date': '20170625', + 'timestamp': 1476046464, + 'upload_date': '20161009', }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { - 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', + 'url': 'https://www.cbsnews.com/video/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', @@ -60,37 +82,29 @@ class CBSNewsIE(CBSIE): # 48 hours 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', 'info_dict': { - 'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1', - 'ext': 'mp4', 'title': 'Cold as Ice', - 'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.', - 'upload_date': '20170604', - 'timestamp': 1496538000, - 'uploader': 'CBSI-NEW', - }, - 'params': { - 'skip_download': True, + 'description': 'Can a childhood memory solve the 1957 murder of 7-year-old Maria Ridulph?', }, + 'playlist_mincount': 7, }, ] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_info = self._parse_json(self._html_search_regex( - r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', - webpage, 'video JSON info', default='{}'), video_id, fatal=False) - - if video_info: - item = video_info['item'] if 'item' in video_info else video_info - else: - state = self._parse_json(self._search_regex( - r'data-cbsvideoui-options=(["\'])(?P<json>{.+?})\1', webpage, - 'playlist JSON info', group='json'), video_id)['state'] - item = state['playlist'][state['pid']] + entries = [] + for embed_url in re.findall(r'<iframe[^>]+data-src="(https?://(?:www\.)?cbsnews\.com/embed/video/[^#]*#[^"]+)"', webpage): + entries.append(self.url_result(embed_url, CBSNewsEmbedIE.ie_key())) + if entries: + return self.playlist_result( + entries, playlist_title=self._html_search_meta(['og:title', 'twitter:title'], webpage), + playlist_description=self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage)) + item = self._parse_json(self._html_search_regex( + r'CBSNEWS\.defaultPayload\s*=\s*({.+})', + webpage, 'video JSON info'), display_id)['items'][0] return self._extract_video_info(item['mpxRefId'], 'cbsnews') diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 734702144..36e6dff72 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -1,9 +1,12 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + try_get, + url_or_none, ) @@ -18,11 +21,13 @@ class CCCIE(InfoExtractor): 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', + 'creator': 'byterazor', 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20131228', 'timestamp': 1388188800, 'duration': 3710, + 'tags': list, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', @@ -68,6 +73,7 @@ class CCCIE(InfoExtractor): 'id': event_id, 'display_id': display_id, 'title': event_data['title'], + 'creator': try_get(event_data, lambda x: ', '.join(x['persons'])), 'description': event_data.get('description'), 'thumbnail': event_data.get('thumb_url'), 'timestamp': parse_iso8601(event_data.get('date')), @@ -75,3 +81,31 @@ class CCCIE(InfoExtractor): 'tags': event_data.get('tags'), 'formats': formats, } + + +class CCCPlaylistIE(InfoExtractor): + IE_NAME = 'media.ccc.de:lists' + _VALID_URL = r'https?://(?:www\.)?media\.ccc\.de/c/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://media.ccc.de/c/30c3', + 'info_dict': { + 'title': '30C3', + 'id': '30c3', + }, + 'playlist_count': 135, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url).lower() + + conf = self._download_json( + 'https://media.ccc.de/public/conferences/' + playlist_id, + playlist_id) + + entries = [] + for e in conf['events']: + event_url = url_or_none(e.get('frontend_link')) + if event_url: + entries.append(self.url_result(event_url, ie=CCCIE.ie_key())) + + return self.playlist_result(entries, playlist_id, conf.get('title')) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 46380430f..1ec58f7d8 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -155,7 +155,7 @@ class CeskaTelevizeIE(InfoExtractor): stream_formats = self._extract_mpd_formats( stream_url, playlist_id, mpd_id='dash-%s' % format_id, fatal=False) - # See https://github.com/rg3/youtube-dl/issues/12119#issuecomment-280037031 + # See https://github.com/ytdl-org/youtube-dl/issues/12119#issuecomment-280037031 if format_id == 'audioDescription': for f in stream_formats: f['source_preference'] = -10 diff --git a/youtube_dl/extractor/cinemax.py b/youtube_dl/extractor/cinemax.py new file mode 100644 index 000000000..7f89d33de --- /dev/null +++ b/youtube_dl/extractor/cinemax.py @@ -0,0 +1,29 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .hbo import HBOBaseIE + + +class CinemaxIE(HBOBaseIE): + _VALID_URL = r'https?://(?:www\.)?cinemax\.com/(?P<path>[^/]+/video/[0-9a-z-]+-(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903', + 'md5': '82e0734bba8aa7ef526c9dd00cf35a05', + 'info_dict': { + 'id': '20126903', + 'ext': 'mp4', + 'title': 'S1 Ep 1: Recap', + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], + }, { + 'url': 'https://www.cinemax.com/warrior/video/s1-ep-1-recap-20126903.embed', + 'only_matching': True, + }] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + info = self._extract_info('https://www.cinemax.com/%s.xml' % path, video_id) + info['id'] = video_id + return info diff --git a/youtube_dl/extractor/ciscolive.py b/youtube_dl/extractor/ciscolive.py new file mode 100644 index 000000000..da404e4dc --- /dev/null +++ b/youtube_dl/extractor/ciscolive.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, + urlencode_postdata, +) + + +class CiscoLiveBaseIE(InfoExtractor): + # These appear to be constant across all Cisco Live presentations + # and are not tied to any user session or event + RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s' + RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz' + RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s' + + HEADERS = { + 'Origin': 'https://ciscolive.cisco.com', + 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID, + 'rfWidgetId': RAINFOCUS_WIDGET_ID, + } + + def _call_api(self, ep, rf_id, query, referrer, note=None): + headers = self.HEADERS.copy() + headers['Referer'] = referrer + return self._download_json( + self.RAINFOCUS_API_URL % ep, rf_id, note=note, + data=urlencode_postdata(query), headers=headers) + + def _parse_rf_item(self, rf_item): + event_name = rf_item.get('eventName') + title = rf_item['title'] + description = clean_html(rf_item.get('abstract')) + presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName']) + bc_id = rf_item['videos'][0]['url'] + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id + duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length'])) + location = try_get(rf_item, lambda x: x['times'][0]['room']) + + if duration: + duration = duration * 60 + + return { + '_type': 'url_transparent', + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + 'title': title, + 'description': description, + 'duration': duration, + 'creator': presenter_name, + 'location': location, + 'series': event_name, + } + + +class CiscoLiveSessionIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs', + 'md5': 'c98acf395ed9c9f766941c70f5352e22', + 'info_dict': { + 'id': '5803694304001', + 'ext': 'mp4', + 'title': '13 Smart Automations to Monitor Your Cisco IOS Network', + 'description': 'md5:ec4a436019e09a918dec17714803f7cc', + 'timestamp': 1530305395, + 'upload_date': '20180629', + 'uploader_id': '5647924234001', + 'location': '16B Mezz.', + }, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS', + 'only_matching': True, + }] + + def _real_extract(self, url): + rf_id = self._match_id(url) + rf_result = self._call_api('session', rf_id, {'id': rf_id}, url) + return self._parse_rf_item(rf_result['items'][0]) + + +class CiscoLiveSearchIE(CiscoLiveBaseIE): + _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)' + _TESTS = [{ + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/', + 'info_dict': { + 'title': 'Search query', + }, + 'playlist_count': 5, + }, { + 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/', + 'only_matching': True, + }, { + 'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url) + + @staticmethod + def _check_bc_id_exists(rf_item): + return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None + + def _entries(self, query, url): + query['size'] = 50 + query['from'] = 0 + for page_num in itertools.count(1): + results = self._call_api( + 'search', None, query, url, + 'Downloading search JSON page %d' % page_num) + sl = try_get(results, lambda x: x['sectionList'][0], dict) + if sl: + results = sl + items = results.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + if not self._check_bc_id_exists(item): + continue + yield self._parse_rf_item(item) + size = int_or_none(results.get('size')) + if size is not None: + query['size'] = size + total = int_or_none(results.get('total')) + if total is not None and query['from'] + query['size'] > total: + break + query['from'] += query['size'] + + def _real_extract(self, url): + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + query['type'] = 'session' + return self.playlist_result( + self._entries(query, url), playlist_title='Search query') diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index ab651d1c8..f2ca7a337 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,19 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none - - -_translation_table = { - 'a': 'h', 'd': 'e', 'e': 'v', 'f': 'o', 'g': 'f', 'i': 'd', 'l': 'n', - 'm': 'a', 'n': 'm', 'p': 'u', 'q': 't', 'r': 's', 'v': 'p', 'x': 'r', - 'y': 'l', 'z': 'i', - '$': ':', '&': '.', '(': '=', '^': '&', '=': '/', -} - - -def _decode(s): - return ''.join(_translation_table.get(c, c) for c in s) +from ..utils import ( + int_or_none, + url_or_none, +) class CliphunterIE(InfoExtractor): @@ -60,14 +51,14 @@ class CliphunterIE(InfoExtractor): formats = [] for format_id, f in gexo_files.items(): - video_url = f.get('url') + video_url = url_or_none(f.get('url')) if not video_url: continue fmt = f.get('fmt') height = f.get('h') format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id formats.append({ - 'url': _decode(video_url), + 'url': video_url, 'format_id': format_id, 'width': int_or_none(f.get('w')), 'height': int_or_none(height), diff --git a/youtube_dl/extractor/cloudflarestream.py b/youtube_dl/extractor/cloudflarestream.py index e6d92cca2..8ff2c6531 100644 --- a/youtube_dl/extractor/cloudflarestream.py +++ b/youtube_dl/extractor/cloudflarestream.py @@ -10,8 +10,8 @@ class CloudflareStreamIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:watch\.)?cloudflarestream\.com/| - embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo= + (?:watch\.)?(?:cloudflarestream\.com|videodelivery\.net)/| + embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo= ) (?P<id>[\da-f]+) ''' @@ -31,6 +31,9 @@ class CloudflareStreamIE(InfoExtractor): }, { 'url': 'https://cloudflarestream.com/31c9291ab41fac05471db4e73aa11717/manifest/video.mpd', 'only_matching': True, + }, { + 'url': 'https://embed.videodelivery.net/embed/r4xu.fla9.latest.js?video=81d80727f3022488598f68d323c1ad5e', + 'only_matching': True, }] @staticmethod @@ -38,7 +41,7 @@ class CloudflareStreamIE(InfoExtractor): return [ mobj.group('url') for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.cloudflarestream\.com/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', + r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.(?:cloudflarestream\.com|videodelivery\.net)/embed/[^/]+\.js\?.*?\bvideo=[\da-f]+?.*?)\1', webpage)] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cnbc.py b/youtube_dl/extractor/cnbc.py index d354d9f95..6889b0f40 100644 --- a/youtube_dl/extractor/cnbc.py +++ b/youtube_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor from ..utils import smuggle_url @@ -34,3 +35,32 @@ class CNBCIE(InfoExtractor): {'force_smil_url': True}), 'id': video_id, } + + +class CNBCVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _TEST = { + 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', + 'info_dict': { + 'id': '7000031301', + 'ext': 'mp4', + 'title': "Trump: I don't necessarily agree with raising rates", + 'description': 'md5:878d8f0b4ebb5bb1dda3514b91b49de3', + 'timestamp': 1531958400, + 'upload_date': '20180719', + 'uploader': 'NBCU-CNBC', + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, + 'video id') + return self.url_result( + 'http://video.cnbc.com/gallery/?video=%s' % video_id, + CNBCIE.ie_key()) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5fc311f53..774b71055 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -119,11 +119,7 @@ class CNNBlogsIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url') - return { - '_type': 'url', - 'url': cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result(cnn_url, CNNIE.ie_key()) class CNNArticleIE(InfoExtractor): @@ -145,8 +141,4 @@ class CNNArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') - return { - '_type': 'url', - 'url': 'http://cnn.com/video/?/video/' + cnn_url, - 'ie_key': CNNIE.ie_key(), - } + return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2dbf81e6e..859786617 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -17,6 +17,7 @@ import math from ..compat import ( compat_cookiejar, compat_cookies, + compat_etree_Element, compat_etree_fromstring, compat_getpass, compat_integer_types, @@ -43,6 +44,7 @@ from ..utils import ( compiled_regex_type, determine_ext, determine_protocol, + dict_get, error_to_compat_str, ExtractorError, extract_attributes, @@ -55,13 +57,17 @@ from ..utils import ( JSON_LD_RE, mimetype2ext, orderedSet, + parse_bitrate, parse_codecs, parse_duration, parse_iso8601, parse_m3u8_attributes, + parse_resolution, RegexNotFoundError, sanitized_Request, sanitize_filename, + str_or_none, + strip_or_none, unescapeHTML, unified_strdate, unified_timestamp, @@ -69,6 +75,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -101,10 +108,26 @@ class InfoExtractor(object): from worst to best quality. Potential fields: - * url Mandatory. The URL of the video file + * url The mandatory URL representing the media: + for plain file media - HTTP URL of this file, + for RTMP - RTMP URL, + for HLS - URL of the M3U8 media playlist, + for HDS - URL of the F4M manifest, + for DASH + - HTTP URL to plain file media (in case of + unfragmented media) + - URL of the MPD manifest or base URL + representing the media if MPD manifest + is parsed from a string (in case of + fragmented media) + for MSS - URL of the ISM manifest. * manifest_url The URL of the manifest file in case of - fragmented media (DASH, hls, hds) + fragmented media: + for HLS - URL of the M3U8 master playlist, + for HDS - URL of the F4M manifest, + for DASH - URL of the MPD manifest, + for MSS - URL of the ISM manifest. * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -197,7 +220,7 @@ class InfoExtractor(object): * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) - * "resolution" (optional, string "{width}x{height"}, + * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. @@ -520,11 +543,11 @@ class InfoExtractor(object): raise ExtractorError('An extractor error has occurred.', cause=e) def __maybe_fake_ip_and_retry(self, countries): - if (not self._downloader.params.get('geo_bypass_country', None) and - self._GEO_BYPASS and - self._downloader.params.get('geo_bypass', True) and - not self._x_forwarded_for_ip and - countries): + if (not self._downloader.params.get('geo_bypass_country', None) + and self._GEO_BYPASS + and self._downloader.params.get('geo_bypass', True) + and not self._x_forwarded_for_ip + and countries): country_code = random.choice(countries) self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._x_forwarded_for_ip: @@ -605,6 +628,11 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of <https://bugs.python.org/issue15002> + # introduced in Python 3.4.1. + err.fp._error = err return err.fp if errnote is False: @@ -655,8 +683,8 @@ class InfoExtractor(object): def __check_blocked(self, content): first_block = content[:512] - if ('<title>Access to this site is blocked' in content and - 'Websense' in first_block): + if ('Access to this site is blocked' in content + and 'Websense' in first_block): msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' blocked_iframe = self._html_search_regex( r'' + embed_url = KalturaIE._extract_url(start_page) + if embed_url: + embed_url = smuggle_url(embed_url, {'source_url': url}) + ie_key = 'Kaltura' + else: + PLAYER_REGEX = r'', - start_page, 'xml filename', default=None) - if xml_name is None: - # Fallback to the older format xml_name = self._html_search_regex( - r'', start_page, 'xml filename') + embed_url = '%s/xml/%s' % (xml_root, xml_name) + ie_key = 'DigitallySpeaking' return { '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': '%s/xml/%s' % (xml_root, xml_name), - 'ie_key': 'DigitallySpeaking', + 'url': embed_url, + 'ie_key': ie_key, } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2a48667f0..d1725d98b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE -from .sportbox import SportBoxEmbedIE +from .sportbox import SportBoxIE from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE @@ -89,7 +89,10 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE -from .openload import OpenloadIE +from .openload import ( + OpenloadIE, + VerystreamIE, +) from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -109,11 +112,13 @@ from .vice import ViceIE from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE +from .teachable import TeachableIE from .indavideo import IndavideoEmbedIE from .apa import APAIE from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE +from .zype import ZypeIE class GenericIE(InfoExtractor): @@ -428,7 +433,7 @@ class GenericIE(InfoExtractor): }, }, { - # https://github.com/rg3/youtube-dl/issues/2253 + # https://github.com/ytdl-org/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', 'md5': '0ba9446db037002366bab3b3eb30c88c', 'info_dict': { @@ -453,7 +458,7 @@ class GenericIE(InfoExtractor): }, }, { - # https://github.com/rg3/youtube-dl/issues/3541 + # https://github.com/ytdl-org/youtube-dl/issues/3541 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { @@ -917,7 +922,7 @@ class GenericIE(InfoExtractor): } }, # Multiple brightcove videos - # https://github.com/rg3/youtube-dl/issues/2283 + # https://github.com/ytdl-org/youtube-dl/issues/2283 { 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', 'info_dict': { @@ -2070,6 +2075,36 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Zype embed + 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + 'info_dict': { + 'id': '5b400b834b32992a310622b9', + 'ext': 'mp4', + 'title': 'Smoky Barbecue Favorites', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'add_ie': [ZypeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', @@ -2085,6 +2120,23 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to download MPD manifest'], }, + { + # DailyMotion embed with DM.player + 'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804', + 'info_dict': { + 'id': 'k6aKkGHd9FJs4mtJN39', + 'ext': 'mp4', + 'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final', + 'description': 'This video is private.', + 'uploader_id': 'x1jf30l', + 'uploader': 'beIN SPORTS USA', + 'upload_date': '20190528', + 'timestamp': 1559062971, + }, + 'params': { + 'skip_download': True, + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2181,10 +2233,7 @@ class GenericIE(InfoExtractor): def _real_extract(self, url): if url.startswith('//'): - return { - '_type': 'url', - 'url': self.http_scheme() + url, - } + return self.url_result(self.http_scheme() + url) parsed_url = compat_urlparse.urlparse(url) if not parsed_url.scheme: @@ -2193,7 +2242,7 @@ class GenericIE(InfoExtractor): default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): - if '/' in url: + if re.match(r'^[^\s/]+\.[^\s/]+/', url): self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': @@ -2358,10 +2407,16 @@ class GenericIE(InfoExtractor): return camtasia_res # Sometimes embedded video player is hidden behind percent encoding - # (e.g. https://github.com/rg3/youtube-dl/issues/2448) + # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name @@ -2533,11 +2588,11 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or - re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or - re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P.{32})[\'"]', webpage) or - re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or - re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) + or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) + or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P.{32})[\'"]', webpage) + or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) + or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: embed_token = self._search_regex( r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', @@ -2567,19 +2622,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') - # Look for embedded NovaMov-based player - mobj = re.search( - r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) - (?Phttp://(?:(?:embed|www)\.)? - (?:novamov\.com| - nowvideo\.(?:ch|sx|eu|at|ag|co)| - videoweed\.(?:es|com)| - movshare\.(?:net|sx|ag)| - divxstage\.(?:eu|net|ch|co|at|ag)) - /embed\.php.+?)\1''', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - # Look for embedded Facebook player facebook_urls = FacebookIE._extract_urls(webpage) if facebook_urls: @@ -2636,9 +2678,9 @@ class GenericIE(InfoExtractor): return self.url_result(tvc_url, 'TVC') # Look for embedded SportBox player - sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) + sportbox_urls = SportBoxIE._extract_urls(webpage) if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) @@ -3004,6 +3046,12 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) + # Look for Verystream embeds + verystream_urls = VerystreamIE._extract_urls(webpage) + if verystream_urls: + return self.playlist_from_matches( + verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key()) + # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: @@ -3097,6 +3145,10 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) + teachable_url = TeachableIE._extract_url(webpage, url) + if teachable_url: + return self.url_result(teachable_url) + indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) if indavideo_urls: return self.playlist_from_matches( @@ -3129,6 +3181,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) + zype_urls = ZypeIE._extract_urls(webpage) + if zype_urls: + return self.playlist_from_matches( + zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: @@ -3155,7 +3212,7 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) except ExtractorError: - # See https://github.com/rg3/youtube-dl/pull/16735 + # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass # Video.js embed @@ -3190,8 +3247,8 @@ class GenericIE(InfoExtractor): else: formats.append({ 'url': src, - 'ext': (mimetype2ext(src_type) or - ext if ext in KNOWN_EXTENSIONS else 'mp4'), + 'ext': (mimetype2ext(src_type) + or ext if ext in KNOWN_EXTENSIONS else 'mp4'), }) if formats: self._sort_formats(formats) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index a0670b645..bbe3cb283 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -11,7 +11,7 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/|gifs/detail/)?(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P[^-/?#]+)' _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -44,16 +44,22 @@ class GfycatIE(InfoExtractor): 'categories': list, 'age_limit': 0, } + }, { + 'url': 'https://gfycat.com/ru/RemarkableDrearyAmurstarfish', + 'only_matching': True }, { 'url': 'https://gfycat.com/gifs/detail/UnconsciousLankyIvorygull', 'only_matching': True + }, { + 'url': 'https://gfycat.com/acceptablehappygoluckyharborporpoise-baseball', + 'only_matching': True }] def _real_extract(self, url): video_id = self._match_id(url) gfy = self._download_json( - 'http://gfycat.com/cajax/get/%s' % video_id, + 'https://api.gfycat.com/v1/gfycats/%s' % video_id, video_id, 'Downloading video info') if 'error' in gfy: raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c2140c362..fb8f7679b 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -72,7 +72,7 @@ class GloboIE(InfoExtractor): return try: - self._download_json( + glb_id = (self._download_json( 'https://login.globo.com/api/authentication', None, data=json.dumps({ 'payload': { 'email': email, @@ -81,7 +81,9 @@ class GloboIE(InfoExtractor): }, }).encode(), headers={ 'Content-Type': 'application/json; charset=utf-8', - }) + }) or {}).get('glbId') + if glb_id: + self._set_cookie('.globo.com', 'GLBID', glb_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: resp = self._parse_json(e.cause.read(), None) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index ec9dd6e3a..03e48f4ea 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -25,18 +25,22 @@ class GoIE(AdobePassIE): }, 'watchdisneychannel': { 'brand': '004', - 'requestor_id': 'Disney', + 'resource_id': 'Disney', }, 'watchdisneyjunior': { 'brand': '008', - 'requestor_id': 'DisneyJunior', + 'resource_id': 'DisneyJunior', }, 'watchdisneyxd': { 'brand': '009', - 'requestor_id': 'DisneyXD', + 'resource_id': 'DisneyXD', + }, + 'disneynow': { + 'brand': '011', + 'resource_id': 'Disney', } } - _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ + _VALID_URL = r'https?://(?:(?:(?P%s)\.)?go|(?Pdisneynow))\.com/(?:(?:[^/]+/)*(?Pvdka\w+)|(?:[^/]+/)*(?P[^/?#]+))'\ % '|'.join(list(_SITE_INFO.keys()) + ['disneynow']) _TESTS = [{ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', @@ -71,6 +75,9 @@ class GoIE(AdobePassIE): # brand 008 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', 'only_matching': True, + }, { + 'url': 'https://disneynow.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013', + 'only_matching': True, }] def _extract_videos(self, brand, video_id='-1', show_id='-1'): @@ -80,7 +87,9 @@ class GoIE(AdobePassIE): display_id)['video'] def _real_extract(self, url): - sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + sub_domain = mobj.group('sub_domain') or mobj.group('sub_domain_2') + video_id, display_id = mobj.group('id', 'display_id') site_info = self._SITE_INFO.get(sub_domain, {}) brand = site_info.get('brand') if not video_id or not site_info: @@ -89,7 +98,7 @@ class GoIE(AdobePassIE): # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', - default=None) + default=video_id) if not site_info: brand = self._search_regex( (r'data-brand=\s*["\']\s*(\d+)', @@ -130,8 +139,8 @@ class GoIE(AdobePassIE): 'device': '001', } if video_data.get('accesslevel') == '1': - requestor_id = site_info['requestor_id'] - resource = self._get_mvpd_resource( + requestor_id = site_info.get('requestor_id', 'DisneyChannels') + resource = site_info.get('resource_id') or self._get_mvpd_resource( requestor_id, title, video_id, None) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 3bf462d63..589e4d5c3 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -36,7 +36,7 @@ class GoogleDriveIE(InfoExtractor): } }, { # video can't be watched anonymously due to view count limit reached, - # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046) + # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', 'md5': 'bfbd670d03a470bb1e6d4a257adec12e', 'info_dict': { diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 859ad5429..68df748f5 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -4,12 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( xpath_text, xpath_element, int_or_none, parse_duration, + urljoin, ) @@ -53,10 +53,13 @@ class HBOBaseIE(InfoExtractor): }, } - def _extract_from_id(self, video_id): - video_data = self._download_xml( - 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) - title = xpath_text(video_data, 'title', 'title', True) + def _extract_info(self, url, display_id): + video_data = self._download_xml(url, display_id) + video_id = xpath_text(video_data, 'id', fatal=True) + episode_title = title = xpath_text(video_data, 'title', fatal=True) + series = xpath_text(video_data, 'program') + if series: + title = '%s - %s' % (series, title) formats = [] for source in xpath_element(video_data, 'videos', 'sources', True): @@ -128,68 +131,45 @@ class HBOBaseIE(InfoExtractor): 'width': width, }) + subtitles = None + caption_url = xpath_text(video_data, 'captionUrl') + if caption_url: + subtitles = { + 'en': [{ + 'url': caption_url, + 'ext': 'ttml' + }], + } + return { 'id': video_id, 'title': title, 'duration': parse_duration(xpath_text(video_data, 'duration/tv14')), + 'series': series, + 'episode': episode_title, 'formats': formats, 'thumbnails': thumbnails, + 'subtitles': subtitles, } class HBOIE(HBOBaseIE): IE_NAME = 'hbo' - _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?:video|embed)(?:/[^/]+)*/(?P[^/?#]+)' _TEST = { - 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', - 'md5': '2c6a6bc1222c7e91cb3334dad1746e5a', + 'url': 'https://www.hbo.com/video/game-of-thrones/seasons/season-8/videos/trailer', + 'md5': '8126210656f433c452a21367f9ad85b3', 'info_dict': { - 'id': '1437839', + 'id': '22113301', 'ext': 'mp4', - 'title': 'Ep. 64 Clip: Encryption', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 1072, - } + 'title': 'Game of Thrones - Trailer', + }, + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], } def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_from_id(video_id) - - -class HBOEpisodeIE(HBOBaseIE): - IE_NAME = 'hbo:episode' - _VALID_URL = r'https?://(?:www\.)?hbo\.com/(?P(?!video)(?:(?:[^/]+/)+video|watch-free-episodes)/(?P[0-9a-z-]+))(?:\.html)?' - - _TESTS = [{ - 'url': 'http://www.hbo.com/girls/episodes/5/52-i-love-you-baby/video/ep-52-inside-the-episode.html?autoplay=true', - 'md5': '61ead79b9c0dfa8d3d4b07ef4ac556fb', - 'info_dict': { - 'id': '1439518', - 'display_id': 'ep-52-inside-the-episode', - 'ext': 'mp4', - 'title': 'Ep. 52: Inside the Episode', - 'thumbnail': r're:https?://.*\.jpg$', - 'duration': 240, - }, - }, { - 'url': 'http://www.hbo.com/game-of-thrones/about/video/season-5-invitation-to-the-set.html?autoplay=true', - 'only_matching': True, - }, { - 'url': 'http://www.hbo.com/watch-free-episodes/last-week-tonight-with-john-oliver', - 'only_matching': True, - }] - - def _real_extract(self, url): - path, display_id = re.match(self._VALID_URL, url).groups() - - content = self._download_json( - 'http://www.hbo.com/api/content/' + path, display_id)['content'] - - video_id = compat_str((content.get('parsed', {}).get( - 'common:FullBleedVideo', {}) or content['selectedEpisode'])['videoId']) - - info_dict = self._extract_from_id(video_id) - info_dict['display_id'] = display_id - - return info_dict + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + location_path = self._parse_json(self._html_search_regex( + r'data-state="({.+?})"', webpage, 'state'), display_id)['video']['locationUrl'] + return self._extract_info(urljoin(url, location_path), display_id) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 5c03780a3..d8a2f9d76 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -155,8 +155,8 @@ class HeiseIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': (xpath_text(doc, './/{http://rss.jwpcdn.com/}image') or - self._og_search_thumbnail(webpage)), + 'thumbnail': (xpath_text(doc, './/{http://rss.jwpcdn.com/}image') + or self._og_search_thumbnail(webpage)), 'timestamp': parse_iso8601( self._html_search_meta('date', webpage)), 'formats': formats, diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py index 1d905dc81..3e5ff2685 100644 --- a/youtube_dl/extractor/hitbox.py +++ b/youtube_dl/extractor/hitbox.py @@ -58,8 +58,8 @@ class HitboxIE(InfoExtractor): title = video_meta.get('media_status') alt_title = video_meta.get('media_title') description = clean_html( - video_meta.get('media_description') or - video_meta.get('media_description_md')) + video_meta.get('media_description') + or video_meta.get('media_description_md')) duration = float_or_none(video_meta.get('media_duration')) uploader = video_meta.get('media_user_name') views = int_or_none(video_meta.get('media_views')) diff --git a/youtube_dl/extractor/hitrecord.py b/youtube_dl/extractor/hitrecord.py index 01a6946d0..fd5dc2935 100644 --- a/youtube_dl/extractor/hitrecord.py +++ b/youtube_dl/extractor/hitrecord.py @@ -47,8 +47,8 @@ class HitRecordIE(InfoExtractor): tags = [ t['text'] for t in tags_list - if isinstance(t, dict) and t.get('text') and - isinstance(t['text'], compat_str)] + if isinstance(t, dict) and t.get('text') + and isinstance(t['text'], compat_str)] return { 'id': video_id, diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py new file mode 100644 index 000000000..1f3502b90 --- /dev/null +++ b/youtube_dl/extractor/hketv.py @@ -0,0 +1,191 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + merge_dicts, + parse_count, + str_or_none, + try_get, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class HKETVIE(InfoExtractor): + IE_NAME = 'hketv' + IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['HK'] + _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hkedcity.net/etv/resource/2932360618', + 'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7', + 'info_dict': { + 'id': '2932360618', + 'ext': 'mp4', + 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', + 'description': 'md5:d5286d05219ef50e0613311cbe96e560', + 'upload_date': '20181024', + 'duration': 900, + 'subtitles': 'count:2', + }, + 'skip': 'Geo restricted to HK', + }, { + 'url': 'https://www.hkedcity.net/etv/resource/972641418', + 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', + 'info_dict': { + 'id': '972641418', + 'ext': 'mp4', + 'title': '衣冠楚楚 (天使系列之一)', + 'description': 'md5:10bb3d659421e74f58e5db5691627b0f', + 'upload_date': '20070109', + 'duration': 907, + 'subtitles': {}, + }, + 'params': { + 'geo_verification_proxy': '', + }, + 'skip': 'Geo restricted to HK', + }] + + _CC_LANGS = { + '中文(繁體中文)': 'zh-Hant', + '中文(简体中文)': 'zh-Hans', + 'English': 'en', + 'Bahasa Indonesia': 'id', + '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi', + '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne', + 'Tagalog': 'tl', + '\u0e44\u0e17\u0e22': 'th', + '\u0627\u0631\u062f\u0648': 'ur', + } + _FORMAT_HEIGHTS = { + 'SD': 360, + 'HD': 720, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = ( + self._html_search_meta( + ('ed_title', 'search.ed_title'), webpage, default=None) + or self._search_regex( + r'data-favorite_title_(?:eng|chi)=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'title', default=None, group='url') + or self._html_search_regex( + r'

([^<]+)

', webpage, 'title', default=None) + or self._og_search_title(webpage) + ) + + file_id = self._search_regex( + r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);', + webpage, 'file ID') + curr_url = self._search_regex( + r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";', + webpage, 'curr URL') + data = { + 'action': 'get_info', + 'curr_url': curr_url, + 'file_id': file_id, + 'video_url': file_id, + } + + response = self._download_json( + self._APPS_BASE_URL + '/media/play/handler.php', video_id, + data=urlencode_postdata(data), + headers=merge_dicts({ + 'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) + + result = response['result'] + + if not response.get('success') or not response.get('access'): + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted( + msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error, expected=True) + + formats = [] + + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = result['playlist'][0] + for fmt in playlist0['sources']: + file_url = urljoin(self._APPS_BASE_URL, fmt.get('file')) + if not file_url: + continue + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + label = fmt.get('label') + h = self._FORMAT_HEIGHTS.get(label) + w = h * width // height if h and width and height else None + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + self._sort_formats(formats) + + subtitles = {} + tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(self._APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get( + track_label, track_label), []).append({ + 'url': self._proto_relative_url(track_url), + 'ext': 'srt', + }) + + # Likes + emotion = self._download_json( + 'https://emocounter.hkedcity.net/handler.php', video_id, + data=urlencode_postdata({ + 'action': 'get_emotion', + 'data[bucket_id]': 'etv', + 'data[identifier]': video_id, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}, + fatal=False) or {} + like_count = int_or_none(try_get( + emotion, lambda x: x['data']['emotion_data'][0]['count'])) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + 'description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta( + 'ed_date', webpage, fatal=False), day_first=False), + 'duration': int_or_none(result.get('length')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')), + 'view_count': parse_count(result.get('view_count')), + 'like_count': like_count, + } diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 354ac00dc..f9f7c5a64 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -3,45 +3,67 @@ from __future__ import unicode_literals import hashlib import hmac +import re import time +import uuid from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( determine_ext, ExtractorError, int_or_none, + str_or_none, + try_get, + url_or_none, ) class HotStarBaseIE(InfoExtractor): _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _call_api(self, path, video_id, query_name='contentId'): + def _call_api_impl(self, path, video_id, query): st = int(time.time()) exp = st + 6000 auth = 'st=%d~exp=%d~acl=/*' % (st, exp) auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() response = self._download_json( - 'https://api.hotstar.com/' + path, - video_id, headers={ + 'https://api.hotstar.com/' + path, video_id, headers={ 'hotstarauth': auth, 'x-country-code': 'IN', 'x-platform-code': 'JIO', - }, query={ - query_name: video_id, - 'tas': 10000, - }) + }, query=query) if response['statusCode'] != 'OK': raise ExtractorError( response['body']['message'], expected=True) return response['body']['results'] + def _call_api(self, path, video_id, query_name='contentId'): + return self._call_api_impl(path, video_id, { + query_name: video_id, + 'tas': 10000, + }) + + def _call_api_v2(self, path, video_id): + return self._call_api_impl( + '%s/in/contents/%s' % (path, video_id), video_id, { + 'desiredConfig': 'encryption:plain;ladder:phone,tv;package:hls,dash', + 'client': 'mweb', + 'clientVersion': '6.18.0', + 'deviceId': compat_str(uuid.uuid4()), + 'osName': 'Windows', + 'osVersion': '10', + }) + class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P\d{10})' _TESTS = [{ + # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { 'id': '1000076273', @@ -56,12 +78,20 @@ class HotStarIE(HotStarBaseIE): # m3u8 download 'skip_download': True, } + }, { + # contentDetail + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, }, { 'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583', 'only_matching': True, }, { 'url': 'http://www.hotstar.com/1000000515', 'only_matching': True, + }, { + # only available via api v2 + 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', + 'only_matching': True, }] _GEO_BYPASS = False @@ -72,7 +102,16 @@ class HotStarIE(HotStarBaseIE): app_state = self._parse_json(self._search_regex( r'', webpage, 'app state'), video_id) - video_data = list(app_state.values())[0]['initialState']['contentData']['content'] + video_data = {} + getters = list( + lambda x, k=k: x['initialState']['content%s' % k]['content'] + for k in ('Data', 'Detail') + ) + for v in app_state.values(): + content = try_get(v, getters, dict) + if content and content.get('contentId') == video_id: + video_data = content + break title = video_data['title'] @@ -80,26 +119,43 @@ class HotStarIE(HotStarBaseIE): raise ExtractorError('This video is DRM protected.', expected=True) formats = [] - format_data = self._call_api('h/v1/play', video_id)['item'] - format_url = format_data['playbackUrl'] - ext = determine_ext(format_url) - if ext == 'm3u8': + geo_restricted = False + playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] + for playback_set in playback_sets: + if not isinstance(playback_set, dict): + continue + format_url = url_or_none(playback_set.get('playbackUrl')) + if not format_url: + continue + format_url = re.sub( + r'(?<=//staragvod)(\d)', r'web\1', format_url) + tags = str_or_none(playback_set.get('tagsCombination')) or '' + if tags and 'encryption:plain' not in tags: + continue + ext = determine_ext(format_url) try: - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls')) + if 'package:hls' in tags or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + elif 'package:dash' in tags or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash')) + elif ext == 'f4m': + # produce broken files + pass + else: + formats.append({ + 'url': format_url, + 'width': int_or_none(playback_set.get('width')), + 'height': int_or_none(playback_set.get('height')), + }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self.raise_geo_restricted(countries=['IN']) - raise - elif ext == 'f4m': - # produce broken files - pass - else: - formats.append({ - 'url': format_url, - 'width': int_or_none(format_data.get('width')), - 'height': int_or_none(format_data.get('height')), - }) + geo_restricted = True + continue + if not formats and geo_restricted: + self.raise_geo_restricted(countries=['IN']) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 9ba1aa703..23f7b1fc9 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -60,8 +60,8 @@ class HRTiBaseIE(InfoExtractor): language=self._APP_LANGUAGE, application_id=self._APP_PUBLICATION_ID) - self._login_url = (modules['user']['resources']['login']['uri'] + - '/format/json').format(session_id=self._session_id) + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) self._logout_url = modules['user']['resources']['logout']['uri'] diff --git a/youtube_dl/extractor/hungama.py b/youtube_dl/extractor/hungama.py new file mode 100644 index 000000000..3fdaac5b6 --- /dev/null +++ b/youtube_dl/extractor/hungama.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urlencode_postdata, +) + + +class HungamaIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?hungama\.com/ + (?: + (?:video|movie)/[^/]+/| + tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + }, { + 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', + 'only_matching': True, + }, { + 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + info = self._search_json_ld(webpage, video_id) + + m3u8_url = self._download_json( + 'https://www.hungama.com/index.php', video_id, + data=urlencode_postdata({'content_id': video_id}), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'c': 'common', + 'm': 'get_video_mdn_url', + })['stream_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + info.update({ + 'id': video_id, + 'formats': formats, + }) + return info + + +class HungamaSongIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P\d+)' + _TEST = { + 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/', + 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'info_dict': { + 'id': '2931166', + 'ext': 'mp4', + 'title': 'Lucky Ali - Kitni Haseen Zindagi', + 'track': 'Kitni Haseen Zindagi', + 'artist': 'Lucky Ali', + 'album': 'Aks', + 'release_year': 2000, + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + + data = self._download_json( + 'https://www.hungama.com/audio-player-data/track/%s' % audio_id, + audio_id, query={'_country': 'IN'})[0] + + track = data['song_name'] + artist = data.get('singer_name') + + m3u8_url = self._download_json( + data.get('file') or data['preview_link'], + audio_id)['response']['media_url'] + + formats = self._extract_m3u8_formats( + m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + title = '%s - %s' % (artist, track) if artist else track + thumbnail = data.get('img_src') or data.get('album_image') + + return { + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'track': track, + 'artist': artist, + 'album': data.get('album_name'), + 'release_year': int_or_none(data.get('date')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index f7c913054..9ca28d632 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -1,18 +1,11 @@ from __future__ import unicode_literals -import json -import time - from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..utils import int_or_none class HypemIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P[0-9a-z]{5})' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', @@ -21,41 +14,36 @@ class HypemIE(InfoExtractor): 'ext': 'mp3', 'title': 'Tame', 'uploader': 'BODYWORK', + 'timestamp': 1371810457, + 'upload_date': '20130621', } } def _real_extract(self, url): track_id = self._match_id(url) - data = {'ax': 1, 'ts': time.time()} - request = sanitized_Request(url + '?' + compat_urllib_parse_urlencode(data)) - response, urlh = self._download_webpage_handle( - request, track_id, 'Downloading webpage with the url') + response = self._download_webpage(url, track_id) - html_tracks = self._html_search_regex( - r'(?ms)', - response, 'tracks') - try: - track_list = json.loads(html_tracks) - track = track_list['tracks'][0] - except ValueError: - raise ExtractorError('Hypemachine contained invalid JSON.') + track = self._parse_json(self._html_search_regex( + r'(?s)(.+?)', + response, 'tracks'), track_id)['tracks'][0] - key = track['key'] track_id = track['id'] title = track['song'] - request = sanitized_Request( - 'http://hypem.com/serve/source/%s/%s' % (track_id, key), - '', {'Content-Type': 'application/json'}) - song_data = self._download_json(request, track_id, 'Downloading metadata') - final_url = song_data['url'] - artist = track.get('artist') + final_url = self._download_json( + 'http://hypem.com/serve/source/%s/%s' % (track_id, track['key']), + track_id, 'Downloading metadata', headers={ + 'Content-Type': 'application/json' + })['url'] return { 'id': track_id, 'url': final_url, 'ext': 'mp3', 'title': title, - 'uploader': artist, + 'uploader': track.get('artist'), + 'duration': int_or_none(track.get('time')), + 'timestamp': int_or_none(track.get('ts')), + 'track': title, } diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index ecc958a17..a5ba03efa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -20,38 +20,23 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', }, }, { 'url': 'https://imgur.com/A61SaA1', - 'info_dict': { - 'id': 'A61SaA1', - 'ext': 'mp4', - 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 'Imgur: The magic of the Internet', - }, - }, { - 'url': 'https://imgur.com/gallery/YcAQlkx', - 'info_dict': { - 'id': 'YcAQlkx', - 'ext': 'mp4', - 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', - } - }, { - 'url': 'http://imgur.com/topic/Funny/N8rOudd', - 'only_matching': True, - }, { - 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }, { 'url': 'https://i.imgur.com/crGpqCV.mp4', 'only_matching': True, + }, { + # no title + 'url': 'https://i.imgur.com/jxBXAMC.gifv', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id) - webpage = self._download_webpage(gifv_url, video_id) + webpage = self._download_webpage( + 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id) width = int_or_none(self._og_search_property( 'video:width', webpage, default=None)) @@ -72,7 +57,6 @@ class ImgurIE(InfoExtractor): 'format_id': m.group('type').partition('/')[2], 'url': self._proto_relative_url(m.group('src')), 'ext': mimetype2ext(m.group('type')), - 'acodec': 'none', 'width': width, 'height': height, 'http_headers': { @@ -107,44 +91,64 @@ class ImgurIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'description': self._og_search_description(webpage, default=None), - 'title': self._og_search_title(webpage), + 'title': self._og_search_title(webpage, default=video_id), } -class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{5})(?:[/?#&]+)?$' +class ImgurGalleryIE(InfoExtractor): + IE_NAME = 'imgur:gallery' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', + 'title': 'Adding faces make every GIF better', }, 'playlist_count': 25, }, { - 'url': 'http://imgur.com/a/j6Orj', + 'url': 'http://imgur.com/topic/Aww/ll5Vk', 'only_matching': True, }, { - 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/r/aww/VQcQPhM', 'only_matching': True, }] def _real_extract(self, url): - album_id = self._match_id(url) + gallery_id = self._match_id(url) - album_images = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id, fatal=False) + data = self._download_json( + 'https://imgur.com/gallery/%s.json' % gallery_id, + gallery_id)['data']['image'] - if album_images: - data = album_images.get('data') - if data and isinstance(data, dict): - images = data.get('images') - if images and isinstance(images, list): - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in images if image.get('hash')] - return self.playlist_result(entries, album_id) + if data.get('is_album'): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash']) + for image in data['album_images']['images'] if image.get('hash')] + return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description')) - # Fallback to single video - return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) + return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) + + +class ImgurAlbumIE(ImgurGalleryIE): + IE_NAME = 'imgur:album' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'http://imgur.com/a/j6Orj', + 'info_dict': { + 'id': 'j6Orj', + 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"', + }, + 'playlist_count': 12, + }] diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 9544ff9d4..12695af27 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -1,36 +1,83 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + strip_or_none, + xpath_attr, + xpath_text, +) class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?PI?[A-Z0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P[A-Z0-9_]+)' + _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', + 'description': 'md5:3f09eb072a06cb286b8f7e4f77109663', } - } + }, { + 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', + 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/audio/P16173408', + 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/video/P16173408-video.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + video_id = self._match_id(url) + info_doc = self._download_xml( + 'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) + item = info_doc.find('channel/item') + title = xpath_text(item, 'title', fatal=True) + media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') + content = item.find(media_ns_xpath('content')) - video_id = mobj.group('id') - mrss_url = 'http://player.ina.fr/notices/%s.mrss' % video_id - info_doc = self._download_xml(mrss_url, video_id) + get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') + formats = [] + for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): + q_url = get_furl(q) + if not q_url: + continue + formats.append({ + 'format_id': q, + 'url': q_url, + 'width': w, + 'height': h, + }) + if not formats: + furl = get_furl('player') or content.attrib['url'] + ext = determine_ext(furl) + formats = [{ + 'url': furl, + 'vcodec': 'none' if ext == 'mp3' else None, + 'ext': ext, + }] - self.report_extraction(video_id) - - video_url = info_doc.find('.//{http://search.yahoo.com/mrss/}player').attrib['url'] + thumbnails = [] + for thumbnail in content.findall(media_ns_xpath('thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'height': int_or_none(thumbnail.get('height')), + 'width': int_or_none(thumbnail.get('width')), + }) return { 'id': video_id, - 'url': video_url, - 'title': info_doc.find('.//title').text, + 'formats': formats, + 'title': title, + 'description': strip_or_none(xpath_text(item, 'description')), + 'thumbnails': thumbnails, } diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 391c2f5d0..18249cf9b 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -122,9 +122,9 @@ class InfoQIE(BokeCCBaseIE): formats = self._extract_bokecc_formats(webpage, video_id) else: formats = ( - self._extract_rtmp_video(webpage) + - self._extract_http_video(webpage) + - self._extract_http_audio(webpage, video_id)) + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 7e0e838f0..b061850a1 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -92,6 +92,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/tv/aye83DjauH/', + 'only_matching': True, }] @staticmethod @@ -227,44 +230,37 @@ class InstagramIE(InfoExtractor): } -class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' - IE_DESC = 'Instagram user profile' - IE_NAME = 'instagram:user' - _TEST = { - 'url': 'https://instagram.com/porsche', - 'info_dict': { - 'id': 'porsche', - 'title': 'porsche', - }, - 'playlist_count': 5, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 5, - } - } +class InstagramPlaylistIE(InfoExtractor): + # A superclass for handling any kind of query based on GraphQL which + # results in a playlist. - _gis_tmpl = None + _gis_tmpl = None # used to cache GIS request type - def _entries(self, data): + def _parse_graphql(self, webpage, item_id): + # Reads a webpage and returns its GraphQL data. + return self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + item_id) + + def _extract_graphql(self, data, url): + # Parses GraphQL queries containing videos and generates a playlist. def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' - self._set_cookie('instagram.com', 'ig_pr', '1') - cursor = '' for page_num in itertools.count(1): - variables = json.dumps({ - 'id': uploader_id, + variables = { 'first': 12, 'after': cursor, - }) + } + variables.update(self._query_vars_for(data)) + variables = json.dumps(variables) if self._gis_tmpl: gis_tmpls = [self._gis_tmpl] @@ -276,21 +272,26 @@ class InstagramUserIE(InfoExtractor): '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), ] + # try all of the ways to generate a GIS query, and not only use the + # first one that works, but cache it for future requests for gis_tmpl in gis_tmpls: try: - media = self._download_json( + json_data = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-Instagram-GIS': hashlib.md5( ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), }, query={ - 'query_hash': '42323d64886122307be10013ad2dcc44', + 'query_hash': self._QUERY_HASH, 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + }) + media = self._parse_timeline_from(json_data) self._gis_tmpl = gis_tmpl break except ExtractorError as e: + # if it's an error caused by a bad query, and there are + # more GIS templates to try, ignore it and keep trying if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue @@ -348,14 +349,80 @@ class InstagramUserIE(InfoExtractor): break def _real_extract(self, url): - username = self._match_id(url) + user_or_tag = self._match_id(url) + webpage = self._download_webpage(url, user_or_tag) + data = self._parse_graphql(webpage, user_or_tag) - webpage = self._download_webpage(url, username) - - data = self._parse_json( - self._search_regex( - r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), - username) + self._set_cookie('instagram.com', 'ig_pr', '1') return self.playlist_result( - self._entries(data), username, username) + self._extract_graphql(data, url), user_or_tag, user_or_tag) + + +class InstagramUserIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + _TEST = { + 'url': 'https://instagram.com/porsche', + 'info_dict': { + 'id': 'porsche', + 'title': 'porsche', + }, + 'playlist_count': 5, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 5, + } + } + + _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['user']['edge_owner_to_timeline_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + } + + +class InstagramTagIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P[^/]+)' + IE_DESC = 'Instagram hashtag search' + IE_NAME = 'instagram:tag' + _TEST = { + 'url': 'https://instagram.com/explore/tags/lolcats', + 'info_dict': { + 'id': 'lolcats', + 'title': 'lolcats', + }, + 'playlist_count': 50, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 50, + } + } + + _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['hashtag']['edge_hashtag_to_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'tag_name': + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + } diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 1d58d6e85..11bbeb592 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -12,7 +12,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P[^?#]+)' + _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_BYPASS = False _TESTS = [{ @@ -41,6 +41,24 @@ class IPrimaIE(InfoExtractor): # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, + }, { + 'url': 'http://www.iprima.cz/filmy/desne-rande', + 'only_matching': True, + }, { + 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby', + 'only_matching': True, + }, { + 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy', + 'only_matching': True, + }, { + 'url': 'https://cool.iprima.cz/derava-silnice-nevadi', + 'only_matching': True, + }, { + 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', + 'only_matching': True, + }, { + 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 4b081bd46..cd11aa70f 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -383,9 +383,9 @@ class IqiyiIE(InfoExtractor): self._sleep(5, video_id) self._sort_formats(formats) - title = (get_element_by_id('widget-videotitle', webpage) or - clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or - self._html_search_regex(r']+data-videochanged-title="word"[^>]*>([^<]+)', webpage, 'title')) + title = (get_element_by_id('widget-videotitle', webpage) + or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) + or self._html_search_regex(r']+data-videochanged-title="word"[^>]*>([^<]+)', webpage, 'title')) return { 'id': video_id, diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index de65b6bb4..ad2f4eca5 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -77,10 +77,10 @@ class ITVIE(InfoExtractor): return etree.SubElement(element, _add_ns(name)) production_id = ( - params.get('data-video-autoplay-id') or - '%s#001' % ( - params.get('data-video-episode-id') or - video_id.replace('a', '/'))) + params.get('data-video-autoplay-id') + or '%s#001' % ( + params.get('data-video-episode-id') + or video_id.replace('a', '/'))) req_env = etree.Element(_add_ns('soapenv:Envelope')) _add_sub_element(req_env, 'soapenv:Header') diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index cb51cef2d..86c014b07 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -15,7 +15,7 @@ from ..utils import ( class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' - _VALID_URL = r'https?://(?:www\.)?ivi\.ru/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P\d+)' _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] @@ -65,7 +65,11 @@ class IviIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', - } + }, + { + 'url': 'https://www.ivi.tv/watch/33560/', + 'only_matching': True, + }, ] # Sorted by quality diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py index d9f8dbfd2..62b28e980 100644 --- a/youtube_dl/extractor/joj.py +++ b/youtube_dl/extractor/joj.py @@ -61,7 +61,7 @@ class JojIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates', + r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates', default='{}'), video_id, transform_source=js_to_json, fatal=False) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 63d0dc998..647b905f1 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -7,8 +7,8 @@ from .common import InfoExtractor class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P[a-zA-Z0-9]{8})' - _TEST = { + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video)s|jw6|v2/media)/|jwplatform:)(?P[a-zA-Z0-9]{8})' + _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', 'info_dict': { @@ -19,7 +19,10 @@ class JWPlatformIE(InfoExtractor): 'upload_date': '20081127', 'timestamp': 1227796140, } - } + }, { + 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js', + 'only_matching': True, + }] @staticmethod def _extract_url(webpage): @@ -34,5 +37,5 @@ class JWPlatformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id) return self._parse_jwplayer_data(json_data, video_id) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 04f68fce4..0a733424c 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -103,6 +103,11 @@ class KalturaIE(InfoExtractor): { 'url': 'https://www.kaltura.com:443/index.php/extwidget/preview/partner_id/1770401/uiconf_id/37307382/entry_id/0_58u8kme7/embed/iframe?&flashvars[streamerType]=auto', 'only_matching': True, + }, + { + # unavailable source format + 'url': 'kaltura:513551:1_66x4rg7o', + 'only_matching': True, } ] @@ -118,8 +123,8 @@ class KalturaIE(InfoExtractor): (?P['"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['"])entry_?[Ii]d(?P=q3)\s*:\s* (?P['"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) - """, webpage) or - re.search( + """, webpage) + or re.search( r'''(?xs) (?P["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+)(?:(?!(?P=q1)).)* @@ -132,8 +137,8 @@ class KalturaIE(InfoExtractor): \[\s*(?P["'])entry_?[Ii]d(?P=q2_1)\s*\]\s*=\s* ) (?P["'])(?P(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage) or - re.search( + ''', webpage) + or re.search( r'''(?xs) <(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P["']) (?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P\d+) @@ -145,6 +150,8 @@ class KalturaIE(InfoExtractor): ) if mobj: embed_info = mobj.groupdict() + for k, v in embed_info.items(): + embed_info[k] = v.strip() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_url = re.search( @@ -192,6 +199,8 @@ class KalturaIE(InfoExtractor): 'entryId': video_id, 'service': 'baseentry', 'ks': '{1:result:ks}', + 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + 'responseProfile:type': 1, }, { 'action': 'getbyentryid', @@ -302,12 +311,17 @@ class KalturaIE(InfoExtractor): f['fileExt'] = 'mp4' video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) + format_id = '%(fileExt)s-%(bitrate)s' % f + # Source format may not be available (e.g. kaltura:513551:1_66x4rg7o) + if f.get('isOriginal') is True and not self._is_valid_url( + video_url, entry_id, format_id): + continue # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g # -f mp4-56) vcodec = 'none' if 'videoCodecId' not in f and f.get( 'frameRate') == 0 else f.get('videoCodecId') formats.append({ - 'format_id': '%(fileExt)s-%(bitrate)s' % f, + 'format_id': format_id, 'ext': f.get('fileExt'), 'tbr': int_or_none(f['bitrate']), 'fps': int_or_none(f.get('frameRate')), diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index f236a2f78..7b291e0a0 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -47,8 +47,8 @@ class KarriereVideosIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = (self._html_search_meta('title', webpage, default=None) or - self._search_regex(r'

([^<]+)

', webpage, 'video title')) + title = (self._html_search_meta('title', webpage, default=None) + or self._search_regex(r'

([^<]+)

', webpage, 'video title')) video_id = self._search_regex( r'/config/video/(.+?)\.xml', webpage, 'video id') diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 63e10125e..cc5b2a1c1 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -215,7 +215,7 @@ class KuwoSingerIE(InfoExtractor): 'title': 'Ali', }, 'playlist_mincount': 95, - 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 + 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/ytdl-org/youtube-dl/jobs/78878540 }] PAGE_SIZE = 15 diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index c7f813370..fa217365a 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor from ..utils import ( @@ -32,7 +33,8 @@ class Laola1TvEmbedIE(InfoExtractor): def _extract_token_url(self, stream_access_url, video_id, data): return self._download_json( - stream_access_url, video_id, headers={ + self._proto_relative_url(stream_access_url, 'https:'), video_id, + headers={ 'Content-Type': 'application/json', }, data=json.dumps(data).encode())['data']['stream-access'][0] @@ -119,9 +121,59 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(Laola1TvEmbedIE): +class Laola1TvBaseIE(Laola1TvEmbedIE): + def _extract_video(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if 'Dieser Livestream ist bereits beendet.' in webpage: + raise ExtractorError('This live stream has already finished.', expected=True) + + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, + transform_source=lambda s: js_to_json(re.sub(r'shareurl:.+,', '', s))) + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, + } + + +class Laola1TvIE(Laola1TvBaseIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P[^/?#&]+)' + _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -169,52 +221,30 @@ class Laola1TvIE(Laola1TvEmbedIE): }] def _real_extract(self, url): - display_id = self._match_id(url) + return self._extract_video(url) - webpage = self._download_webpage(url, display_id) - if 'Dieser Livestream ist bereits beendet.' in webpage: - raise ExtractorError('This live stream has already finished.', expected=True) +class EHFTVIE(Laola1TvBaseIE): + IE_NAME = 'ehftv' + _VALID_URL = r'https?://(?:www\.)?ehftv\.com/[a-z]+(?:-[a-z]+)?/[^/]+/(?P[^/?#&]+)' - conf = self._parse_json(self._search_regex( - r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), - display_id, js_to_json) + _TESTS = [{ + 'url': 'https://www.ehftv.com/int/video/paris-saint-germain-handball-pge-vive-kielce/1166761', + 'info_dict': { + 'id': '1166761', + 'display_id': 'paris-saint-germain-handball-pge-vive-kielce', + 'ext': 'mp4', + 'title': 'Paris Saint-Germain Handball - PGE Vive Kielce', + 'is_live': False, + 'categories': ['Handball'], + }, + 'params': { + 'skip_download': True, + }, + }] - video_id = conf['videoid'] - - config = self._download_json(conf['configUrl'], video_id, query={ - 'videoid': video_id, - 'partnerid': conf['partnerid'], - 'language': conf.get('language', ''), - 'portal': conf.get('portalid', ''), - }) - error = config.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - video_data = config['video'] - title = video_data['title'] - is_live = video_data.get('isLivestream') and video_data.get('isLive') - meta = video_data.get('metaInformation') - sports = meta.get('sports') - categories = sports.split(',') if sports else [] - - token_url = self._extract_token_url( - video_data['streamAccess'], video_id, - video_data['abo']['required']) - - formats = self._extract_formats(token_url, video_id) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if is_live else title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('image'), - 'categories': categories, - 'formats': formats, - 'is_live': is_live, - } + def _real_extract(self, url): + return self._extract_video(url) class ITTFIE(InfoExtractor): diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py new file mode 100644 index 000000000..6ed7da4ab --- /dev/null +++ b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,244 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + determine_ext, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _API_BASE_URL = 'https://app.lecturio.com/api/en/latest/html5/' + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in compat_str(url_handle.geturl()) + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)]+class=["\']error_list[^>]+>(.+?)', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'''(?x) + https:// + (?: + app\.lecturio\.com/([^/]+/(?P[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P\d+))| + (?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.vortrag + ) + ''' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': '9a42cf1d8282a6311bf7211bbde26fde', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms — Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, + }, { + 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', + 'only_matching': True, + }] + + _CC_LANGS = { + 'Arabic': 'ar', + 'Bulgarian': 'bg', + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'Persian': 'fa', + 'French': 'fr', + 'Japanese': 'ja', + 'Polish': 'pl', + 'Pashto': 'ps', + 'Russian': 'ru', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + nt = mobj.group('nt') or mobj.group('nt_de') + lecture_id = mobj.group('id') + display_id = nt or lecture_id + api_path = 'lectures/' + lecture_id if lecture_id else 'lecture/' + nt + '.json' + video = self._download_json( + self._API_BASE_URL + api_path, display_id) + title = video['title'].strip() + if not lecture_id: + pid = video.get('productId') or video.get('uid') + if pid: + spid = pid.split('_') + if spid and len(spid) == 2: + lecture_id = spid[1] + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + f = { + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + } + if label: + mobj = re.match(r'(\d+)p\s*\(([^)]+)\)', label) + if mobj: + f.update({ + 'format_id': mobj.group(2), + 'height': int(mobj.group(1)), + }) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + captions = video.get('captions') or [] + for cc in captions: + cc_url = cc.get('url') + if not cc_url: + continue + cc_label = cc.get('translatedCode') + lang = cc.get('languageCode') or self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', + default=cc_label.split()[0] if cc_label else 'en') + original_lang = self._search_regex( + r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang', + default=None) + sub_dict = (automatic_captions + if 'auto-translated' in cc_label or original_lang + else subtitles) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id or nt, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/(?:[^/]+/(?P[^/?#&]+)\.course|(?:#/)?course/c/(?P\d+))' + _TESTS = [{ + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + 'description': 'md5:13da8500c25880c6016ae1e6d78c386a', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + }, { + 'url': 'https://app.lecturio.com/#/course/c/6434', + 'only_matching': True, + }] + + def _real_extract(self, url): + nt, course_id = re.match(self._VALID_URL, url).groups() + display_id = nt or course_id + api_path = 'courses/' + course_id if course_id else 'course/content/' + nt + '.json' + course = self._download_json( + self._API_BASE_URL + api_path, display_id) + entries = [] + for lecture in course.get('lectures', []): + lecture_id = str_or_none(lecture.get('id')) + lecture_url = lecture.get('url') + if lecture_url: + lecture_url = urljoin(url, lecture_url) + else: + lecture_url = 'https://app.lecturio.com/#/lecture/c/%s/%s' % (course_id, lecture_id) + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + return self.playlist_result( + entries, display_id, course.get('title'), + clean_html(course.get('description'))) + + +class LecturioDeCourseIE(LecturioBaseIE): + _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P[^/?#&]+)\.kurs' + _TEST = { + 'url': 'https://www.lecturio.de/jura/grundrechte.kurs', + 'only_matching': True, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)]+\bdata-lecture-id=["\'](?P\d+).+?\bhref=(["\'])(?P(?:(?!\2).)+\.vortrag)\b[^>]+>', + webpage): + lecture_url = urljoin(url, mobj.group('url')) + lecture_id = mobj.group('id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r']*>([^<]+)', webpage, 'title', default=None) + + return self.playlist_result(entries, display_id, title) diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 8dd1ce0d0..7dc0ad794 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -326,7 +326,7 @@ class LetvCloudIE(InfoExtractor): elif play_json.get('code'): raise ExtractorError('Letv cloud returned error %d' % play_json['code'], expected=True) else: - raise ExtractorError('Letv cloud returned an unknwon error') + raise ExtractorError('Letv cloud returned an unknown error') def b64decode(s): return compat_b64decode(s).decode('utf-8') diff --git a/youtube_dl/extractor/libraryofcongress.py b/youtube_dl/extractor/libraryofcongress.py index 40295a30b..03f205144 100644 --- a/youtube_dl/extractor/libraryofcongress.py +++ b/youtube_dl/extractor/libraryofcongress.py @@ -16,16 +16,15 @@ from ..utils import ( class LibraryOfCongressIE(InfoExtractor): IE_NAME = 'loc' IE_DESC = 'Library of Congress' - _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P[0-9a-z_.]+)' _TESTS = [{ # embedded via
.+?)\1', r']+id=(["\'])uuid-(?P.+?)\1', r']+data-uuid=(["\'])(?P.+?)\1', - r'mediaObjectId\s*:\s*(["\'])(?P.+?)\1'), + r'mediaObjectId\s*:\s*(["\'])(?P.+?)\1', + r'data-tab="share-media-(?P[0-9A-F]{32})"'), webpage, 'media id', group='id') data = self._download_json( 'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id, - video_id)['mediaObject'] + media_id)['mediaObject'] derivative = data['derivatives'][0] media_url = derivative['derivativeUrl'] @@ -89,25 +95,29 @@ class LibraryOfCongressIE(InfoExtractor): if ext not in ('mp4', 'mp3'): media_url += '.mp4' if is_video else '.mp3' - if 'vod/mp4:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8', + formats = [] + if '/vod/mp4:' in media_url: + formats.append({ + 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8', 'format_id': 'hls', 'ext': 'mp4', 'protocol': 'm3u8_native', 'quality': 1, - }] - elif 'vod/mp3:' in media_url: - formats = [{ - 'url': media_url.replace('vod/mp3:', ''), - 'vcodec': 'none', - }] + }) + http_format = { + 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url), + 'format_id': 'http', + 'quality': 1, + } + if not is_video: + http_format['vcodec'] = 'none' + formats.append(http_format) download_urls = set() for m in re.finditer( r']+value=(["\'])(?P.+?)\1[^>]+data-file-download=[^>]+>\s*(?P.+?)(?:(?: |\s+)\((?P.+?)\))?\s*<', webpage): format_id = m.group('id').lower() - if format_id == 'gif': + if format_id in ('gif', 'jpeg'): continue download_url = m.group('url') if download_url in download_urls: diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index f7311f483..2cf444258 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_duration, + strip_or_none, unified_strdate, ) @@ -21,7 +23,9 @@ class LibsynIE(InfoExtractor): 'id': '6385796', 'ext': 'mp3', 'title': "Champion Minded - Developing a Growth Mindset", - 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + # description fetched using another request: + # http://html5-player.libsyn.com/embed/getitemdetails?item_id=6385796 + # 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, @@ -38,22 +42,36 @@ class LibsynIE(InfoExtractor): }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') - url = m.group('mainurl') + url, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - podcast_title = self._search_regex( - r'

([^<]+)

', webpage, 'podcast title', default=None) - if podcast_title: - podcast_title = podcast_title.strip() - episode_title = self._search_regex( - r'(?:
|

)([^<]+)(.+?)'], + webpage, 'episode title') + episode_title = episode_title.strip() + + podcast_title = strip_or_none(clean_html(self._search_regex( + r'

([^<]+)

', webpage, 'podcast title', + default=None) or get_element_by_class('podcast-title', webpage))) title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title + formats = [] + for k, format_id in (('media_url_libsyn', 'libsyn'), ('media_url', 'main'), ('download_link', 'download')): + f_url = data.get(k) + if not f_url: + continue + formats.append({ + 'url': f_url, + 'format_id': format_id, + }) + description = self._html_search_regex( r'(.+?)

', webpage, 'description', default=None) @@ -61,27 +79,15 @@ class LibsynIE(InfoExtractor): # Strip non-breaking and normal spaces description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( - r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) - - data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') - data = json.loads(data_json) - - formats = [{ - 'url': data['media_url'], - 'format_id': 'main', - }, { - 'url': data['media_url_libsyn'], - 'format_id': 'libsyn', - }] - thumbnail = data.get('thumbnail_url') - duration = parse_duration(data.get('duration')) + r'
Released: ([^<]+)<', + webpage, 'release date', default=None) or data.get('release_date')) return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': data.get('thumbnail_url'), 'upload_date': release_date, - 'duration': duration, + 'duration': parse_duration(data.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/linkedin.py b/youtube_dl/extractor/linkedin.py new file mode 100644 index 000000000..26fc703d1 --- /dev/null +++ b/youtube_dl/extractor/linkedin.py @@ -0,0 +1,182 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + urlencode_postdata, + urljoin, +) + + +class LinkedInLearningBaseIE(InfoExtractor): + _NETRC_MACHINE = 'linkedin' + _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' + + def _call_api(self, course_slug, fields, video_slug=None, resolution=None): + query = { + 'courseSlug': course_slug, + 'fields': fields, + 'q': 'slugs', + } + sub = '' + if video_slug: + query.update({ + 'videoSlug': video_slug, + 'resolution': '_%s' % resolution, + }) + sub = ' %dp' % resolution + api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + return self._download_json( + api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ + 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, + }, query=query)['elements'][0] + + def _get_urn_id(self, video_data): + urn = video_data.get('urn') + if urn: + mobj = re.search(r'urn:li:lyndaCourse:\d+,(\d+)', urn) + if mobj: + return mobj.group(1) + + def _get_video_id(self, video_data, course_slug, video_slug): + return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) + + def _real_initialize(self): + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r']+action=(["\'])(?P.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r']+class="error"[^>]*>\s*(.+?)\s*', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + +class LinkedInLearningIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P[^/]+)/(?P[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true', + 'md5': 'a1d74422ff0d5e66a792deb996693167', + 'info_dict': { + 'id': '90426', + 'ext': 'mp4', + 'title': 'Welcome', + 'timestamp': 1430396150.82, + 'upload_date': '20150430', + }, + } + + def _real_extract(self, url): + course_slug, video_slug = re.match(self._VALID_URL, url).groups() + + video_data = None + formats = [] + for width, height in ((640, 360), (960, 540), (1280, 720)): + video_data = self._call_api( + course_slug, 'selectedVideo', video_slug, height)['selectedVideo'] + + video_url_data = video_data.get('url') or {} + progressive_url = video_url_data.get('progressiveUrl') + if progressive_url: + formats.append({ + 'format_id': 'progressive-%dp' % height, + 'url': progressive_url, + 'height': height, + 'width': width, + 'source_preference': 1, + }) + + title = video_data['title'] + + audio_url = video_data.get('audio', {}).get('progressiveUrl') + if audio_url: + formats.append({ + 'abr': 64, + 'ext': 'm4a', + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + streaming_url = video_url_data.get('streamingUrl') + if streaming_url: + formats.extend(self._extract_m3u8_formats( + streaming_url, video_slug, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + self._sort_formats(formats, ('width', 'height', 'source_preference', 'tbr', 'abr')) + + return { + 'id': self._get_video_id(video_data, course_slug, video_slug), + 'title': title, + 'formats': formats, + 'thumbnail': video_data.get('defaultThumbnail'), + 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), + 'duration': int_or_none(video_data.get('durationInSeconds')), + } + + +class LinkedInLearningCourseIE(LinkedInLearningBaseIE): + IE_NAME = 'linkedin:learning:course' + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/learning/(?P[^/?#]+)' + _TEST = { + 'url': 'https://www.linkedin.com/learning/programming-foundations-fundamentals', + 'info_dict': { + 'id': 'programming-foundations-fundamentals', + 'title': 'Programming Foundations: Fundamentals', + 'description': 'md5:76e580b017694eb89dc8e8923fff5c86', + }, + 'playlist_mincount': 61, + } + + @classmethod + def suitable(cls, url): + return False if LinkedInLearningIE.suitable(url) else super(LinkedInLearningCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_slug = self._match_id(url) + course_data = self._call_api(course_slug, 'chapters,description,title') + + entries = [] + for chapter_number, chapter in enumerate(course_data.get('chapters', []), 1): + chapter_title = chapter.get('title') + chapter_id = self._get_urn_id(chapter) + for video in chapter.get('videos', []): + video_slug = video.get('slug') + if not video_slug: + continue + entries.append({ + '_type': 'url_transparent', + 'id': self._get_video_id(video, course_slug, video_slug), + 'title': video.get('title'), + 'url': 'https://www.linkedin.com/learning/%s/%s' % (course_slug, video_slug), + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + 'ie_key': LinkedInLearningIE.ie_key(), + }) + + return self.playlist_result( + entries, course_slug, + course_data.get('title'), + course_data.get('description')) diff --git a/youtube_dl/extractor/linuxacademy.py b/youtube_dl/extractor/linuxacademy.py new file mode 100644 index 000000000..a78c6556e --- /dev/null +++ b/youtube_dl/extractor/linuxacademy.py @@ -0,0 +1,174 @@ +from __future__ import unicode_literals + +import json +import random +import re + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + orderedSet, + unescapeHTML, + urlencode_postdata, + urljoin, +) + + +class LinuxAcademyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?linuxacademy\.com/cp/ + (?: + courses/lesson/course/(?P\d+)/lesson/(?P\d+)| + modules/view/id/(?P\d+) + ) + ''' + _TESTS = [{ + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'info_dict': { + 'id': '1498-2', + 'ext': 'mp4', + 'title': "Introduction to the Practitioner's Brief", + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires Linux Academy account credentials', + }, { + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2', + 'only_matching': True, + }, { + 'url': 'https://linuxacademy.com/cp/modules/view/id/154', + 'info_dict': { + 'id': '154', + 'title': 'AWS Certified Cloud Practitioner', + 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + }, + 'playlist_count': 41, + 'skip': 'Requires Linux Academy account credentials', + }] + + _AUTHORIZE_URL = 'https://login.linuxacademy.com/authorize' + _ORIGIN_URL = 'https://linuxacademy.com' + _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' + _NETRC_MACHINE = 'linuxacademy' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def random_string(): + return ''.join([ + random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') + for _ in range(32)]) + + webpage, urlh = self._download_webpage_handle( + self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ + 'client_id': self._CLIENT_ID, + 'response_type': 'token id_token', + 'redirect_uri': self._ORIGIN_URL, + 'scope': 'openid email user_impersonation profile', + 'audience': self._ORIGIN_URL, + 'state': random_string(), + 'nonce': random_string(), + }) + + login_data = self._parse_json( + self._search_regex( + r'atob\(\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'login info', group='value'), None, + transform_source=lambda x: compat_b64decode(x).decode('utf-8') + )['extraParams'] + + login_data.update({ + 'client_id': self._CLIENT_ID, + 'redirect_uri': self._ORIGIN_URL, + 'tenant': 'lacausers', + 'connection': 'Username-Password-Authentication', + 'username': username, + 'password': password, + 'sso': 'true', + }) + + login_state_url = compat_str(urlh.geturl()) + + try: + login_page = self._download_webpage( + 'https://login.linuxacademy.com/usernamepassword/login', None, + 'Downloading login page', data=json.dumps(login_data).encode(), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read(), None) + message = error.get('description') or error['code'] + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, message), expected=True) + raise + + callback_page, urlh = self._download_webpage_handle( + 'https://login.linuxacademy.com/login/callback', None, + 'Downloading callback page', + data=urlencode_postdata(self._hidden_inputs(login_page)), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://login.linuxacademy.com', + 'Referer': login_state_url, + }) + + access_token = self._search_regex( + r'access_token=([^=&]+)', compat_str(urlh.geturl()), + 'access token') + + self._download_webpage( + 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' + % access_token, None, 'Downloading token validation page') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + chapter_id, lecture_id, course_id = mobj.group('chapter_id', 'lesson_id', 'course_id') + item_id = course_id if course_id else '%s-%s' % (chapter_id, lecture_id) + + webpage = self._download_webpage(url, item_id) + + # course path + if course_id: + entries = [ + self.url_result( + urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) + for lesson_url in orderedSet(re.findall( + r']+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', + webpage))] + title = unescapeHTML(self._html_search_regex( + (r'class=["\']course-title["\'][^>]*>(?P[^<]+)', + r'var\s+title\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), + webpage, 'title', default=None, group='value')) + description = unescapeHTML(self._html_search_regex( + r'var\s+description\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'description', default=None, group='value')) + return self.playlist_result(entries, course_id, title, description) + + # single video path + info = self._extract_jwplayer_data( + webpage, item_id, require_title=False, m3u8_id='hls',) + title = self._search_regex( + (r'>Lecture\s*:\s*(?P[^<]+)', + r'lessonName\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, + 'title', group='value') + info.update({ + 'id': item_id, + 'title': title, + }) + return info diff --git a/youtube_dl/extractor/livejournal.py b/youtube_dl/extractor/livejournal.py new file mode 100644 index 000000000..3a9f4553f --- /dev/null +++ b/youtube_dl/extractor/livejournal.py @@ -0,0 +1,42 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class LiveJournalIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?livejournal\.com/video/album/\d+.+?\bid=(?P\d+)' + _TEST = { + 'url': 'https://andrei-bt.livejournal.com/video/album/407/?mode=view&id=51272', + 'md5': 'adaf018388572ced8a6f301ace49d4b2', + 'info_dict': { + 'id': '1263729', + 'ext': 'mp4', + 'title': 'Истребители против БПЛА', + 'upload_date': '20190624', + 'timestamp': 1561406715, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + record = self._parse_json(self._search_regex( + r'Site\.page\s*=\s*({.+?});', webpage, + 'page data'), video_id)['video']['record'] + storage_id = compat_str(record['storageid']) + title = record.get('name') + if title: + # remove filename extension(.mp4, .mov, etc...) + title = title.rsplit('.', 1)[0] + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'thumbnail': record.get('thumbnail'), + 'timestamp': int_or_none(record.get('timecreate')), + 'url': 'eagleplatform:vc.videos.livejournal.com:' + storage_id, + 'ie_key': 'EaglePlatform', + } diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 26671753c..4ac437c8b 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -44,7 +44,7 @@ class LiveLeakIE(InfoExtractor): }, 'skip': 'Video is dead', }, { - # Covers https://github.com/rg3/youtube-dl/pull/5983 + # Covers https://github.com/ytdl-org/youtube-dl/pull/5983 # Multiple resolutions 'url': 'http://www.liveleak.com/view?i=801_1409392012', 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', @@ -57,7 +57,7 @@ class LiveLeakIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$' } }, { - # Covers https://github.com/rg3/youtube-dl/pull/10664#issuecomment-247439521 + # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521 'url': 'http://m.liveleak.com/view?i=763_1473349649', 'add_ie': ['Youtube'], 'info_dict': { @@ -82,12 +82,16 @@ class LiveLeakIE(InfoExtractor): }, { 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', 'only_matching': True, + }, { + # No original video + 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage): return re.findall( - r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"', + r']+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', webpage) def _real_extract(self, url): @@ -120,13 +124,29 @@ class LiveLeakIE(InfoExtractor): } for idx, info_dict in enumerate(entries): + formats = [] for a_format in info_dict['formats']: if not a_format.get('height'): a_format['height'] = int_or_none(self._search_regex( r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)) + formats.append(a_format) - self._sort_formats(info_dict['formats']) + # Removing '.*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/ytdl-org/youtube-dl/pull/4768) + orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) + if a_format['url'] != orig_url: + format_id = a_format.get('format_id') + format_id = 'original' + ('-' + format_id if format_id else '') + if self._is_valid_url(orig_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': orig_url, + 'preference': 1, + }) + self._sort_formats(formats) + info_dict['formats'] = formats # Don't append entry ID for one-video pages to keep backward compatibility if len(entries) > 1: @@ -146,7 +166,7 @@ class LiveLeakIE(InfoExtractor): class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[if])=(?P[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P[ift])=(?P[\w_]+)' # See generic.py for actual test cases _TESTS = [{ @@ -158,15 +178,14 @@ class LiveLeakEmbedIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - kind, video_id = mobj.group('kind', 'id') + kind, video_id = re.match(self._VALID_URL, url).groups() if kind == 'f': webpage = self._download_webpage(url, video_id) liveleak_url = self._search_regex( - r'logourl\s*:\s*(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, + r'(?:logourl\s*:\s*|window\.open\()(?P[\'"])(?P%s)(?P=q1)' % LiveLeakIE._VALID_URL, webpage, 'LiveLeak URL', group='url') - elif kind == 'i': - liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id + else: + liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index c4776bbf3..e55b1a202 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -363,7 +363,4 @@ class LivestreamShortenerIE(InfoExtractor): id = mobj.group('id') webpage = self._download_webpage(url, id) - return { - '_type': 'url', - 'url': self._og_search_url(webpage), - } + return self.url_result(self._og_search_url(webpage)) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 4ba61cd8a..b3d8653d0 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -15,7 +15,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): - _SIGNIN_URL = 'https://www.lynda.com/signin' + _SIGNIN_URL = 'https://www.lynda.com/signin/lynda' _PASSWORD_URL = 'https://www.lynda.com/signin/password' _USER_URL = 'https://www.lynda.com/signin/user' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' @@ -117,6 +117,10 @@ class LyndaIE(LyndaBaseIE): }, { 'url': 'https://www.lynda.com/de/Graphic-Design-tutorials/Willkommen-Grundlagen-guten-Gestaltung/393570/393572-4.html', 'only_matching': True, + }, { + # Status="NotFound", Message="Transcript not found" + 'url': 'https://www.lynda.com/ASP-NET-tutorials/What-you-should-know/5034180/2811512-4.html', + 'only_matching': True, }] def _raise_unavailable(self, video_id): @@ -247,12 +251,17 @@ class LyndaIE(LyndaBaseIE): def _get_subtitles(self, video_id): url = 'https://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id - subs = self._download_json(url, None, False) + subs = self._download_webpage( + url, video_id, 'Downloading subtitles JSON', fatal=False) + if not subs or 'Status="NotFound"' in subs: + return {} + subs = self._parse_json(subs, video_id, fatal=False) + if not subs: + return {} fixed_subs = self._fix_subtitles(subs) if fixed_subs: return {'en': [{'ext': 'srt', 'data': fixed_subs}]} - else: - return {} + return {} class LyndaCourseIE(LyndaBaseIE): diff --git a/youtube_dl/extractor/malltv.py b/youtube_dl/extractor/malltv.py new file mode 100644 index 000000000..e13c2e11a --- /dev/null +++ b/youtube_dl/extractor/malltv.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import merge_dicts + + +class MallTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'md5': '1c4a37f080e1f3023103a7b43458e518', + 'info_dict': { + 'id': 't0zzt0', + 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'ext': 'mp4', + 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', + 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'duration': 216, + 'timestamp': 1538870400, + 'upload_date': '20181007', + 'view_count': int, + } + }, { + 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + url, display_id, headers=self.geo_verification_headers()) + + SOURCE_RE = r'(]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P[\da-z]+)/index)\b' + video_id = self._search_regex( + SOURCE_RE, webpage, 'video id', group='id') + + media = self._parse_html5_media_entries( + url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, + m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + + info = self._search_json_ld(webpage, video_id, default={}) + + return merge_dicts(media, info, { + 'id': video_id, + 'display_id': display_id, + 'title': self._og_search_title(webpage, default=None) or display_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + }) diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py index b94b3c2ab..e8d7163e4 100644 --- a/youtube_dl/extractor/manyvids.py +++ b/youtube_dl/extractor/manyvids.py @@ -2,12 +2,18 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + determine_ext, + int_or_none, + str_to_int, + urlencode_postdata, +) class ManyVidsIE(InfoExtractor): _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ + # preview video 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/', 'md5': '03f11bb21c52dd12a05be21a5c7dcc97', 'info_dict': { @@ -17,7 +23,18 @@ class ManyVidsIE(InfoExtractor): 'view_count': int, 'like_count': int, }, - } + }, { + # full video + 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', + 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'info_dict': { + 'id': '935718', + 'ext': 'mp4', + 'title': 'MY FACE REVEAL', + 'view_count': int, + 'like_count': int, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -28,12 +45,41 @@ class ManyVidsIE(InfoExtractor): r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video URL', group='url') - title = '%s (Preview)' % self._html_search_regex( - r']+class="m-a-0"[^>]*>([^<]+)', webpage, 'title') + title = self._html_search_regex( + (r']+class=["\']item-title[^>]+>([^<]+)', + r']+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + + if any(p in webpage for p in ('preview_videos', '_preview.mp4')): + title += ' (Preview)' + + mv_token = self._search_regex( + r'data-mvtoken=(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'mv token', default=None, group='value') + + if mv_token: + # Sets some cookies + self._download_webpage( + 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', + video_id, fatal=False, data=urlencode_postdata({ + 'mvtoken': mv_token, + 'vid': video_id, + }), headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + }) + + if determine_ext(video_url) == 'm3u8': + formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + formats = [{'url': video_url}] like_count = int_or_none(self._search_regex( r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = int_or_none(self._html_search_regex( + view_count = str_to_int(self._html_search_regex( r'(?s)]+class="views-wrapper"[^>]*>(.+?)[0-9a-f]{32,34})(?P\?[^#]+|)' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P%s)(?P\?[^#]+|)' % _ID_RE _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -84,6 +89,19 @@ class MediasiteIE(InfoExtractor): 'timestamp': 1333983600, 'duration': 7794, } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, + { + # dashed id + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', + 'only_matching': True, } ] @@ -101,7 +119,7 @@ class MediasiteIE(InfoExtractor): return [ unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', + r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, webpage)] def _real_extract(self, url): @@ -213,3 +231,136 @@ class MediasiteIE(InfoExtractor): 'formats': formats, 'thumbnails': thumbnails, } + + +class MediasiteCatalogIE(InfoExtractor): + _VALID_URL = r'''(?xi) + (?Phttps?://[^/]+/Mediasite) + /Catalog/Full/ + (?P{0}) + (?: + /(?P{0}) + /(?P{0}) + )? + '''.format(_ID_RE) + _TESTS = [{ + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', + 'info_dict': { + 'id': '631f9e48530d454381549f955d08c75e21', + 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', + }, + 'playlist_count': 6, + 'expected_warnings': ['is not a supported codec'], + }, { + # with CurrentFolderId and RootDynamicFolderId + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'info_dict': { + 'id': '9518c4a6c5cf4993b21cbd53e828a92521', + 'title': 'IUSM Family and Friends Sessions', + }, + 'playlist_count': 2, + }, { + 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', + 'only_matching': True, + }, { + # no AntiForgeryToken + 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', + 'only_matching': True, + }, { + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'only_matching': True, + }, { + # dashed id + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mediasite_url = mobj.group('url') + catalog_id = mobj.group('catalog_id') + current_folder_id = mobj.group('current_folder_id') or catalog_id + root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') + + webpage = self._download_webpage(url, catalog_id) + + # AntiForgeryToken is optional (e.g. [1]) + # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 + anti_forgery_token = self._search_regex( + r'AntiForgeryToken\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'anti forgery token', default=None, group='value') + if anti_forgery_token: + anti_forgery_header = self._search_regex( + r'AntiForgeryHeaderName\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'anti forgery header name', + default='X-SOFO-AntiForgeryHeader', group='value') + + data = { + 'IsViewPage': True, + 'IsNewFolder': True, + 'AuthTicket': None, + 'CatalogId': catalog_id, + 'CurrentFolderId': current_folder_id, + 'RootDynamicFolderId': root_dynamic_folder_id, + 'ItemsPerPage': 1000, + 'PageIndex': 0, + 'PermissionMask': 'Execute', + 'CatalogSearchType': 'SearchInFolder', + 'SortBy': 'Date', + 'SortDirection': 'Descending', + 'StartDate': None, + 'EndDate': None, + 'StatusFilterList': None, + 'PreviewKey': None, + 'Tags': [], + } + + headers = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + if anti_forgery_token: + headers[anti_forgery_header] = anti_forgery_token + + catalog = self._download_json( + '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, + catalog_id, data=json.dumps(data).encode(), headers=headers) + + entries = [] + for video in catalog['PresentationDetailsList']: + if not isinstance(video, dict): + continue + video_id = str_or_none(video.get('Id')) + if not video_id: + continue + entries.append(self.url_result( + '%s/Play/%s' % (mediasite_url, video_id), + ie=MediasiteIE.ie_key(), video_id=video_id)) + + title = try_get( + catalog, lambda x: x['CurrentFolder']['Name'], compat_str) + + return self.playlist_result(entries, catalog_id, title,) + + +class MediasiteNamedCatalogIE(InfoExtractor): + _VALID_URL = r'(?xi)(?Phttps?://[^/]+/Mediasite)/Catalog/catalogs/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mediasite_url = mobj.group('url') + catalog_name = mobj.group('catalog_name') + + webpage = self._download_webpage(url, catalog_name) + + catalog_id = self._search_regex( + r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') + + return self.url_result( + '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), + ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 28f59f63c..9e92416d1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals +import json import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_urllib_parse, compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, ) from ..utils import ( determine_ext, @@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor): headers = { # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False}) + 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) } # AnyClip videos require the flashversion cookie so that we get the link diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index d53d96aae..71fc3ec56 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -1,22 +1,32 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 +import time +import uuid + from .common import InfoExtractor -from ..compat import compat_str -from ..utils import int_or_none +from ..compat import ( + compat_HTTPError, + compat_str, +) +from ..utils import ( + ExtractorError, + int_or_none, +) class MGTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' + _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', - 'md5': 'b1ffc0fc163152acf6beaa81832c9ee7', 'info_dict': { 'id': '3116640', 'ext': 'mp4', - 'title': '我是歌手第四季双年巅峰会:韩红李玟“双王”领军对抗', + 'title': '我是歌手 第四季', 'description': '我是歌手第四季双年巅峰会', 'duration': 7461, 'thumbnail': r're:^https?://.*\.jpg$', @@ -28,16 +38,30 @@ class MGTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_data = self._download_json( - 'http://pcweb.api.mgtv.com/player/video', video_id, - query={'video_id': video_id}, - headers=self.geo_verification_headers())['data'] + try: + api_data = self._download_json( + 'https://pcweb.api.mgtv.com/player/video', video_id, query={ + 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'video_id': video_id, + }, headers=self.geo_verification_headers())['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None) + if error.get('code') == 40005: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(error['msg'], expected=True) + raise info = api_data['info'] title = info['title'].strip() - stream_domain = api_data['stream_domain'][0] + stream_data = self._download_json( + 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ + 'pm2': api_data['atc']['pm2'], + 'video_id': video_id, + }, headers=self.geo_verification_headers())['data'] + stream_domain = stream_data['stream_domain'][0] formats = [] - for idx, stream in enumerate(api_data['stream']): + for idx, stream in enumerate(stream_data['stream']): stream_path = stream.get('url') if not stream_path: continue @@ -47,7 +71,7 @@ class MGTVIE(InfoExtractor): format_url = format_data.get('info') if not format_url: continue - tbr = int_or_none(self._search_regex( + tbr = int_or_none(stream.get('filebitrate') or self._search_regex( r'_(\d+)_mp4/', format_url, 'tbr', default=None)) formats.append({ 'format_id': compat_str(tbr or idx), @@ -55,6 +79,10 @@ class MGTVIE(InfoExtractor): 'ext': 'mp4', 'tbr': tbr, 'protocol': 'm3u8_native', + 'http_headers': { + 'Referer': url, + }, + 'format_note': stream.get('name'), }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index b7bccb504..bf5353ef9 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -161,11 +161,17 @@ class MixcloudIE(InfoExtractor): stream_info = info_json['streamInfo'] formats = [] + def decrypt_url(f_url): + for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'): + decrypted_url = self._decrypt_xor_cipher(k, f_url) + if re.search(r'^https?://[0-9A-Za-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url): + return decrypted_url + for url_key in ('url', 'hlsUrl', 'dashUrl'): format_url = stream_info.get(url_key) if not format_url: continue - decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url)) + decrypted = decrypt_url(compat_b64decode(format_url)) if not decrypted: continue if url_key == 'hlsUrl': diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py index 44bcc4982..eb9b4ce7c 100644 --- a/youtube_dl/extractor/moevideo.py +++ b/youtube_dl/extractor/moevideo.py @@ -1,15 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( - ExtractorError, + clean_html, int_or_none, - sanitized_Request, - urlencode_postdata, ) @@ -17,8 +14,8 @@ class MoeVideoIE(InfoExtractor): IE_DESC = 'LetitBit video services: moevideo.net, playreplay.net and videochart.net' _VALID_URL = r'''(?x) https?://(?P(?:www\.)? - (?:(?:moevideo|playreplay|videochart)\.net))/ - (?:video|framevideo)/(?P[0-9]+\.[0-9A-Za-z]+)''' + (?:(?:moevideo|playreplay|videochart)\.net|thesame\.tv))/ + (?:video|framevideo|embed)/(?P[0-9a-z]+\.[0-9A-Za-z]+)''' _API_URL = 'http://api.letitbit.net/' _API_KEY = 'tVL0gjqo5' _TESTS = [ @@ -57,58 +54,26 @@ class MoeVideoIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage( - 'http://%s/video/%s' % (mobj.group('host'), video_id), + 'http://%s/video/%s' % (host, video_id), video_id, 'Downloading webpage') title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - r = [ - self._API_KEY, - [ - 'preview/flv_link', - { - 'uid': video_id, - }, - ], - ] - r_json = json.dumps(r) - post = urlencode_postdata({'r': r_json}) - req = sanitized_Request(self._API_URL, post) - req.add_header('Content-type', 'application/x-www-form-urlencoded') - - response = self._download_json(req, video_id) - if response['status'] != 'OK': - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, response['data']), - expected=True - ) - item = response['data'][0] - video_url = item['link'] - duration = int_or_none(item['length']) - width = int_or_none(item['width']) - height = int_or_none(item['height']) - filesize = int_or_none(item['convert_size']) - - formats = [{ - 'format_id': 'sd', - 'http_headers': {'Range': 'bytes=0-'}, # Required to download - 'url': video_url, - 'width': width, - 'height': height, - 'filesize': filesize, - }] + embed_webpage = self._download_webpage( + 'http://%s/embed/%s' % (host, video_id), + video_id, 'Downloading embed webpage') + video = self._parse_json(self._search_regex( + r'mvplayer\("#player"\s*,\s*({.+})', + embed_webpage, 'mvplayer'), video_id)['video'] return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'formats': formats, + 'thumbnail': video.get('poster') or self._og_search_thumbnail(webpage), + 'description': clean_html(self._og_search_description(webpage)), + 'duration': int_or_none(self._og_search_property('video:duration', webpage)), + 'url': video['ourUrl'], } diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index d4bd273b6..43fd70f11 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -80,8 +80,8 @@ class MotherlessIE(InfoExtractor): video_url = (self._html_search_regex( (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', r'fileurl\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), - webpage, 'video URL', default=None, group='url') or - 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) + webpage, 'video URL', default=None, group='url') + or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( r'Views\s+([^<]+)<', diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index 650731fdc..0460cf4d5 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -70,7 +70,7 @@ class MSNIE(InfoExtractor): continue if 'm3u8' in format_url: # m3u8_native should not be used here until - # https://github.com/rg3/youtube-dl/issues/9913 is fixed + # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed m3u8_formats = self._extract_m3u8_formats( format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 4d2ee6408..ee12e2b47 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,15 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .adobepass import AdobePassIE -from .theplatform import ThePlatformIE +from .fox import FOXIE from ..utils import ( smuggle_url, url_basename, - update_url_query, - get_element_by_class, ) @@ -66,130 +61,22 @@ class NationalGeographicVideoIE(InfoExtractor): } -class NationalGeographicIE(ThePlatformIE, AdobePassIE): - IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P[^/?]+)' - - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/', - 'md5': '518c9aa655686cf81493af5cc21e2a04', - 'info_dict': { - 'id': 'vKInpacll2pC', - 'ext': 'mp4', - 'title': 'Uncovering a Universal Knowledge', - 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', - 'timestamp': 1458680907, - 'upload_date': '20160322', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], +class NationalGeographicTVIE(FOXIE): + _VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P[\da-fA-F]+)' + _TESTS = [{ + 'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/', + 'info_dict': { + 'id': '6a875e6e734b479beda26438c9f21138', + 'ext': 'mp4', + 'title': 'Why Nat Geo? Valley of the Boom', + 'description': 'The lives of prominent figures in the tech world, including their friendships, rivalries, victories and failures.', + 'timestamp': 1542662458, + 'upload_date': '20181119', + 'age_limit': 14, }, - { - 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/', - 'md5': 'c4912f656b4cbe58f3e000c489360989', - 'info_dict': { - 'id': 'Pok5lWCkiEFA', - 'ext': 'mp4', - 'title': 'The Stunning Red Bird of Paradise', - 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', - 'timestamp': 1459362152, - 'upload_date': '20160330', - 'uploader': 'NEWA-FNG-NGTV', - }, - 'add_ie': ['ThePlatform'], + 'params': { + 'skip_download': True, }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', - 'only_matching': True, - }, - { - 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - release_url = self._search_regex( - r'video_auth_playlist_url\s*=\s*"([^"]+)"', - webpage, 'release url') - theplatform_path = self._search_regex(r'https?://link\.theplatform\.com/s/([^?]+)', release_url, 'theplatform path') - video_id = theplatform_path.split('/')[-1] - query = { - 'mbr': 'true', - } - is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) - if is_auth == 'auth': - auth_resource_id = self._search_regex( - r"video_auth_resourceId\s*=\s*'([^']+)'", - webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, video_id, 'natgeo', auth_resource_id) - - formats = [] - subtitles = {} - for key, value in (('switch', 'http'), ('manifest', 'm3u')): - tp_query = query.copy() - tp_query.update({ - key: value, - }) - tp_formats, tp_subtitles = self._extract_theplatform_smil( - update_url_query(release_url, tp_query), video_id, 'Downloading %s SMIL data' % value) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self._extract_theplatform_metadata(theplatform_path, display_id) - info.update({ - 'id': video_id, - 'formats': formats, - 'subtitles': subtitles, - 'display_id': display_id, - }) - return info - - -class NationalGeographicEpisodeGuideIE(InfoExtractor): - IE_NAME = 'natgeo:episodeguide' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P[^/]+)/episode-guide' - _TESTS = [ - { - 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', - 'info_dict': { - 'id': 'the-story-of-god-with-morgan-freeman-season-1', - 'title': 'The Story of God with Morgan Freeman - Season 1', - }, - 'playlist_mincount': 6, - }, - { - 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', - 'info_dict': { - 'id': 'underworld-inc-season-2', - 'title': 'Underworld, Inc. - Season 2', - }, - 'playlist_mincount': 7, - }, - ] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - show = get_element_by_class('show', webpage) - selected_season = self._search_regex( - r']+class="select-seasons[^"]*".*?]*>(.*?)', - webpage, 'selected season') - entries = [ - self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') - for entry_url in re.findall('(?s)]+class="col-inner"[^>]*?>.*?]+href="([^"]+)"', webpage)] - return self.playlist_result( - entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), - '%s - %s' % (show, selected_season)) + }] + _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' + _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 765c46fd2..3282f84ee 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,8 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( - find_xpath_attr, smuggle_url, try_get, - unescapeHTML, update_url_query, int_or_none, ) @@ -269,27 +267,14 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ - (?:video/.+?/(?P\d+)| - ([^/]+/)*(?:.*-)?(?P[^/?]+)) - ''' + _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' _TESTS = [ - { - 'url': 'http://www.nbcnews.com/video/nbc-news/52753292', - 'md5': '47abaac93c6eaf9ad37ee6c4463a5179', - 'info_dict': { - 'id': '52753292', - 'ext': 'flv', - 'title': 'Crew emerges after four-month Mars food study', - 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', - }, - }, { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'p_tweet_snow_140529', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -313,7 +298,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'nn_netcast_150204', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -326,7 +311,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': 'x_lon_vwhorn_150922', + 'id': '529953347624', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -339,7 +324,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': 'tdy_al_space_160420', + 'id': '669831235788', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -352,7 +337,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', + 'id': '314487875924', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', @@ -374,60 +359,22 @@ class NBCNewsIE(ThePlatformIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if video_id is not None: - all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id) - info = all_info.find('video') - - return { - 'id': video_id, - 'title': info.find('headline').text, - 'ext': 'flv', - 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text, - 'description': info.find('caption').text, - 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text, - } - else: - # "feature" and "nightly-news" pages use theplatform.com - video_id = mobj.group('mpx_id') + video_id = self._match_id(url) + if not video_id.isdigit(): webpage = self._download_webpage(url, video_id) - filter_param = 'byId' - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], - webpage, 'bootstrap json', default=None) - if bootstrap_json: - bootstrap = self._parse_json( - bootstrap_json, video_id, transform_source=unescapeHTML) + data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.+});', webpage, + 'bootstrap json'), video_id) + video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id'] - info = None - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - elif 'msnbcVideoInfo' in bootstrap: - info = bootstrap['msnbcVideoInfo']['meta'] - elif 'msnbcThePlatform' in bootstrap: - info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] - else: - info = bootstrap - - if 'guid' in info: - video_id = info['guid'] - filter_param = 'byGuid' - elif 'mpxId' in info: - video_id = info['mpxId'] - - return { - '_type': 'url_transparent', - 'id': video_id, - # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), - 'ie_key': 'ThePlatformFeed', - } + return { + '_type': 'url_transparent', + 'id': video_id, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}), + 'ie_key': 'ThePlatformFeed', + } class NBCOlympicsIE(InfoExtractor): diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index ddec89f2c..bc3eb9160 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -84,8 +84,8 @@ class NDTVIE(InfoExtractor): # '__title' does not contain extra words such as sub-site name, "Video" etc. title = compat_urllib_parse_unquote_plus( - self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) or - self._og_search_title(webpage)) + self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) + or self._og_search_title(webpage)) filename = self._search_regex( r"(?:__)?filename\s*[:=]\s*'([^']+)'", webpage, 'video filename') diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py index e3f35f1d8..dab4aec44 100644 --- a/youtube_dl/extractor/newstube.py +++ b/youtube_dl/extractor/newstube.py @@ -1,12 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals -import re +import base64 +import hashlib from .common import InfoExtractor +from ..aes import aes_cbc_decrypt from ..utils import ( - ExtractorError, + bytes_to_intlist, int_or_none, + intlist_to_bytes, + parse_codecs, + parse_duration, ) @@ -14,7 +19,7 @@ class NewstubeIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P.+)' _TEST = { 'url': 'http://www.newstube.ru/media/telekanal-cnn-peremestil-gorod-slavyansk-v-krym', - 'md5': '801eef0c2a9f4089fa04e4fe3533abdc', + 'md5': '9d10320ad473444352f72f746ccb8b8c', 'info_dict': { 'id': '728e0ef2-e187-4012-bac0-5a081fdcb1f6', 'ext': 'mp4', @@ -25,84 +30,45 @@ class NewstubeIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - page = self._download_webpage(url, video_id, 'Downloading page') + page = self._download_webpage(url, video_id) + title = self._html_search_meta(['og:title', 'twitter:title'], page, fatal=True) video_guid = self._html_search_regex( - r'([^<>]+)', page, 'news title', default=None) or - self._html_search_meta('description', page, 'news title')) + return (self._html_search_regex(r'

([^<>]+)

', page, 'news title', default=None) + or self._html_search_meta('description', page, 'news title')) def _fetch_thumbnail(self, page): return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) diff --git a/youtube_dl/extractor/nfb.py b/youtube_dl/extractor/nfb.py deleted file mode 100644 index adcc636bc..000000000 --- a/youtube_dl/extractor/nfb.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - clean_html, - determine_ext, - int_or_none, - qualities, - urlencode_postdata, - xpath_text, -) - - -class NFBIE(InfoExtractor): - IE_NAME = 'nfb' - IE_DESC = 'National Film Board of Canada' - _VALID_URL = r'https?://(?:www\.)?(?:nfb|onf)\.ca/film/(?P[\da-z_-]+)' - - _TEST = { - 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny', - 'info_dict': { - 'id': 'qallunaat_why_white_people_are_funny', - 'ext': 'flv', - 'title': 'Qallunaat! Why White People Are Funny ', - 'description': 'md5:6b8e32dde3abf91e58857b174916620c', - 'duration': 3128, - 'creator': 'Mark Sandiford', - 'uploader': 'Mark Sandiford', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - config = self._download_xml( - 'https://www.nfb.ca/film/%s/player_config' % video_id, - video_id, 'Downloading player config XML', - data=urlencode_postdata({'getConfig': 'true'}), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf' - }) - - title, description, thumbnail, duration, uploader, author = [None] * 6 - thumbnails, formats = [[]] * 2 - subtitles = {} - - for media in config.findall('./player/stream/media'): - if media.get('type') == 'posterImage': - quality_key = qualities(('low', 'high')) - thumbnails = [] - for asset in media.findall('assets/asset'): - asset_url = xpath_text(asset, 'default/url', default=None) - if not asset_url: - continue - quality = asset.get('quality') - thumbnails.append({ - 'url': asset_url, - 'id': quality, - 'preference': quality_key(quality), - }) - elif media.get('type') == 'video': - title = xpath_text(media, 'title', fatal=True) - for asset in media.findall('assets/asset'): - quality = asset.get('quality') - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', quality or '', 'height', default=None)) - for node in asset: - streamer = xpath_text(node, 'streamerURI', default=None) - if not streamer: - continue - play_path = xpath_text(node, 'url', default=None) - if not play_path: - continue - formats.append({ - 'url': streamer, - 'app': streamer.split('/', 3)[3], - 'play_path': play_path, - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag, - 'height': height, - }) - self._sort_formats(formats) - description = clean_html(xpath_text(media, 'description')) - uploader = xpath_text(media, 'author') - duration = int_or_none(media.get('duration')) - for subtitle in media.findall('./subtitles/subtitle'): - subtitle_url = xpath_text(subtitle, 'url', default=None) - if not subtitle_url: - continue - lang = xpath_text(subtitle, 'lang', default='en') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - 'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(), - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'creator': uploader, - 'uploader': uploader, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 5c8cd76dc..241412f98 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,51 +1,81 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import ExtractorError class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P[^/]+/[^/?#&]+)' - _TEST = { - # Videos available only for a limited period of time. Visit - # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. - 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', - 'info_dict': { - 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', - 'ext': 'flv', - 'title': 'TOKYO FASHION EXPRESS - The Kimono as Global Fashion', - 'description': 'md5:db338ee6ce8204f415b754782f819824', - 'series': 'TOKYO FASHION EXPRESS', - 'episode': 'The Kimono as Global Fashion', - }, - 'skip': 'Videos available only for a limited period of time', - } - _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[a-z]+-\d{8}-\d+)' + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _TESTS = [{ + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }] + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sodesdlist/v7/episode/%s/%s/all%s.json' def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json(self._API_URL, video_id) - - try: - episode = next( - e for e in data['data']['episodes'] - if e.get('url') and video_id in e['url']) - except StopIteration: - raise ExtractorError('Unable to find episode') - - embed_code = episode['vod_id'] + lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id.isdigit(): + episode_id = episode_id[:4] + '-' + episode_id[4:] + is_video = m_type == 'video' + episode = self._download_json( + self._API_URL_TEMPLATE % ('v' if is_video else 'r', episode_id, lang, '/all' if is_video else ''), + episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] title = episode.get('sub_title_clean') or episode['sub_title'] - description = episode.get('description_clean') or episode.get('description') - series = episode.get('title_clean') or episode.get('title') - return { - '_type': 'url_transparent', - 'ie_key': 'Ooyala', - 'url': 'ooyala:%s' % embed_code, + def get_clean_field(key): + return episode.get(key + '_clean') or episode.get(key) + + series = get_clean_field('title') + + thumbnails = [] + for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: + img_path = episode.get('image' + s) + if not img_path: + continue + thumbnails.append({ + 'id': '%dp' % h, + 'height': h, + 'width': w, + 'url': 'https://www3.nhk.or.jp' + img_path, + }) + + info = { + 'id': episode_id + '-' + lang, 'title': '%s - %s' % (series, title) if series and title else title, - 'description': description, + 'description': get_clean_field('description'), + 'thumbnails': thumbnails, 'series': series, 'episode': title, } + if is_video: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:' + episode['vod_id'], + }) + else: + audio = episode['audio'] + audio_path = audio['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', m3u8_id='hls', fatal=False) + for proto in ('rtmpt', 'rtmp'): + info['formats'].append({ + 'ext': 'flv', + 'format_id': proto, + 'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path), + 'vcodec': 'none', + }) + for f in info['formats']: + f['language'] = lang + return info diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index cf440f713..eddfe1f37 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -108,7 +108,7 @@ class NHLIE(NHLBaseIE): 'timestamp': 1454544904, }, }, { - # Some m3u8 URLs are invalid (https://github.com/rg3/youtube-dl/issues/10713) + # Some m3u8 URLs are invalid (https://github.com/ytdl-org/youtube-dl/issues/10713) 'url': 'https://www.nhl.com/predators/video/poile-laviolette-on-subban-trade/t-277437416/c-44315003', 'md5': '50b2bb47f405121484dda3ccbea25459', 'info_dict': { diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 5e34d776b..2e8b302ac 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -85,7 +85,8 @@ class NickBrIE(MTVServicesInfoExtractor): https?:// (?: (?P(?:www\.)?nickjr|mundonick\.uol)\.com\.br| - (?:www\.)?nickjr\.[a-z]{2} + (?:www\.)?nickjr\.[a-z]{2}| + (?:www\.)?nickelodeonjunior\.fr ) /(?:programas/)?[^/]+/videos/(?:episodios/)?(?P[^/?\#.]+) ''' @@ -101,6 +102,9 @@ class NickBrIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.de/blaze-und-die-monster-maschinen/videos/f6caaf8f-e4e8-4cc1-b489-9380d6dcd059/', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeonjunior.fr/paw-patrol-la-pat-patrouille/videos/episode-401-entier-paw-patrol/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 76b412ff1..eb07ca776 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -369,14 +369,14 @@ class NiconicoIE(InfoExtractor): video_detail = watch_api_data.get('videoDetail', {}) thumbnail = ( - get_video_info(['thumbnail_url', 'thumbnailURL']) or - self._html_search_meta('image', webpage, 'thumbnail', default=None) or - video_detail.get('thumbnail')) + get_video_info(['thumbnail_url', 'thumbnailURL']) + or self._html_search_meta('image', webpage, 'thumbnail', default=None) + or video_detail.get('thumbnail')) description = get_video_info('description') - timestamp = (parse_iso8601(get_video_info('first_retrieve')) or - unified_timestamp(get_video_info('postedDateTime'))) + timestamp = (parse_iso8601(get_video_info('first_retrieve')) + or unified_timestamp(get_video_info('postedDateTime'))) if not timestamp: match = self._html_search_meta('datePublished', webpage, 'date published', default=None) if match: @@ -395,9 +395,9 @@ class NiconicoIE(InfoExtractor): view_count = int_or_none(match.replace(',', '')) view_count = view_count or video_detail.get('viewCount') - comment_count = (int_or_none(get_video_info('comment_num')) or - video_detail.get('commentCount') or - try_get(api_data, lambda x: x['thread']['commentCount'])) + comment_count = (int_or_none(get_video_info('comment_num')) + or video_detail.get('commentCount') + or try_get(api_data, lambda x: x['thread']['commentCount'])) if not comment_count: match = self._html_search_regex( r'>Comments: ]*>([^<]+)', @@ -406,11 +406,11 @@ class NiconicoIE(InfoExtractor): comment_count = int_or_none(match.replace(',', '')) duration = (parse_duration( - get_video_info('length') or - self._html_search_meta( - 'video:duration', webpage, 'video duration', default=None)) or - video_detail.get('length') or - get_video_info('duration')) + get_video_info('length') + or self._html_search_meta( + 'video:duration', webpage, 'video duration', default=None)) + or video_detail.get('length') + or get_video_info('duration')) webpage_url = get_video_info('watch_url') or url diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index f32f530f7..6157dc7c1 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -45,7 +45,11 @@ class NineNowIE(InfoExtractor): webpage = self._download_webpage(url, display_id) page_data = self._parse_json(self._search_regex( r'window\.__data\s*=\s*({.*?});', webpage, - 'page data'), display_id) + 'page data', default='{}'), display_id, fatal=False) + if not page_data: + page_data = self._parse_json(self._parse_json(self._search_regex( + r'window\.__data\s*=\s*JSON\.parse\s*\(\s*(".+?")\s*\)\s*;', + webpage, 'page data'), display_id), display_id) for kind in ('episode', 'clip'): current_key = page_data.get(kind, {}).get( diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index febef097a..025c5d249 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -31,6 +31,8 @@ class NJPWWorldIE(InfoExtractor): 'skip': 'Requires login', } + _LOGIN_URL = 'https://front.njpwworld.com/auth/login' + def _real_initialize(self): self._login() @@ -40,13 +42,17 @@ class NJPWWorldIE(InfoExtractor): if not username: return True + # Setup session (will set necessary cookies) + self._request_webpage( + 'https://njpwworld.com/', None, note='Setting up session') + webpage, urlh = self._download_webpage_handle( - 'https://njpwworld.com/auth/login', None, + self._LOGIN_URL, None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({'login_id': username, 'pw': password}), - headers={'Referer': 'https://njpwworld.com/auth'}) + headers={'Referer': 'https://front.njpwworld.com/auth'}) # /auth/login will return 302 for successful logins - if urlh.geturl() == 'https://njpwworld.com/auth/login': + if urlh.geturl() == self._LOGIN_URL: self.report_warning('unable to login') return False diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 58b371ed7..30df905af 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -115,7 +115,7 @@ class NocoIE(InfoExtractor): # Timestamp adjustment offset between server time and local time # must be calculated in order to use timestamps closest to server's - # in all API requests (see https://github.com/rg3/youtube-dl/issues/7864) + # in all API requests (see https://github.com/ytdl-org/youtube-dl/issues/7864) webpage = self._download_webpage(url, video_id) player_url = self._search_regex( diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py index 974de3c3e..b40770d07 100644 --- a/youtube_dl/extractor/noovo.py +++ b/youtube_dl/extractor/noovo.py @@ -57,7 +57,8 @@ class NoovoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - bc_url = BrightcoveNewIE._extract_url(self, webpage) + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') data = self._parse_json( self._search_regex( @@ -89,7 +90,10 @@ class NoovoIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': BrightcoveNewIE.ie_key(), - 'url': smuggle_url(bc_url, {'geo_countries': ['CA']}), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, 'title': title, 'description': description, 'series': series, diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 80186ec50..901f44b54 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -35,7 +35,7 @@ class NovaEmbedIE(InfoExtractor): bitrates = self._parse_json( self._search_regex( - r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'), + r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), video_id, transform_source=js_to_json) QUALITIES = ('lq', 'mq', 'hq', 'hd') diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py deleted file mode 100644 index 829c71960..000000000 --- a/youtube_dl/extractor/novamov.py +++ /dev/null @@ -1,212 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - NO_DEFAULT, - sanitized_Request, - urlencode_postdata, -) - - -class NovaMovIE(InfoExtractor): - IE_NAME = 'novamov' - IE_DESC = 'NovaMov' - - _VALID_URL_TEMPLATE = r'''(?x) - http:// - (?: - (?:www\.)?%(host)s/(?:file|video|mobile/\#/videos)/| - (?:(?:embed|www)\.)%(host)s/embed(?:\.php|/)?\?(?:.*?&)?\bv= - ) - (?P[a-z\d]{13}) - ''' - _VALID_URL = _VALID_URL_TEMPLATE % {'host': r'novamov\.com'} - - _HOST = 'www.novamov.com' - - _FILE_DELETED_REGEX = r'This file no longer exists on our servers!' - _FILEKEY_REGEX = r'flashvars\.filekey=(?P"?[^"]+"?);' - _TITLE_REGEX = r'(?s)
\s*

([^<]+)

' - _DESCRIPTION_REGEX = r'(?s)
\s*

[^<]+

([^<]+)

' - _URL_TEMPLATE = 'http://%s/video/%s' - - _TEST = None - - def _check_existence(self, webpage, video_id): - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - def _real_extract(self, url): - video_id = self._match_id(url) - - url = self._URL_TEMPLATE % (self._HOST, video_id) - - webpage = self._download_webpage( - url, video_id, 'Downloading video page') - - self._check_existence(webpage, video_id) - - def extract_filekey(default=NO_DEFAULT): - filekey = self._search_regex( - self._FILEKEY_REGEX, webpage, 'filekey', default=default) - if filekey is not default and (filekey[0] != '"' or filekey[-1] != '"'): - return self._search_regex( - r'var\s+%s\s*=\s*"([^"]+)"' % re.escape(filekey), webpage, 'filekey', default=default) - else: - return filekey - - filekey = extract_filekey(default=None) - - if not filekey: - fields = self._hidden_inputs(webpage) - post_url = self._search_regex( - r']+action=(["\'])(?P.+?)\1', webpage, - 'post url', default=url, group='url') - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(fields)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - request.add_header('Referer', post_url) - webpage = self._download_webpage( - request, video_id, 'Downloading continue to the video page') - self._check_existence(webpage, video_id) - - filekey = extract_filekey() - - title = self._html_search_regex(self._TITLE_REGEX, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEX, webpage, 'description', default='', fatal=False) - - api_response = self._download_webpage( - 'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id, - 'Downloading video api response') - - response = compat_urlparse.parse_qs(api_response) - - if 'error_msg' in response: - raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True) - - video_url = response['url'][0] - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'description': description - } - - -class WholeCloudIE(NovaMovIE): - IE_NAME = 'wholecloud' - IE_DESC = 'WholeCloud' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} - - _HOST = 'www.wholecloud.net' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'Title: ([^<]+)

' - _DESCRIPTION_REGEX = r'Description: ([^<]+)

' - - _TEST = { - 'url': 'http://www.wholecloud.net/video/559e28be54d96', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': '559e28be54d96', - 'ext': 'flv', - 'title': 'dissapeared image', - 'description': 'optical illusion dissapeared image magic illusion', - } - } - - -class NowVideoIE(NovaMovIE): - IE_NAME = 'nowvideo' - IE_DESC = 'NowVideo' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} - - _HOST = 'www.nowvideo.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'

([^<]+)

' - _DESCRIPTION_REGEX = r'\s*

([^<]+)

' - - _TEST = { - 'url': 'http://www.nowvideo.sx/video/f1d6fce9a968b', - 'md5': '12c82cad4f2084881d8bc60ee29df092', - 'info_dict': { - 'id': 'f1d6fce9a968b', - 'ext': 'flv', - 'title': 'youtubedl test video BaWjenozKc', - 'description': 'Description', - }, - } - - -class VideoWeedIE(NovaMovIE): - IE_NAME = 'videoweed' - IE_DESC = 'VideoWeed' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'videoweed\.(?:es|com)'} - - _HOST = 'www.videoweed.es' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'

([^<]+)

' - _URL_TEMPLATE = 'http://%s/file/%s' - - _TEST = { - 'url': 'http://www.videoweed.es/file/b42178afbea14', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': 'b42178afbea14', - 'ext': 'flv', - 'title': 'optical illusion dissapeared image magic illusion', - 'description': '' - }, - } - - -class CloudTimeIE(NovaMovIE): - IE_NAME = 'cloudtime' - IE_DESC = 'CloudTime' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'cloudtime\.to'} - - _HOST = 'www.cloudtime.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r']+class=["\']video_det["\'][^>]*>\s*([^<]+)' - - _TEST = None - - -class AuroraVidIE(NovaMovIE): - IE_NAME = 'auroravid' - IE_DESC = 'AuroraVid' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': r'auroravid\.to'} - - _HOST = 'www.auroravid.to' - - _FILE_DELETED_REGEX = r'This file no longer exists on our servers!<' - - _TESTS = [{ - 'url': 'http://www.auroravid.to/video/4rurhn9x446jj', - 'md5': '7205f346a52bbeba427603ba10d4b935', - 'info_dict': { - 'id': '4rurhn9x446jj', - 'ext': 'flv', - 'title': 'search engine optimization', - 'description': 'search engine optimization is used to rank the web page in the google search engine' - }, - 'skip': '"Invalid token" errors abound (in web interface as well as youtube-dl, there is nothing we can do about it.)' - }, { - 'url': 'http://www.auroravid.to/embed/?v=4rurhn9x446jj', - 'only_matching': True, - }] diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c2cb85a73..e525ad928 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -12,11 +12,16 @@ from ..utils import ( ExtractorError, fix_xml_ampersands, int_or_none, + merge_dicts, orderedSet, parse_duration, qualities, + str_or_none, strip_jsonp, unified_strdate, + unified_timestamp, + url_or_none, + urlencode_postdata, ) @@ -176,9 +181,122 @@ class NPOIE(NPOBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._get_info(video_id) + return self._get_info(url, video_id) or self._get_old_info(video_id) - def _get_info(self, video_id): + def _get_info(self, url, video_id): + token = self._download_json( + 'https://www.npostart.nl/api/token', video_id, + 'Downloading token', headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + })['token'] + + player = self._download_json( + 'https://www.npostart.nl/player/%s' % video_id, video_id, + 'Downloading player JSON', data=urlencode_postdata({ + 'autoplay': 0, + 'share': 1, + 'pageUrl': url, + 'hasAdConsent': 0, + '_token': token, + })) + + player_token = player['token'] + + drm = False + format_urls = set() + formats = [] + for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'): + streams = self._download_json( + 'https://start-player.npo.nl/video/%s/streams' % video_id, + video_id, 'Downloading %s profile JSON' % profile, fatal=False, + query={ + 'profile': profile, + 'quality': 'npo', + 'tokenId': player_token, + 'streamType': 'broadcast', + }) + if not streams: + continue + stream = streams.get('stream') + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('src')) + if not stream_url or stream_url in format_urls: + continue + format_urls.add(stream_url) + if stream.get('protection') is not None or stream.get('keySystemOptions') is not None: + drm = True + continue + stream_type = stream.get('type') + stream_ext = determine_ext(stream_url) + if stream_type == 'application/dash+xml' or stream_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif re.search(r'\.isml?/Manifest', stream_url): + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': stream_url, + }) + + if not formats: + if drm: + raise ExtractorError('This video is DRM protected.', expected=True) + return + + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + embed_url = url_or_none(player.get('embedUrl')) + if embed_url: + webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed page', fatal=False) + if webpage: + video = self._parse_json( + self._search_regex( + r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video', + default='{}'), video_id) + if video: + title = video.get('episodeTitle') + subtitles = {} + subtitles_list = video.get('subtitles') + if isinstance(subtitles_list, list): + for cc in subtitles_list: + cc_url = url_or_none(cc.get('src')) + if not cc_url: + continue + lang = str_or_none(cc.get('language')) or 'nl' + subtitles.setdefault(lang, []).append({ + 'url': cc_url, + }) + return merge_dicts({ + 'title': title, + 'description': video.get('description'), + 'thumbnail': url_or_none( + video.get('still_image_url') or video.get('orig_image_url')), + 'duration': int_or_none(video.get('duration')), + 'timestamp': unified_timestamp(video.get('broadcastDate')), + 'creator': video.get('channel'), + 'series': video.get('title'), + 'episode': title, + 'episode_number': int_or_none(video.get('episodeNumber')), + 'subtitles': subtitles, + }, info) + + return info + + def _get_old_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/%s' % video_id, video_id, @@ -280,7 +398,7 @@ class NPOIE(NPOBaseIE): # JSON else: video_url = stream_info.get('url') - if not video_url or video_url in urls: + if not video_url or 'vodnotavailable.' in video_url or video_url in urls: continue urls.add(video_url) if determine_ext(video_url) == 'm3u8': @@ -363,7 +481,7 @@ class NPOIE(NPOBaseIE): class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P[^/?#&]+))?' _TESTS = [{ 'url': 'http://www.npo.nl/live/npo-1', @@ -380,6 +498,9 @@ class NPOLiveIE(NPOBaseIE): }, { 'url': 'http://www.npo.nl/live', 'only_matching': True, + }, { + 'url': 'https://www.npostart.nl/live/npo-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 1777aa10b..a5e8baa7e 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import ( int_or_none, qualities, @@ -9,16 +8,16 @@ from ..utils import ( class NprIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?npr\.org/(?:sections/[^/]+/)?\d{4}/\d{2}/\d{2}/(?P\d+)' _TESTS = [{ - 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', + 'url': 'https://www.npr.org/sections/allsongs/2015/10/21/449974205/new-music-from-beach-house-chairlift-cmj-discoveries-and-more', 'info_dict': { 'id': '449974205', 'title': 'New Music From Beach House, Chairlift, CMJ Discoveries And More' }, 'playlist_count': 7, }, { - 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?action=1&t=1&islist=false&id=446928052&m=446929930&live=1', + 'url': 'https://www.npr.org/sections/deceptivecadence/2015/10/09/446928052/music-from-the-shadows-ancient-armenian-hymns-and-piano-jazz', 'info_dict': { 'id': '446928052', 'title': "Songs We Love: Tigran Hamasyan, 'Your Mercy is Boundless'" @@ -32,30 +31,46 @@ class NprIE(InfoExtractor): 'duration': 402, }, }], + }, { + # mutlimedia, not media title + 'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert', + 'info_dict': { + 'id': '533198237', + 'title': 'Tigers Jaw: Tiny Desk Concert', + }, + 'playlist': [{ + 'md5': '12fa60cb2d3ed932f53609d4aeceabf1', + 'info_dict': { + 'id': '533201718', + 'ext': 'mp4', + 'title': 'Tigers Jaw: Tiny Desk Concert', + 'duration': 402, + }, + }], + 'expected_warnings': ['Failed to download m3u8 information'], }] def _real_extract(self, url): playlist_id = self._match_id(url) - config = self._download_json( - 'http://api.npr.org/query?%s' % compat_urllib_parse_urlencode({ + story = self._download_json( + 'http://api.npr.org/query', playlist_id, query={ 'id': playlist_id, - 'fields': 'titles,audio,show', + 'fields': 'audio,multimedia,title', 'format': 'json', 'apiKey': 'MDAzMzQ2MjAyMDEyMzk4MTU1MDg3ZmM3MQ010', - }), playlist_id) + })['list']['story'][0] + playlist_title = story.get('title', {}).get('$text') - story = config['list']['story'][0] - - KNOWN_FORMATS = ('threegp', 'mp4', 'mp3') + KNOWN_FORMATS = ('threegp', 'm3u8', 'smil', 'mp4', 'mp3') quality = qualities(KNOWN_FORMATS) entries = [] - for audio in story.get('audio', []): - title = audio.get('title', {}).get('$text') - duration = int_or_none(audio.get('duration', {}).get('$text')) + for media in story.get('audio', []) + story.get('multimedia', []): + media_id = media['id'] + formats = [] - for format_id, formats_entry in audio.get('format', {}).items(): + for format_id, formats_entry in media.get('format', {}).items(): if not formats_entry: continue if isinstance(formats_entry, list): @@ -64,19 +79,30 @@ class NprIE(InfoExtractor): if not format_url: continue if format_id in KNOWN_FORMATS: - formats.append({ - 'url': format_url, - 'format_id': format_id, - 'ext': formats_entry.get('type'), - 'quality': quality(format_id), - }) + if format_id == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif format_id == 'smil': + smil_formats = self._extract_smil_formats( + format_url, media_id, transform_source=lambda s: s.replace( + 'rtmp://flash.npr.org/ondemand/', 'https://ondemand.npr.org/')) + self._check_formats(smil_formats, media_id) + formats.extend(smil_formats) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality(format_id), + }) self._sort_formats(formats) + entries.append({ - 'id': audio['id'], - 'title': title, - 'duration': duration, + 'id': media_id, + 'title': media.get('title', {}).get('$text') or playlist_title, + 'thumbnail': media.get('altImageUrl', {}).get('$text'), + 'duration': int_or_none(media.get('duration', {}).get('$text')), 'formats': formats, }) - playlist_title = story.get('title', {}).get('$text') return self.playlist_result(entries, playlist_id, playlist_title) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index a231735fb..5f43e692f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -45,8 +45,8 @@ class NRKBaseIE(InfoExtractor): entries = [] conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' or - data.get('isLive') is True or conviva.get('isLive')) + live = (data.get('mediaElementType') == 'Live' + or data.get('isLive') is True or conviva.get('isLive')) def make_title(t): return self._live_title(t) if live else t @@ -211,13 +211,13 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'md5': '706f34cdf1322577589e369e522b50ef', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, + 'duration': 262, } }, { # audio @@ -248,7 +248,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie/[^/]+|program)/ + (?:serie(?:/[^/]+){1,2}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P\d+))? @@ -256,14 +256,14 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, - 'series': '20 spørsmål - TV', + 'series': '20 spørsmål', 'episode': '23.05.2014', }, }, { @@ -301,7 +301,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515AH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 772, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -314,7 +314,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515BH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 6175, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -326,7 +326,7 @@ class NRKTVIE(NRKBaseIE): 'info_dict': { 'id': 'MSPO40010515', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', }, 'expected_warnings': ['Video is geo restricted'], }, { @@ -362,6 +362,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, }] @@ -403,21 +406,35 @@ class NRKTVSerieBaseIE(InfoExtractor): def _extract_series(self, webpage, display_id, fatal=True): config = self._parse_json( self._search_regex( - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*', webpage, 'config', - default='{}' if not fatal else NO_DEFAULT), + (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), + webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False) if not config: return - return try_get(config, lambda x: x['series'], dict) + return try_get( + config, + (lambda x: x['initialState']['series'], lambda x: x['series']), + dict) + + def _extract_seasons(self, seasons): + if not isinstance(seasons, list): + return [] + entries = [] + for season in seasons: + entries.extend(self._extract_episodes(season)) + return entries def _extract_episodes(self, season): - entries = [] if not isinstance(season, dict): - return entries - episodes = season.get('episodes') - if not isinstance(episodes, list): - return entries - for episode in episodes: + return [] + return self._extract_entries(season.get('episodes')) + + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: nrk_id = episode.get('prfId') if not nrk_id or not isinstance(nrk_id, compat_str): continue @@ -462,7 +479,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' _TESTS = [{ - # new layout + # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { 'id': 'backstage', @@ -471,20 +488,21 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, 'playlist_mincount': 60, }, { - # old layout + # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { 'id': 'groenn-glede', 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 9, + 'playlist_mincount': 10, }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', + 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', }, 'playlist_mincount': 3, }, { @@ -517,11 +535,12 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) entries = [] - for season in series['seasons']: - entries.extend(self._extract_episodes(season)) + entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_entries(series.get('instalments'))) + entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) - # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) entries = [ self.url_result( 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( @@ -533,6 +552,9 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'seriestitle', webpage, 'title', default=None) or self._og_search_title( webpage, fatal=False) + if title: + title = self._search_regex( + r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) description = self._html_search_meta( 'series_description', webpage, @@ -593,7 +615,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE): 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, - 'playlist_count': 5, + 'playlist_count': 2, }] def _extract_title(self, webpage): diff --git a/youtube_dl/extractor/nrl.py b/youtube_dl/extractor/nrl.py new file mode 100644 index 000000000..798b91e04 --- /dev/null +++ b/youtube_dl/extractor/nrl.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class NRLTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nrl\.com/tv(/[^/]+)*/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://www.nrl.com/tv/news/match-highlights-titans-v-knights-862805/', + 'info_dict': { + 'id': 'YyNnFuaDE6kPJqlDhG4CGQ_w89mKTau4', + 'ext': 'mp4', + 'title': 'Match Highlights: Titans v Knights', + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + q_data = self._parse_json(self._search_regex( + r"(?s)q-data='({.+?})'", webpage, 'player data'), display_id) + ooyala_id = q_data['videoId'] + return self.url_result( + 'ooyala:' + ooyala_id, 'Ooyala', ooyala_id, q_data.get('title')) diff --git a/youtube_dl/extractor/ntvcojp.py b/youtube_dl/extractor/ntvcojp.py new file mode 100644 index 000000000..0c8221b22 --- /dev/null +++ b/youtube_dl/extractor/ntvcojp.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + smuggle_url, +) + + +class NTVCoJpCUIE(InfoExtractor): + IE_NAME = 'cu.ntv.co.jp' + IE_DESC = 'Nippon Television Network' + _VALID_URL = r'https?://cu\.ntv\.co\.jp/(?!program)(?P[^/?&#]+)' + _TEST = { + 'url': 'https://cu.ntv.co.jp/televiva-chill-gohan_181031/', + 'info_dict': { + 'id': '5978891207001', + 'ext': 'mp4', + 'title': '桜エビと炒り卵がポイント! 「中華風 エビチリおにぎり」──『美虎』五十嵐美幸', + 'upload_date': '20181213', + 'description': 'md5:211b52f4fd60f3e0e72b68b0c6ba52a9', + 'uploader_id': '3855502814001', + 'timestamp': 1544669941, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_config = self._parse_json(self._search_regex( + r'(?s)PLAYER_CONFIG\s*=\s*({.+?})', + webpage, 'player config'), display_id, js_to_json) + video_id = player_config['videoId'] + account_id = player_config.get('account') or '3855502814001' + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': display_id, + 'title': self._search_regex(r']+class="title"[^>]*>([^<]+)', webpage, 'title').strip(), + 'description': self._html_search_meta(['description', 'og:description'], webpage), + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id), {'geo_countries': ['JP']}), + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/nzz.py b/youtube_dl/extractor/nzz.py index 2d352f53f..61ee77adb 100644 --- a/youtube_dl/extractor/nzz.py +++ b/youtube_dl/extractor/nzz.py @@ -11,20 +11,27 @@ from ..utils import ( class NZZIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153', 'info_dict': { 'id': '9153', }, 'playlist_mincount': 6, - } + }, { + 'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112', + 'info_dict': { + 'id': '1368112', + }, + 'playlist_count': 1, + }] def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) entries = [] - for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage): + for player_element in re.findall( + r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage): player_params = extract_attributes(player_element) if player_params.get('data-type') not in ('kaltura_singleArticle',): self.report_warning('Unsupported player type') diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 190d8af4d..114b93c07 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -115,6 +115,10 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', 'only_matching': True, + }, { + # Paid video + 'url': 'https://ok.ru/video/954886983203', + 'only_matching': True, }] def _real_extract(self, url): @@ -244,6 +248,11 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'flv', }) + if not formats: + payment_info = metadata.get('paymentInfo') + if payment_info: + raise ExtractorError('This video is paid, subscribe to download it', expected=True) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 8ae5fadd8..3e44b7829 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -21,7 +21,7 @@ class OnceIE(InfoExtractor): progressive_formats = [] for adaptive_format in formats: # Prevent advertisement from embedding into m3u8 playlist (see - # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684) + # https://github.com/ytdl-org/youtube-dl/issues/8893#issuecomment-199912684) adaptive_format['url'] = re.sub( r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) rendition_id = self._search_regex( diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index ad8bf03f8..995b24d1b 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -31,12 +31,12 @@ class OoyalaBaseIE(InfoExtractor): title = metadata['title'] auth_data = self._download_json( - self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + - compat_urllib_parse_urlencode({ + self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + + compat_urllib_parse_urlencode({ 'domain': domain, 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', 'embedToken': embed_token, - }), video_id) + }), video_id, headers=self.geo_verification_headers()) cur_auth_data = auth_data['authorization_data'][embed_code] diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index dc01b6346..679eaf6c3 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -42,9 +42,9 @@ def cookie_to_dict(cookie): if cookie.discard is not None: cookie_dict['discard'] = cookie.discard try: - if (cookie.has_nonstandard_attr('httpOnly') or - cookie.has_nonstandard_attr('httponly') or - cookie.has_nonstandard_attr('HttpOnly')): + if (cookie.has_nonstandard_attr('httpOnly') + or cookie.has_nonstandard_attr('httponly') + or cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True except TypeError: pass @@ -243,8 +243,26 @@ class PhantomJSwrapper(object): class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud))/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' - + _DOMAINS = r''' + (?: + openload\.(?:co|io|link|pw)| + oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|press|pw|life|live|space|services|website|vip)| + oladblock\.(?:services|xyz|me)|openloed\.co + ) + ''' + _VALID_URL = r'''(?x) + https?:// + (?P + (?:www\.)? + %s + )/ + (?:f|embed)/ + (?P[a-zA-Z0-9-_]+) + ''' % _DOMAINS + _EMBED_WORD = 'embed' + _STREAM_WORD = 'f' + _REDIR_WORD = 'stream' + _URL_IDS = ('streamurl', 'streamuri', 'streamurj') _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', 'md5': 'bf1c059b004ebc7a256f89408e65c36e', @@ -314,29 +332,92 @@ class OpenloadIE(InfoExtractor): # Its title has not got its extension but url has it 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4', 'only_matching': True, + }, { + 'url': 'https://oload.cc/embed/5NEAbI2BDSk', + 'only_matching': True, + }, { + 'url': 'https://oload.icu/f/-_i4y_F_Hs8', + 'only_matching': True, + }, { + 'url': 'https://oload.fun/f/gb6G1H4sHXY', + 'only_matching': True, + }, { + 'url': 'https://oload.club/f/Nr1L-aZ2dbQ', + 'only_matching': True, + }, { + 'url': 'https://oload.info/f/5NEAbI2BDSk', + 'only_matching': True, + }, { + 'url': 'https://openload.pw/f/WyKgK8s94N0', + 'only_matching': True, + }, { + 'url': 'https://oload.pw/f/WyKgK8s94N0', + 'only_matching': True, + }, { + 'url': 'https://oload.live/f/-Z58UZ-GR4M', + 'only_matching': True, + }, { + 'url': 'https://oload.space/f/IY4eZSst3u8/', + 'only_matching': True, + }, { + 'url': 'https://oload.services/embed/bs1NWj1dCag/', + 'only_matching': True, + }, { + 'url': 'https://oload.press/embed/drTBl1aOTvk/', + 'only_matching': True, + }, { + 'url': 'https://oload.website/embed/drTBl1aOTvk/', + 'only_matching': True, + }, { + 'url': 'https://oload.life/embed/oOzZjNPw9Dc/', + 'only_matching': True, + }, { + 'url': 'https://oload.biz/f/bEk3Gp8ARr4/', + 'only_matching': True, + }, { + 'url': 'https://oload.best/embed/kkz9JgVZeWc/', + 'only_matching': True, + }, { + 'url': 'https://oladblock.services/f/b8NWEgkqNLI/', + 'only_matching': True, + }, { + 'url': 'https://oladblock.xyz/f/b8NWEgkqNLI/', + 'only_matching': True, + }, { + 'url': 'https://oladblock.me/f/b8NWEgkqNLI/', + 'only_matching': True, + }, { + 'url': 'https://openloed.co/f/b8NWEgkqNLI/', + 'only_matching': True, + }, { + 'url': 'https://oload.vip/f/kUEfGclsU9o', + 'only_matching': True, }] - _USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' - - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_urls(cls, webpage): return re.findall( - r']+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', - webpage) + r'(?x)]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)' + % (cls._DOMAINS, cls._EMBED_WORD), webpage) + + def _extract_decrypted_page(self, page_url, webpage, video_id): + phantom = PhantomJSwrapper(self, required_version='2.0') + webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id) + return webpage def _real_extract(self, url): - video_id = self._match_id(url) - url_pattern = 'https://openload.co/%%s/%s/' % video_id - headers = { - 'User-Agent': self._USER_AGENT, - } + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + video_id = mobj.group('id') - for path in ('embed', 'f'): + url_pattern = 'https://%s/%%s/%s/' % (host, video_id) + + for path in (self._EMBED_WORD, self._STREAM_WORD): page_url = url_pattern % path - last = path == 'f' + last = path == self._STREAM_WORD webpage = self._download_webpage( page_url, video_id, 'Downloading %s webpage' % path, - headers=headers, fatal=last) + fatal=last) if not webpage: continue if 'File not found' in webpage or 'deleted by the owner' in webpage: @@ -345,21 +426,20 @@ class OpenloadIE(InfoExtractor): raise ExtractorError('File not found', expected=True, video_id=video_id) break - phantom = PhantomJSwrapper(self, required_version='2.0') - webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id, headers=headers) - - decoded_id = (get_element_by_id('streamurl', webpage) or - get_element_by_id('streamuri', webpage) or - get_element_by_id('streamurj', webpage) or - self._search_regex( - (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', - r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', - r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', - r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, - 'stream URL')) - - video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id + webpage = self._extract_decrypted_page(page_url, webpage, video_id) + for element_id in self._URL_IDS: + decoded_id = get_element_by_id(element_id, webpage) + if decoded_id: + break + if not decoded_id: + decoded_id = self._search_regex( + (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<', + r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)', + r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<', + r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage, + 'stream URL') + video_url = 'https://%s/%s/%s?mime=true' % (host, self._REDIR_WORD, decoded_id) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, @@ -370,13 +450,46 @@ class OpenloadIE(InfoExtractor): entry = entries[0] if entries else {} subtitles = entry.get('subtitles') - info_dict = { + return { 'id': video_id, 'title': title, 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None), 'url': video_url, 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'), 'subtitles': subtitles, - 'http_headers': headers, } - return info_dict + + +class VerystreamIE(OpenloadIE): + IE_NAME = 'verystream' + + _DOMAINS = r'(?:verystream\.com|woof\.tube)' + _VALID_URL = r'''(?x) + https?:// + (?P + (?:www\.)? + %s + )/ + (?:stream|e)/ + (?P[a-zA-Z0-9-_]+) + ''' % _DOMAINS + _EMBED_WORD = 'e' + _STREAM_WORD = 'stream' + _REDIR_WORD = 'gettoken' + _URL_IDS = ('videolink', ) + _TESTS = [{ + 'url': 'https://verystream.com/stream/c1GWQ9ngBBx/', + 'md5': 'd3e8c5628ccb9970b65fd65269886795', + 'info_dict': { + 'id': 'c1GWQ9ngBBx', + 'ext': 'mp4', + 'title': 'Big Buck Bunny.mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://verystream.com/e/c1GWQ9ngBBx/', + 'only_matching': True, + }] + + def _extract_decrypted_page(self, page_url, webpage, video_id): + return webpage # for Verystream, the webpage is already decrypted diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index c1fb580ca..499be0029 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -15,6 +15,7 @@ from ..utils import ( strip_jsonp, unescapeHTML, unified_strdate, + url_or_none, ) @@ -68,26 +69,35 @@ class ORFTVthekIE(InfoExtractor): webpage, 'playlist', group='json'), playlist_id, transform_source=unescapeHTML)['playlist']['videos'] - def quality_to_int(s): - m = re.search('([0-9]+)', s) - if m is None: - return -1 - return int(m.group(1)) - entries = [] for sd in data_jsb: video_id, title = sd.get('id'), sd.get('title') if not video_id or not title: continue video_id = compat_str(video_id) - formats = [{ - 'preference': -10 if fd['delivery'] == 'hls' else None, - 'format_id': '%s-%s-%s' % ( - fd['delivery'], fd['quality'], fd['quality_string']), - 'url': fd['src'], - 'protocol': fd['protocol'], - 'quality': quality_to_int(fd['quality']), - } for fd in sd['sources']] + formats = [] + for fd in sd['sources']: + src = url_or_none(fd.get('src')) + if not src: + continue + format_id_list = [] + for key in ('delivery', 'quality', 'quality_string'): + value = fd.get(key) + if value: + format_id_list.append(value) + format_id = '-'.join(format_id_list) + if determine_ext(fd['src']) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + fd['src'], video_id, 'mp4', m3u8_id=format_id)) + elif determine_ext(fd['src']) == 'f4m': + formats.extend(self._extract_f4m_formats( + fd['src'], video_id, f4m_id=format_id)) + else: + formats.append({ + 'format_id': format_id, + 'url': src, + 'protocol': fd.get('protocol'), + }) # Check for geoblocking. # There is a property is_geoprotection, but that's always false @@ -166,7 +176,8 @@ class ORFRadioIE(InfoExtractor): 'description': subtitle, 'duration': (info['end'] - info['start']) / 1000, 'timestamp': info['start'] / 1000, - 'ext': 'mp3' + 'ext': 'mp3', + 'series': data.get('programTitle') } entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']] diff --git a/youtube_dl/extractor/outsidetv.py b/youtube_dl/extractor/outsidetv.py new file mode 100644 index 000000000..c5333b08c --- /dev/null +++ b/youtube_dl/extractor/outsidetv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OutsideTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?outsidetv\.com/(?:[^/]+/)*?play/[a-zA-Z0-9]{8}/\d+/\d+/(?P[a-zA-Z0-9]{8})' + _TESTS = [{ + 'url': 'http://www.outsidetv.com/category/snow/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'md5': '192d968fedc10b2f70ec31865ffba0da', + 'info_dict': { + 'id': 'Hdg0jukV', + 'ext': 'mp4', + 'title': 'Home - Jackson Ep 1 | Arbor Snowboards', + 'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd', + 'upload_date': '20181225', + 'timestamp': 1545742800, + } + }, { + 'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4', + 'only_matching': True, + }] + + def _real_extract(self, url): + jw_media_id = self._match_id(url) + return self.url_result( + 'jwplatform:' + jw_media_id, 'JWPlatform', jw_media_id) diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 56a2a1083..11ad3b3b8 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -5,28 +5,29 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, + # compat_str, compat_HTTPError, ) from ..utils import ( clean_html, ExtractorError, - remove_end, + # remove_end, + str_or_none, strip_or_none, unified_timestamp, - urljoin, + # urljoin, ) class PacktPubBaseIE(InfoExtractor): - _PACKT_BASE = 'https://www.packtpub.com' - _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE + # _PACKT_BASE = 'https://www.packtpub.com' + _STATIC_PRODUCTS_BASE = 'https://static.packt-cdn.com/products/' class PacktPubIE(PacktPubBaseIE): - _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P\d+)/(?P[^/]+)/(?P[^/]+)(?:/(?P[^/?&#]+))?' - _TEST = { + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', 'info_dict': { @@ -37,7 +38,13 @@ class PacktPubIE(PacktPubBaseIE): 'timestamp': 1490918400, 'upload_date': '20170331', }, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro', + 'only_matching': True, + }, { + 'url': 'https://subscription.packtpub.com/video/programming/9781838988906/p1/video1_1/business-card-project', + 'only_matching': True, + }] _NETRC_MACHINE = 'packtpub' _TOKEN = None @@ -47,9 +54,9 @@ class PacktPubIE(PacktPubBaseIE): return try: self._TOKEN = self._download_json( - self._MAPT_REST + '/users/tokens', None, + 'https://services.packtpub.com/auth-v1/users/tokens', None, 'Downloading Authorization Token', data=json.dumps({ - 'email': username, + 'username': username, 'password': password, }).encode())['data']['access'] except ExtractorError as e: @@ -58,67 +65,57 @@ class PacktPubIE(PacktPubBaseIE): raise ExtractorError(message, expected=True) raise - def _handle_error(self, response): - if response.get('status') != 'success': - raise ExtractorError( - '% said: %s' % (self.IE_NAME, response['message']), - expected=True) - - def _download_json(self, *args, **kwargs): - response = super(PacktPubIE, self)._download_json(*args, **kwargs) - self._handle_error(response) - return response - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - course_id, chapter_id, video_id = mobj.group( - 'course_id', 'chapter_id', 'id') + course_id, chapter_id, video_id, display_id = re.match(self._VALID_URL, url).groups() headers = {} if self._TOKEN: headers['Authorization'] = 'Bearer ' + self._TOKEN - video = self._download_json( - '%s/users/me/products/%s/chapters/%s/sections/%s' - % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, - 'Downloading JSON video', headers=headers)['data'] + try: + video_url = self._download_json( + 'https://services.packtpub.com/products-v1/products/%s/%s/%s' % (course_id, chapter_id, video_id), video_id, + 'Downloading JSON video', headers=headers)['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self.raise_login_required('This video is locked') + raise - content = video.get('content') - if not content: - self.raise_login_required('This video is locked') + # TODO: find a better way to avoid duplicating course requests + # metadata = self._download_json( + # '%s/products/%s/chapters/%s/sections/%s/metadata' + # % (self._MAPT_REST, course_id, chapter_id, video_id), + # video_id)['data'] - video_url = content['file'] - - metadata = self._download_json( - '%s/products/%s/chapters/%s/sections/%s/metadata' - % (self._MAPT_REST, course_id, chapter_id, video_id), - video_id)['data'] - - title = metadata['pageTitle'] - course_title = metadata.get('title') - if course_title: - title = remove_end(title, ' - %s' % course_title) - timestamp = unified_timestamp(metadata.get('publicationDate')) - thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) + # title = metadata['pageTitle'] + # course_title = metadata.get('title') + # if course_title: + # title = remove_end(title, ' - %s' % course_title) + # timestamp = unified_timestamp(metadata.get('publicationDate')) + # thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) return { 'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'timestamp': timestamp, + 'title': display_id or video_id, # title, + # 'thumbnail': thumbnail, + # 'timestamp': timestamp, } class PacktPubCourseIE(PacktPubBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P\d+))' - _TEST = { + _VALID_URL = r'(?Phttps?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P\d+))' + _TESTS = [{ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', 'info_dict': { 'id': '9781787122215', 'title': 'Learn Nodejs by building 12 projects [Video]', + 'description': 'md5:489da8d953f416e51927b60a1c7db0aa', }, 'playlist_count': 90, - } + }, { + 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215', + 'only_matching': True, + }] @classmethod def suitable(cls, url): @@ -130,35 +127,38 @@ class PacktPubCourseIE(PacktPubBaseIE): url, course_id = mobj.group('url', 'id') course = self._download_json( - '%s/products/%s/metadata' % (self._MAPT_REST, course_id), - course_id)['data'] + self._STATIC_PRODUCTS_BASE + '%s/toc' % course_id, course_id) + metadata = self._download_json( + self._STATIC_PRODUCTS_BASE + '%s/summary' % course_id, + course_id, fatal=False) or {} entries = [] - for chapter_num, chapter in enumerate(course['tableOfContents'], 1): - if chapter.get('type') != 'chapter': - continue - children = chapter.get('children') - if not isinstance(children, list): + for chapter_num, chapter in enumerate(course['chapters'], 1): + chapter_id = str_or_none(chapter.get('id')) + sections = chapter.get('sections') + if not chapter_id or not isinstance(sections, list): continue chapter_info = { 'chapter': chapter.get('title'), 'chapter_number': chapter_num, - 'chapter_id': chapter.get('id'), + 'chapter_id': chapter_id, } - for section in children: - if section.get('type') != 'section': - continue - section_url = section.get('seoUrl') - if not isinstance(section_url, compat_str): + for section in sections: + section_id = str_or_none(section.get('id')) + if not section_id or section.get('contentType') != 'video': continue entry = { '_type': 'url_transparent', - 'url': urljoin(url + '/', section_url), + 'url': '/'.join([url, chapter_id, section_id]), 'title': strip_or_none(section.get('title')), 'description': clean_html(section.get('summary')), + 'thumbnail': metadata.get('coverImage'), + 'timestamp': unified_timestamp(metadata.get('publicationDate')), 'ie_key': PacktPubIE.ie_key(), } entry.update(chapter_info) entries.append(entry) - return self.playlist_result(entries, course_id, course.get('title')) + return self.playlist_result( + entries, course_id, metadata.get('title'), + clean_html(metadata.get('about'))) diff --git a/youtube_dl/extractor/pandatv.py b/youtube_dl/extractor/pandatv.py index 13a2e7efc..4219802d5 100644 --- a/youtube_dl/extractor/pandatv.py +++ b/youtube_dl/extractor/pandatv.py @@ -36,7 +36,7 @@ class PandaTVIE(InfoExtractor): 'https://www.panda.tv/api_room_v2?roomid=%s' % video_id, video_id) error_code = config.get('errno', 0) - if error_code is not 0: + if error_code != 0: raise ExtractorError( '%s returned error %s: %s' % (self.IE_NAME, error_code, config['errmsg']), diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index 9eb027679..426dd8121 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -2,52 +2,63 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from ..utils import ( + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) class PatreonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/creation\?hid=(?P[^&#]+)' - _TESTS = [ - { - 'url': 'http://www.patreon.com/creation?hid=743933', - 'md5': 'e25505eec1053a6e6813b8ed369875cc', - 'info_dict': { - 'id': '743933', - 'ext': 'mp3', - 'title': 'Episode 166: David Smalley of Dogma Debate', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P\d+)' + _TESTS = [{ + 'url': 'http://www.patreon.com/creation?hid=743933', + 'md5': 'e25505eec1053a6e6813b8ed369875cc', + 'info_dict': { + 'id': '743933', + 'ext': 'mp3', + 'title': 'Episode 166: David Smalley of Dogma Debate', + 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', + 'timestamp': 1406473987, + 'upload_date': '20140727', }, - { - 'url': 'http://www.patreon.com/creation?hid=754133', - 'md5': '3eb09345bf44bf60451b8b0b81759d0a', - 'info_dict': { - 'id': '754133', - 'ext': 'mp3', - 'title': 'CD 167 Extra', - 'uploader': 'Cognitive Dissonance Podcast', - 'thumbnail': 're:^https?://.*$', - }, + }, { + 'url': 'http://www.patreon.com/creation?hid=754133', + 'md5': '3eb09345bf44bf60451b8b0b81759d0a', + 'info_dict': { + 'id': '754133', + 'ext': 'mp3', + 'title': 'CD 167 Extra', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': 're:^https?://.*$', }, - { - 'url': 'https://www.patreon.com/creation?hid=1682498', - 'info_dict': { - 'id': 'SU4fj_aEMVw', - 'ext': 'mp4', - 'title': 'I\'m on Patreon!', - 'uploader': 'TraciJHines', - 'thumbnail': 're:^https?://.*$', - 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', - 'uploader_id': 'TraciJHines', - }, - 'params': { - 'noplaylist': True, - 'skip_download': True, - } + 'skip': 'Patron-only content', + }, { + 'url': 'https://www.patreon.com/creation?hid=1682498', + 'info_dict': { + 'id': 'SU4fj_aEMVw', + 'ext': 'mp4', + 'title': 'I\'m on Patreon!', + 'uploader': 'TraciJHines', + 'thumbnail': 're:^https?://.*$', + 'upload_date': '20150211', + 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'uploader_id': 'TraciJHines', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, } - ] + }, { + 'url': 'https://www.patreon.com/posts/episode-166-of-743933', + 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/posts/743933', + 'only_matching': True, + }] # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. @@ -78,38 +89,48 @@ class PatreonIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage).strip() - - attach_fn = self._html_search_regex( - r'
', - webpage, 'attachment URL', default=None) - embed = self._html_search_regex( - r']+id="watchCreation"[^>]*>\s*]+src="([^"]+)"', - webpage, 'embedded URL', default=None) - - if attach_fn is not None: - video_url = 'http://www.patreon.com' + attach_fn - thumbnail = self._og_search_thumbnail(webpage) - uploader = self._html_search_regex( - r'(.*?) is creating', webpage, 'uploader') - elif embed is not None: - return self.url_result(embed) - else: - playlist = self._parse_json(self._search_regex( - r'(?s)new\s+jPlayerPlaylist\(\s*\{\s*[^}]*},\s*(\[.*?,?\s*\])', - webpage, 'playlist JSON'), - video_id, transform_source=js_to_json) - data = playlist[0] - video_url = self._proto_relative_url(data['mp3']) - thumbnail = self._proto_relative_url(data.get('cover')) - uploader = data.get('artist') - - return { + post = self._download_json( + 'https://www.patreon.com/api/posts/' + video_id, video_id) + attributes = post['data']['attributes'] + title = attributes['title'].strip() + image = attributes.get('image') or {} + info = { 'id': video_id, - 'url': video_url, - 'ext': 'mp3', 'title': title, - 'uploader': uploader, - 'thumbnail': thumbnail, + 'description': clean_html(attributes.get('content')), + 'thumbnail': image.get('large_url') or image.get('url'), + 'timestamp': parse_iso8601(attributes.get('published_at')), + 'like_count': int_or_none(attributes.get('like_count')), + 'comment_count': int_or_none(attributes.get('comment_count')), } + + def add_file(file_data): + file_url = file_data.get('url') + if file_url: + info.update({ + 'url': file_url, + 'ext': determine_ext(file_data.get('name'), 'mp3'), + }) + + for i in post.get('included', []): + i_type = i.get('type') + if i_type == 'attachment': + add_file(i.get('attributes') or {}) + elif i_type == 'user': + user_attributes = i.get('attributes') + if user_attributes: + info.update({ + 'uploader': user_attributes.get('full_name'), + 'uploader_url': user_attributes.get('url'), + }) + + if not info.get('url'): + add_file(attributes.get('post_file') or {}) + + if not info.get('url'): + info.update({ + '_type': 'url', + 'url': attributes['embed']['url'], + }) + + return info diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 80340f595..4dbe661be 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -305,7 +305,7 @@ class PBSIE(InfoExtractor): { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "