youtube-dl/youtube_dl/extractor/vier.py

# coding: utf-8
from __future__ import unicode_literals

import re
import itertools

from .common import InfoExtractor
from ..utils import (
    ExtractorError,
    urlencode_postdata,
)


class VierIE(InfoExtractor):
    IE_NAME = 'vier'
    IE_DESC = 'vier.be and vijf.be'
    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
    _NETRC_MACHINE = 'vier'
    _TESTS = [{
        'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
        'info_dict': {
            'id': '16129',
            'display_id': 'het-wordt-warm-de-moestuin',
            'ext': 'mp4',
            'title': 'Het wordt warm in De Moestuin',
            'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        # 'skip': 'Requires account credentials',
    }, {
        'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
        'info_dict': {
            'id': '2561614',
            'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
            'ext': 'mp4',
            'title': 'EXTRA: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
            'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }, {
        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
        'info_dict': {
            'id': '2674839',
            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
            'ext': 'mp4',
            'title': 'Jani gaat naar Tokio - Aflevering 4',
            'description': 'Bekijk hier de volledige vierde aflevering van het 2de seizoen van Jani gaat...',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        'skip': 'Requires account credentials',
    }, {
        'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
        'info_dict': {
            'id': '2674839',
            'display_id': 'jani-gaat-naar-tokio-aflevering-4',
            'ext': 'mp4',
            'title': 'jani-gaat-naar-tokio-aflevering-4',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
        'expected_warnings': ['Log in to extract metadata'],
    }, {
        'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
        'only_matching': True,
    }, {
        'url': 'http://www.vier.be/video/v3/embed/16129',
        'only_matching': True,
    }]

    def _real_initialize(self):
        self._logged_in = False

    def _login(self, site):
        username, password = self._get_login_info()
        if username is None or password is None:
            return

        login_page = self._download_webpage(
            'http://www.%s.be/user/login' % site,
            None, note='Logging in', errnote='Unable to log in',
            data=urlencode_postdata({
                'form_id': 'user_login',
                'name': username,
                'pass': password,
            }),
            headers={'Content-Type': 'application/x-www-form-urlencoded'})

        login_error = self._html_search_regex(
            r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
            login_page, 'login error', default=None)
        if login_error:
            self.report_warning('Unable to log in: %s' % login_error)
        else:
            self._logged_in = True

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        embed_id = mobj.group('embed_id')
        display_id = mobj.group('display_id') or embed_id
        video_id = mobj.group('id') or embed_id
        site = mobj.group('site')

        if not self._logged_in:
            self._login(site)

        webpage = self._download_webpage(url, display_id)

        if r'id="user-login"' in webpage:
            self.report_warning(
                'Log in to extract metadata', video_id=display_id)
            webpage = self._download_webpage(
                'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
                display_id)

        video_id = self._search_regex(
            [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
            webpage, 'video id', default=video_id)
        application = self._search_regex(
            [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
            webpage, 'application', default=site + '_vod')
        filename = self._search_regex(
            [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
            webpage, 'filename')

        playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
        formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
        self._sort_formats(formats)

        title = self._og_search_title(webpage, default=display_id)
        description = self._og_search_description(webpage, default=None)
        thumbnail = self._og_search_thumbnail(webpage, default=None)

        return {
            'id': video_id,
            'display_id': display_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'formats': formats,
        }


class VierVideosIE(InfoExtractor):
    IE_NAME = 'vier:videos'
    _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)'
    _TESTS = [{
        'url': 'http://www.vier.be/demoestuin/videos',
        'info_dict': {
            'id': 'demoestuin',
        },
        'playlist_mincount': 153,
    }, {
        'url': 'http://www.vijf.be/temptationisland/videos',
        'info_dict': {
            'id': 'temptationisland',
        },
        'playlist_mincount': 159,
    }, {
        'url': 'http://www.vier.be/demoestuin/videos?page=6',
        'info_dict': {
            'id': 'demoestuin-page6',
        },
        'playlist_mincount': 20,
    }, {
        'url': 'http://www.vier.be/demoestuin/videos?page=7',
        'info_dict': {
            'id': 'demoestuin-page7',
        },
        'playlist_mincount': 13,
    }]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        program = mobj.group('program')
        site = mobj.group('site')

        page_id = mobj.group('page')
        if page_id:
            page_id = int(page_id)
            start_page = page_id
            playlist_id = '%s-page%d' % (program, page_id)
        else:
            start_page = 0
            playlist_id = program

        entries = []
        for current_page_id in itertools.count(start_page):
            current_page = self._download_webpage(
                'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),
                program,
                'Downloading page %d' % (current_page_id + 1))
            page_entries = [
                self.url_result('http://www.' + site + '.be' + video_url, 'Vier')
                for video_url in re.findall(
                    r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]
            entries.extend(page_entries)
            if page_id or '>Meer<' not in current_page:
                break

        return self.playlist_result(entries, playlist_id)
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`# coding: utf-8`
			`from __future__ import unicode_literals`
[vier] Add new extractor 2015-01-02 20:13:18 +08:00
			`import re`
[vier:videos] Fix extraction with old approach (Closes #6806) 2015-09-10 01:59:17 +08:00			`import itertools`
[vier] Add new extractor 2015-01-02 20:13:18 +08:00
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`from .common import InfoExtractor`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`from ..utils import (`
			`ExtractorError,`
			`urlencode_postdata,`
			`)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00

			`class VierIE(InfoExtractor):`
			`IE_NAME = 'vier'`
[vier] Add IE_DESC 2017-04-08 23:43:29 +08:00			`IE_DESC = 'vier.be and vijf.be'`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`_VALID_URL = r'https?://(?:www\.)?(?P<site>vier\|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?\|video/v3/embed/(?P<embed_id>\d+))'`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`_NETRC_MACHINE = 'vier'`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`_TESTS = [{`
			`'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',`
			`'info_dict': {`
			`'id': '16129',`
			`'display_id': 'het-wordt-warm-de-moestuin',`
			`'ext': 'mp4',`
			`'title': 'Het wordt warm in De Moestuin',`
			`'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',`
			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`# 'skip': 'Requires account credentials',`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`}, {`
			`'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',`
			`'info_dict': {`
			`'id': '2561614',`
			`'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',`
			`'ext': 'mp4',`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`'title': 'EXTRA: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',`
			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`}, {`
			`'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',`
			`'info_dict': {`
			`'id': '2674839',`
			`'display_id': 'jani-gaat-naar-tokio-aflevering-4',`
			`'ext': 'mp4',`
			`'title': 'Jani gaat naar Tokio - Aflevering 4',`
			`'description': 'Bekijk hier de volledige vierde aflevering van het 2de seizoen van Jani gaat...',`
			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
			`'skip': 'Requires account credentials',`
			`}, {`
			`'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',`
			`'info_dict': {`
			`'id': '2674839',`
			`'display_id': 'jani-gaat-naar-tokio-aflevering-4',`
			`'ext': 'mp4',`
			`'title': 'jani-gaat-naar-tokio-aflevering-4',`
			`},`
			`'params': {`
			`# m3u8 download`
			`'skip_download': True,`
			`},`
			`'expected_warnings': ['Log in to extract metadata'],`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`}, {`
			`'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',`
			`'only_matching': True,`
			`}, {`
			`'url': 'http://www.vier.be/video/v3/embed/16129',`
			`'only_matching': True,`
			`}]`

[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`def _real_initialize(self):`
			`self._logged_in = False`

			`def _login(self, site):`
			`username, password = self._get_login_info()`
			`if username is None or password is None:`
			`return`

			`login_page = self._download_webpage(`
			`'http://www.%s.be/user/login' % site,`
			`None, note='Logging in', errnote='Unable to log in',`
			`data=urlencode_postdata({`
			`'form_id': 'user_login',`
			`'name': username,`
			`'pass': password,`
			`}),`
			`headers={'Content-Type': 'application/x-www-form-urlencoded'})`

			`login_error = self._html_search_regex(`
			`r'(?s)<div class="messages error">\s<div>\s<h2.+?</h2>(.+?)<',`
			`login_page, 'login error', default=None)`
			`if login_error:`
			`self.report_warning('Unable to log in: %s' % login_error)`
			`else:`
			`self._logged_in = True`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`embed_id = mobj.group('embed_id')`
			`display_id = mobj.group('display_id') or embed_id`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`video_id = mobj.group('id') or embed_id`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`site = mobj.group('site')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`if not self._logged_in:`
			`self._login(site)`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`webpage = self._download_webpage(url, display_id)`

[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`if r'id="user-login"' in webpage:`
			`self.report_warning(`
			`'Log in to extract metadata', video_id=display_id)`
			`webpage = self._download_webpage(`
			`'http://www.%s.be/video/v3/embed/%s' % (site, video_id),`
			`display_id)`

[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`video_id = self._search_regex(`
[vier] Fix extraction 2015-05-18 23:43:54 +08:00			`[r'data-nid="(\d+)"', r'"nid"\s:\s"(\d+)"'],`
[vier] Improve extraction + Add support for authentication * Bypass authentication when no credentials provded * Improve extraction robustness 2017-05-15 22:46:55 +08:00			`webpage, 'video id', default=video_id)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`application = self._search_regex(`
[vier] Fix extraction 2015-05-18 23:43:54 +08:00			`[r'data-application="([^"]+)"', r'"application"\s:\s"([^"]+)"'],`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`webpage, 'application', default=site + '_vod')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`filename = self._search_regex(`
[vier] Fix extraction 2015-05-18 23:43:54 +08:00			`[r'data-filename="([^"]+)"', r'"filename"\s:\s"([^"]+)"'],`
			`webpage, 'filename')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00
[common] add helper method for Wowza Streaming Engine format extraction 2016-09-17 02:30:38 +08:00			`playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)`
[extractor/common] try to extract non smil wowza mpd manifests 2016-10-19 21:57:12 +08:00			`formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])`
Remove _sort_formats from _extract_*_formats methods Now _sort_formats should be called explicitly. _sort_formats has been added to all the necessary places in code. Closes #8051 2016-03-27 09:03:08 +08:00			`self._sort_formats(formats)`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00
			`title = self._og_search_title(webpage, default=display_id)`
			`description = self._og_search_description(webpage, default=None)`
			`thumbnail = self._og_search_thumbnail(webpage, default=None)`

			`return {`
			`'id': video_id,`
			`'display_id': display_id,`
			`'title': title,`
			`'description': description,`
			`'thumbnail': thumbnail,`
			`'formats': formats,`
[vier] Add new extractor 2015-01-02 20:13:18 +08:00			`}`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00

			`class VierVideosIE(InfoExtractor):`
			`IE_NAME = 'vier:videos'`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`_VALID_URL = r'https?://(?:www\.)?(?P<site>vier\|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)\|$)'`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`_TESTS = [{`
			`'url': 'http://www.vier.be/demoestuin/videos',`
			`'info_dict': {`
			`'id': 'demoestuin',`
			`},`
			`'playlist_mincount': 153,`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`}, {`
			`'url': 'http://www.vijf.be/temptationisland/videos',`
			`'info_dict': {`
			`'id': 'temptationisland',`
			`},`
			`'playlist_mincount': 159,`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`}, {`
			`'url': 'http://www.vier.be/demoestuin/videos?page=6',`
			`'info_dict': {`
			`'id': 'demoestuin-page6',`
			`},`
			`'playlist_mincount': 20,`
			`}, {`
			`'url': 'http://www.vier.be/demoestuin/videos?page=7',`
			`'info_dict': {`
			`'id': 'demoestuin-page7',`
			`},`
			`'playlist_mincount': 13,`
			`}]`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`program = mobj.group('program')`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`site = mobj.group('site')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00
			`page_id = mobj.group('page')`
			`if page_id:`
			`page_id = int(page_id)`
			`start_page = page_id`
			`playlist_id = '%s-page%d' % (program, page_id)`
			`else:`
			`start_page = 0`
			`playlist_id = program`

			`entries = []`
[vier:videos] Fix extraction with old approach (Closes #6806) 2015-09-10 01:59:17 +08:00			`for current_page_id in itertools.count(start_page):`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`current_page = self._download_webpage(`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id),`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`program,`
[vier:videos] Fix extraction with old approach (Closes #6806) 2015-09-10 01:59:17 +08:00			`'Downloading page %d' % (current_page_id + 1))`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`page_entries = [`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`self.url_result('http://www.' + site + '.be' + video_url, 'Vier')`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`for video_url in re.findall(`
[vier] Add support for vijf.be vier.be and vijf.be run on the same CMS and are property of the same company, so the same extractor can be used for both of them. 2017-03-05 00:47:19 +08:00			`r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)]`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00			`entries.extend(page_entries)`
[vier:videos] Fix extraction with old approach (Closes #6806) 2015-09-10 01:59:17 +08:00			`if page_id or '>Meer<' not in current_page:`
			`break`
[vier] Simplify, add support for more URL formats, extract all playlist pages when page is not specified 2015-01-02 22:15:40 +08:00
			`return self.playlist_result(entries, playlist_id)`