youtube-dl/youtube_dl/extractor/niconico.py

# encoding: utf-8
from __future__ import unicode_literals

import re
import json

from .common import InfoExtractor
from ..compat import (
    compat_urllib_parse,
    compat_urllib_request,
    compat_urlparse,
)
from ..utils import (
    ExtractorError,
    int_or_none,
    parse_duration,
    unified_strdate,
)


class NiconicoIE(InfoExtractor):
    IE_NAME = 'niconico'
    IE_DESC = 'ニコニコ動画'

    _TESTS = [{
        'url': 'http://www.nicovideo.jp/watch/sm22312215',
        'md5': 'd1a75c0823e2f629128c43e1212760f9',
        'info_dict': {
            'id': 'sm22312215',
            'ext': 'mp4',
            'title': 'Big Buck Bunny',
            'uploader': 'takuya0301',
            'uploader_id': '2698420',
            'upload_date': '20131123',
            'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
            'duration': 33,
        },
        'params': {
            'username': 'ydl.niconico@gmail.com',
            'password': 'youtube-dl',
        },
    }, {
        'url': 'http://www.nicovideo.jp/watch/nm14296458',
        'md5': '8db08e0158457cf852a31519fceea5bc',
        'info_dict': {
            'id': 'nm14296458',
            'ext': 'swf',
            'title': '【鏡音リン】Dance on media【オリジナル】take2!',
            'description': 'md5:',
            'uploader': 'りょうた',
            'uploader_id': '18822557',
            'upload_date': '20110429',
            'duration': 209,
        },
        'params': {
            'username': 'ydl.niconico@gmail.com',
            'password': 'youtube-dl',
        },
    }]

    _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
    _NETRC_MACHINE = 'niconico'
    # Determine whether the downloader used authentication to download video
    _AUTHENTICATED = False

    def _real_initialize(self):
        self._login()

    def _login(self):
        (username, password) = self._get_login_info()
        # No authentication to be performed
        if not username:
            return True

        # Log in
        login_form_strs = {
            'mail': username,
            'password': password,
        }
        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
        # chokes on unicode
        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
        login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
        request = compat_urllib_request.Request(
            'https://secure.nicovideo.jp/secure/login', login_data)
        login_results = self._download_webpage(
            request, None, note='Logging in', errnote='Unable to log in')
        if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
            self._downloader.report_warning('unable to log in: bad username or password')
            return False
        # Successful login
        self._AUTHENTICATED = True
        return True

    def _real_extract(self, url):
        video_id = self._match_id(url)

        # Get video webpage. We are not actually interested in it, but need
        # the cookies in order to be able to download the info webpage
        self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)

        video_info = self._download_xml(
            'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
            note='Downloading video info page')

        if self._AUTHENTICATED:
            # Get flv info
            flv_info_webpage = self._download_webpage(
                'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
                video_id, 'Downloading flv info')
        else:
            # Get external player info
            ext_player_info = self._download_webpage(
                'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id)
            thumb_play_key = self._search_regex(
                r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey')

            # Get flv info
            flv_info_data = compat_urllib_parse.urlencode({
                'k': thumb_play_key,
                'v': video_id
            })
            flv_info_request = compat_urllib_request.Request(
                'http://ext.nicovideo.jp/thumb_watch', flv_info_data,
                {'Content-Type': 'application/x-www-form-urlencoded'})
            flv_info_webpage = self._download_webpage(
                flv_info_request, video_id,
                note='Downloading flv info', errnote='Unable to download flv info')

        if 'deleted=' in flv_info_webpage:
            raise ExtractorError('The video has been deleted.',
                                 expected=True)
        video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]

        # Start extracting information
        title = video_info.find('.//title').text
        extension = video_info.find('.//movie_type').text
        video_format = extension.upper()
        thumbnail = video_info.find('.//thumbnail_url').text
        description = video_info.find('.//description').text
        upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
        view_count = int_or_none(video_info.find('.//view_counter').text)
        comment_count = int_or_none(video_info.find('.//comment_num').text)
        duration = parse_duration(video_info.find('.//length').text)
        webpage_url = video_info.find('.//watch_url').text

        if video_info.find('.//ch_id') is not None:
            uploader_id = video_info.find('.//ch_id').text
            uploader = video_info.find('.//ch_name').text
        elif video_info.find('.//user_id') is not None:
            uploader_id = video_info.find('.//user_id').text
            uploader = video_info.find('.//user_nickname').text
        else:
            uploader_id = uploader = None

        return {
            'id': video_id,
            'url': video_real_url,
            'title': title,
            'ext': extension,
            'format': video_format,
            'thumbnail': thumbnail,
            'description': description,
            'uploader': uploader,
            'upload_date': upload_date,
            'uploader_id': uploader_id,
            'view_count': view_count,
            'comment_count': comment_count,
            'duration': duration,
            'webpage_url': webpage_url,
        }


class NiconicoPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P<id>\d+)'

    _TEST = {
        'url': 'http://www.nicovideo.jp/mylist/27411728',
        'info_dict': {
            'id': '27411728',
            'title': 'AKB48のオールナイトニッポン',
        },
        'playlist_mincount': 225,
    }

    def _real_extract(self, url):
        list_id = self._match_id(url)
        webpage = self._download_webpage(url, list_id)

        entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
                                          webpage, 'entries')
        entries = json.loads(entries_json)
        entries = [{
            '_type': 'url',
            'ie_key': NiconicoIE.ie_key(),
            'url': ('http://www.nicovideo.jp/watch/%s' %
                    entry['item_data']['video_id']),
        } for entry in entries]

        return {
            '_type': 'playlist',
            'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
            'id': list_id,
            'entries': entries,
        }
Add support for niconico 2013-11-23 17:19:44 +08:00			`# encoding: utf-8`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`from __future__ import unicode_literals`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`import re`
[niconico] Add extractor for playlists (closes #4043) 2014-10-29 18:04:48 +08:00			`import json`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`from ..compat import (`
Add support for niconico 2013-11-23 17:19:44 +08:00			`compat_urllib_parse,`
			`compat_urllib_request,`
			`compat_urlparse,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`)`
			`from ..utils import (`
[niconico] Catch deleted videos (closes #4064) 2014-11-06 02:52:34 +08:00			`ExtractorError,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`int_or_none,`
			`parse_duration,`
			`unified_strdate,`
Add support for niconico 2013-11-23 17:19:44 +08:00			`)`

[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00
Add support for niconico 2013-11-23 17:19:44 +08:00			`class NiconicoIE(InfoExtractor):`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`IE_NAME = 'niconico'`
			`IE_DESC = 'ニコニコ動画'`
Add support for niconico 2013-11-23 17:19:44 +08:00
[niconico] Add nm video test 2015-03-20 22:53:14 +08:00			`_TESTS = [{`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'url': 'http://www.nicovideo.jp/watch/sm22312215',`
			`'md5': 'd1a75c0823e2f629128c43e1212760f9',`
			`'info_dict': {`
			`'id': 'sm22312215',`
			`'ext': 'mp4',`
			`'title': 'Big Buck Bunny',`
			`'uploader': 'takuya0301',`
			`'uploader_id': '2698420',`
			`'upload_date': '20131123',`
			`'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`'duration': 33,`
Add support for niconico 2013-11-23 17:19:44 +08:00			`},`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'params': {`
			`'username': 'ydl.niconico@gmail.com',`
			`'password': 'youtube-dl',`
Add support for niconico 2013-11-23 17:19:44 +08:00			`},`
[niconico] Add nm video test 2015-03-20 22:53:14 +08:00			`}, {`
			`'url': 'http://www.nicovideo.jp/watch/nm14296458',`
			`'md5': '8db08e0158457cf852a31519fceea5bc',`
			`'info_dict': {`
			`'id': 'nm14296458',`
			`'ext': 'swf',`
			`'title': '【鏡音リン】Dance on media【オリジナル】take2!',`
			`'description': 'md5:',`
			`'uploader': 'りょうた',`
			`'uploader_id': '18822557',`
			`'upload_date': '20110429',`
			`'duration': 209,`
			`},`
			`'params': {`
			`'username': 'ydl.niconico@gmail.com',`
			`'password': 'youtube-dl',`
			`},`
			`}]`
Add support for niconico 2013-11-23 17:19:44 +08:00
[niconico] Use '_match_id' 2015-03-10 01:12:41 +08:00			`_VALID_URL = r'https?://(?:www\.\|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'`
Add support for niconico 2013-11-23 17:19:44 +08:00			`_NETRC_MACHINE = 'niconico'`
[niconico] Fix ignored --netrc flag See issue #3753 2014-10-13 04:18:42 +08:00			`# Determine whether the downloader used authentication to download video`
			`_AUTHENTICATED = False`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`def _real_initialize(self):`
[niconico] Fix ignored --netrc flag See issue #3753 2014-10-13 04:18:42 +08:00			`self._login()`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`def _login(self):`
			`(username, password) = self._get_login_info()`
[niconico] Fix ignored --netrc flag See issue #3753 2014-10-13 04:18:42 +08:00			`# No authentication to be performed`
			`if not username:`
			`return True`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`# Log in`
			`login_form_strs = {`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'mail': username,`
			`'password': password,`
Add support for niconico 2013-11-23 17:19:44 +08:00			`}`
			`# Convert to UTF-8 before urlencode because Python 2.x's urlencode`
			`# chokes on unicode`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())`
[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00			`login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')`
			`request = compat_urllib_request.Request(`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'https://secure.nicovideo.jp/secure/login', login_data)`
[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00			`login_results = self._download_webpage(`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`request, None, note='Logging in', errnote='Unable to log in')`
[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00			`if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`self._downloader.report_warning('unable to log in: bad username or password')`
Add support for niconico 2013-11-23 17:19:44 +08:00			`return False`
[niconico] Fix ignored --netrc flag See issue #3753 2014-10-13 04:18:42 +08:00			`# Successful login`
			`self._AUTHENTICATED = True`
Add support for niconico 2013-11-23 17:19:44 +08:00			`return True`

			`def _real_extract(self, url):`
[niconico] Use '_match_id' 2015-03-10 01:12:41 +08:00			`video_id = self._match_id(url)`
Add support for niconico 2013-11-23 17:19:44 +08:00
[niconico] Clarify download 2013-11-24 13:53:50 +08:00			`# Get video webpage. We are not actually interested in it, but need`
			`# the cookies in order to be able to download the info webpage`
			`self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)`
Add support for niconico 2013-11-23 17:19:44 +08:00
Use the new '_download_xml' helper in more extractors 2013-11-27 01:48:52 +08:00			`video_info = self._download_xml(`
[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00			`'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`note='Downloading video info page')`
Add support for niconico 2013-11-23 17:19:44 +08:00
[niconico] Fix ignored --netrc flag See issue #3753 2014-10-13 04:18:42 +08:00			`if self._AUTHENTICATED:`
[niconico] Download without authentication 2014-07-02 01:32:54 +08:00			`# Get flv info`
			`flv_info_webpage = self._download_webpage(`
fix nm video DL issue when logged in 2015-03-18 21:19:55 +08:00			`'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',`
[niconico] Download without authentication 2014-07-02 01:32:54 +08:00			`video_id, 'Downloading flv info')`
			`else:`
			`# Get external player info`
			`ext_player_info = self._download_webpage(`
			`'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id)`
			`thumb_play_key = self._search_regex(`
			`r'\'thumbPlayKey\'\s:\s\'(.*?)\'', ext_player_info, 'thumbPlayKey')`
fix nm video DL issue when logged in 2015-03-18 21:24:17 +08:00
[niconico] Download without authentication 2014-07-02 01:32:54 +08:00			`# Get flv info`
			`flv_info_data = compat_urllib_parse.urlencode({`
			`'k': thumb_play_key,`
			`'v': video_id`
			`})`
			`flv_info_request = compat_urllib_request.Request(`
			`'http://ext.nicovideo.jp/thumb_watch', flv_info_data,`
			`{'Content-Type': 'application/x-www-form-urlencoded'})`
			`flv_info_webpage = self._download_webpage(`
			`flv_info_request, video_id,`
			`note='Downloading flv info', errnote='Unable to download flv info')`

[niconico] Catch deleted videos (closes #4064) 2014-11-06 02:52:34 +08:00			`if 'deleted=' in flv_info_webpage:`
			`raise ExtractorError('The video has been deleted.',`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`expected=True)`
[niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. 2013-11-24 13:37:14 +08:00			`video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`# Start extracting information`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`title = video_info.find('.//title').text`
			`extension = video_info.find('.//movie_type').text`
			`video_format = extension.upper()`
			`thumbnail = video_info.find('.//thumbnail_url').text`
			`description = video_info.find('.//description').text`
			`upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])`
			`view_count = int_or_none(video_info.find('.//view_counter').text)`
			`comment_count = int_or_none(video_info.find('.//comment_num').text)`
			`duration = parse_duration(video_info.find('.//length').text)`
			`webpage_url = video_info.find('.//watch_url').text`

[niconico] Add support for channel video 2014-07-02 02:13:12 +08:00			`if video_info.find('.//ch_id') is not None:`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`uploader_id = video_info.find('.//ch_id').text`
			`uploader = video_info.find('.//ch_name').text`
[niconico] Add support for channel video 2014-07-02 02:13:12 +08:00			`elif video_info.find('.//user_id') is not None:`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`uploader_id = video_info.find('.//user_id').text`
			`uploader = video_info.find('.//user_nickname').text`
			`else:`
			`uploader_id = uploader = None`
Add support for niconico 2013-11-23 17:19:44 +08:00
			`return {`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'id': video_id,`
			`'url': video_real_url,`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`'title': title,`
			`'ext': extension,`
[niconico] Modernize 2014-03-28 04:01:09 +08:00			`'format': video_format,`
[niconico] Extract more metadata and simplify (Closes #3181) 2014-07-04 23:05:46 +08:00			`'thumbnail': thumbnail,`
			`'description': description,`
			`'uploader': uploader,`
			`'upload_date': upload_date,`
			`'uploader_id': uploader_id,`
			`'view_count': view_count,`
			`'comment_count': comment_count,`
			`'duration': duration,`
			`'webpage_url': webpage_url,`
Add support for niconico 2013-11-23 17:19:44 +08:00			`}`
[niconico] Add extractor for playlists (closes #4043) 2014-10-29 18:04:48 +08:00

			`class NiconicoPlaylistIE(InfoExtractor):`
			`_VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P<id>\d+)'`

			`_TEST = {`
			`'url': 'http://www.nicovideo.jp/mylist/27411728',`
			`'info_dict': {`
			`'id': '27411728',`
			`'title': 'AKB48のオールナイトニッポン',`
			`},`
			`'playlist_mincount': 225,`
			`}`

			`def _real_extract(self, url):`
			`list_id = self._match_id(url)`
			`webpage = self._download_webpage(url, list_id)`

			`entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`webpage, 'entries')`
[niconico] Add extractor for playlists (closes #4043) 2014-10-29 18:04:48 +08:00			`entries = json.loads(entries_json)`
			`entries = [{`
			`'_type': 'url',`
			`'ie_key': NiconicoIE.ie_key(),`
[niconico:playlist] Use the same video url the webpage uses (closes #4133) 2014-11-08 21:53:23 +08:00			`'url': ('http://www.nicovideo.jp/watch/%s' %`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`entry['item_data']['video_id']),`
[niconico] Add extractor for playlists (closes #4043) 2014-10-29 18:04:48 +08:00			`} for entry in entries]`

			`return {`
			`'_type': 'playlist',`
			`'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),`
			`'id': list_id,`
			`'entries': entries,`
			`}`