youtube-dl/youtube_dl/extractor/facebook.py

from __future__ import unicode_literals

import json
import re
import socket

from .common import InfoExtractor
from ..compat import (
    compat_http_client,
    compat_str,
    compat_urllib_error,
    compat_urllib_parse_unquote,
    compat_urllib_request,
)
from ..utils import (
    ExtractorError,
    int_or_none,
    limit_length,
    urlencode_postdata,
    get_element_by_id,
    clean_html,
)


class FacebookIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        https?://(?:\w+\.)?facebook\.com/
        (?:[^#]*?\#!/)?
        (?:
            (?:video/video\.php|photo\.php|video\.php|video/embed)\?(?:.*?)
            (?:v|video_id)=|
            [^/]+/videos/(?:[^/]+/)?
        )
        (?P<id>[0-9]+)
        (?:.*)'''
    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
    _NETRC_MACHINE = 'facebook'
    IE_NAME = 'facebook'
    _TESTS = [{
        'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
        'md5': '6a40d33c0eccbb1af76cf0485a052659',
        'info_dict': {
            'id': '637842556329505',
            'ext': 'mp4',
            'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
            'uploader': 'Tennis on Facebook',
        }
    }, {
        'note': 'Video without discernible title',
        'url': 'https://www.facebook.com/video.php?v=274175099429670',
        'info_dict': {
            'id': '274175099429670',
            'ext': 'mp4',
            'title': 'Facebook video #274175099429670',
            'uploader': 'Asif Nawab Butt',
        },
        'expected_warnings': [
            'title'
        ]
    }, {
        'url': 'https://www.facebook.com/video.php?v=10204634152394104',
        'only_matching': True,
    }, {
        'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
        'only_matching': True,
    }, {
        'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
        'only_matching': True,
    }]

    def _login(self):
        (useremail, password) = self._get_login_info()
        if useremail is None:
            return

        login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
        login_page_req.add_header('Cookie', 'locale=en_US')
        login_page = self._download_webpage(login_page_req, None,
                                            note='Downloading login page',
                                            errnote='Unable to download login page')
        lsd = self._search_regex(
            r'<input type="hidden" name="lsd" value="([^"]*)"',
            login_page, 'lsd')
        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')

        login_form = {
            'email': useremail,
            'pass': password,
            'lsd': lsd,
            'lgnrnd': lgnrnd,
            'next': 'http://facebook.com/home.php',
            'default_persistent': '0',
            'legacy_return': '1',
            'timezone': '-60',
            'trynum': '1',
        }
        request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
        try:
            login_results = self._download_webpage(request, None,
                                                   note='Logging in', errnote='unable to fetch login page')
            if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
                self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                return

            check_form = {
                'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
                'h': self._search_regex(
                    r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'),
                'name_action_selected': 'dont_save',
            }
            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
            check_response = self._download_webpage(check_req, None,
                                                    note='Confirming login')
            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
                self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
            self._downloader.report_warning('unable to log in: %s' % compat_str(err))
            return

    def _real_initialize(self):
        self._login()

    def _real_extract(self, url):
        video_id = self._match_id(url)
        url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
        webpage = self._download_webpage(url, video_id)

        BEFORE = '{swf.addParam(param[0], param[1]);});\n'
        AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
        m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
        if not m:
            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
            if m_msg is not None:
                raise ExtractorError(
                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                    expected=True)
            else:
                raise ExtractorError('Cannot parse data')
        data = dict(json.loads(m.group(1)))
        params_raw = compat_urllib_parse_unquote(data['params'])
        params = json.loads(params_raw)
        video_data = params['video_data'][0]

        formats = []
        for quality in ['sd', 'hd']:
            src = video_data.get('%s_src' % quality)
            if src is not None:
                formats.append({
                    'format_id': quality,
                    'url': src,
                })
        if not formats:
            raise ExtractorError('Cannot find video formats')

        video_title = self._html_search_regex(
            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
            default=None)
        if not video_title:
            video_title = self._html_search_regex(
                r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
                webpage, 'alternative title', fatal=False)
            video_title = limit_length(video_title, 80)
        if not video_title:
            video_title = 'Facebook video #%s' % video_id
        uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))

        return {
            'id': video_id,
            'title': video_title,
            'formats': formats,
            'duration': int_or_none(video_data.get('video_duration')),
            'thumbnail': video_data.get('thumbnail_src'),
            'uploader': uploader,
        }
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`from __future__ import unicode_literals`

Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`import json`
			`import re`
			`import socket`

			`from .common import InfoExtractor`
[util] Move compatibility functions out of util utils is large enough without these compatibility functions. Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py . Everything else (i.e. youtube-dl-specific helpers) goes into utils.py . 2014-11-02 18:23:40 +08:00			`from ..compat import (`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`compat_http_client,`
			`compat_str,`
			`compat_urllib_error,`
[facebook] Use compat_urllib_parse_unquote 2015-07-18 01:37:56 +08:00			`compat_urllib_parse_unquote,`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`compat_urllib_request,`
[util] Move compatibility functions out of util utils is large enough without these compatibility functions. Everything that is present in newer versions of Python (i.e. with dev Python it's just an import) goes into compat.py . Everything else (i.e. youtube-dl-specific helpers) goes into utils.py . 2014-11-02 18:23:40 +08:00			`)`
			`from ..utils import (`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`ExtractorError,`
[facebook] Make thumbnail and duration optional Fixes #4425. Looks like both properties aren't given to us anymore. For now, just fall back to not returning them. 2014-12-10 22:18:34 +08:00			`int_or_none,`
[facebook] Fix support for untitled videos (Fixes #3757) 2014-09-15 21:10:24 +08:00			`limit_length,`
[facebook] Make thumbnail and duration optional Fixes #4425. Looks like both properties aren't given to us anymore. For now, just fall back to not returning them. 2014-12-10 22:18:34 +08:00			`urlencode_postdata,`
[facebook] extract uploader 2015-08-03 05:52:12 +08:00			`get_element_by_id,`
			`clean_html,`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`)`


			`class FacebookIE(InfoExtractor):`
[facebook] Add support for embeds Example URL: http://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html 2014-01-22 01:10:14 +08:00			`_VALID_URL = r'''(?x)`
[facebook] Modernize 2014-03-10 01:42:44 +08:00			`https?://(?:\w+\.)?facebook\.com/`
[facebook] Allow '?' before '#!' (fixes #3477) 2014-08-10 17:55:24 +08:00			`(?:[^#]*?\#!/)?`
[facebook] Extend _VALID_URL 2015-04-17 00:08:52 +08:00			`(?:`
			`(?:video/video\.php\|photo\.php\|video\.php\|video/embed)\?(?:.*?)`
			`(?:v\|video_id)=\|`
[facebook] Extend _VALID_URL take 2 (#5120) 2015-04-18 16:08:24 +08:00			`[^/]+/videos/(?:[^/]+/)?`
[facebook] Extend _VALID_URL 2015-04-17 00:08:52 +08:00			`)`
			`(?P<id>[0-9]+)`
[facebook] Add support for embeds Example URL: http://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html 2014-01-22 01:10:14 +08:00			`(?:.*)'''`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`_LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'`
			`_CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`_NETRC_MACHINE = 'facebook'`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`IE_NAME = 'facebook'`
[facebook] Match video.php URLs 2014-08-27 17:08:47 +08:00			`_TESTS = [{`
[facebook] Fix and caption if title is empty 2014-09-13 15:01:57 +08:00			`'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',`
			`'md5': '6a40d33c0eccbb1af76cf0485a052659',`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`'info_dict': {`
[facebook] Fix and caption if title is empty 2014-09-13 15:01:57 +08:00			`'id': '637842556329505',`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`'ext': 'mp4',`
[facebook] Fix test case 2014-09-29 11:19:56 +08:00			`'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',`
[facebook] add uploader value to the tests 2015-08-03 07:09:21 +08:00			`'uploader': 'Tennis on Facebook',`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`}`
[facebook] Fix support for untitled videos (Fixes #3757) 2014-09-15 21:10:24 +08:00			`}, {`
			`'note': 'Video without discernible title',`
			`'url': 'https://www.facebook.com/video.php?v=274175099429670',`
			`'info_dict': {`
			`'id': '274175099429670',`
			`'ext': 'mp4',`
			`'title': 'Facebook video #274175099429670',`
[facebook] add uploader value to the tests 2015-08-03 07:09:21 +08:00			`'uploader': 'Asif Nawab Butt',`
[facebook] Move the title extraction warning below (fixes #5820) 2015-05-26 13:37:15 +08:00			`},`
			`'expected_warnings': [`
			`'title'`
			`]`
[facebook] Match video.php URLs 2014-08-27 17:08:47 +08:00			`}, {`
			`'url': 'https://www.facebook.com/video.php?v=10204634152394104',`
			`'only_matching': True,`
[facebook] Extend _VALID_URL 2015-04-17 00:08:52 +08:00			`}, {`
			`'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',`
			`'only_matching': True,`
[facebook] Extend _VALID_URL take 2 (#5120) 2015-04-18 16:08:24 +08:00			`}, {`
			`'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',`
			`'only_matching': True,`
[facebook] Match video.php URLs 2014-08-27 17:08:47 +08:00			`}]`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`def _login(self):`
			`(useremail, password) = self._get_login_info()`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`if useremail is None:`
			`return`

[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`login_page_req = compat_urllib_request.Request(self._LOGIN_URL)`
			`login_page_req.add_header('Cookie', 'locale=en_US')`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`login_page = self._download_webpage(login_page_req, None,`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`note='Downloading login page',`
			`errnote='Unable to download login page')`
[facebook] Fix login detection (#2505) 2014-03-04 10:39:04 +08:00			`lsd = self._search_regex(`
[facebook] Correct regexp 2014-03-04 10:39:45 +08:00			`r'<input type="hidden" name="lsd" value="([^"]*)"',`
[facebook] Fix login detection (#2505) 2014-03-04 10:39:04 +08:00			`login_page, 'lsd')`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`login_form = {`
			`'email': useremail,`
			`'pass': password,`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`'lsd': lsd,`
			`'lgnrnd': lgnrnd,`
			`'next': 'http://facebook.com/home.php',`
			`'default_persistent': '0',`
			`'legacy_return': '1',`
			`'timezone': '-60',`
			`'trynum': '1',`
Fix all PEP8 issues except E501 2014-11-24 05:21:46 +08:00			`}`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`request.add_header('Content-Type', 'application/x-www-form-urlencoded')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`try:`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`login_results = self._download_webpage(request, None,`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`note='Logging in', errnote='unable to fetch login page')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`if re.search(r'<form(.)name="login"(.)</form>', login_results) is not None:`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`return`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00
			`check_form = {`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),`
[facebook] Fix login (Fixes #3667) 2014-09-03 15:49:05 +08:00			`'h': self._search_regex(`
			`r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'),`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`'name_action_selected': 'dont_save',`
			`}`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')`
[facebook] Fix login process It was broken and didn't work in python 3. And use `_download_webpage` instead of `compat_urllib_request.urlopen`. 2014-03-07 22:25:33 +08:00			`check_response = self._download_webpage(check_req, None,`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`note='Confirming login')`
[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`if re.search(r'id="checkpointSubmitButton"', check_response) is not None:`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`self._downloader.report_warning('unable to log in: %s' % compat_str(err))`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`return`

[facebook] Fix the login process (fixes #1244) 2013-10-27 19:07:58 +08:00			`def _real_initialize(self):`
			`self._login()`

Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`def _real_extract(self, url):`
[facebook] Make thumbnail and duration optional Fixes #4425. Looks like both properties aren't given to us anymore. For now, just fall back to not returning them. 2014-12-10 22:18:34 +08:00			`video_id = self._match_id(url)`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`url = 'https://www.facebook.com/video/video.php?v=%s' % video_id`
			`webpage = self._download_webpage(url, video_id)`

			`BEFORE = '{swf.addParam(param[0], param[1]);});\n'`
			`AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'`
			`m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)`
			`if not m:`
[facebook] Report a more meaningful message if the video cannot be accessed (closes #1658) 2013-10-27 19:09:46 +08:00			`m_msg = re.search(r'class="[^"]uiInterstitialContent[^"]"><div>(.*?)</div>', webpage)`
			`if m_msg is not None:`
[facebook] Don't recommend to report the issue if the video is private. 2013-10-27 19:13:55 +08:00			`raise ExtractorError(`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`'The video is not available, Facebook said: "%s"' % m_msg.group(1),`
[facebook] Don't recommend to report the issue if the video is private. 2013-10-27 19:13:55 +08:00			`expected=True)`
[facebook] Report a more meaningful message if the video cannot be accessed (closes #1658) 2013-10-27 19:09:46 +08:00			`else:`
[facebook] Modernize 2014-03-04 10:36:54 +08:00			`raise ExtractorError('Cannot parse data')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`data = dict(json.loads(m.group(1)))`
[facebook] Use compat_urllib_parse_unquote 2015-07-18 01:37:56 +08:00			`params_raw = compat_urllib_parse_unquote(data['params'])`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`params = json.loads(params_raw)`
			`video_data = params['video_data'][0]`
[facebook] Extract all the formats (closes #5037) 2015-02-24 01:54:15 +08:00
			`formats = []`
			`for quality in ['sd', 'hd']:`
			`src = video_data.get('%s_src' % quality)`
			`if src is not None:`
			`formats.append({`
			`'format_id': quality,`
			`'url': src,`
			`})`
			`if not formats:`
			`raise ExtractorError('Cannot find video formats')`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00
[facebook] Allow untitled videos (Fixes #1484) 2013-09-23 17:24:10 +08:00			`video_title = self._html_search_regex(`
[facebook] Improve title regex (Closes #5816) 2015-05-27 23:25:07 +08:00			`r'<h2\s+[^>]class="uiHeaderTitle"[^>]>([^<]*)</h2>', webpage, 'title',`
[facebook] Move the title extraction warning below (fixes #5820) 2015-05-26 13:37:15 +08:00			`default=None)`
[facebook] Fix and caption if title is empty 2014-09-13 15:01:57 +08:00			`if not video_title:`
			`video_title = self._html_search_regex(`
			`r'(?s)<span class="fbPhotosPhotoCaption".?id="fbPhotoPageCaption"><span class="hasCaption">(.?)</span>',`
[facebook] Move the title extraction warning below (fixes #5820) 2015-05-26 13:37:15 +08:00			`webpage, 'alternative title', fatal=False)`
[facebook] Fix support for untitled videos (Fixes #3757) 2014-09-15 21:10:24 +08:00			`video_title = limit_length(video_title, 80)`
[facebook] Fix and caption if title is empty 2014-09-13 15:01:57 +08:00			`if not video_title:`
			`video_title = 'Facebook video #%s' % video_id`
[facebook] extract uploader 2015-08-03 05:52:12 +08:00			`uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage))`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00
[facebook] Modernize 2014-03-10 01:42:44 +08:00			`return {`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`'id': video_id,`
			`'title': video_title,`
[facebook] Extract all the formats (closes #5037) 2015-02-24 01:54:15 +08:00			`'formats': formats,`
[facebook] Make thumbnail and duration optional Fixes #4425. Looks like both properties aren't given to us anymore. For now, just fall back to not returning them. 2014-12-10 22:18:34 +08:00			`'duration': int_or_none(video_data.get('video_duration')),`
			`'thumbnail': video_data.get('thumbnail_src'),`
[facebook] extract uploader 2015-08-03 05:52:12 +08:00			`'uploader': uploader,`
Move Facebook into its own file 2013-06-24 02:59:45 +08:00			`}`