youtube-dl/youtube_dl/extractor/photobucket.py

from __future__ import unicode_literals

import json
import re

from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote


class PhotobucketIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
    _TEST = {
        'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
        'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
        'info_dict': {
            'id': 'zpsc0c3b9fa',
            'ext': 'mp4',
            'timestamp': 1367669341,
            'upload_date': '20130504',
            'uploader': 'rachaneronas',
            'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',
        }
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        video_extension = mobj.group('ext')

        webpage = self._download_webpage(url, video_id)

        # Extract URL, uploader, and title from webpage
        self.report_extraction(video_id)
        info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
                                       webpage, 'info json')
        info = json.loads(info_json)
        url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
        return {
            'id': video_id,
            'url': url,
            'uploader': info['username'],
            'timestamp': info['creationDate'],
            'title': info['title'],
            'ext': video_extension,
            'thumbnail': info['thumbUrl'],
        }
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`from __future__ import unicode_literals`

Move Photobucket into its own file 2013-06-24 02:12:18 +08:00			`import json`
			`import re`

			`from .common import InfoExtractor`
[photobucket] Use compat_urllib_parse_unquote 2015-07-18 01:45:55 +08:00			`from ..compat import compat_urllib_parse_unquote`
Move Photobucket into its own file 2013-06-24 02:12:18 +08:00

			`class PhotobucketIE(InfoExtractor):`
Add support for https for all extractors as preventive and future-proof measure 2016-03-21 23:36:32 +08:00			`_VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.(([\?\&]current=)\|_)(?P<id>.)\.(?P<ext>(flv)\|(mp4))'`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`_TEST = {`
[photobucket] More unicode literals 2014-03-10 03:24:19 +08:00			`'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',`
			`'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',`
			`'info_dict': {`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`'id': 'zpsc0c3b9fa',`
			`'ext': 'mp4',`
test_download works for photobucket after this change 2014-05-16 23:43:03 +08:00			`'timestamp': 1367669341,`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`'upload_date': '20130504',`
			`'uploader': 'rachaneronas',`
			`'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',`
Move tests to the IE definitions 2013-06-28 02:46:46 +08:00			`}`
			`}`
Move Photobucket into its own file 2013-06-24 02:12:18 +08:00
			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`video_id = mobj.group('id')`
			`video_extension = mobj.group('ext')`

			`webpage = self._download_webpage(url, video_id)`

			`# Extract URL, uploader, and title from webpage`
			`self.report_extraction(video_id)`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',`
PEP8: applied even more rules 2014-11-24 04:39:15 +08:00			`webpage, 'info json')`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`info = json.loads(info_json)`
[photobucket] Use compat_urllib_parse_unquote 2015-07-18 01:45:55 +08:00			`url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`return {`
			`'id': video_id,`
test_download works for photobucket after this change 2014-05-16 23:43:03 +08:00			`'url': url,`
[photobucket] More unicode literals 2014-03-10 03:24:19 +08:00			`'uploader': info['username'],`
test_download works for photobucket after this change 2014-05-16 23:43:03 +08:00			`'timestamp': info['creationDate'],`
[photobucket] More unicode literals 2014-03-10 03:24:19 +08:00			`'title': info['title'],`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`'ext': video_extension,`
[photobucket] More unicode literals 2014-03-10 03:24:19 +08:00			`'thumbnail': info['thumbUrl'],`
[photobucket] Modernize and remove the old extraction code 2014-03-10 02:36:46 +08:00			`}`