youtube-dl/youtube_dl/extractor/slideshare.py

from __future__ import unicode_literals

import re
import json

from .common import InfoExtractor
from ..compat import (
    compat_urlparse,
)
from ..utils import (
    ExtractorError,
    get_element_by_id,
)


class SlideshareIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)'

    _TEST = {
        'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',
        'info_dict': {
            'id': '25665706',
            'ext': 'mp4',
            'title': 'Managing Scale and Complexity',
            'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',
        },
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        page_title = mobj.group('title')
        webpage = self._download_webpage(url, page_title)
        slideshare_obj = self._search_regex(
            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
            webpage, 'slideshare object')
        info = json.loads(slideshare_obj)
        if info['slideshow']['type'] != 'video':
            raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)

        doc = info['doc']
        bucket = info['jsplayer']['video_bucket']
        ext = info['jsplayer']['video_extension']
        video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
        description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
            r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
            'description', fatal=False)

        return {
            '_type': 'video',
            'id': info['slideshow']['id'],
            'title': info['slideshow']['title'],
            'ext': ext,
            'url': video_url,
            'thumbnail': info['slideshow']['pin_image_url'],
            'description': description.strip() if description else None,
        }
[slideshare] Fix description extraction and modernize The ‘og:description’ property doesn’t contain the full description 2014-02-09 21:22:56 +08:00			`from __future__ import unicode_literals`

Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`import re`
			`import json`

			`from .common import InfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`from ..compat import (`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`compat_urlparse,`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`)`
			`from ..utils import (`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`ExtractorError,`
[slideshare] fix description extraction 2016-07-05 19:01:04 +08:00			`get_element_by_id,`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`)`


			`class SlideshareIE(InfoExtractor):`
Improve some _VALID_URLs 2016-09-08 19:29:05 +08:00			`_VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($\|\?)'`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00
			`_TEST = {`
[slideshare] Fix description extraction and modernize The ‘og:description’ property doesn’t contain the full description 2014-02-09 21:22:56 +08:00			`'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity',`
			`'info_dict': {`
			`'id': '25665706',`
			`'ext': 'mp4',`
			`'title': 'Managing Scale and Complexity',`
			`'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.',`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`},`
			`}`

			`def _real_extract(self, url):`
			`mobj = re.match(self._VALID_URL, url)`
			`page_title = mobj.group('title')`
			`webpage = self._download_webpage(url, page_title)`
			`slideshare_obj = self._search_regex(`
[slideshare] Fix extraction (#5279) 2015-03-26 23:46:20 +08:00			`r'\$\.extend\(slideshare_object,\s(\{.?\})\);',`
[slideshare] Fix description extraction and modernize The ‘og:description’ property doesn’t contain the full description 2014-02-09 21:22:56 +08:00			`webpage, 'slideshare object')`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`info = json.loads(slideshare_obj)`
[slideshare] Fix description extraction and modernize The ‘og:description’ property doesn’t contain the full description 2014-02-09 21:22:56 +08:00			`if info['slideshow']['type'] != 'video':`
			`raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True)`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00
			`doc = info['doc']`
			`bucket = info['jsplayer']['video_bucket']`
			`ext = info['jsplayer']['video_extension']`
			`video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)`
[slideshare] fix description extraction 2016-07-05 19:01:04 +08:00			`description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(`
[slideshare] Fix extraction 2015-01-01 02:26:19 +08:00			`r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,`
[slideshare] Fix description 2014-04-11 08:19:15 +08:00			`'description', fatal=False)`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00
			`return {`
			`'_type': 'video',`
			`'id': info['slideshow']['id'],`
			`'title': info['slideshow']['title'],`
			`'ext': ext,`
			`'url': video_url,`
			`'thumbnail': info['slideshow']['pin_image_url'],`
[slideshare] fix description extraction 2016-07-05 19:01:04 +08:00			`'description': description.strip() if description else None,`
Add an extractor for Slideshare (closes #1400) 2013-09-10 17:19:58 +08:00			`}`