youtube-dl/youtube_dl/extractor/googlesearch.py

from __future__ import unicode_literals

import itertools
import re

from .common import SearchInfoExtractor
from ..compat import (
    compat_urllib_parse,
)


class GoogleSearchIE(SearchInfoExtractor):
    IE_DESC = 'Google Video search'
    _MAX_RESULTS = 1000
    IE_NAME = 'video.google:search'
    _SEARCH_KEY = 'gvsearch'
    _TEST = {
        'url': 'gvsearch15:python language',
        'info_dict': {
            'id': 'python language',
            'title': 'python language',
        },
        'playlist_count': 15,
    }

    def _get_n_results(self, query, n):
        """Get a specified number of results for a query"""

        entries = []
        res = {
            '_type': 'playlist',
            'id': query,
            'title': query,
        }

        for pagenum in itertools.count():
            result_url = (
                'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'
                % (compat_urllib_parse.quote_plus(query), pagenum * 10))

            webpage = self._download_webpage(
                result_url, 'gvsearch:' + query,
                note='Downloading result page ' + str(pagenum + 1))

            for hit_idx, mobj in enumerate(re.finditer(
                    r'<h3 class="r"><a href="([^"]+)"', webpage)):

                # Skip playlists
                if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):
                    continue

                entries.append({
                    '_type': 'url',
                    'url': mobj.group(1)
                })

            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
                res['entries'] = entries[:n]
                return res
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`from __future__ import unicode_literals`

Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`import itertools`
			`import re`

			`from .common import SearchInfoExtractor`
Fix imports and general cleanup · Import from compat what comes from compat. Yes, some names are available in utils too, but that's an implementation detail. · Use _match_id consistently whenever possible · Fix some outdated tests · Use consistent valid URL (always match the whole protocol, no ^ at start required) · Use modern test definitions 2014-12-13 19:24:42 +08:00			`from ..compat import (`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`compat_urllib_parse,`
			`)`


			`class GoogleSearchIE(SearchInfoExtractor):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`IE_DESC = 'Google Video search'`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`_MAX_RESULTS = 1000`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`IE_NAME = 'video.google:search'`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`_SEARCH_KEY = 'gvsearch'`
[googlesearch] Move test to extractor 2014-08-25 23:02:52 +08:00			`_TEST = {`
			`'url': 'gvsearch15:python language',`
			`'info_dict': {`
			`'id': 'python language',`
			`'title': 'python language',`
			`},`
			`'playlist_count': 15,`
			`}`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00
			`def _get_n_results(self, query, n):`
			`"""Get a specified number of results for a query"""`

[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`entries = []`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`res = {`
			`'_type': 'playlist',`
			`'id': query,`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`'title': query,`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`}`

[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`for pagenum in itertools.count():`
			`result_url = (`
			`'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en'`
			`% (compat_urllib_parse.quote_plus(query), pagenum * 10))`

			`webpage = self._download_webpage(`
			`result_url, 'gvsearch:' + query,`
			`note='Downloading result page ' + str(pagenum + 1))`

			`for hit_idx, mobj in enumerate(re.finditer(`
			`r'<h3 class="r"><a href="([^"]+)"', webpage)):`

			`# Skip playlists`
			`if not re.search(r'id="vidthumb%d"' % (hit_idx + 1), webpage):`
			`continue`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`entries.append({`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`'_type': 'url',`
			`'url': mobj.group(1)`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`})`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00
[googlesearch] Fix next page indicator check 2014-03-13 23:52:13 +08:00			`if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):`
[googlesearch] Fix start, and skip playlists (Fixes #2329) 2014-02-06 10:29:10 +08:00			`res['entries'] = entries[:n]`
Move GoogleSearchIE into its own file 2013-06-24 02:32:49 +08:00			`return res`