Add an extractor for orf.at (closes #1346)

Make find_xpath_attr also accept numbers in the value
2024-11-20 20:42:52 +08:00 · 2013-08-29 19:16:07 +02:00 · 2013-08-29 19:16:07 +02:00 · 545434670b
commit 545434670b
parent 54fda45bac
3 changed files with 67 additions and 1 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -59,6 +59,7 @@ from .myvideo import MyVideoIE
 from .nba import NBAIE
 from .nbc import NBCNewsIE
 from .ooyala import OoyalaIE
 from .orf import ORFIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
 from .pornotube import PornotubeIE
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@ -0,0 +1,65 @@
 import re
 import xml.etree.ElementTree
 import json
 from .common import InfoExtractor
 from ..utils import (
    compat_urlparse,
    ExtractorError,
    find_xpath_attr,
 )
 class ORFIE(InfoExtractor):
    _VALID_URL = r'https?://tvthek.orf.at/(programs/.+?/episodes|topics/.+?)/(?P<id>\d+)'
    _TEST = {
        u'url': u'http://tvthek.orf.at/programs/1171769-Wetter-ZIB/episodes/6557323-Wetter',
        u'file': u'6566957.flv',
        u'info_dict': {
            u'title': u'Wetter',
            u'description': u'Christa Kummer, Marcus Wadsak und Kollegen  präsentieren abwechselnd ihre täglichen Wetterprognosen für Österreich.\r \r Mehr Wetter unter wetter.ORF.at',
        },
        u'params': {
            # It uses rtmp
            u'skip_download': True,
        }
    }
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        playlist_id = mobj.group('id')
        webpage = self._download_webpage(url, playlist_id)
        flash_xml = self._search_regex('ORF.flashXML = \'(.+?)\'', webpage, u'flash xml')
        flash_xml = compat_urlparse.parse_qs('xml='+flash_xml)['xml'][0]
        flash_config = xml.etree.ElementTree.fromstring(flash_xml.encode('utf-8'))
        playlist_json = self._search_regex(r'playlist\': \'(\[.*?\])\'', webpage, u'playlist').replace(r'\"','"')
        playlist = json.loads(playlist_json)
        videos = []
        ns = '{http://tempuri.org/XMLSchema.xsd}'
        xpath = '%(ns)sPlaylist/%(ns)sItems/%(ns)sItem' % {'ns': ns}
        webpage_description = self._og_search_description(webpage)
        for (i, (item, info)) in enumerate(zip(flash_config.findall(xpath), playlist), 1):
            # Get best quality url
            rtmp_url = None
            for q in ['Q6A', 'Q4A', 'Q1A']:
                video_url = find_xpath_attr(item, '%sVideoUrl' % ns, 'quality', q)
                if video_url is not None:
                    rtmp_url = video_url.text
                    break
            if rtmp_url is None:
                raise ExtractorError(u'Couldn\'t get video url: %s' % info['id'])
            description = self._html_search_regex(
                r'id="playlist_entry_%s".*?<p>(.*?)</p>' % i, webpage,
                u'description', default=webpage_description, flags=re.DOTALL)
            videos.append({
                '_type': 'video',
                'id': info['id'],
                'title': info['title'],
                'url': rtmp_url,
                'ext': 'flv',
                'description': description,
                })
        return videos
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -213,7 +213,7 @@ if sys.version_info >= (2,7):
    def find_xpath_attr(node, xpath, key, val):
        """ Find the xpath xpath[@key=val] """
        assert re.match(r'^[a-zA-Z]+$', key)
-        assert re.match(r'^[a-zA-Z@\s]*$', val)
+        assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
        expr = xpath + u"[@%s='%s']" % (key, val)
        return node.find(expr)
 else: