1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-09 22:43:11 +08:00

[iTunes] Add new extractor

This commit is contained in:
TRox1972 2016-05-23 18:53:12 +02:00
parent 4a12077855
commit 16000e0799
2 changed files with 60 additions and 0 deletions

View File

@ -337,6 +337,7 @@ from .internetvideoarchive import InternetVideoArchiveIE
from .iprima import IPrimaIE from .iprima import IPrimaIE
from .iqiyi import IqiyiIE from .iqiyi import IqiyiIE
from .ir90tv import Ir90TvIE from .ir90tv import Ir90TvIE
from .itunes import iTunesIE
from .ivi import ( from .ivi import (
IviIE, IviIE,
IviCompilationIE IviCompilationIE

View File

@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
int_or_none,
unescapeHTML,
unified_strdate,
)
class iTunesIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/[a-z]{2}/[a-z0-9-]+/(?P<display_id>[a-z0-9-]+)?/(?:id)?(?P<id>[0-9]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/itunes-u/uc-davis-symphony-orchestra/id403834767',
'info_dict': {
'id': '403834767',
'title': 'UC Davis Symphony Orchestra & University Chorus',
},
'playlist_count': 31,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id, display_id = mobj.group('id', 'display_id')
if not display_id:
display_id = playlist_id
webpage = self._download_webpage(url, display_id)
video_infos = re.findall(r'var\s+__desc_popup_d_\d+\s*=\s*({[^><]+});', webpage)
html_entries = re.findall(r'<tr\s+[^>]*role="row"[^>]+>', webpage)
entries = []
for idx, html_entry in enumerate(html_entries):
video_info = self._parse_json(video_infos[idx], display_id)
entry = extract_attributes(html_entry)
entries.append({
'id': entry['adam-id'],
'title': entry['preview-title'],
'description': video_info.get('description'),
'url': entry.get('audio-preview-url', entry.get('video-preview-url')),
'duration': int_or_none(entry.get('duration')),
'release_date': unified_strdate(video_info.get('release_date')),
'track': unescapeHTML(entry.get('preview-title')),
'track_number': int_or_none(entry.get('row-number')),
'track_id': entry.get('adam-id'),
'artist': unescapeHTML(entry.get('preview-artist')),
'album': unescapeHTML(entry.get('preview-album')),
})
title = self._html_search_regex(r'<h1>(.+)</h1>',
webpage, 'title')
return self.playlist_result(entries, playlist_id, title)