[iTunes] Add new extractor

The extractor only works for free content, like most podcasts, i.e. it does not download 30-seconds previews of paid songs.
2025-01-24 04:53:06 +08:00 · 2017-09-14 14:34:19 +02:00 · 2017-09-14 14:34:19 +02:00 · 82260ae1f8
commit 82260ae1f8
parent 757984af90
2 changed files with 59 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -465,6 +465,7 @@ from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
 from .iqiyi import IqiyiIE
 from .ir90tv import Ir90TvIE
+from .itunes import iTunesIE
 from .itv import ITVIE
 from .ivi import (
    IviIE,
--- a/youtube_dl/extractor/itunes.py
+++ b/youtube_dl/extractor/itunes.py
@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    unescapeHTML,
+    unified_strdate,
+)
+
+
+class iTunesIE(InfoExtractor):
+    _VALID_URL = r'https?://itunes\.apple\.com/[a-z]{2}?/?[a-z0-9-]+/?(?P<display_id>[a-z0-9-]+)?/(?:id)?(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'https://itunes.apple.com/us/itunes-u/uc-davis-symphony-orchestra/id403834767',
+        'info_dict': {
+            'id': '403834767',
+            'title': 'UC Davis Symphony Orchestra & University Chorus',
+        },
+        'playlist_count': 31,
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id, display_id = mobj.group('id', 'display_id')
+        if not display_id:
+            display_id = playlist_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_infos = re.findall(r'var\s+__desc_popup_d_\d+\s*=\s*({[^><]+});', webpage)
+        html_entries = re.findall(r'<tr\s+[^>]*role="row"[^>]+>', webpage)
+
+        entries = []
+        for idx, html_entry in enumerate(html_entries):
+            video_info = self._parse_json(video_infos[idx], display_id)
+            entry = extract_attributes(html_entry)
+            entries.append({
+                'id': entry['adam-id'],
+                'title': entry['preview-title'],
+                'description': video_info.get('description'),
+                'url': entry.get('audio-preview-url', entry.get('video-preview-url')),
+                'duration': int_or_none(entry.get('duration')),
+                'release_date': unified_strdate(video_info.get('release_date')),
+                'track': unescapeHTML(entry.get('preview-title')),
+                'track_number': int_or_none(entry.get('row-number')),
+                'track_id': entry.get('adam-id'),
+                'artist': unescapeHTML(entry.get('preview-artist')),
+                'album': unescapeHTML(entry.get('preview-album')),
+            })
+
+        title = self._html_search_regex(r'<h1>(.+)</h1>', webpage, 'title')
+
+        return self.playlist_result(entries, playlist_id, title)