From 471f398772782ff73542c7fa1490c9ba31eed884 Mon Sep 17 00:00:00 2001 From: rr- Date: Wed, 30 Aug 2017 12:01:19 +0200 Subject: [PATCH] [bandcamp] Extract more metadata --- test/helper.py | 2 +- youtube_dl/extractor/bandcamp.py | 80 +++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/test/helper.py b/test/helper.py index dfee217a9..87f4f7612 100644 --- a/test/helper.py +++ b/test/helper.py @@ -186,7 +186,7 @@ def expect_info_dict(self, got_dict, expected_dict): # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in got_dict.items() - if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) + if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit', 'album', 'artist', 'track', 'track_number', 'release_year', 'release_date')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index be41bd5a2..a73cb48f7 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -30,8 +30,11 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'track': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, + 'uploader': 'youtube-dl \\', + 'artist': 'youtube-dl \\', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { @@ -40,8 +43,29 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '2650410135', 'ext': 'aiff', - 'title': 'Ben Prunty - Lanius (Battle)', + 'album': 'FTL: Advanced Edition Soundtrack', + 'artist': 'Ben Prunty', 'uploader': 'Ben Prunty', + 'release_date': '20140403', + 'release_year': 2014, + 'track_number': 1, + 'track': 'Lanius (Battle)', + 'title': 'Ben Prunty - Lanius (Battle)', + }, + }, { + 'url': 'https://billbaxter.bandcamp.com/track/drone-city-pt-3-3', + 'md5': 'e8e24365cb38ff841b4e5df014f988ed', + 'info_dict': { + 'id': '3755531036', + 'ext': 'mp3', + 'album': 'Drone City', + 'artist': 'The ambient drones of Bill Baxter', + 'uploader': 'The ambient drones of Bill Baxter', + 'release_date': '20160326', + 'release_year': 2016, + 'track_number': 3, + 'track': 'Drone City, Pt. 3', + 'title': 'The ambient drones of Bill Baxter - Drone City, Pt. 3', }, }] @@ -51,11 +75,25 @@ class BandcampIE(InfoExtractor): webpage = self._download_webpage(url, title) thumbnail = self._html_search_meta('og:image', webpage, default=None) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) + m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) + json_code = m_trackinfo.group(1) if m_trackinfo else None + data = json.loads(json_code)[0] if json_code else None + + match = re.search(r'album_title\s*:\s*"([^"]+)"', webpage) + album_title = match.group(1) if match else None + + match = re.search(r'artist\s*:\s*"([^"]+)"', webpage) + artist = match.group(1) if match else None + + match = re.search(r'album_release_date\s*:\s*"([^"]+)"', webpage) + release_date = unified_strdate(match.group(1)) if match else None + release_year = int(release_date[0:4]) if release_date else None + + track = data['title'] if data else None + title = '%s - %s' % (artist, track) if artist else track + if not m_download: - m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) - if m_trackinfo: - json_code = m_trackinfo.group(1) - data = json.loads(json_code)[0] + if data: track_id = compat_str(data['id']) if not data.get('file'): @@ -77,7 +115,15 @@ class BandcampIE(InfoExtractor): return { 'id': track_id, - 'title': data['title'], + 'album': album_title, + 'uploader': artist, + 'artist': artist, + 'track_id': track_id, + 'track_number': data.get('track_num'), + 'release_date': release_date, + 'release_year': release_year, + 'track': track, + 'title': title, 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), @@ -99,13 +145,9 @@ class BandcampIE(InfoExtractor): 'blob', group='blob'), video_id, transform_source=unescapeHTML) - info = blob['digital_items'][0] + digital_items = blob['digital_items'][0] - downloads = info['downloads'] - track = info['title'] - - artist = info.get('artist') - title = '%s - %s' % (artist, track) if artist else track + downloads = digital_items['downloads'] download_formats = {} for f in blob['download_formats']: @@ -146,10 +188,16 @@ class BandcampIE(InfoExtractor): return { 'id': video_id, - 'title': title, - 'thumbnail': info.get('thumb_url') or thumbnail, - 'uploader': info.get('artist'), + 'album': album_title, + 'uploader': artist, 'artist': artist, + 'track_id': video_id, + 'track_number': data.get('track_num'), + 'release_date': release_date, + 'release_year': release_year, + 'track': track, + 'title': title, + 'thumbnail': digital_items.get('thumb_url') or thumbnail, 'track': track, 'formats': formats, }