1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-02-05 19:35:14 +08:00

[neteasemusic] Add subtitles for NetEaseMusic

1. Use .lrc format lyrics as subtitles if available.
2. Remove the 3rd digit after dot in NetEase's time tag to fit LRC
format standard for time tag.
3. Update lyrics_expr to match empty string after time tag as a
valid lrc (music players uses this to hide previous item's text).
4. Add new regular expressions to match only text for the line with
multiple time tag for translation to avoid time tag is treated part of
lyrics.

This will return at most 2 lyrics. one for original text, the other
for the translated text if it exists.

Also add two extra tests to test:
1. multiple time tag in one line,
2. multiple time tag in one line with time need to be fixed to
hundredth of second rather
than millisecond.
This commit is contained in:
Qijiang Fan 2015-09-26 21:51:08 +08:00
parent 46b4070f3f
commit 55ffed5e02

View File

@ -66,6 +66,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
IE_DESC = '网易云音乐' IE_DESC = '网易云音乐'
_VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)'
_TESTS = [{ _TESTS = [{
'note': 'origin + translated lyrics, with time tag need to be fixed',
'url': 'http://music.163.com/#/song?id=32102397', 'url': 'http://music.163.com/#/song?id=32102397',
'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45',
'info_dict': { 'info_dict': {
@ -75,7 +76,10 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'creator': 'Taylor Swift / Kendrick Lamar', 'creator': 'Taylor Swift / Kendrick Lamar',
'upload_date': '20150517', 'upload_date': '20150517',
'timestamp': 1431878400, 'timestamp': 1431878400,
'description': 'md5:a10a54589c2860300d02e1de821eb2ef', 'subtitles': {
'origin': [{'ext': 'lrc', 'data': 'md5:eb9ae90502b435de7d9e99fc7602adb4'}],
'translated': [{'ext': 'lrc', 'data': 'md5:cadca69fdfb7b679d273cc01a518f7dd'}],
}
}, },
}, { }, {
'note': 'No lyrics translation.', 'note': 'No lyrics translation.',
@ -87,7 +91,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'creator': '周杰伦', 'creator': '周杰伦',
'upload_date': '20141225', 'upload_date': '20141225',
'timestamp': 1419523200, 'timestamp': 1419523200,
'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', 'subtitles': {
'origin': [{'ext': 'lrc', 'data': 'md5:a1766edaa6dbc85357f0ae9feabc867b'}],
}
}, },
}, { }, {
'note': 'No lyrics.', 'note': 'No lyrics.',
@ -99,6 +105,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'creator': 'Dustin O\'Halloran', 'creator': 'Dustin O\'Halloran',
'upload_date': '20080211', 'upload_date': '20080211',
'timestamp': 1202745600, 'timestamp': 1202745600,
'subtitles': {}
}, },
}, { }, {
'note': 'Has translated name.', 'note': 'Has translated name.',
@ -108,30 +115,88 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'ext': 'mp3', 'ext': 'mp3',
'title': '소원을 말해봐 (Genie)', 'title': '소원을 말해봐 (Genie)',
'creator': '少女时代', 'creator': '少女时代',
'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184',
'upload_date': '20100127', 'upload_date': '20100127',
'timestamp': 1264608000, 'timestamp': 1264608000,
'alt_title': '说出愿望吧(Genie)', 'alt_title': '说出愿望吧(Genie)',
'subtitles': {
'origin': [{'ext': 'lrc', 'data': 'md5:8d5782f92bb275b9a6acd01e9ffd12b9'}],
'translated': [{'ext': 'lrc', 'data': 'md5:d8270e3375fd305f92b18ad78585cabb'}],
}
}
}, {
'note': 'some lines with multiple time tag',
'url': 'http://music.163.com/#/song?id=4926366',
'info_dict': {
'id': '4926366',
'ext': 'mp3',
'title': 'sweet&sweet holiday',
'timestamp': 1306252800,
'upload_date': '20110524',
'subtitles': {
'origin': [{'ext': 'lrc', 'data': 'md5:9971db3f0361b0b66d47ba5c95bbff35'}],
'translated': [{'ext': 'lrc', 'data': 'md5:ce5f9eef13ae4948b01bbb015fc507e1'}],
}
}
}, {
'note': 'some lines with multiple time tag and time need to be fixed',
'url': 'http://music.163.com/#/song?id=22826396',
'info_dict': {
'id': '22826396',
'ext': 'mp3',
'title': 'God knows...',
'timestamp': 1306252800,
'upload_date': '20110524',
'subtitles': {
'origin': [{'ext': 'lrc', 'data': 'md5:5ca2952ed8974f2c28beb1c0f89e2ab5'}],
'translated': [{'ext': 'lrc', 'data': 'md5:5ed0d4adb337594f927674d351ba6626'}],
}
} }
}] }]
def _fix_timestamp(self, timestamp):
# Netease returns timestamp use 2 or 3 digits for less than
# 1 second metrics
# While standard LRC requires exact 2 digits
# remvoe the 3rd digit if the there're three
match_expr = r'\[([0-9]{2}:[0-9]{2}\.[0-9]{2})(.*)\]'
match_result = re.match(match_expr, timestamp)
if match_result:
# This must match as match_expr is exact the first group of lyrics_expr
return '[' + match_result.group(1) + ']'
else:
# return a valid timestamp to avoid other applications' error
return '[00:00.00]'
def _fix_lyric_timestamp(self, line):
text = line
corrected_times = ''
lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]*)'
# Handle time tag with one or more time in one line.
while True:
m = re.match(lyrics_expr, text)
if m:
corrected_times += self._fix_timestamp(m.group(1))
text = m.group(2)
else:
break
return corrected_times + text
def _process_lyrics(self, lyrics_info): def _process_lyrics(self, lyrics_info):
original = lyrics_info.get('lrc', {}).get('lyric') original = lyrics_info.get('lrc', {}).get('lyric')
translated = lyrics_info.get('tlyric', {}).get('lyric') translated = lyrics_info.get('tlyric', {}).get('lyric')
if not translated: if not translated:
return original translated = ''
if not original:
original = ''
lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' lyrics_expr = r'\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\][^\n]*'
original_ts_texts = re.findall(lyrics_expr, original) original_ts_texts = re.findall(lyrics_expr, original)
translation_ts_dict = dict( translated_ts_texts = re.findall(lyrics_expr, translated)
(time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) gen_lyrics = lambda texts: '\n'.join(map(self._fix_lyric_timestamp, texts)) + '\n' if texts else ''
) lyrics_original = gen_lyrics(original_ts_texts)
lyrics = '\n'.join([ lyrics_translated = gen_lyrics(translated_ts_texts)
'%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) return lyrics_original, lyrics_translated
for time_stamp, text in original_ts_texts
])
return lyrics
def _real_extract(self, url): def _real_extract(self, url):
song_id = self._match_id(url) song_id = self._match_id(url)
@ -150,13 +215,13 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
lyrics_info = self.query_api( lyrics_info = self.query_api(
'song/lyric?id=%s&lv=-1&tv=-1' % song_id, 'song/lyric?id=%s&lv=-1&tv=-1' % song_id,
song_id, 'Downloading lyrics data') song_id, 'Downloading lyrics data')
lyrics = self._process_lyrics(lyrics_info) lyrics_original, lyrics_translated = self._process_lyrics(lyrics_info)
alt_title = None alt_title = None
if info.get('transNames'): if info.get('transNames'):
alt_title = '/'.join(info.get('transNames')) alt_title = '/'.join(info.get('transNames'))
return { ret = {
'id': song_id, 'id': song_id,
'title': info['name'], 'title': info['name'],
'alt_title': alt_title, 'alt_title': alt_title,
@ -164,10 +229,21 @@ class NetEaseMusicIE(NetEaseMusicBaseIE):
'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')),
'thumbnail': info.get('album', {}).get('picUrl'), 'thumbnail': info.get('album', {}).get('picUrl'),
'duration': self.convert_milliseconds(info.get('duration', 0)), 'duration': self.convert_milliseconds(info.get('duration', 0)),
'description': lyrics, 'description': '',
'subtitles': {},
'formats': formats, 'formats': formats,
} }
def update_lyrics(info_dict, key, content):
if content:
info_dict['subtitles'][key] = [{
'ext': 'lrc',
'data': content
}]
update_lyrics(ret, 'origin', lyrics_original)
update_lyrics(ret, 'translated', lyrics_translated)
return ret
class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): class NetEaseMusicAlbumIE(NetEaseMusicBaseIE):
IE_NAME = 'netease:album' IE_NAME = 'netease:album'