1
0
mirror of https://github.com/l1ving/youtube-dl synced 2025-03-28 09:47:14 +08:00

Correct Video ID if base64 is wrong

This commit is contained in:
Jeremy Mahieu 2020-04-29 21:49:17 +02:00
parent 2468a6fa64
commit 56359a6db7

View File

@ -10,6 +10,7 @@ import random
import re import re
import time import time
import traceback import traceback
import base64
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from ..jsinterp import JSInterpreter from ..jsinterp import JSInterpreter
@ -1609,12 +1610,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return urls[0] if urls else None return urls[0] if urls else None
@classmethod @classmethod
def extract_id(cls, url): def extract_id(cls, self, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE) mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
if mobj is None: if mobj is None:
raise ExtractorError('Invalid URL: %s' % url) raise ExtractorError('Invalid URL: %s' % url)
video_id = mobj.group(2) video_id = mobj.group(2)
return video_id
# video_id is a result of a base64 encoding
# due to the padding multiple video ids can lead to the same video
# these wrong ids work but not always (not for age-gated videos)
# it can be detected and corrected
video_id_padded = video_id + '=' * ((4 - len(video_id) % 4) % 4) # add padding, required for decode
decoded_bytes = base64.urlsafe_b64decode(video_id_padded)
real_video_id = base64.urlsafe_b64encode(decoded_bytes).decode("utf-8").replace('=','') # remove padding again
if real_video_id != video_id:
self.to_screen('Detected wrong video id %s, trying corrected id %s' % (video_id, real_video_id))
return real_video_id
@staticmethod @staticmethod
def _extract_chapters(description, duration): def _extract_chapters(description, duration):
@ -1674,7 +1687,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(self._NEXT_URL_RE, url) mobj = re.search(self._NEXT_URL_RE, url)
if mobj: if mobj:
url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/')
video_id = self.extract_id(url) video_id = self.extract_id(self, url)
# Get video webpage # Get video webpage
url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id