diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..f67c95db6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -856,9 +856,15 @@ from .porncom import PornComIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, - PornHubUserIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, + PornHubProfileIE, + PornHubProfileVideosIE, + PornHubPlaylistIE, +) +from .pornhubpremium import ( + PornHubPremiumIE, + PornHubPremiumProfileIE, + PornHubPremiumProfileVideosIE, + PornHubPremiumPlaylistIE, ) from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE @@ -1129,6 +1135,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE +from .thumbzilla import ThumbzillaIE from .tiktok import ( TikTokIE, TikTokUserIE, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b8f65af7c..2640d2b14 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -7,256 +7,301 @@ import operator import re from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, -) -from .openload import PhantomJSwrapper -from ..utils import ( - determine_ext, - ExtractorError, - int_or_none, - NO_DEFAULT, - orderedSet, - remove_quotes, - str_to_int, - url_or_none, -) +from ..compat import compat_HTTPError +from ..compat import compat_str +from ..utils import ExtractorError +from ..utils import NO_DEFAULT +from ..utils import determine_ext +from ..utils import get_element_by_id +from ..utils import int_or_none +from ..utils import orderedSet +from ..utils import remove_quotes +from ..utils import str_to_int +from ..utils import url_or_none +from ..utils import urlencode_postdata + + +def _get_page(url, default=1): + """Returns the value of 'page' from the query string, or default.""" + mobj = re.search(r'page=(\\d+)', url) + return int_or_none(mobj.group(1), default=default) if mobj else default + + +def _get_pkey(url, default=None): + """Returns the value of 'pkey' from the query string, or default.""" + mobj = re.search(r'pkey=(\\d+)', url) + return mobj.group(1) if mobj else default + + +def _has_more(webpage): + """Returns True if webpage is a paged result and has more pages.""" + if 'page_next' in webpage: + return True + if 'moreDataBtn' in webpage: + return True + if 'scrollLazyload' in webpage: + return True + return False class PornHubBaseIE(InfoExtractor): - def _download_webpage_handle(self, *args, **kwargs): - def dl(*args, **kwargs): - return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + """ + PornHubBaseIE is the base class responsible for extracting videos from PornHub sites + like PornHub and PornHub Premium. + """ - webpage, urlh = dl(*args, **kwargs) + _HOST = None # Must be redefined in subclasses. + _VALID_URL = None # Must be redefined in subclasses. - if any(re.search(p, webpage) for p in ( - r'
]+\bonload=["\']go\(\)', - r'document\.cookie\s*=\s*["\']RNKEY=', - r'document\.location\.reload\(true\)')): - url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) - else url_or_request) - phantom = PhantomJSwrapper(self, required_version='2.0') - phantom.get(url, html=webpage) - webpage, urlh = dl(*args, **kwargs) + def _login(self): + """Must be redefined in subclasses.""" + raise NotImplementedError() - return webpage, urlh + def _set_cookies(self): + self._set_cookie(self._HOST, 'age_verified', '1') + self._set_cookie(self._HOST, 'platform', 'pc') + def _real_initialize(self): + self._set_cookies() + self._login() -class PornHubIE(PornHubBaseIE): - IE_DESC = 'PornHub and Thumbzilla' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:[^/]+\.)?(?P