From 8636f424288295142d061d1ea1b59e3cb5caafb2 Mon Sep 17 00:00:00 2001 From: Throaway Date: Mon, 20 Mar 2017 16:29:39 -0700 Subject: [PATCH] Fix issue #12470 - Parse out encoded PH video URLs --- youtube_dl/extractor/pornhub.py | 37 ++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 9b413590a..eb316ad14 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import operator # import os import re @@ -129,9 +131,38 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - video_url = self._search_regex( - r']+\bsrc=(["\'])(?P(?:https?:)?//.+?)\1', tv_webpage, - 'video url', group='url') + encoded_url = self._search_regex(r'(var.*mediastring.*)', + tv_webpage, 'encoded url') + assignments = encoded_url.split(";") + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*[^*]*\*/', "", inp) + + if "+" in inp: + inps = inp.split("+") + return functools.reduce(operator.concat, map(parse_js_value, inps)) + + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + + # Hope it's a string! + assert inp.startswith('"') and inp.endswith('"') + return inp[1:-1] + + for assn in assignments: + assn = assn.strip() + if len(assn) == 0: + continue + + assert assn.startswith("var ") + assn = assn[4:] + vname, value = assn.split("=", 1) + + js_vars[vname] = parse_js_value(value) + + video_url = js_vars["mediastring"] title = self._search_regex( r'

([^>]+)

', tv_webpage, 'title', default=None)