From 300fc5a0536c20059a413fc925b901251a69d40e Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 13:49:00 -0400 Subject: [PATCH 1/9] fixed category parsing --- youtube_dl/extractor/pornhub.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index be93d5d48..1804ba15d 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -307,10 +307,13 @@ class PornHubIE(PornHubBaseIE): r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', webpage, 'page parameters', group='data', default='{}'), video_id, transform_source=js_to_json, fatal=False) - tags = categories = None + tags = None if page_params: tags = page_params.get('tags', '').split(',') - categories = page_params.get('categories', '').split(',') + + categories = [] + for mobj in re.finditer(r']+Category[^>]*>([^<]+)', webpage): + categories.append(mobj.group(1)) return { 'id': video_id, From 77020f033b0ea43972e4e205153dc19fb18f0478 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 14:46:18 -0400 Subject: [PATCH 2/9] made requested changes changed regex for categories fixed parsing for tags --- youtube_dl/extractor/pornhub.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 1804ba15d..4312db656 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -16,7 +16,6 @@ from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, - js_to_json, orderedSet, remove_quotes, str_to_int, @@ -303,17 +302,15 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - page_params = self._parse_json(self._search_regex( - r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', - webpage, 'page parameters', group='data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) - tags = None - if page_params: - tags = page_params.get('tags', '').split(',') - categories = [] - for mobj in re.finditer(r']+Category[^>]*>([^<]+)', webpage): - categories.append(mobj.group(1)) + cat_div = re.search(r'
\s+Categories: \s+([^\n]+)', webpage) + for a in re.finditer(r']+Category[^>]*>([^<]+)', cat_div.group(1)): + categories.append(a.group(1)) + + tags = [] + tag_div = re.search(r'
\s+Tags: \s+([^\n]+)', webpage) + for a in re.finditer(r']+>([^<]+)', tag_div.group(1)): + tags.append(a.group(1)) return { 'id': video_id, From eb6f5ea0b071e5d6e7947bdd3c610e097521e442 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:14:12 -0400 Subject: [PATCH 3/9] Fixed potential break when div not found --- youtube_dl/extractor/pornhub.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 4312db656..8bf4d9f62 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -159,6 +159,12 @@ class PornHubIE(PornHubBaseIE): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) + def _get_text(self, str): + l = [] + for a in re.finditer(r']+>([^<]+)', str): + l.append(a.group(1)) + return l + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or 'pornhub.com' @@ -302,15 +308,17 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - categories = [] - cat_div = re.search(r'
\s+Categories: \s+([^\n]+)', webpage) - for a in re.finditer(r']+Category[^>]*>([^<]+)', cat_div.group(1)): - categories.append(a.group(1)) + div = re.search(r'
\s+[^\n]+\s+([^\n]+)', webpage) + if div: + categories = self._get_text(div.group(1)) + else: + categories = None - tags = [] - tag_div = re.search(r'
\s+Tags: \s+([^\n]+)', webpage) - for a in re.finditer(r']+>([^<]+)', tag_div.group(1)): - tags.append(a.group(1)) + div = re.search(r'
\s+Tags: \s+([^\n]+)', webpage) + if div: + tags = self._get_text(div.group(1)) + else: + tags = None return { 'id': video_id, From 32f6c118cb16caac01905f4c1bee661d6b6f48f2 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:20:14 -0400 Subject: [PATCH 4/9] improved function to grab categories and tags --- youtube_dl/extractor/pornhub.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8bf4d9f62..1a2f07345 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -159,10 +159,12 @@ class PornHubIE(PornHubBaseIE): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) - def _get_text(self, str): + def _get_text(self, str, page): l = [] - for a in re.finditer(r']+>([^<]+)', str): - l.append(a.group(1)) + div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) + if div: + for a in re.finditer(r']+>([^<]+)', div.group(1)): + l.append(a.group(1)) return l def _real_extract(self, url): @@ -308,17 +310,8 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - div = re.search(r'
\s+[^\n]+\s+([^\n]+)', webpage) - if div: - categories = self._get_text(div.group(1)) - else: - categories = None - - div = re.search(r'
\s+Tags: \s+([^\n]+)', webpage) - if div: - tags = self._get_text(div.group(1)) - else: - tags = None + categories = self._get_text("categoriesWrapper", webpage) + tags = self._get_text("tagsWrapper", webpage) return { 'id': video_id, From 6102ae5602d10d466c2d3bb5a90065e64aa29054 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:27:16 -0400 Subject: [PATCH 5/9] Add files via upload --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 1a2f07345..bb1054406 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -161,7 +161,7 @@ class PornHubIE(PornHubBaseIE): def _get_text(self, str, page): l = [] - div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) + div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) if div: for a in re.finditer(r']+>([^<]+)', div.group(1)): l.append(a.group(1)) From 8089536058c208d402c747a7ee8ad86dd154feb9 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:39:39 -0400 Subject: [PATCH 6/9] optimised code --- youtube_dl/extractor/pornhub.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index bb1054406..7c350df4a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -159,13 +159,13 @@ class PornHubIE(PornHubBaseIE): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) - def _get_text(self, str, page): - l = [] - div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) + @staticmethod + def _get_text(class_name, page): + div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) if div: - for a in re.finditer(r']+>([^<]+)', div.group(1)): - l.append(a.group(1)) - return l + return [a.group(1) for a in re.finditer(r']+>([^<]+)', div.group(1))] + else: + return [] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 54f503a83d2fd23d836c1d14a00a3caed7e9680a Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 15:58:19 -0400 Subject: [PATCH 7/9] replaced finditer to findall --- youtube_dl/extractor/pornhub.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7c350df4a..e9209d701 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -161,9 +161,9 @@ class PornHubIE(PornHubBaseIE): @staticmethod def _get_text(class_name, page): - div = re.search(r'
\s+[^\n]+\s+([^\n]+)', page) + div = re.findall(r'
\s+[^\n]+\s+([^\n]+)', page) if div: - return [a.group(1) for a in re.finditer(r']+>([^<]+)', div.group(1))] + return [a for a in re.findall(r']+>([^<]+)', div[0])] else: return [] From ee3a27d0365a22e623a43f9344988da252c532c5 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 16:33:54 -0400 Subject: [PATCH 8/9] made requested changes --- youtube_dl/extractor/pornhub.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e9209d701..e16c10f97 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -159,14 +159,6 @@ class PornHubIE(PornHubBaseIE): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) - @staticmethod - def _get_text(class_name, page): - div = re.findall(r'
\s+[^\n]+\s+([^\n]+)', page) - if div: - return [a for a in re.findall(r']+>([^<]+)', div[0])] - else: - return [] - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) host = mobj.group('host') or 'pornhub.com' @@ -310,8 +302,14 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - categories = self._get_text("categoriesWrapper", webpage) - tags = self._get_text("tagsWrapper", webpage) + def _get_text(class_name, page): + div = re.search( + r'
\s+[^\n]+\s+([^\n]+)\s+[^\n]+\s+
', page) + if div: + return [a for a in re.findall(r'
]+>([^<]+)', div.group(1))] + + categories = _get_text('categoriesWrapper', webpage) + tags = _get_text('tagsWrapper', webpage) return { 'id': video_id, From 4c125b818166b532523c32260b23378d54dbbec8 Mon Sep 17 00:00:00 2001 From: JChris246 <43832407+JChris246@users.noreply.github.com> Date: Sat, 2 Feb 2019 17:10:39 -0400 Subject: [PATCH 9/9] made requested changes fixed div regex for tags and categories changed function name --- youtube_dl/extractor/pornhub.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e16c10f97..428324ef0 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -302,14 +302,17 @@ class PornHubIE(PornHubBaseIE): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - def _get_text(class_name, page): - div = re.search( - r'
\s+[^\n]+\s+([^\n]+)\s+[^\n]+\s+
', page) + def _get_items(class_name): + div = self._search_regex( + r'
([\S\s]+?)
', + webpage, class_name, default=None) if div: - return [a for a in re.findall(r'
]+>([^<]+)', div.group(1))] + return [a for a in re.findall(r']+>([^<]+)', div)] + else: + return None - categories = _get_text('categoriesWrapper', webpage) - tags = _get_text('tagsWrapper', webpage) + categories = _get_items('categoriesWrapper') + tags = _get_items('tagsWrapper') return { 'id': video_id,