From 16797933bbf089b3a51c973abbd5a25741d162b6 Mon Sep 17 00:00:00 2001 From: motophil Date: Wed, 11 Jan 2017 20:21:04 +0100 Subject: [PATCH 01/11] [gaskrank] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gaskrank.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/gaskrank.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5ba8efb0e..937356d9a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -330,6 +330,7 @@ from .gameone import ( from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE +from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py new file mode 100644 index 000000000..3804b0833 --- /dev/null +++ b/youtube_dl/extractor/gaskrank.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from re import search +from re import findall +from .common import InfoExtractor + + +class GaskrankIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+).htm' + _TEST = { + 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'info_dict': { + 'id': '201601/26955', + 'ext': 'mp4', + 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'https?://movies.gaskrank.tv/([^-]*?).mp4', webpage, 'video id') + playlist = self._search_regex(r'playlist:\s*\[\s*{\s*([\s\S]*(?!}\s*]))', webpage, 'video id') + entries = findall(r'[0-9]+:\s*{[\s\S]*?}', playlist) + formats = [] + for entry in entries: + format = dict() + format['url'] = search(r'src:[\s]*\"([^\"]*)\"', entry).group(1) + format['format_id'] = search(r'([0-9]+):\s*{[\s\S]*?}', entry).group(1) + format['quality'] = search(r'quality:[\s]*\"([^\"]*)\"', entry).group(1) + format['resolution'] = format['quality'] + formats.append(format) + title = self._html_search_regex(r'movieName: *\'([^\']*)\'', webpage, 'title') + thumbnail = self._html_search_regex(r'poster: *\'([^\']*)\'', webpage, 'thumbnail') + categories = search(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+.htm', url).group(1) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'categories': categories, + 'display_id': display_id, + } From 5de648f0199f8249efaf187e70a0194867ac0a5c Mon Sep 17 00:00:00 2001 From: motophil Date: Fri, 20 Jan 2017 19:36:21 +0100 Subject: [PATCH 02/11] [gaskrank] Add new extractor - fixes as requested --- youtube_dl/extractor/gaskrank.py | 34 +++++++++++++++++--------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 3804b0833..db8e9be93 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -1,16 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -from re import search -from re import findall +from re import sub from .common import InfoExtractor - +from ..utils import js_to_json class GaskrankIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+).htm' _TEST = { 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'md5': '200e28a405f6919b914a83f8adfc5739', 'info_dict': { 'id': '201601/26955', 'ext': 'mp4', @@ -20,22 +19,25 @@ class GaskrankIE(InfoExtractor): } def _real_extract(self, url): + def fix_json(code): + return sub(r'}[\s]*?,[\s]*?}', r'}}', js_to_json(code)) + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) video_id = self._search_regex(r'https?://movies.gaskrank.tv/([^-]*?).mp4', webpage, 'video id') - playlist = self._search_regex(r'playlist:\s*\[\s*{\s*([\s\S]*(?!}\s*]))', webpage, 'video id') - entries = findall(r'[0-9]+:\s*{[\s\S]*?}', playlist) + categories = self._search_regex(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+.htm', url, 'categories') + title = self._search_regex(r'movieName[\s\S]*?\'([^\']*?)\'', webpage, 'config', default='{}') + thumbnail = self._search_regex(r'poster[\s\S]*?\'([^\']*?)\'', webpage, 'config', default='{}') + playlist = self._parse_json( + self._search_regex(r'playlist:[\s\S]*?\[([\s\S]*?)]', webpage, 'config', default='{}'), + video_id, transform_source=fix_json, fatal=False) formats = [] - for entry in entries: - format = dict() - format['url'] = search(r'src:[\s]*\"([^\"]*)\"', entry).group(1) - format['format_id'] = search(r'([0-9]+):\s*{[\s\S]*?}', entry).group(1) - format['quality'] = search(r'quality:[\s]*\"([^\"]*)\"', entry).group(1) - format['resolution'] = format['quality'] - formats.append(format) - title = self._html_search_regex(r'movieName: *\'([^\']*)\'', webpage, 'title') - thumbnail = self._html_search_regex(r'poster: *\'([^\']*)\'', webpage, 'thumbnail') - categories = search(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+.htm', url).group(1) + for key in playlist: + formats.append({ + 'url': playlist[key]['src'], + 'format_id': key, + 'quality': playlist[key]['quality'], + 'resolution': playlist[key]['quality']}) return { 'id': video_id, From 274f73b9df0c9366b2fccd5140aceeaad80361fa Mon Sep 17 00:00:00 2001 From: motophil Date: Fri, 20 Jan 2017 19:46:40 +0100 Subject: [PATCH 03/11] [gaskrank] Add new extractor - style fix --- youtube_dl/extractor/gaskrank.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index db8e9be93..c40d35772 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -5,6 +5,7 @@ from re import sub from .common import InfoExtractor from ..utils import js_to_json + class GaskrankIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+).htm' _TEST = { From 015e4edad0b551b984302e11a4f93106c4896c65 Mon Sep 17 00:00:00 2001 From: motophil Date: Sat, 21 Jan 2017 13:47:52 +0100 Subject: [PATCH 04/11] [Gaskrank] Add new extractor - requested fixes --- youtube_dl/extractor/gaskrank.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index c40d35772..d4e7cf5f0 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -1,13 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -from re import sub +import re from .common import InfoExtractor from ..utils import js_to_json class GaskrankIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+).htm' + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+)\.htm' _TEST = { 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', 'md5': '200e28a405f6919b914a83f8adfc5739', @@ -21,24 +21,24 @@ class GaskrankIE(InfoExtractor): def _real_extract(self, url): def fix_json(code): - return sub(r'}[\s]*?,[\s]*?}', r'}}', js_to_json(code)) + return re.sub(r'}[\s]*?,[\s]*?}', r'}}', js_to_json(code)) display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'https?://movies.gaskrank.tv/([^-]*?).mp4', webpage, 'video id') - categories = self._search_regex(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+.htm', url, 'categories') - title = self._search_regex(r'movieName[\s\S]*?\'([^\']*?)\'', webpage, 'config', default='{}') - thumbnail = self._search_regex(r'poster[\s\S]*?\'([^\']*?)\'', webpage, 'config', default='{}') + video_id = self._search_regex(r'https?://movies\.gaskrank\.tv/([^-]*?)\.mp4', webpage, 'video id') + categories = self._search_regex(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+\.htm', url, 'categories', default=None) + title = self._search_regex(r'movieName[^\']*?\'([^\']*?)\'', webpage, 'title') + thumbnail = self._search_regex(r'poster[^\']*?\'([^\']*?)\'', webpage, 'thumbnail', default=None) playlist = self._parse_json( - self._search_regex(r'playlist:[\s\S]*?\[([\s\S]*?)]', webpage, 'config', default='{}'), + self._search_regex(r'playlist:[\s\S]*?\[([\s\S]*?)]', webpage, 'playlist', default='{}'), video_id, transform_source=fix_json, fatal=False) formats = [] - for key in playlist: + for key in sorted(playlist): formats.append({ 'url': playlist[key]['src'], 'format_id': key, - 'quality': playlist[key]['quality'], - 'resolution': playlist[key]['quality']}) + 'quality': playlist[key].get('quality'), + 'resolution': playlist[key].get('quality')}) return { 'id': video_id, From b7c7d29085f3d42cb6ad5bcefae37da7a7f54e33 Mon Sep 17 00:00:00 2001 From: motophil Date: Sun, 22 Jan 2017 02:34:39 +0100 Subject: [PATCH 05/11] [Gaskrank] Add new extractor - fix md5 checksum --- youtube_dl/extractor/gaskrank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index d4e7cf5f0..3fcd56ca0 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -10,7 +10,7 @@ class GaskrankIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+)\.htm' _TEST = { 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '200e28a405f6919b914a83f8adfc5739', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', 'info_dict': { 'id': '201601/26955', 'ext': 'mp4', From 2c88df8d543297daaf5126a8b1cb3ec62434de7b Mon Sep 17 00:00:00 2001 From: motophil Date: Sun, 22 Jan 2017 18:07:08 +0100 Subject: [PATCH 06/11] [gaskrank] Add new extractor - more requested fixes --- youtube_dl/extractor/gaskrank.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 3fcd56ca0..aa99f8dd0 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -7,7 +7,7 @@ from ..utils import js_to_json class GaskrankIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv(?:/[^/]+)+/(?P[^/]+)\.htm' + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P[^/]+)/(?P[^/]+)\.html?' _TEST = { 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', @@ -16,6 +16,8 @@ class GaskrankIE(InfoExtractor): 'ext': 'mp4', 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['motorrad-fun'], + 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', } } @@ -25,20 +27,20 @@ class GaskrankIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'https?://movies\.gaskrank\.tv/([^-]*?)\.mp4', webpage, 'video id') - categories = self._search_regex(r'https?://(?:www\.)?gaskrank\.tv/tv(?:/([^/]+))+/[^/]+\.htm', url, 'categories', default=None) - title = self._search_regex(r'movieName[^\']*?\'([^\']*?)\'', webpage, 'title') - thumbnail = self._search_regex(r'poster[^\']*?\'([^\']*?)\'', webpage, 'thumbnail', default=None) + categories = [re.match(self._VALID_URL, url).group('categories')] + title = self._search_regex(r'movieName\s*:\s*\'([^\']*)\'', webpage, 'title') + thumbnail = self._search_regex(r'poster\s*:\s*\'([^\']*)\'', webpage, 'thumbnail', default=None) playlist = self._parse_json( - self._search_regex(r'playlist:[\s\S]*?\[([\s\S]*?)]', webpage, 'playlist', default='{}'), - video_id, transform_source=fix_json, fatal=False) + self._search_regex(r'playlist\s*:\s*\[([^\]]*)\]', webpage, 'playlist', default='{}'), + display_id, transform_source=fix_json, fatal=False) + video_id = self._search_regex(r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', playlist.get('0').get('src'), 'video id') formats = [] - for key in sorted(playlist): + for key in playlist: formats.append({ 'url': playlist[key]['src'], 'format_id': key, - 'quality': playlist[key].get('quality'), - 'resolution': playlist[key].get('quality')}) + 'quality': playlist[key].get('quality')}) + self._sort_formats(formats, field_preference=['format_id']) return { 'id': video_id, From c08326ccdf467acf534315922f876724c66a3ab9 Mon Sep 17 00:00:00 2001 From: motophil Date: Sun, 22 Jan 2017 19:51:40 +0100 Subject: [PATCH 07/11] [Gaskrank] Add new extractor - fixed all but one quantified code issues --- youtube_dl/extractor/gaskrank.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index aa99f8dd0..8ece658bf 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -7,6 +7,7 @@ from ..utils import js_to_json class GaskrankIE(InfoExtractor): + """InfoExtractor for gaskrank.tv""" _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P[^/]+)/(?P[^/]+)\.html?' _TEST = { 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', @@ -22,8 +23,10 @@ class GaskrankIE(InfoExtractor): } def _real_extract(self, url): + """extract information from gaskrank.tv""" def fix_json(code): - return re.sub(r'}[\s]*?,[\s]*?}', r'}}', js_to_json(code)) + """Removes trailing comma in json: {{},} --> {{}}""" + return re.sub(r',[\s]*?}', r'}', js_to_json(code)) display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) From 7c79b050780b204c354b52351d2f885e911bc621 Mon Sep 17 00:00:00 2001 From: motophil Date: Mon, 23 Jan 2017 23:13:21 +0100 Subject: [PATCH 08/11] [Gaskrank] add new extractor - more fields extracted, added second test --- youtube_dl/extractor/gaskrank.py | 70 +++++++++++++++++++++++++++----- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 8ece658bf..d3ce4df2f 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -9,18 +9,40 @@ from ..utils import js_to_json class GaskrankIE(InfoExtractor): """InfoExtractor for gaskrank.tv""" _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P[^/]+)/(?P[^/]+)\.html?' - _TEST = { - 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', - 'info_dict': { - 'id': '201601/26955', - 'ext': 'mp4', - 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['motorrad-fun'], - 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', + _TESTS = [ + { + 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'info_dict': { + 'id': '201601/26955', + 'ext': 'mp4', + 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['motorrad-fun'], + 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', + 'uploader_id': 'Bikefun', + 'upload_date': '20170110', + 'uploader_url': None, + 'tags': ['honkj', 'strike', 'idiot', 'depp', 'fahrschule', 'einparken', 'parken', 'autounfall', 'unfall', 'flurschaden', 'crash', 'accident', 'fail', 'rueckwaerts', 'umfaller'], + } + }, + { + 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', + 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', + 'info_dict': { + 'id': '201106/15920', + 'ext': 'mp4', + 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['racing'], + 'display_id': 'isle-of-man-tt-2011-michael-du-15920', + 'uploader_id': 'IOM', + 'upload_date': '20160506', + 'uploader_url': 'www.iomtt.com', + 'tags': ['schwindelig', 'iom', 'isle of man', 'guy martin', 'glen helen', 'michael dunlop', 'tt 2011', 'attacke'], + } } - } + ] def _real_extract(self, url): """extract information from gaskrank.tv""" @@ -33,10 +55,30 @@ class GaskrankIE(InfoExtractor): categories = [re.match(self._VALID_URL, url).group('categories')] title = self._search_regex(r'movieName\s*:\s*\'([^\']*)\'', webpage, 'title') thumbnail = self._search_regex(r'poster\s*:\s*\'([^\']*)\'', webpage, 'thumbnail', default=None) + + mobj = re.search(r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]).(?P[0-9][0-9]).(?P[0-9][0-9][0-9][0-9])', webpage) + uploader_id = mobj.groupdict().get('uploader_id') + upload_date = mobj.groupdict().get('upload_date_year') + mobj.groupdict().get('upload_date_month') + mobj.groupdict().get('upload_date_day') + if len(upload_date) != 8: + upload_date = None + + uploader_url = self._search_regex(r'Homepage:\s*<[^>]*?>(?P[^<]*)', webpage, 'uploader_url', default=None) + tags = re.findall(r'/tv/tags/[^/]*?/\"\s*>(?P[^<]*?)<', webpage) + + view_count = self._search_regex(r'class\s*=\s*\"gkRight\"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P[0-9\.]*)', webpage, 'view_count') + if view_count: + view_count = int(view_count.replace('.', '')) + + average_rating = self._search_regex(r'itemprop\s*=\s*\"ratingValue\"\s*>\s*(?P[0-9,]*)', webpage, 'average_rating') + if average_rating: + average_rating = float(average_rating.replace(',', '.')) + playlist = self._parse_json( self._search_regex(r'playlist\s*:\s*\[([^\]]*)\]', webpage, 'playlist', default='{}'), display_id, transform_source=fix_json, fatal=False) + video_id = self._search_regex(r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', playlist.get('0').get('src'), 'video id') + formats = [] for key in playlist: formats.append({ @@ -52,4 +94,10 @@ class GaskrankIE(InfoExtractor): 'thumbnail': thumbnail, 'categories': categories, 'display_id': display_id, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'uploader_url': uploader_url, + 'tags': tags, + 'view_count': view_count, + 'average_rating': average_rating, } From fb71f438cda581aa9f8d73123dd270512202215d Mon Sep 17 00:00:00 2001 From: motophil Date: Wed, 1 Feb 2017 12:15:00 +0100 Subject: [PATCH 09/11] [Gaskrank] Add new extractor - requested fixes. --- youtube_dl/extractor/gaskrank.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index d3ce4df2f..1c0732da5 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -57,10 +57,11 @@ class GaskrankIE(InfoExtractor): thumbnail = self._search_regex(r'poster\s*:\s*\'([^\']*)\'', webpage, 'thumbnail', default=None) mobj = re.search(r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]).(?P[0-9][0-9]).(?P[0-9][0-9][0-9][0-9])', webpage) - uploader_id = mobj.groupdict().get('uploader_id') - upload_date = mobj.groupdict().get('upload_date_year') + mobj.groupdict().get('upload_date_month') + mobj.groupdict().get('upload_date_day') - if len(upload_date) != 8: - upload_date = None + if mobj is not None: + uploader_id = mobj.groupdict().get('uploader_id') + upload_date = mobj.groupdict().get('upload_date_year') + mobj.groupdict().get('upload_date_month') + mobj.groupdict().get('upload_date_day') + if len(upload_date) != 8: + upload_date = None uploader_url = self._search_regex(r'Homepage:\s*<[^>]*?>(?P[^<]*)', webpage, 'uploader_url', default=None) tags = re.findall(r'/tv/tags/[^/]*?/\"\s*>(?P[^<]*?)<', webpage) From bac2ef6b5a75c3efb54c1e43d2e120295847802a Mon Sep 17 00:00:00 2001 From: motophil Date: Fri, 3 Feb 2017 23:52:20 +0100 Subject: [PATCH 10/11] [Gaskrank] Add new extractor - requested changes. --- youtube_dl/extractor/gaskrank.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index 1c0732da5..cf80670d6 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import float_or_none +from ..utils import int_or_none from ..utils import js_to_json +from ..utils import unified_strdate class GaskrankIE(InfoExtractor): @@ -23,7 +26,6 @@ class GaskrankIE(InfoExtractor): 'uploader_id': 'Bikefun', 'upload_date': '20170110', 'uploader_url': None, - 'tags': ['honkj', 'strike', 'idiot', 'depp', 'fahrschule', 'einparken', 'parken', 'autounfall', 'unfall', 'flurschaden', 'crash', 'accident', 'fail', 'rueckwaerts', 'umfaller'], } }, { @@ -39,7 +41,6 @@ class GaskrankIE(InfoExtractor): 'uploader_id': 'IOM', 'upload_date': '20160506', 'uploader_url': 'www.iomtt.com', - 'tags': ['schwindelig', 'iom', 'isle of man', 'guy martin', 'glen helen', 'michael dunlop', 'tt 2011', 'attacke'], } } ] @@ -48,7 +49,7 @@ class GaskrankIE(InfoExtractor): """extract information from gaskrank.tv""" def fix_json(code): """Removes trailing comma in json: {{},} --> {{}}""" - return re.sub(r',[\s]*?}', r'}', js_to_json(code)) + return re.sub(r',[\s]*}', r'}', js_to_json(code)) display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -56,23 +57,21 @@ class GaskrankIE(InfoExtractor): title = self._search_regex(r'movieName\s*:\s*\'([^\']*)\'', webpage, 'title') thumbnail = self._search_regex(r'poster\s*:\s*\'([^\']*)\'', webpage, 'thumbnail', default=None) - mobj = re.search(r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]).(?P[0-9][0-9]).(?P[0-9][0-9][0-9][0-9])', webpage) + mobj = re.search(r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', webpage) if mobj is not None: uploader_id = mobj.groupdict().get('uploader_id') - upload_date = mobj.groupdict().get('upload_date_year') + mobj.groupdict().get('upload_date_month') + mobj.groupdict().get('upload_date_day') - if len(upload_date) != 8: - upload_date = None + upload_date = unified_strdate(mobj.groupdict().get('upload_date')) - uploader_url = self._search_regex(r'Homepage:\s*<[^>]*?>(?P[^<]*)', webpage, 'uploader_url', default=None) - tags = re.findall(r'/tv/tags/[^/]*?/\"\s*>(?P[^<]*?)<', webpage) + uploader_url = self._search_regex(r'Homepage:\s*<[^>]*>(?P[^<]*)', webpage, 'uploader_url', default=None) + tags = re.findall(r'/tv/tags/[^/]+/"\s*>(?P[^<]*?)<', webpage) - view_count = self._search_regex(r'class\s*=\s*\"gkRight\"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P[0-9\.]*)', webpage, 'view_count') + view_count = self._search_regex(r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P[0-9\.]*)', webpage, 'view_count', default=None) if view_count: - view_count = int(view_count.replace('.', '')) + view_count = int_or_none(view_count.replace('.', '')) - average_rating = self._search_regex(r'itemprop\s*=\s*\"ratingValue\"\s*>\s*(?P[0-9,]*)', webpage, 'average_rating') + average_rating = self._search_regex(r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P[0-9,]+)', webpage, 'average_rating') if average_rating: - average_rating = float(average_rating.replace(',', '.')) + average_rating = float_or_none(average_rating.replace(',', '.')) playlist = self._parse_json( self._search_regex(r'playlist\s*:\s*\[([^\]]*)\]', webpage, 'playlist', default='{}'), From 9be420c61c567966c60e7c39555aaa5195fe2a1a Mon Sep 17 00:00:00 2001 From: motophil Date: Sun, 5 Feb 2017 16:45:57 +0100 Subject: [PATCH 11/11] [Gaskrank] Add new extractor - final(?) fixes. --- youtube_dl/extractor/gaskrank.py | 38 +++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/gaskrank.py b/youtube_dl/extractor/gaskrank.py index cf80670d6..972b47bf2 100644 --- a/youtube_dl/extractor/gaskrank.py +++ b/youtube_dl/extractor/gaskrank.py @@ -49,35 +49,53 @@ class GaskrankIE(InfoExtractor): """extract information from gaskrank.tv""" def fix_json(code): """Removes trailing comma in json: {{},} --> {{}}""" - return re.sub(r',[\s]*}', r'}', js_to_json(code)) + return re.sub(r',\s*}', r'}', js_to_json(code)) display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) categories = [re.match(self._VALID_URL, url).group('categories')] - title = self._search_regex(r'movieName\s*:\s*\'([^\']*)\'', webpage, 'title') - thumbnail = self._search_regex(r'poster\s*:\s*\'([^\']*)\'', webpage, 'thumbnail', default=None) + title = self._search_regex( + r'movieName\s*:\s*\'([^\']*)\'', + webpage, 'title') + thumbnail = self._search_regex( + r'poster\s*:\s*\'([^\']*)\'', + webpage, 'thumbnail', default=None) - mobj = re.search(r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', webpage) + mobj = re.search( + r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', + webpage) if mobj is not None: uploader_id = mobj.groupdict().get('uploader_id') upload_date = unified_strdate(mobj.groupdict().get('upload_date')) - uploader_url = self._search_regex(r'Homepage:\s*<[^>]*>(?P[^<]*)', webpage, 'uploader_url', default=None) - tags = re.findall(r'/tv/tags/[^/]+/"\s*>(?P[^<]*?)<', webpage) + uploader_url = self._search_regex( + r'Homepage:\s*<[^>]*>(?P[^<]*)', + webpage, 'uploader_url', default=None) + tags = re.findall( + r'/tv/tags/[^/]+/"\s*>(?P[^<]*?)<', + webpage) - view_count = self._search_regex(r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P[0-9\.]*)', webpage, 'view_count', default=None) + view_count = self._search_regex( + r'class\s*=\s*"gkRight"(?:[^>]*>\s*<[^>]*)*icon-eye-open(?:[^>]*>\s*<[^>]*)*>\s*(?P[0-9\.]*)', + webpage, 'view_count', default=None) if view_count: view_count = int_or_none(view_count.replace('.', '')) - average_rating = self._search_regex(r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P[0-9,]+)', webpage, 'average_rating') + average_rating = self._search_regex( + r'itemprop\s*=\s*"ratingValue"[^>]*>\s*(?P[0-9,]+)', + webpage, 'average_rating') if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) playlist = self._parse_json( - self._search_regex(r'playlist\s*:\s*\[([^\]]*)\]', webpage, 'playlist', default='{}'), + self._search_regex( + r'playlist\s*:\s*\[([^\]]*)\]', + webpage, 'playlist', default='{}'), display_id, transform_source=fix_json, fatal=False) - video_id = self._search_regex(r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', playlist.get('0').get('src'), 'video id') + video_id = self._search_regex( + r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', + playlist.get('0').get('src'), 'video id') formats = [] for key in playlist: