From 9d334aedc8e7e0497991cf13f40df9625efd8497 Mon Sep 17 00:00:00 2001 From: Mathias Rav Date: Sat, 3 Jan 2015 09:05:34 +0100 Subject: [PATCH] [WatchTvSeries] Add new extractor for watch-tv-series.to --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/watch_tv_series.py | 108 ++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 youtube_dl/extractor/watch_tv_series.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9ccd1b32e..a2ffdb613 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -504,6 +504,10 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .watch_tv_series import ( + WatchTvSeriesSeasonIE, + WatchTvSeriesEpisodeIE, +) from .wayofthemaster import WayOfTheMasterIE from .wdr import ( WDRIE, diff --git a/youtube_dl/extractor/watch_tv_series.py b/youtube_dl/extractor/watch_tv_series.py new file mode 100644 index 000000000..3d18fc2dd --- /dev/null +++ b/youtube_dl/extractor/watch_tv_series.py @@ -0,0 +1,108 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import clean_html, ExtractorError + +try: + import microdata +except ImportError: + microdata = None + + +class WatchTvSeriesSeasonIE(InfoExtractor): + domain_base = 'http://watch-tv-series.to/' + _VALID_URL = domain_base + r'(?Pseason-(?P\d+)/[a-z]+)' + + def extract_microdata(self): + item, = microdata.get_items(self.content) + name = item.name + playlist_title = '%s - Season %s' % (name, self.season) + playlist_description = item.description + + season = item.season + episodes = sorted( + season.get_all('episode'), key=lambda ep: ep.datepublished) + + entries = [ + self.url_result(self.domain_base + ep.url.lstrip('/'), + 'WatchTvSeriesEpisode') + for ep in episodes + ] + + return self.playlist_result( + entries, playlist_id=item.url, playlist_title=playlist_title, + playlist_description=playlist_description) + + def extract_strings(self): + name = self._search_regex( + r'([^<]+)', + self.content, 'show name') + + playlist_title = '%s - Season %s' % (name, self.season) + + data = {} + entries = [] + + # This parsing is sensitive to the order of HTML attributes in the + # content of the page, but it works for now. + itemprop_regex = r'itemprop="([a-z]+)"(?: content="([^"]+)"|>([^<]+))' + + for mobj in re.finditer(itemprop_regex, self.content): + key = mobj.group(1) + value = mobj.group(2) or mobj.group(3) + value = clean_html(value).strip() + data[key] = value + if key == 'name' and data.get('url', '').startswith('/episode'): + url = self.domain_base + data['url'].lstrip('/') + entries.append(self.url_result(url, 'WatchTvSeriesEpisode')) + + # Episodes are displayed with the latest first, + # but we want to retrieve the earliest first. + entries.reverse() + + return self.playlist_result( + entries, playlist_id=self.path, playlist_title=playlist_title) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + self.path = mobj.group('path') + self.season = mobj.group('season') + self.content = self._download_webpage(url, self.path) + if microdata: + return self.extract_microdata() + else: + return self.extract_strings() + + +class WatchTvSeriesEpisodeIE(InfoExtractor): + domain_base = 'http://watch-tv-series.to/' + _VALID_URL = 'http://watch-tv-series.to/(?Pepisode/[a-z_0-9]+.html)' + + def _real_extract(self, url): + # There is microdata on this page, but we don't need it, + # since we will return a url_result anyway. + + mobj = re.match(self._VALID_URL, url) + path = mobj.group('path') + + content = self._download_webpage(url, path) + + movshare_regex = ( + r'href="/([^"]+)" class="buttonlink" title="movshare.net"') + + mobj = re.search(movshare_regex, content) + if mobj is None: + raise ExtractorError( + 'Unable to extract movshare.net link. ' + + 'No other video sites are supported.', + expected=True) + + external_link = self._download_webpage( + self.domain_base + mobj.group(1), path + '[external]') + + mobj = re.search( + r'http://www.movshare.net/video/[a-z0-9]+', external_link) + + return self.url_result(mobj.group(0))