From 69ed8da71871a6e262480100c12a02e9d0b9b237 Mon Sep 17 00:00:00 2001 From: Vladimir K Urushev Date: Wed, 17 May 2017 05:43:51 +0300 Subject: [PATCH] [asbook] Add new extractor --- youtube_dl/extractor/asbook.py | 89 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 90 insertions(+) create mode 100644 youtube_dl/extractor/asbook.py diff --git a/youtube_dl/extractor/asbook.py b/youtube_dl/extractor/asbook.py new file mode 100644 index 000000000..713f1db2a --- /dev/null +++ b/youtube_dl/extractor/asbook.py @@ -0,0 +1,89 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + PACKED_CODES_RE, encode_base_n, ExtractorError) + + +class AsBookIE(InfoExtractor): + _VIDEO_RE = r'

(?P.+)</h1>' + _VALID_URL = r'http://asbook\.net/(?P<section>abooks|radioshow|inyaz)/(?P<subsection>\S+)/(?P<id>\S+).html' + _TEST = { + 'url': 'http://asbook.net/abooks/fantastic/8904-grad-obrechennyy-boris-i-arkadiy-strugackie.html', + 'md5': 'ab3220ba94ed5bafa7fd796588198862', + 'info_dict': { + 'id': 'Град обреченный - 1', + 'ext': 'mp3', + 'title': '"Град обреченный" Аркадий и Борис Стругацкие', + 'upload_date': '20160216', + } + } + + def _real_extract(self, url): + book_id = self._match_id(url) + + page = self._download_webpage(url, book_id) + + json_url = None + for mobj in re.finditer(PACKED_CODES_RE, page): + packed_data = mobj.group(0).replace('\\\'', '\'') + text = self.decode_packed_codes(packed_data) + json_url = self._search_regex(r"json_url='(?P<json_url>\S+)';", + text, 'json_url', default=None) + if json_url is not None: + break + + if not json_url: + raise ExtractorError('Could not get information about audiobook', + expected=True) + + title = None + for mobj in re.finditer(self._VIDEO_RE, page): + info = mobj.groupdict() + if 'title' in info: + title = info['title'].strip() + break + + playlist = self._download_json(json_url, book_id).get('playlist', None) + + if not title: + title = playlist[0]['comment'] + + return self.playlist_result(self._entries(playlist, title), + book_id, title) + + @staticmethod + def decode_packed_codes(code): + # This method copies the method from utils.decode_packed_codes, + # but it correctly passes Cyrillic characters + + mobj = re.search(PACKED_CODES_RE, code) + obfucasted_code, base, count, symbols = mobj.groups() + base = int(base) + count = int(count) + symbols = symbols.split('|') + symbol_table = {} + + while count: + count -= 1 + base_n_count = encode_base_n(count, base) + symbol_table[base_n_count] = symbols[count] or base_n_count + + return re.sub( + r'\b(\w+)\b', lambda mobj: symbol_table.get(mobj.group(0)), + obfucasted_code) + + def _entries(self, playlist, playlist_title): + for item in playlist: + info = {'_type': 'url_transparent', + 'url': item['file'], + 'ie_key': None, + 'id': item['comment'], # instead filename + 'title': playlist_title # item['comment'] + } + + yield info diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed603eb29..64abc9d70 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -295,6 +295,7 @@ from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE from .eitb import EitbIE +from .asbook import AsBookIE from .ellentv import ( EllenTVIE, EllenTVClipsIE,