[asbook] Add new extractor

2025-01-24 05:32:52 +08:00 · 2017-05-17 05:43:51 +03:00 · 2017-05-17 05:43:51 +03:00 · 69ed8da718
commit 69ed8da718
parent 6f76679804
2 changed files with 90 additions and 0 deletions
--- a/youtube_dl/extractor/asbook.py
+++ b/youtube_dl/extractor/asbook.py
@ -0,0 +1,89 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    decode_packed_codes,
+    PACKED_CODES_RE, encode_base_n, ExtractorError)
+
+
+class AsBookIE(InfoExtractor):
+    _VIDEO_RE = r'<h1 class="b-maintitle">(?P<title>.+)</h1>'
+    _VALID_URL = r'http://asbook\.net/(?P<section>abooks|radioshow|inyaz)/(?P<subsection>\S+)/(?P<id>\S+).html'
+    _TEST = {
+        'url': 'http://asbook.net/abooks/fantastic/8904-grad-obrechennyy-boris-i-arkadiy-strugackie.html',
+        'md5': 'ab3220ba94ed5bafa7fd796588198862',
+        'info_dict': {
+            'id': 'Град обреченный - 1',
+            'ext': 'mp3',
+            'title': '"Град обреченный" Аркадий и Борис Стругацкие',
+            'upload_date': '20160216',
+        }
+    }
+
+    def _real_extract(self, url):
+        book_id = self._match_id(url)
+
+        page = self._download_webpage(url, book_id)
+
+        json_url = None
+        for mobj in re.finditer(PACKED_CODES_RE, page):
+            packed_data = mobj.group(0).replace('\\\'', '\'')
+            text = self.decode_packed_codes(packed_data)
+            json_url = self._search_regex(r"json_url='(?P<json_url>\S+)';",
+                                          text, 'json_url', default=None)
+            if json_url is not None:
+                break
+
+        if not json_url:
+            raise ExtractorError('Could not get information about audiobook',
+                                 expected=True)
+
+        title = None
+        for mobj in re.finditer(self._VIDEO_RE, page):
+            info = mobj.groupdict()
+            if 'title' in info:
+                title = info['title'].strip()
+                break
+
+        playlist = self._download_json(json_url, book_id).get('playlist', None)
+
+        if not title:
+            title = playlist[0]['comment']
+
+        return self.playlist_result(self._entries(playlist, title),
+                                    book_id, title)
+
+    @staticmethod
+    def decode_packed_codes(code):
+        # This method copies the method from utils.decode_packed_codes,
+        # but it correctly passes Cyrillic characters
+
+        mobj = re.search(PACKED_CODES_RE, code)
+        obfucasted_code, base, count, symbols = mobj.groups()
+        base = int(base)
+        count = int(count)
+        symbols = symbols.split('|')
+        symbol_table = {}
+
+        while count:
+            count -= 1
+            base_n_count = encode_base_n(count, base)
+            symbol_table[base_n_count] = symbols[count] or base_n_count
+
+        return re.sub(
+            r'\b(\w+)\b', lambda mobj: symbol_table.get(mobj.group(0)),
+            obfucasted_code)
+
+    def _entries(self, playlist, playlist_title):
+        for item in playlist:
+            info = {'_type': 'url_transparent',
+                    'url': item['file'],
+                    'ie_key': None,
+                    'id': item['comment'],  # instead filename
+                    'title': playlist_title  # item['comment']
+                    }
+
+            yield info
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -295,6 +295,7 @@ from .ehow import EHowIE
 from .eighttracks import EightTracksIE
 from .einthusan import EinthusanIE
 from .eitb import EitbIE
+from .asbook import AsBookIE
 from .ellentv import (
    EllenTVIE,
    EllenTVClipsIE,