From a30b01348fd354550824c5594a58948c50cc0e9f Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Mon, 6 Nov 2017 23:28:28 +0800
Subject: [PATCH 1/8] [ximalaya_extractor] Add new extractor

---
 youtube_dl/extractor/extractors.py |   4 +
 youtube_dl/extractor/ximalaya.py   | 184 +++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 youtube_dl/extractor/ximalaya.py
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index ecb33bc9e..3585da6e3 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -1308,6 +1308,10 @@ from .xiami import (
     XiamiArtistIE,
     XiamiCollectionIE
 )
+from .ximalaya import (
+    XimalayaIE,
+    XimalayaAlbumIE
+)
 from .xminus import XMinusIE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
new file mode 100644
index 000000000..daeae4524
--- /dev/null
+++ b/youtube_dl/extractor/ximalaya.py
@@ -0,0 +1,184 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+import itertools
+
+from ..compat import (
+    compat_str,
+)
+
+from .common import InfoExtractor
+
+class XimalayaBaseIE(InfoExtractor):
+     _GEO_COUNTRIES = ['CN']
+
+class XimalayaIE(XimalayaBaseIE):
+    IE_NAME = 'ximalaya'
+    IE_DESC = 'ximalaya.com'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)/?'
+    _USER_URL_FORMAT = 'http://www.ximalaya.com/zhubo/%i/'
+    _TESTS = [
+        {
+            'url': 'http://www.ximalaya.com/61425525/sound/47740352/',
+            # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
+            'info_dict': {
+                'id': '47740352',
+                'ext': 'm4a',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
+                'uploader': '小彬彬爱听书',
+                'uploader_id': 61425525,
+                'view_count': int,
+                'like_count': int,
+            }
+        },
+        {
+            'url': 'http://m.ximalaya.com/61425525/sound/47740352/',
+            'info_dict': {
+                'id': '47740352',
+                'ext': 'm4a',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
+                'uploader': '小彬彬爱听书',
+                'uploader_id': 61425525,
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+
+        is_m = 'm.ximalaya' in url
+
+        audio_id = self._match_id(url)
+        webpage = self._download_webpage(url, audio_id,
+                                         note='Download sound page for %s'% audio_id ,
+                                         errnote='Unable to get sound page')
+
+        audio_info_file = 'http://m.ximalaya.com/tracks/%s.json' % audio_id
+        audio_info = self._download_json(audio_info_file, audio_id,
+                                         'Downloading info json %s' % audio_info_file,
+                                         'Unable to download info file', fatal=True)
+
+        formats = []
+        for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')):
+            if audio_info.get(k):
+                formats.append({
+                    'format_id': bps,
+                    'url': audio_info[k],
+                    'ext': 'm4a',
+                })
+
+        # cover pics kyes like: cover_url', 'cover_url_142'
+        thumbnails = [{'name': k, 'url': audio_info.get(k)} for k in audio_info.keys() if k.startswith('cover_url')]
+
+        audio_uploader_id = audio_info.get('uid')
+
+        if is_m:
+            intro = re.search(r'(?s)<section class=["\']content[^>]+>(.+)</section>'
+                              , webpage)
+        else:
+            intro = re.search(r'(?s)<div class="rich_intro"[^>]*>(.+?</article>)',
+                              webpage)
+
+        if intro:
+            audio_description = intro.group(1).strip()
+        else:
+            audio_description_file = 'http://www.ximalaya.com/sounds/%s/rich_intro' % audio_id
+            audio_description = self._download_webpage(audio_description_file, audio_id,
+                                                       note='Downloading description file %s' % audio_description_file,
+                                                       errnote='Unable to download descrip file, try to parse web page',
+                                                       fatal=False)
+            audio_description = audio_description.strip()
+
+        return {
+            'id': audio_id,
+            'uploader': audio_info.get('nickname'),
+            'uploader_id': audio_uploader_id,
+            'uploader_url': self._USER_URL_FORMAT % audio_uploader_id,
+            'title': audio_info.get('title'),
+            'thumbnails': thumbnails,
+            'description': audio_description,
+            'categories': audio_info.get('category_title'),
+            'duration': audio_info.get('duration'),
+            'view_count': audio_info.get('play_count'),
+            'like_count': audio_info.get('favorites_count'),
+            'formats': formats,
+        }
+
+
+class XimalayaAlbumIE(XimalayaBaseIE):
+    IE_NAME = 'ximalaya.com:album'
+    IE_DESC = 'ximalaya album'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)/?'
+    _TEMPLATE_URL = 'http://www.ximalaya.com/%s/album/%s/'
+    _BASE_URL_TEMPL = 'http://www.ximalaya.com%s'
+    _TESTS = [{
+        'url': 'http://www.ximalaya.com/61425525/album/5534601/',
+        'info_dict': {
+            'title': 'contains:唐诗三百首（含赏析）',
+            'id': '5534601',
+        },
+        'playlist_count': 312,
+    }, {
+        'url': 'http://m.ximalaya.com/61425525/album/5534601',
+        'info_dict': {
+            'title': 'contains:唐诗三百首（含赏析）',
+            'id': '5534601',
+        },
+        'playlist_count': 312,
+    },
+    ]
+
+    def _real_extract(self, url):
+        uid, playlist_id = self._match_uid_an_id(url)
+        assert uid.isdecimal()
+        webpage = self._download_webpage(self._TEMPLATE_URL % (uid, playlist_id), playlist_id,
+                                         note='Download album page for %s' % playlist_id,
+                                         errnote='Unable to get album info'
+                                         )
+
+        mobj = re.search(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>', webpage)
+        title = mobj.group(1) if mobj else self._meta_regex('title')
+
+        return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
+
+    def _entries(self, page, playlist_id, uid):
+        html = page
+        for page_num in itertools.count(1):
+            for entry in self._process_page(html, uid):
+                yield entry
+
+            mobj = re.search(r'<a href=(["\'])(?P<more>[^\'"]+)\1'
+                             r'[^>]+rel=(["\'])next\3', html)
+            if not mobj:
+                break
+
+            next_url = self._BASE_URL_TEMPL % mobj['more']
+            self.report_download_webpage(next_url)
+            html = self._download_webpage(next_url, playlist_id)
+            if not html.strip():
+                # Some webpages show a "Load more" button but they don't
+                # have more videos
+                break
+
+    def _process_page(self, html, uid):
+        find_from = html.index('album_soundlist')
+        for mobj in re.finditer(r'<a[^>]+?href="(?P<url>/' +
+                                        uid +
+                                        r'/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">',
+                                html[find_from:]):
+            if 'url' in mobj.groupdict():
+                yield self.url_result(self._BASE_URL_TEMPL % mobj.group('url'),
+                                      'Ximalaya',
+                                      mobj.group('id'),
+                                      mobj.group('title'))
+
+    @classmethod
+    def _match_uid_an_id(cls, url):
+        if '_VALID_URL_RE' not in cls.__dict__:
+            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+        m = cls._VALID_URL_RE.match(url)
+        assert m
+        return compat_str(m.group('uid')), compat_str(m.group('id'))

From 513ec05236ef81500c5105aaf3ccf7a7d6daef26 Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Mon, 6 Nov 2017 23:56:21 +0800
Subject: [PATCH 2/8] format change according by flake8

---
 youtube_dl/extractor/ximalaya.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index daeae4524..4208c6f72 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -11,8 +11,10 @@ from ..compat import (
 
 from .common import InfoExtractor
 
+
 class XimalayaBaseIE(InfoExtractor):
-     _GEO_COUNTRIES = ['CN']
+    _GEO_COUNTRIES = ['CN']
+
 
 class XimalayaIE(XimalayaBaseIE):
     IE_NAME = 'ximalaya'
@@ -53,7 +55,7 @@ class XimalayaIE(XimalayaBaseIE):
 
         audio_id = self._match_id(url)
         webpage = self._download_webpage(url, audio_id,
-                                         note='Download sound page for %s'% audio_id ,
+                                         note='Download sound page for %s' % audio_id,
                                          errnote='Unable to get sound page')
 
         audio_info_file = 'http://m.ximalaya.com/tracks/%s.json' % audio_id
@@ -76,11 +78,9 @@ class XimalayaIE(XimalayaBaseIE):
         audio_uploader_id = audio_info.get('uid')
 
         if is_m:
-            intro = re.search(r'(?s)<section class=["\']content[^>]+>(.+)</section>'
-                              , webpage)
+            intro = re.search(r'(?s)<section class=["\']content[^>]+>(.+)</section>', webpage)
         else:
-            intro = re.search(r'(?s)<div class="rich_intro"[^>]*>(.+?</article>)',
-                              webpage)
+            intro = re.search(r'(?s)<div class="rich_intro"[^>]*>(.+?</article>)', webpage)
 
         if intro:
             audio_description = intro.group(1).strip()
@@ -155,8 +155,8 @@ class XimalayaAlbumIE(XimalayaBaseIE):
             if not mobj:
                 break
 
-            next_url = self._BASE_URL_TEMPL % mobj['more']
-            self.report_download_webpage(next_url)
+            next_url = self._BASE_URL_TEMPL % mobj.group('more')
+            self.report_download_webpage('%d %s' % (page_num, next_url))
             html = self._download_webpage(next_url, playlist_id)
             if not html.strip():
                 # Some webpages show a "Load more" button but they don't
@@ -166,8 +166,8 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     def _process_page(self, html, uid):
         find_from = html.index('album_soundlist')
         for mobj in re.finditer(r'<a[^>]+?href="(?P<url>/' +
-                                        uid +
-                                        r'/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">',
+                                uid +
+                                r'/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">',
                                 html[find_from:]):
             if 'url' in mobj.groupdict():
                 yield self.url_result(self._BASE_URL_TEMPL % mobj.group('url'),

From 714abc3278f76ff138612926a40792e6fcee7b13 Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Thu, 16 Nov 2017 23:23:40 +0800
Subject: [PATCH 3/8] changes accoring to review by @yan12125 at github pull
 #14687

---
 youtube_dl/extractor/ximalaya.py | 40 ++++++++++++--------------------
 1 file changed, 15 insertions(+), 25 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index 4208c6f72..79891d5ab 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -1,13 +1,9 @@
 # coding: utf-8
+
 from __future__ import unicode_literals
 
-import re
-
 import itertools
-
-from ..compat import (
-    compat_str,
-)
+import re
 
 from .common import InfoExtractor
 
@@ -24,7 +20,6 @@ class XimalayaIE(XimalayaBaseIE):
     _TESTS = [
         {
             'url': 'http://www.ximalaya.com/61425525/sound/47740352/',
-            # 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
             'info_dict': {
                 'id': '47740352',
                 'ext': 'm4a',
@@ -61,7 +56,7 @@ class XimalayaIE(XimalayaBaseIE):
         audio_info_file = 'http://m.ximalaya.com/tracks/%s.json' % audio_id
         audio_info = self._download_json(audio_info_file, audio_id,
                                          'Downloading info json %s' % audio_info_file,
-                                         'Unable to download info file', fatal=True)
+                                         'Unable to download info file')
 
         formats = []
         for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')):
@@ -78,9 +73,9 @@ class XimalayaIE(XimalayaBaseIE):
         audio_uploader_id = audio_info.get('uid')
 
         if is_m:
-            intro = re.search(r'(?s)<section class=["\']content[^>]+>(.+)</section>', webpage)
+            intro = re.search(r'(?s)<section\s+class=["\']content[^>]+>(.+)</section>', webpage)
         else:
-            intro = re.search(r'(?s)<div class="rich_intro"[^>]*>(.+?</article>)', webpage)
+            intro = re.search(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', webpage)
 
         if intro:
             audio_description = intro.group(1).strip()
@@ -90,14 +85,14 @@ class XimalayaIE(XimalayaBaseIE):
                                                        note='Downloading description file %s' % audio_description_file,
                                                        errnote='Unable to download descrip file, try to parse web page',
                                                        fatal=False)
-            audio_description = audio_description.strip()
+            audio_description = audio_description.strip() if audio_description else None
 
         return {
             'id': audio_id,
             'uploader': audio_info.get('nickname'),
             'uploader_id': audio_uploader_id,
             'uploader_url': self._USER_URL_FORMAT % audio_uploader_id,
-            'title': audio_info.get('title'),
+            'title': audio_info['title'],
             'thumbnails': thumbnails,
             'description': audio_description,
             'categories': audio_info.get('category_title'),
@@ -114,6 +109,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)/?'
     _TEMPLATE_URL = 'http://www.ximalaya.com/%s/album/%s/'
     _BASE_URL_TEMPL = 'http://www.ximalaya.com%s'
+    _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%d/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">'
     _TESTS = [{
         'url': 'http://www.ximalaya.com/61425525/album/5534601/',
         'info_dict': {
@@ -132,15 +128,15 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     ]
 
     def _real_extract(self, url):
-        uid, playlist_id = self._match_uid_an_id(url)
-        assert uid.isdecimal()
+        mobj = re.match(self._VALID_URL, url)
+        uid, playlist_id = mobj.group('uid'), mobj.group('id')
+
         webpage = self._download_webpage(self._TEMPLATE_URL % (uid, playlist_id), playlist_id,
                                          note='Download album page for %s' % playlist_id,
-                                         errnote='Unable to get album info'
-                                         )
+                                         errnote='Unable to get album info')
 
         mobj = re.search(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>', webpage)
-        title = mobj.group(1) if mobj else self._meta_regex('title')
+        title = mobj.group(1) if mobj else self._html_search_meta('title')
 
         return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
 
@@ -150,7 +146,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
             for entry in self._process_page(html, uid):
                 yield entry
 
-            mobj = re.search(r'<a href=(["\'])(?P<more>[^\'"]+)\1'
+            mobj = re.search(r'<a\s+href=(["\'])(?P<more>[^\'"]+)\1'
                              r'[^>]+rel=(["\'])next\3', html)
             if not mobj:
                 break
@@ -165,10 +161,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
 
     def _process_page(self, html, uid):
         find_from = html.index('album_soundlist')
-        for mobj in re.finditer(r'<a[^>]+?href="(?P<url>/' +
-                                uid +
-                                r'/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">',
-                                html[find_from:]):
+        for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]):
             if 'url' in mobj.groupdict():
                 yield self.url_result(self._BASE_URL_TEMPL % mobj.group('url'),
                                       'Ximalaya',
@@ -179,6 +172,3 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     def _match_uid_an_id(cls, url):
         if '_VALID_URL_RE' not in cls.__dict__:
             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
-        m = cls._VALID_URL_RE.match(url)
-        assert m
-        return compat_str(m.group('uid')), compat_str(m.group('id'))

From 0c78b03f288fde4d00d87264df5b330461acd42c Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Thu, 16 Nov 2017 23:31:25 +0800
Subject: [PATCH 4/8] change %d to %s in a temp str

---
 youtube_dl/extractor/ximalaya.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index 79891d5ab..d79537d86 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -109,7 +109,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)/?'
     _TEMPLATE_URL = 'http://www.ximalaya.com/%s/album/%s/'
     _BASE_URL_TEMPL = 'http://www.ximalaya.com%s'
-    _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%d/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">'
+    _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">'
     _TESTS = [{
         'url': 'http://www.ximalaya.com/61425525/album/5534601/',
         'info_dict': {

From 76cf93b6e4e8298f15b324960a80e32e1aabb8e4 Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Tue, 28 Nov 2017 23:18:43 +0800
Subject: [PATCH 5/8] seond changes accoring to review by @yan12125 at github
 pull #1468

---
 youtube_dl/extractor/ximalaya.py | 154 ++++++++++++++++++++++---------
 1 file changed, 108 insertions(+), 46 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index d79537d86..7fc8e47cc 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -14,19 +14,34 @@ class XimalayaBaseIE(InfoExtractor):
 
 class XimalayaIE(XimalayaBaseIE):
     IE_NAME = 'ximalaya'
-    IE_DESC = 'ximalaya.com'
-    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)/?'
-    _USER_URL_FORMAT = 'http://www.ximalaya.com/zhubo/%i/'
+    IE_DESC = '喜马拉雅FM'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)'
+    _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/'
     _TESTS = [
         {
             'url': 'http://www.ximalaya.com/61425525/sound/47740352/',
             'info_dict': {
                 'id': '47740352',
                 'ext': 'm4a',
-                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
-                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
                 'uploader': '小彬彬爱听书',
                 'uploader_id': 61425525,
+                'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': "contains:《送孟浩然之广陵》\n作者：李白\n故人西辞黄鹤楼，烟花三月下扬州。\n孤帆远影碧空尽，惟见长江天际流。",
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762.jpg',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762_web_large.jpg',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['renwen', '人文'],
+                'duration': 93,
                 'view_count': int,
                 'like_count': int,
             }
@@ -36,10 +51,56 @@ class XimalayaIE(XimalayaBaseIE):
             'info_dict': {
                 'id': '47740352',
                 'ext': 'm4a',
-                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
-                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
                 'uploader': '小彬彬爱听书',
                 'uploader_id': 61425525,
+                'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
+                'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
+                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762.jpg',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762_web_large.jpg',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['renwen', '人文'],
+                'duration': 93,
+                'view_count': int,
+                'like_count': int,
+            }
+        },
+        {
+            'url': 'https://www.ximalaya.com/11045267/sound/15705996/',
+            'info_dict': {
+                'id': '15705996',
+                'ext': 'm4a',
+                'uploader': '李延隆老师',
+                'uploader_id': 11045267,
+                'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/',
+                'title': 'Lesson 1 Excuse me!',
+                'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n"
+                               "听录音，然后回答问题，这是谁的手袋？",
+                'thumbnails': [
+                    {
+                        'name': 'cover_url',
+                        'url': 'http://fdfs.xmcdn.com/group11/M07/5A/F9/wKgDa1c2gfyzJFVqAAFKjzSOBug106.jpg',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': 'http://fdfs.xmcdn.com/group11/M07/5A/F9/wKgDa1c2gfyzJFVqAAFKjzSOBug106_web_large.jpg',
+                        'width': 180,
+                        'height': 180
+                    }
+                ],
+                'categories': ['train', '外语'],
+                'duration': 40,
+                'view_count': int,
+                'like_count': int,
             }
         },
     ]
@@ -47,13 +108,14 @@ class XimalayaIE(XimalayaBaseIE):
     def _real_extract(self, url):
 
         is_m = 'm.ximalaya' in url
+        scheme = 'https' if url.startswith('https') else 'http'
 
         audio_id = self._match_id(url)
         webpage = self._download_webpage(url, audio_id,
                                          note='Download sound page for %s' % audio_id,
                                          errnote='Unable to get sound page')
 
-        audio_info_file = 'http://m.ximalaya.com/tracks/%s.json' % audio_id
+        audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id)
         audio_info = self._download_json(audio_info_file, audio_id,
                                          'Downloading info json %s' % audio_info_file,
                                          'Unable to download info file')
@@ -64,23 +126,29 @@ class XimalayaIE(XimalayaBaseIE):
                 formats.append({
                     'format_id': bps,
                     'url': audio_info[k],
-                    'ext': 'm4a',
                 })
 
-        # cover pics kyes like: cover_url', 'cover_url_142'
-        thumbnails = [{'name': k, 'url': audio_info.get(k)} for k in audio_info.keys() if k.startswith('cover_url')]
+        thumbnails = []
+        for k in audio_info.keys():
+            # cover pics kyes like: cover_url', 'cover_url_142'
+            if k.startswith('cover_url'):
+                thumbnail = {'name': k, 'url': audio_info[k]}
+                if k == 'cover_url_142':
+                    thumbnail['width'] = 180
+                    thumbnail['height'] = 180
+                thumbnails.append(thumbnail)
 
         audio_uploader_id = audio_info.get('uid')
 
         if is_m:
-            intro = re.search(r'(?s)<section\s+class=["\']content[^>]+>(.+)</section>', webpage)
+            audio_description= self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>',
+                                            webpage, 'audio_description', fatal=False, group=1)
         else:
-            intro = re.search(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', webpage)
+            audio_description= self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)',
+                                            webpage, 'audio_description', fatal=False, group=1)
 
-        if intro:
-            audio_description = intro.group(1).strip()
-        else:
-            audio_description_file = 'http://www.ximalaya.com/sounds/%s/rich_intro' % audio_id
+        if not audio_description:
+            audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id)
             audio_description = self._download_webpage(audio_description_file, audio_id,
                                                        note='Downloading description file %s' % audio_description_file,
                                                        errnote='Unable to download descrip file, try to parse web page',
@@ -91,11 +159,11 @@ class XimalayaIE(XimalayaBaseIE):
             'id': audio_id,
             'uploader': audio_info.get('nickname'),
             'uploader_id': audio_uploader_id,
-            'uploader_url': self._USER_URL_FORMAT % audio_uploader_id,
+            'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id),
             'title': audio_info['title'],
             'thumbnails': thumbnails,
             'description': audio_description,
-            'categories': audio_info.get('category_title'),
+            'categories': list((audio_info.get('category_name'), audio_info.get('category_title'))),
             'duration': audio_info.get('duration'),
             'view_count': audio_info.get('play_count'),
             'like_count': audio_info.get('favorites_count'),
@@ -104,11 +172,11 @@ class XimalayaIE(XimalayaBaseIE):
 
 
 class XimalayaAlbumIE(XimalayaBaseIE):
-    IE_NAME = 'ximalaya.com:album'
-    IE_DESC = 'ximalaya album'
-    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)/?'
-    _TEMPLATE_URL = 'http://www.ximalaya.com/%s/album/%s/'
-    _BASE_URL_TEMPL = 'http://www.ximalaya.com%s'
+    IE_NAME = 'ximalaya:album'
+    IE_DESC = '喜马拉雅FM 专辑'
+    _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)'
+    _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/'
+    _BASE_URL_TEMPL = '%s://www.ximalaya.com%s'
     _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">'
     _TESTS = [{
         'url': 'http://www.ximalaya.com/61425525/album/5534601/',
@@ -128,15 +196,18 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     ]
 
     def _real_extract(self, url):
+        self.scheme = scheme = 'https' if url.startswith('https') else 'http'
+
         mobj = re.match(self._VALID_URL, url)
         uid, playlist_id = mobj.group('uid'), mobj.group('id')
 
-        webpage = self._download_webpage(self._TEMPLATE_URL % (uid, playlist_id), playlist_id,
+        webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id,
                                          note='Download album page for %s' % playlist_id,
                                          errnote='Unable to get album info')
 
-        mobj = re.search(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>', webpage)
-        title = mobj.group(1) if mobj else self._html_search_meta('title')
+        title = self._html_search_regex(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>',
+                                        webpage, 'title',fatal=False, group=1)
+        title = title or self._html_search_meta('title')
 
         return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
 
@@ -146,29 +217,20 @@ class XimalayaAlbumIE(XimalayaBaseIE):
             for entry in self._process_page(html, uid):
                 yield entry
 
-            mobj = re.search(r'<a\s+href=(["\'])(?P<more>[^\'"]+)\1'
-                             r'[^>]+rel=(["\'])next\3', html)
-            if not mobj:
+            next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3',
+                                          html, 'list_next_url', default=None, group='more')
+            if not next_url:
                 break
 
-            next_url = self._BASE_URL_TEMPL % mobj.group('more')
-            self.report_download_webpage('%d %s' % (page_num, next_url))
-            html = self._download_webpage(next_url, playlist_id)
-            if not html.strip():
-                # Some webpages show a "Load more" button but they don't
-                # have more videos
-                break
+            next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url)
+            html = self._download_webpage(next_full_url, playlist_id)
 
     def _process_page(self, html, uid):
         find_from = html.index('album_soundlist')
         for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]):
-            if 'url' in mobj.groupdict():
-                yield self.url_result(self._BASE_URL_TEMPL % mobj.group('url'),
-                                      'Ximalaya',
-                                      mobj.group('id'),
-                                      mobj.group('title'))
+            yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')),
+                                  XimalayaIE.ie_key(),
+                                  mobj.group('id'),
+                                  mobj.group('title'))
+
 
-    @classmethod
-    def _match_uid_an_id(cls, url):
-        if '_VALID_URL_RE' not in cls.__dict__:
-            cls._VALID_URL_RE = re.compile(cls._VALID_URL)

From 9cfbb04ba28e1a05c2b9a8d4cc36e37cddf9349f Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Tue, 28 Nov 2017 23:36:25 +0800
Subject: [PATCH 6/8] improve TESTS about contains

---
 youtube_dl/extractor/ximalaya.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index 7fc8e47cc..a0d32f121 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -55,7 +55,7 @@ class XimalayaIE(XimalayaBaseIE):
                 'uploader_id': 61425525,
                 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/',
                 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白',
-                'description': 'contains:孤帆远影碧空尽，惟见长江天际流。',
+                'description': "contains:《送孟浩然之广陵》\n作者：李白\n故人西辞黄鹤楼，烟花三月下扬州。\n孤帆远影碧空尽，惟见长江天际流。",
                 'thumbnails': [
                     {
                         'name': 'cover_url',
@@ -181,14 +181,14 @@ class XimalayaAlbumIE(XimalayaBaseIE):
     _TESTS = [{
         'url': 'http://www.ximalaya.com/61425525/album/5534601/',
         'info_dict': {
-            'title': 'contains:唐诗三百首（含赏析）',
+            'title': '唐诗三百首（含赏析）',
             'id': '5534601',
         },
         'playlist_count': 312,
     }, {
         'url': 'http://m.ximalaya.com/61425525/album/5534601',
         'info_dict': {
-            'title': 'contains:唐诗三百首（含赏析）',
+            'title': '唐诗三百首（含赏析）',
             'id': '5534601',
         },
         'playlist_count': 312,
@@ -207,7 +207,6 @@ class XimalayaAlbumIE(XimalayaBaseIE):
 
         title = self._html_search_regex(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>',
                                         webpage, 'title',fatal=False, group=1)
-        title = title or self._html_search_meta('title')
 
         return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
 

From d3ebb04895ba6d3dd80831201fca7d5defc32f79 Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Fri, 29 Dec 2017 23:57:01 +0800
Subject: [PATCH 7/8] changes accoring to third review by @yan12125 at github
 pull #1468

---
 youtube_dl/extractor/ximalaya.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index a0d32f121..d66790ab9 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -31,11 +31,11 @@ class XimalayaIE(XimalayaBaseIE):
                 'thumbnails': [
                     {
                         'name': 'cover_url',
-                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762.jpg',
+                        'url': r're:^https?://.*\.jpg$',
                     },
                     {
                         'name': 'cover_url_142',
-                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762_web_large.jpg',
+                        'url': r're:^https?://.*\.jpg$',
                         'width': 180,
                         'height': 180
                     }
@@ -59,11 +59,15 @@ class XimalayaIE(XimalayaBaseIE):
                 'thumbnails': [
                     {
                         'name': 'cover_url',
-                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762.jpg',
+                        'url': r're:^https?://.*\.jpg$',
                     },
                     {
                         'name': 'cover_url_142',
-                        'url': 'http://fdfs.xmcdn.com/group18/M03/7F/61/wKgJJVgCaNzAIs-KAAC415nhLZs762_web_large.jpg',
+                        'url': r're:^https?://.*\.jpg$',
+                    },
+                    {
+                        'name': 'cover_url_142',
+                        'url': r're:^https?://.*\.jpg$',
                         'width': 180,
                         'height': 180
                     }
@@ -88,11 +92,11 @@ class XimalayaIE(XimalayaBaseIE):
                 'thumbnails': [
                     {
                         'name': 'cover_url',
-                        'url': 'http://fdfs.xmcdn.com/group11/M07/5A/F9/wKgDa1c2gfyzJFVqAAFKjzSOBug106.jpg',
+                        'url': r're:^https?://.*\.jpg$',
                     },
                     {
                         'name': 'cover_url_142',
-                        'url': 'http://fdfs.xmcdn.com/group11/M07/5A/F9/wKgDa1c2gfyzJFVqAAFKjzSOBug106_web_large.jpg',
+                        'url': r're:^https?://.*\.jpg$',
                         'width': 180,
                         'height': 180
                     }
@@ -142,10 +146,10 @@ class XimalayaIE(XimalayaBaseIE):
 
         if is_m:
             audio_description= self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>',
-                                            webpage, 'audio_description', fatal=False, group=1)
+                                            webpage, 'audio_description', fatal=False)
         else:
             audio_description= self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)',
-                                            webpage, 'audio_description', fatal=False, group=1)
+                                            webpage, 'audio_description', fatal=False)
 
         if not audio_description:
             audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id)
@@ -205,8 +209,8 @@ class XimalayaAlbumIE(XimalayaBaseIE):
                                          note='Download album page for %s' % playlist_id,
                                          errnote='Unable to get album info')
 
-        title = self._html_search_regex(r'detailContent_title(?:[^>]+)?><h1(?:[^>]+)?>([^<]+)</h1>',
-                                        webpage, 'title',fatal=False, group=1)
+        title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>',
+                                        webpage, 'title',fatal=False)
 
         return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
 

From 3c4c17a7edb74f1955888597db05f7f17816084c Mon Sep 17 00:00:00 2001
From: scil <scil.zc@gmail.com>
Date: Wed, 10 Jan 2018 23:45:56 +0800
Subject: [PATCH 8/8] forth changes accoring to forth review by @yan12125 at
 github pull #1468

---
 youtube_dl/extractor/ximalaya.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/youtube_dl/extractor/ximalaya.py b/youtube_dl/extractor/ximalaya.py
index d66790ab9..a912e54b8 100644
--- a/youtube_dl/extractor/ximalaya.py
+++ b/youtube_dl/extractor/ximalaya.py
@@ -61,10 +61,6 @@ class XimalayaIE(XimalayaBaseIE):
                         'name': 'cover_url',
                         'url': r're:^https?://.*\.jpg$',
                     },
-                    {
-                        'name': 'cover_url_142',
-                        'url': r're:^https?://.*\.jpg$',
-                    },
                     {
                         'name': 'cover_url_142',
                         'url': r're:^https?://.*\.jpg$',
@@ -145,17 +141,17 @@ class XimalayaIE(XimalayaBaseIE):
         audio_uploader_id = audio_info.get('uid')
 
         if is_m:
-            audio_description= self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>',
-                                            webpage, 'audio_description', fatal=False)
+            audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>',
+                                                        webpage, 'audio_description', fatal=False)
         else:
-            audio_description= self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)',
-                                            webpage, 'audio_description', fatal=False)
+            audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)',
+                                                        webpage, 'audio_description', fatal=False)
 
         if not audio_description:
             audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id)
             audio_description = self._download_webpage(audio_description_file, audio_id,
                                                        note='Downloading description file %s' % audio_description_file,
-                                                       errnote='Unable to download descrip file, try to parse web page',
+                                                       errnote='Unable to download descrip file',
                                                        fatal=False)
             audio_description = audio_description.strip() if audio_description else None
 
@@ -163,11 +159,11 @@ class XimalayaIE(XimalayaBaseIE):
             'id': audio_id,
             'uploader': audio_info.get('nickname'),
             'uploader_id': audio_uploader_id,
-            'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id),
+            'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None,
             'title': audio_info['title'],
             'thumbnails': thumbnails,
             'description': audio_description,
-            'categories': list((audio_info.get('category_name'), audio_info.get('category_title'))),
+            'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))),
             'duration': audio_info.get('duration'),
             'view_count': audio_info.get('play_count'),
             'like_count': audio_info.get('favorites_count'),
@@ -210,7 +206,7 @@ class XimalayaAlbumIE(XimalayaBaseIE):
                                          errnote='Unable to get album info')
 
         title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>',
-                                        webpage, 'title',fatal=False)
+                                        webpage, 'title', fatal=False)
 
         return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title)
 
@@ -235,5 +231,3 @@ class XimalayaAlbumIE(XimalayaBaseIE):
                                   XimalayaIE.ie_key(),
                                   mobj.group('id'),
                                   mobj.group('title'))
-
-