From a7ca0f930355782f60f0b75fa05c09e90814d1b0 Mon Sep 17 00:00:00 2001 From: rubyist Date: Thu, 27 Feb 2020 18:52:48 -0800 Subject: [PATCH 1/5] Add initial extractor for Matter Online --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/matter.py | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/matter.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 64d1fa251..8fd10bed8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -601,6 +601,7 @@ from .markiza import ( ) from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE +from .matter import MatterIE from .mdr import MDRIE from .mediaset import MediasetIE from .mediasite import ( diff --git a/youtube_dl/extractor/matter.py b/youtube_dl/extractor/matter.py new file mode 100644 index 000000000..ec427625b --- /dev/null +++ b/youtube_dl/extractor/matter.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MatterIE(InfoExtractor): + """ + InfoExtractor for Matter Music + + This class should be used to handle tracks. Another class (TODO) will be + used to implement playlists or other content. + """ + _VALID_URL = r'https?://app.matter.online/tracks/(?P\d+)/?' + _TESTS = { + # TODO: Implement + } + + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage( + "https://api.matter.online/api/v1/open-graph/tracks/%s/embedded" % track_id, track_id + ) + + author = self._html_search_regex( + r'([^<]+)', + webpage, "author" + ) + title = self._html_search_regex( + r'([^<]+)', + webpage, "title" + ) + download_url = self._html_search_regex( + r'', + webpage, "download_url" + ) + artwork = self._html_search_regex( + r'style="background: url\((https://matter-production.s3.amazonaws.com/images/[^\.]+\.[^\)]+)\)', + webpage, "artwork" + ) + + return { + 'id': track_id, + 'url': download_url, + 'title': title, + 'uploader': author, + 'thumbnail': artwork, + } From 8c5c97a0d36e37d552c56ff03cda0fea2651e5d7 Mon Sep 17 00:00:00 2001 From: rubyist Date: Thu, 27 Feb 2020 19:06:16 -0800 Subject: [PATCH 2/5] Be a little less specific about what an artist username looks like --- youtube_dl/extractor/matter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/matter.py b/youtube_dl/extractor/matter.py index ec427625b..1d2db7451 100644 --- a/youtube_dl/extractor/matter.py +++ b/youtube_dl/extractor/matter.py @@ -23,7 +23,7 @@ class MatterIE(InfoExtractor): ) author = self._html_search_regex( - r'([^<]+)', + r'([^<]+)', webpage, "author" ) title = self._html_search_regex( From b5879f6e4466a8573ce58ffc39d3f7b541d9c98a Mon Sep 17 00:00:00 2001 From: rubyist Date: Thu, 27 Feb 2020 19:10:31 -0800 Subject: [PATCH 3/5] Don't use _html_search_regex when there's no html to filter out --- youtube_dl/extractor/matter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/matter.py b/youtube_dl/extractor/matter.py index 1d2db7451..fb47bbff0 100644 --- a/youtube_dl/extractor/matter.py +++ b/youtube_dl/extractor/matter.py @@ -22,19 +22,19 @@ class MatterIE(InfoExtractor): "https://api.matter.online/api/v1/open-graph/tracks/%s/embedded" % track_id, track_id ) - author = self._html_search_regex( + author = self._search_regex( r'([^<]+)', webpage, "author" ) - title = self._html_search_regex( + title = self._search_regex( r'([^<]+)', webpage, "title" ) - download_url = self._html_search_regex( + download_url = self._search_regex( r'', webpage, "download_url" ) - artwork = self._html_search_regex( + artwork = self._search_regex( r'style="background: url\((https://matter-production.s3.amazonaws.com/images/[^\.]+\.[^\)]+)\)', webpage, "artwork" ) From c1020cf113592b26057ef10dae245f2f93b08a59 Mon Sep 17 00:00:00 2001 From: rubyist Date: Thu, 27 Feb 2020 19:42:51 -0800 Subject: [PATCH 4/5] Added tests for Matter extractor --- youtube_dl/extractor/matter.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/matter.py b/youtube_dl/extractor/matter.py index fb47bbff0..a0e96d278 100644 --- a/youtube_dl/extractor/matter.py +++ b/youtube_dl/extractor/matter.py @@ -12,9 +12,23 @@ class MatterIE(InfoExtractor): used to implement playlists or other content. """ _VALID_URL = r'https?://app.matter.online/tracks/(?P\d+)/?' - _TESTS = { - # TODO: Implement - } + _TESTS = [{ + 'url': 'https://app.matter.online/tracks/12866', + 'info_dict': { + 'id': '12866', + 'ext': 'mp3', + 'title': 'Beautiful type beat', + 'uploader': 'internet user', + }, + }, { + 'url': 'https://app.matter.online/tracks/18891', + 'info_dict': { + 'id': '18891', + 'ext': 'mp3', + 'title': 'starstruck', + 'uploader': 'iwi.', + } + }] def _real_extract(self, url): track_id = self._match_id(url) From 19ba4ec21833fccd3bb6f3b441a6afa24418a6ee Mon Sep 17 00:00:00 2001 From: rubyist Date: Mon, 2 Mar 2020 17:48:22 -0800 Subject: [PATCH 5/5] Remove long lines, relax reqs on optional fields, and simplify regexes. --- youtube_dl/extractor/matter.py | 41 +++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/matter.py b/youtube_dl/extractor/matter.py index a0e96d278..2295cafa5 100644 --- a/youtube_dl/extractor/matter.py +++ b/youtube_dl/extractor/matter.py @@ -32,31 +32,40 @@ class MatterIE(InfoExtractor): def _real_extract(self, url): track_id = self._match_id(url) - webpage = self._download_webpage( - "https://api.matter.online/api/v1/open-graph/tracks/%s/embedded" % track_id, track_id - ) - author = self._search_regex( - r'([^<]+)', - webpage, "author" - ) + # Fetch page with metadata and download URLs. + api = "https://api.matter.online/api/v1/open-graph/tracks/%s/embedded" + webpage = self._download_webpage(api % track_id, track_id) + + # Extract required fields title = self._search_regex( - r'([^<]+)', + r'tracks/\d+" target="[^"]+">([^<]+)', webpage, "title" ) download_url = self._search_regex( - r'', + r'(https://[^/]+/audios/[^\.]+\.[^"]+)"/>', webpage, "download_url" ) - artwork = self._search_regex( - r'style="background: url\((https://matter-production.s3.amazonaws.com/images/[^\.]+\.[^\)]+)\)', - webpage, "artwork" - ) - return { + extracted = { 'id': track_id, 'url': download_url, 'title': title, - 'uploader': author, - 'thumbnail': artwork, } + + # Extract optional fields + author = self._search_regex( + r'artists/[^"]+" target="[^"]+">([^<]+)', + webpage, "author", fatal=False + ) + artwork = self._search_regex( + r'(https://[^/]+/images/[^\.]+\.[^\)]+)\)', + webpage, "artwork", fatal=False + ) + + if artwork: + extracted['thumbnail'] = artwork + if author: + extracted['uploader'] = author + + return extracted