From 41f3835b53763a179a50ec7d5cec191b429dedfd Mon Sep 17 00:00:00 2001 From: Avichai Date: Wed, 22 Apr 2020 18:04:54 +0300 Subject: [PATCH 01/10] trying to get video-date using the tahoe --- youtube_dl/extractor/facebook.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index a85005caa..8aaf2a9a5 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -362,6 +362,15 @@ class FacebookIE(InfoExtractor): video_data = extract_from_jsmods_instances(server_js_data) tahoe_data = FacebookTahoeData(self, webpage, video_id) + if not video_data: + tahoe_js_data = self._parse_json( + self._search_regex( + r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data.primary, + 'tahoe js data', default='{}'), + video_id, fatal=False) + + video_data = extract_from_jsmods_instances(tahoe_js_data) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -372,15 +381,6 @@ class FacebookIE(InfoExtractor): expected=True) elif '>You must log in to continue' in webpage: self.raise_login_required() - # Video info not in first request, do a secondary request using - # tahoe player specific URL - tahoe_js_data = self._parse_json( - self._search_regex( - r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data.primary, - 'tahoe js data', default='{}'), - video_id, fatal=False) - - video_data = extract_from_jsmods_instances(tahoe_js_data) if not video_data : if self._search_regex(r'newsFeedStream.*?

(.*?)<\/span><\/h1>', webpage, "video_title") is not None: From 57c28f98b34723fbd7cdbed76818801c663576dc Mon Sep 17 00:00:00 2001 From: shiran Date: Wed, 22 Apr 2020 13:22:47 -0700 Subject: [PATCH 02/10] Added regex options for view count and uploader id --- youtube_dl/extractor/facebook.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 8aaf2a9a5..7ead5e58f 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -463,10 +463,10 @@ class FacebookIE(InfoExtractor): uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, 'uploader_id', default=None) or self._search_regex( - r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]', tahoe_data.secondary, + r'[\'\"]ownerid[\'\"]\s*:\s*[\'\"](\d+)[\'\"]',tahoe_data.secondary, 'uploader_id', default=None) or \ - self._search_regex(r'\\\"page_id\\\"\s*:\s*\\\"(\d+)\\\"', tahoe_data.secondary, 'uploader_id', fatal=False) - + self._search_regex(r'\\\"page_id\\\"\s*:\s*\\\"(\d+)\\\"', tahoe_data.secondary, 'uploader_id', fatal=False) or \ + self._search_regex(r'content_owner_id_new\\":\\"(\d+)\\"', tahoe_data.secondary, 'uploader_id', fatal=False) thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) if is_live: @@ -579,6 +579,10 @@ class FacebookIE(InfoExtractor): if values: return values[-1] + values = re.findall(r'seen_by_count":\"(\d+)\"', tahoe_data.secondary) + if values: + return values[-1] + def _real_extract(self, url): video_id = self._match_id(url) From db400f388e2498fa8861953a591572e97066eefb Mon Sep 17 00:00:00 2001 From: bhodaya Date: Tue, 5 May 2020 13:06:08 +0300 Subject: [PATCH 03/10] download video --- test/ci/test_tiktok.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 0aad8e420..3f248ac56 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -33,9 +33,9 @@ class TikTokTestYoutubeDl(unittest.TestCase): params = {} ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=True) - self.assertTrue(os.path.exists("Imagine lebron freaking out over something you did! #foryou #ballislife #lebron #nba-6783617809113943301.mp.4")) - os.remove("Imagine lebron freaking out over something you did! #foryou #ballislife #lebron #nba-6783617809113943301.mp.4") - + file_name="Imagine lebron freaking out over something you did! #foryou #ballislife #lebron #nba-6783617809113943301.mp.4" + self.assertTrue(os.path.exists(file_name)) + os.remove(file_name) if __name__ == '__main__': From d9de9d55954dbca43ec69ebc8df4729f37c05d9d Mon Sep 17 00:00:00 2001 From: bhodaya Date: Tue, 5 May 2020 14:16:48 +0300 Subject: [PATCH 04/10] download video --- test/ci/test_tiktok.py | 4 ++-- youtube_dl/extractor/tiktok.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 3f248ac56..cda730059 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -11,7 +11,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) self.assertEquals(info['id'], '6807126376001441030') - self.assertEquals(info['url'], 'https://www.tiktok.com/@oriangaon/video/6807126376001441030') + self.assertEquals(info['webpage_url'], 'https://www.tiktok.com/@oriangaon/video/6807126376001441030') self.assertEquals(info['title'], '#foryou #foyou Mmmmm....,,') self.assertEquals(info['uploader'], 'Oriangaon') self.assertEquals(info['timestamp'], 1584907616) @@ -25,7 +25,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): self.assertGreaterEqual(info['share_count'], 109) self.assertGreaterEqual(info['comment_count'], 40) self.assertEquals(info['duration'], 10) - self.assertEquals(info['ext'], 'mp.4') + self.assertEquals(info['ext'], 'mp4') self.assertGreater(len(info['embed_code']),0) def test_download_video(self): diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 023b40ab6..4c4377504 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -109,9 +109,9 @@ class TikTokIE(TikTokBaseIE): 'subtitles': subtitles, 'comment_count': comment_count, 'duration': duration, - 'ext':'mp.4', + 'ext':'mp4', 'embed_code': embed_code, - 'format': format + 'formats': format } return info_dict From e3f32c957b09112e1f8818a78d5296990d041572 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 6 May 2020 12:32:25 +0300 Subject: [PATCH 05/10] download video ext fix --- youtube_dl/extractor/tiktok.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 4c4377504..f18526ab8 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -21,8 +21,8 @@ class TikTokBaseIE(InfoExtractor): for format in format_urls: formats.append({ 'url': format, - 'ext': 'mp4', 'height': height, + 'ext': 'mp4', 'width': width, }) self._sort_formats(formats) @@ -109,7 +109,6 @@ class TikTokIE(TikTokBaseIE): 'subtitles': subtitles, 'comment_count': comment_count, 'duration': duration, - 'ext':'mp4', 'embed_code': embed_code, 'formats': format } From 9b21d732f1226bf3e00a9e686f6f6a920364ff9b Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 6 May 2020 12:59:15 +0300 Subject: [PATCH 06/10] download video ext fix --- test/ci/test_tiktok.py | 2 +- youtube_dl/extractor/tiktok.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index cda730059..7fd7f8f93 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -33,7 +33,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): params = {} ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=True) - file_name="Imagine lebron freaking out over something you did! #foryou #ballislife #lebron #nba-6783617809113943301.mp.4" + file_name="Imagine lebron freaking out over something you did! #foryou #ballislife #lebron #nba-6783617809113943301.mp4" self.assertTrue(os.path.exists(file_name)) os.remove(file_name) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index f18526ab8..0dbf230ac 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -22,8 +22,8 @@ class TikTokBaseIE(InfoExtractor): formats.append({ 'url': format, 'height': height, - 'ext': 'mp4', 'width': width, + 'ext': 'mp4', }) self._sort_formats(formats) return { From 3d42ba1a267ce8d732e18f788d7c52f1728b6f75 Mon Sep 17 00:00:00 2001 From: vshiran <64166845+vshiran@users.noreply.github.com> Date: Thu, 14 May 2020 23:17:54 -0700 Subject: [PATCH 07/10] Added broadcaster id retrieval for twitch clip (#313) --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index df4c39f1c..279fa3f9b 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -705,6 +705,7 @@ class TwitchClipsIE(TwitchBaseIE): clip(slug: "%s") { broadcaster { displayName + id } createdAt curator { @@ -774,6 +775,7 @@ class TwitchClipsIE(TwitchBaseIE): 'timestamp': unified_timestamp(clip.get('createdAt')), 'thumbnails': thumbnails, 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str), + 'creator_id': try_get(clip, lambda x: x['broadcaster']['id'], compat_str), 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str), 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str), } From 94482416de09b738f1b02cc500e192927e7b6d4c Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 20 May 2020 10:59:03 +0300 Subject: [PATCH 08/10] missing metadate fix --- test/ci/test_tiktok.py | 5 +++-- youtube_dl/extractor/tiktok.py | 14 +++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 7fd7f8f93..3f75cb202 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -15,8 +15,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): self.assertEquals(info['title'], '#foryou #foyou Mmmmm....,,') self.assertEquals(info['uploader'], 'Oriangaon') self.assertEquals(info['timestamp'], 1584907616) - self.assertEquals(info['thumbnail'], - 'https://p16-va-default.akamaized.net/obj/tos-maliva-p-0068/d1a8fbd3e42dda3a1baa01ee9edad289') + self.assertTrue(info['thumbnail']) self.assertGreaterEqual(info['view_count'], 79864) self.assertEquals(info['uploader_id'], '6772113344733955077') self.assertFalse(info['is_live']) @@ -27,6 +26,8 @@ class TikTokTestYoutubeDl(unittest.TestCase): self.assertEquals(info['duration'], 10) self.assertEquals(info['ext'], 'mp4') self.assertGreater(len(info['embed_code']),0) + self.assertGreaterEqual(info['author_followers'], 1357) + self.assertEqual(info['uploader_url'], "https://www.tiktok.com/@oriangaon") def test_download_video(self): url = 'https://www.tiktok.com/@ballislife/video/6783617809113943301' diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 0dbf230ac..17aad8ef3 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -74,6 +74,8 @@ class TikTokIE(TikTokBaseIE): ast_le = ast.literal_eval(json_data_encode) data_dict = json.loads(ast_le) + author_followers = data_dict['props']['pageProps']['videoData']['authorStats']['followerCount'] + item_info = data_dict['props']['pageProps']['videoData']['itemInfos'] timestamp = int(item_info['createTime']) shares = item_info['shareCount'] @@ -82,17 +84,17 @@ class TikTokIE(TikTokBaseIE): provider_id = item_info['authorId'] comments_count = item_info['commentCount'] likes_count = item_info['diggCount'] - - entry=self._extract_aweme(data_dict) + author_url = json_api['author_url'] + entry = self._extract_aweme(data_dict) return self.info_dict(video_id, str(url), json_api['title'], json_api['author_name'], timestamp, json_api['thumbnail_url'], - views, provider_id, False, 'not_live', likes_count, shares, '', comments_count, duration, json_api['html'], entry['formats']) + views, provider_id, False, 'not_live', likes_count, shares, '', comments_count, duration, json_api['html'], entry['formats'], author_url, author_followers) def info_dict(self, video_id, url, video_title, uploader, timestamp, thumbnail, view_count, uploader_id, is_live, live_status - , likes_count, shares_count, subtitles, comment_count, duration, embed_code, format): + , likes_count, shares_count, subtitles, comment_count, duration, embed_code, format, author_url, author_followers): info_dict = { 'id': video_id, 'url': url, @@ -110,7 +112,9 @@ class TikTokIE(TikTokBaseIE): 'comment_count': comment_count, 'duration': duration, 'embed_code': embed_code, - 'formats': format + 'formats': format, + 'uploader_url': author_url, + 'author_followers': author_followers } return info_dict From d01fd34533ed6712944313f6ae98d61b5342f9da Mon Sep 17 00:00:00 2001 From: bhodaya Date: Wed, 20 May 2020 11:04:01 +0300 Subject: [PATCH 09/10] missing metadate fix --- test/ci/test_tiktok.py | 2 +- youtube_dl/extractor/tiktok.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 3f75cb202..eb8fa8b31 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -26,7 +26,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): self.assertEquals(info['duration'], 10) self.assertEquals(info['ext'], 'mp4') self.assertGreater(len(info['embed_code']),0) - self.assertGreaterEqual(info['author_followers'], 1357) + self.assertGreaterEqual(info['uploader_like_count'], 1357) self.assertEqual(info['uploader_url'], "https://www.tiktok.com/@oriangaon") def test_download_video(self): diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 17aad8ef3..aecadd05a 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -114,7 +114,7 @@ class TikTokIE(TikTokBaseIE): 'embed_code': embed_code, 'formats': format, 'uploader_url': author_url, - 'author_followers': author_followers + 'uploader_like_count': author_followers } return info_dict From 64b596f672c0d1f896e7d8b393b2cd64b0ef2160 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 24 May 2020 18:48:34 +0300 Subject: [PATCH 10/10] missing metadate fix --- youtube_dl/extractor/facebook.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 7ead5e58f..6311f663c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re import socket @@ -25,10 +26,12 @@ from ..utils import ( try_get, urlencode_postdata, update_url_query, - lowercase_escape + lowercase_escape, + parse_iso8601 ) + class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: @@ -451,14 +454,20 @@ class FacebookIE(InfoExtractor): self._search_regex(r'ownerName"\s*:\s*"([^"]+)"', webpage, 'uploader', default=None) or \ self._og_search_title(webpage, default=None) + if webpage.find('Paid Partnership'): + timestamp = self._search_regex( + r'datePublished":"(.+?)"', webpage, + 'timestamp', default=None) + timestamp = parse_iso8601(timestamp) + else: + timestamp = int_or_none(self._search_regex( + r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, + 'timestamp', default=None) or self._search_regex( + r']+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) or int_or_none(self._search_regex( + r'publish_time":([\d]+)', webpage, + 'timestamp', default=None)) - timestamp = int_or_none(self._search_regex( - r'data-utime=\\\"(\d+)\\\"', tahoe_data.secondary, - 'timestamp', default=None) or self._search_regex( - r']+data-utime=["\'](\d+)', webpage, - 'timestamp', default=None)) or int_or_none(self._search_regex( - r'publish_time":([\d]+)', webpage, - 'timestamp', default=None)) uploader_id = self._search_regex( r'ownerid:"([\d]+)', webpage, @@ -631,7 +640,6 @@ class FacebookIE(InfoExtractor): video_title = 'Facebook video #%s' % video_id return video_title - class FacebookTahoeData: def __init__(self, extractor, page, video_id): self._page = page