[tiktok] add support (#3061 #4177 #5646 #6878 #6708)

* Add TikTok photo support #3061 #4177 * Address linting errors * Fix more test failures * Forgot to update category names in tests * Looking into re issue * Follow default yt-dlp output template * Fix format string error on 3.5 * Support downloading videos and audio Respond to comments Improve archiving and file naming * Forgot to update supportedsites.md * Support user profiles * Fix indentation * Prevent matching with more than one TikTok extractor * Fix TikTok regex * Support TikTok profile avatars * Fix supportedsites.md * TikTok: Ignore no formats error In my limited experience, this doesn't mean that gallery-dl can't download the photo post (but this could mean that you can't download the audio) * Fix error reporting message * TikTok: Support more URL formats vt.tiktok.com www.tiktok.com/t/ * TikTok: Only download avatar when extracting user profile * TikTok: Document profile avatar limitation * TikTok: Add support for www.tiktokv.com/share links * Address Share -> Sharepost issue * TikTok: Export post's creation date in JSON (ISO 8601) * [tiktok] update * [tiktok] update 'vmpost' handling just perform a HEAD request and handle its response * [tiktok] build URLs from post IDs instead of reusing unchanged input URLs * [tiktok] combine 'post' and 'sharepost' extractors * [tiktok] update default filenames put 'id' and 'num' first to ensure better file order * [tiktok] improve ytdl usage - speed up extraction by passing '"extract_flat": True' - pass more user options and cookies - pre-define 'TikTokUser' extractor usage * [tiktok] Add _COOKIES entry to AUTH_MAP * [tiktok] Always download user avatars * [tiktok] Add more documentation to supportedsites.md * [tiktok] Address review comments --------- Co-authored-by: Mike Fährmann <[email protected]>
mikf · Feb 25, 2025 · daac2c6 · daac2c6
1 parent a9853cd
commit daac2c6
Show file tree

Hide file tree

Showing 5 changed files with 521 additions and 1 deletion.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -1,6 +1,7 @@
 # Supported Sites
 
 <!-- auto-generated by scripts/supportedsites.py -->
+
 Consider all listed sites to potentially be NSFW.
 
 <table>
@@ -937,6 +938,16 @@ Consider all listed sites to potentially be NSFW.
     <td>Galleries</td>
     <td></td>
 </tr>
+<tr>
+    <td>TikTok</td>
+    <td>https://www.tiktok.com/</td>
+    <td>Posts, User Profiles, VM Posts</td>
+    <td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>
+    <br /><span title="Pass `-o videos` to download photos only, and
+    `-o tiktok-range=a-yt-dlp-range` to tell yt-dlp to only extract XYZ links
+    when scraping a user profile. Also note that profile avatars will only be
+    downloaded when downloading a User Profile.">Hover for more info</span></td>
+</tr>
 <tr>
     <td>TMOHentai</td>
     <td>https://tmohentai.com/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -171,6 +171,7 @@
     "tapas",
     "tcbscans",
     "telegraph",
+    "tiktok",
     "tmohentai",
     "toyhouse",
     "tsumino",

diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
@@ -0,0 +1,242 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tiktok.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, ytdl, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktokv?\.com"
+
+
+class TiktokExtractor(Extractor):
+    """Base class for TikTok extractors"""
+    category = "tiktok"
+    directory_fmt = ("{category}", "{user}")
+    filename_fmt = (
+        "{id}{num:?_//>02} {title[b:150]}{img_id:? [/]/}.{extension}")
+    archive_fmt = "{id}_{num}_{img_id}"
+    root = "https://www.tiktok.com"
+    cookies_domain = ".tiktok.com"
+
+    def avatar(self):
+        return ""
+
+    def items(self):
+        videos = self.config("videos", True)
+        # We assume that all of the URLs served by urls() come from the same
+        # author.
+        downloaded_avatar = not self.avatar()
+
+        for tiktok_url in self.urls():
+            tiktok_url = self._sanitize_url(tiktok_url)
+            data = self._extract_rehydration_data(tiktok_url)
+            if "webapp.video-detail" not in data:
+                # Only /video/ links result in the video-detail dict we need.
+                # Try again using that form of link.
+                tiktok_url = self._sanitize_url(
+                    data["seo.abtest"]["canonical"])
+                data = self._extract_rehydration_data(tiktok_url)
+            video_detail = data["webapp.video-detail"]
+
+            if not self._check_status_code(video_detail, tiktok_url):
+                continue
+
+            post = video_detail["itemInfo"]["itemStruct"]
+            author = post["author"]
+            post["user"] = user = author["uniqueId"]
+            post["date"] = text.parse_timestamp(post["createTime"])
+            original_title = title = post["desc"]
+            if not title:
+                title = "TikTok photo #{}".format(post["id"])
+
+            if not downloaded_avatar:
+                avatar_url = author["avatarLarger"]
+                avatar = self._generate_avatar(
+                    avatar_url, post, user, author["id"])
+                yield Message.Directory, avatar
+                yield Message.Url, avatar_url, avatar
+                downloaded_avatar = True
+
+            yield Message.Directory, post
+            if "imagePost" in post:
+                img_list = post["imagePost"]["images"]
+                for i, img in enumerate(img_list, 1):
+                    url = img["imageURL"]["urlList"][0]
+                    text.nameext_from_url(url, post)
+                    post.update({
+                        "type"  : "image",
+                        "image" : img,
+                        "title" : title,
+                        "num"   : i,
+                        "img_id": post["filename"].partition("~")[0],
+                        "width" : img["imageWidth"],
+                        "height": img["imageHeight"],
+                    })
+                    yield Message.Url, url, post
+
+            elif videos:
+                if not original_title:
+                    title = "TikTok video #{}".format(post["id"])
+
+            else:
+                self.log.info("%s: Skipping post", tiktok_url)
+
+            if videos:
+                post.update({
+                    "type"      : "video",
+                    "image"     : None,
+                    "filename"  : "",
+                    "extension" : "mp4",
+                    "title"     : title,
+                    "num"       : 0,
+                    "img_id"    : "",
+                    "width"     : 0,
+                    "height"    : 0,
+                })
+                yield Message.Url, "ytdl:" + tiktok_url, post
+
+        # If we couldn't download the avatar because the given user has no
+        # posts, we'll need to make a separate request for the user's page
+        # and download the avatar that way.
+        if not downloaded_avatar:
+            user_name = self.avatar()
+            profile_url = "https://www.tiktok.com/@{}".format(user_name)
+            data = self._extract_rehydration_data(profile_url)
+            data = data["webapp.user-detail"]["userInfo"]["user"]
+            data["user"] = user_name
+            avatar_url = data["avatarLarger"]
+            avatar = self._generate_avatar(
+                avatar_url, data, user_name, data["id"])
+            yield Message.Directory, avatar
+            yield Message.Url, avatar_url, avatar
+
+    def _generate_avatar(self, avatar_url, data, user_name, user_id):
+        avatar = text.nameext_from_url(avatar_url, data.copy())
+        avatar.update({
+            "type"  : "avatar",
+            "title" : "@" + user_name,
+            "id"    : user_id,
+            "img_id": avatar["filename"].partition("~")[0],
+            "num"   : 0,
+        })
+        return avatar
+
+    def _sanitize_url(self, url):
+        return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
+
+    def _extract_rehydration_data(self, url):
+        html = self.request(url).text
+        data = text.extr(
+            html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
+            'type="application/json">', '</script>')
+        return util.json_loads(data)["__DEFAULT_SCOPE__"]
+
+    def _check_status_code(self, detail, url):
+        status = detail.get("statusCode")
+        if not status:
+            return True
+
+        if status == 10222:
+            self.log.error("%s: Login required to access this post", url)
+        elif status == 10204:
+            self.log.error("%s: Requested post not available", url)
+        elif status == 10231:
+            self.log.error("%s: Region locked - Try downloading with a"
+                           "VPN/proxy connection", url)
+        else:
+            self.log.error(
+                "%s: Received unknown error code %s ('%s')",
+                url, status, detail.get("statusMsg") or "")
+        return False
+
+
+class TiktokPostExtractor(TiktokExtractor):
+    """Extract a single video or photo TikTok link"""
+    subcategory = "post"
+    pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)"
+    example = "https://www.tiktok.com/@USER/photo/1234567890"
+
+    def urls(self):
+        user, post_id = self.groups
+        url = "{}/@{}/video/{}".format(self.root, user or "", post_id)
+        return (url,)
+
+
+class TiktokVmpostExtractor(TiktokExtractor):
+    """Extract a single video or photo TikTok VM link"""
+    subcategory = "vmpost"
+    pattern = (r"(?:https?://)?(?:"
+               r"(?:v[mt]\.)?tiktok\.com|(?:www\.)?tiktok\.com/t"
+               r")/(?!@)([^/?#]+)")
+    example = "https://vm.tiktok.com/1a2B3c4E5"
+
+    def items(self):
+        url = text.ensure_http_scheme(self.url)
+        headers = {"User-Agent": "facebookexternalhit/1.1"}
+
+        response = self.request(url, headers=headers, method="HEAD",
+                                allow_redirects=False, notfound="post")
+
+        url = response.headers.get("Location")
+        if not url or len(url) <= 28:
+            # https://www.tiktok.com/?_r=1
+            raise exception.NotFoundError("post")
+
+        data = {"_extractor": TiktokPostExtractor}
+        yield Message.Queue, url.partition("?")[0], data
+
+
+class TiktokUserExtractor(TiktokExtractor):
+    """Extract a TikTok user's profile"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
+    example = "https://www.tiktok.com/@USER"
+
+    def urls(self):
+        """Attempt to use yt-dlp/youtube-dl to extract links from a
+        user's page"""
+
+        try:
+            module = ytdl.import_module(self.config("module"))
+        except (ImportError, SyntaxError) as exc:
+            self.log.error("Cannot import module '%s'",
+                           getattr(exc, "name", ""))
+            self.log.debug("", exc_info=exc)
+            raise exception.ExtractionError("yt-dlp or youtube-dl is required "
+                                            "for this feature!")
+        extr_opts = {
+            "extract_flat"           : True,
+            "ignore_no_formats_error": True,
+        }
+        user_opts = {
+            "retries"                : self._retries,
+            "socket_timeout"         : self._timeout,
+            "nocheckcertificate"     : not self._verify,
+            "playlist_items"         : str(self.config("tiktok-range", "")),
+        }
+        if self._proxies:
+            user_opts["proxy"] = self._proxies.get("http")
+
+        ytdl_instance = ytdl.construct_YoutubeDL(
+            module, self, user_opts, extr_opts)
+
+        # transfer cookies to ytdl
+        if self.cookies:
+            set_cookie = ytdl_instance.cookiejar.set_cookie
+            for cookie in self.cookies:
+                set_cookie(cookie)
+
+        with ytdl_instance as ydl:
+            info_dict = ydl._YoutubeDL__extract_info(
+                "{}/@{}".format(self.root, self.groups[0]),
+                ydl.get_info_extractor("TikTokUser"),
+                False, {}, True)
+            # This should include video and photo posts in /video/ URL form.
+            return [video["url"] for video in info_dict["entries"]]
+
+    def avatar(self):
+        return self.groups[0]
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
@@ -145,10 +145,11 @@
     "tbib"           : "The Big ImageBoard",
     "tcbscans"       : "TCB Scans",
     "tco"            : "Twitter t.co",
-    "tmohentai"      : "TMOHentai",
     "thatpervert"    : "ThatPervert",
     "thebarchive"    : "The /b/ Archive",
     "thecollection"  : "The /co/llection",
+    "tiktok"         : "TikTok",
+    "tmohentai"      : "TMOHentai",
     "tumblrgallery"  : "TumblrGallery",
     "vanillarock"    : "もえぴりあ",
     "vidyart2"       : "/v/idyart2",
@@ -339,6 +340,9 @@
     "steamgriddb": {
         "asset": "Individual Assets",
     },
+    "tiktok": {
+        "vmpost": "VM Posts",
+    },
     "tumblr": {
         "day": "Days",
     },
@@ -408,6 +412,11 @@
               '#extractor-wallhaven-api-key">API Key</a>')
 _APIKEY_WY = ('<a href="https://gdl-org.github.io/docs/configuration.html'
               '#extractor-weasyl-api-key">API Key</a>')
+_TIKTOK_MORE_INFO = '''
+    <br /><span title="Pass `-o videos` to download photos only, and
+    `-o tiktok-range=a-yt-dlp-range` to tell yt-dlp to only extract XYZ links
+    when scraping a user profile. Also note that profile avatars will only be
+    downloaded when downloading a User Profile.">Hover for more info</span>'''
 
 AUTH_MAP = {
     "aibooru"        : "Supported",
@@ -457,6 +466,7 @@
     "smugmug"        : _OAUTH,
     "subscribestar"  : "Supported",
     "tapas"          : "Supported",
+    "tiktok"         : _COOKIES + _TIKTOK_MORE_INFO,
     "tsumino"        : "Supported",
     "tumblr"         : _OAUTH,
     "twitter"        : "Supported",