Skip to content

Commit

Permalink
[tiktok] add support (#3061 #4177 #5646 #6878 #6708)
Browse files Browse the repository at this point in the history
* Add TikTok photo support

#3061
#4177

* Address linting errors

* Fix more test failures

* Forgot to update category names in tests

* Looking into re issue

* Follow default yt-dlp output template

* Fix format string error on 3.5

* Support downloading videos and audio

Respond to comments
Improve archiving and file naming

* Forgot to update supportedsites.md

* Support user profiles

* Fix indentation

* Prevent matching with more than one TikTok extractor

* Fix TikTok regex

* Support TikTok profile avatars

* Fix supportedsites.md

* TikTok: Ignore no formats error

In my limited experience, this doesn't mean that gallery-dl can't download the photo post (but this could mean that you can't download the audio)

* Fix error reporting message

* TikTok: Support more URL formats

vt.tiktok.com
www.tiktok.com/t/

* TikTok: Only download avatar when extracting user profile

* TikTok: Document profile avatar limitation

* TikTok: Add support for www.tiktokv.com/share links

* Address Share -> Sharepost issue

* TikTok: Export post's creation date in JSON (ISO 8601)

* [tiktok] update

* [tiktok] update 'vmpost' handling

just perform a HEAD request and handle its response

* [tiktok] build URLs from post IDs

instead of reusing unchanged input URLs

* [tiktok] combine 'post' and 'sharepost' extractors

* [tiktok] update default filenames

put 'id' and 'num' first to ensure better file order

* [tiktok] improve ytdl usage

- speed up extraction by passing '"extract_flat": True'
- pass more user options and cookies
- pre-define 'TikTokUser' extractor usage

* [tiktok] Add _COOKIES entry to AUTH_MAP

* [tiktok] Always download user avatars

* [tiktok] Add more documentation to supportedsites.md

* [tiktok] Address review comments

---------

Co-authored-by: Mike Fährmann <[email protected]>
  • Loading branch information
CasualYT31 and mikf authored Feb 25, 2025
1 parent a9853cd commit daac2c6
Show file tree
Hide file tree
Showing 5 changed files with 521 additions and 1 deletion.
11 changes: 11 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Supported Sites

<!-- auto-generated by scripts/supportedsites.py -->

Consider all listed sites to potentially be NSFW.

<table>
Expand Down Expand Up @@ -937,6 +938,16 @@ Consider all listed sites to potentially be NSFW.
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>TikTok</td>
<td>https://www.tiktok.com/</td>
<td>Posts, User Profiles, VM Posts</td>
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>
<br /><span title="Pass `-o videos` to download photos only, and
`-o tiktok-range=a-yt-dlp-range` to tell yt-dlp to only extract XYZ links
when scraping a user profile. Also note that profile avatars will only be
downloaded when downloading a User Profile.">Hover for more info</span></td>
</tr>
<tr>
<td>TMOHentai</td>
<td>https://tmohentai.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@
"tapas",
"tcbscans",
"telegraph",
"tiktok",
"tmohentai",
"toyhouse",
"tsumino",
Expand Down
242 changes: 242 additions & 0 deletions gallery_dl/extractor/tiktok.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://www.tiktok.com/"""

from .common import Extractor, Message
from .. import text, util, ytdl, exception

BASE_PATTERN = r"(?:https?://)?(?:www\.)?tiktokv?\.com"


class TiktokExtractor(Extractor):
"""Base class for TikTok extractors"""
category = "tiktok"
directory_fmt = ("{category}", "{user}")
filename_fmt = (
"{id}{num:?_//>02} {title[b:150]}{img_id:? [/]/}.{extension}")
archive_fmt = "{id}_{num}_{img_id}"
root = "https://www.tiktok.com"
cookies_domain = ".tiktok.com"

def avatar(self):
return ""

def items(self):
videos = self.config("videos", True)
# We assume that all of the URLs served by urls() come from the same
# author.
downloaded_avatar = not self.avatar()

for tiktok_url in self.urls():
tiktok_url = self._sanitize_url(tiktok_url)
data = self._extract_rehydration_data(tiktok_url)
if "webapp.video-detail" not in data:
# Only /video/ links result in the video-detail dict we need.
# Try again using that form of link.
tiktok_url = self._sanitize_url(
data["seo.abtest"]["canonical"])
data = self._extract_rehydration_data(tiktok_url)
video_detail = data["webapp.video-detail"]

if not self._check_status_code(video_detail, tiktok_url):
continue

post = video_detail["itemInfo"]["itemStruct"]
author = post["author"]
post["user"] = user = author["uniqueId"]
post["date"] = text.parse_timestamp(post["createTime"])
original_title = title = post["desc"]
if not title:
title = "TikTok photo #{}".format(post["id"])

if not downloaded_avatar:
avatar_url = author["avatarLarger"]
avatar = self._generate_avatar(
avatar_url, post, user, author["id"])
yield Message.Directory, avatar
yield Message.Url, avatar_url, avatar
downloaded_avatar = True

yield Message.Directory, post
if "imagePost" in post:
img_list = post["imagePost"]["images"]
for i, img in enumerate(img_list, 1):
url = img["imageURL"]["urlList"][0]
text.nameext_from_url(url, post)
post.update({
"type" : "image",
"image" : img,
"title" : title,
"num" : i,
"img_id": post["filename"].partition("~")[0],
"width" : img["imageWidth"],
"height": img["imageHeight"],
})
yield Message.Url, url, post

elif videos:
if not original_title:
title = "TikTok video #{}".format(post["id"])

else:
self.log.info("%s: Skipping post", tiktok_url)

if videos:
post.update({
"type" : "video",
"image" : None,
"filename" : "",
"extension" : "mp4",
"title" : title,
"num" : 0,
"img_id" : "",
"width" : 0,
"height" : 0,
})
yield Message.Url, "ytdl:" + tiktok_url, post

# If we couldn't download the avatar because the given user has no
# posts, we'll need to make a separate request for the user's page
# and download the avatar that way.
if not downloaded_avatar:
user_name = self.avatar()
profile_url = "https://www.tiktok.com/@{}".format(user_name)
data = self._extract_rehydration_data(profile_url)
data = data["webapp.user-detail"]["userInfo"]["user"]
data["user"] = user_name
avatar_url = data["avatarLarger"]
avatar = self._generate_avatar(
avatar_url, data, user_name, data["id"])
yield Message.Directory, avatar
yield Message.Url, avatar_url, avatar

def _generate_avatar(self, avatar_url, data, user_name, user_id):
avatar = text.nameext_from_url(avatar_url, data.copy())
avatar.update({
"type" : "avatar",
"title" : "@" + user_name,
"id" : user_id,
"img_id": avatar["filename"].partition("~")[0],
"num" : 0,
})
return avatar

def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))

def _extract_rehydration_data(self, url):
html = self.request(url).text
data = text.extr(
html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
'type="application/json">', '</script>')
return util.json_loads(data)["__DEFAULT_SCOPE__"]

def _check_status_code(self, detail, url):
status = detail.get("statusCode")
if not status:
return True

if status == 10222:
self.log.error("%s: Login required to access this post", url)
elif status == 10204:
self.log.error("%s: Requested post not available", url)
elif status == 10231:
self.log.error("%s: Region locked - Try downloading with a"
"VPN/proxy connection", url)
else:
self.log.error(
"%s: Received unknown error code %s ('%s')",
url, status, detail.get("statusMsg") or "")
return False


class TiktokPostExtractor(TiktokExtractor):
"""Extract a single video or photo TikTok link"""
subcategory = "post"
pattern = BASE_PATTERN + r"/(?:@([\w_.-]*)|share)/(?:phot|vide)o/(\d+)"
example = "https://www.tiktok.com/@USER/photo/1234567890"

def urls(self):
user, post_id = self.groups
url = "{}/@{}/video/{}".format(self.root, user or "", post_id)
return (url,)


class TiktokVmpostExtractor(TiktokExtractor):
"""Extract a single video or photo TikTok VM link"""
subcategory = "vmpost"
pattern = (r"(?:https?://)?(?:"
r"(?:v[mt]\.)?tiktok\.com|(?:www\.)?tiktok\.com/t"
r")/(?!@)([^/?#]+)")
example = "https://vm.tiktok.com/1a2B3c4E5"

def items(self):
url = text.ensure_http_scheme(self.url)
headers = {"User-Agent": "facebookexternalhit/1.1"}

response = self.request(url, headers=headers, method="HEAD",
allow_redirects=False, notfound="post")

url = response.headers.get("Location")
if not url or len(url) <= 28:
# https://www.tiktok.com/?_r=1
raise exception.NotFoundError("post")

data = {"_extractor": TiktokPostExtractor}
yield Message.Queue, url.partition("?")[0], data


class TiktokUserExtractor(TiktokExtractor):
"""Extract a TikTok user's profile"""
subcategory = "user"
pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
example = "https://www.tiktok.com/@USER"

def urls(self):
"""Attempt to use yt-dlp/youtube-dl to extract links from a
user's page"""

try:
module = ytdl.import_module(self.config("module"))
except (ImportError, SyntaxError) as exc:
self.log.error("Cannot import module '%s'",
getattr(exc, "name", ""))
self.log.debug("", exc_info=exc)
raise exception.ExtractionError("yt-dlp or youtube-dl is required "
"for this feature!")
extr_opts = {
"extract_flat" : True,
"ignore_no_formats_error": True,
}
user_opts = {
"retries" : self._retries,
"socket_timeout" : self._timeout,
"nocheckcertificate" : not self._verify,
"playlist_items" : str(self.config("tiktok-range", "")),
}
if self._proxies:
user_opts["proxy"] = self._proxies.get("http")

ytdl_instance = ytdl.construct_YoutubeDL(
module, self, user_opts, extr_opts)

# transfer cookies to ytdl
if self.cookies:
set_cookie = ytdl_instance.cookiejar.set_cookie
for cookie in self.cookies:
set_cookie(cookie)

with ytdl_instance as ydl:
info_dict = ydl._YoutubeDL__extract_info(
"{}/@{}".format(self.root, self.groups[0]),
ydl.get_info_extractor("TikTokUser"),
False, {}, True)
# This should include video and photo posts in /video/ URL form.
return [video["url"] for video in info_dict["entries"]]

def avatar(self):
return self.groups[0]
12 changes: 11 additions & 1 deletion scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,11 @@
"tbib" : "The Big ImageBoard",
"tcbscans" : "TCB Scans",
"tco" : "Twitter t.co",
"tmohentai" : "TMOHentai",
"thatpervert" : "ThatPervert",
"thebarchive" : "The /b/ Archive",
"thecollection" : "The /co/llection",
"tiktok" : "TikTok",
"tmohentai" : "TMOHentai",
"tumblrgallery" : "TumblrGallery",
"vanillarock" : "もえぴりあ",
"vidyart2" : "/v/idyart2",
Expand Down Expand Up @@ -339,6 +340,9 @@
"steamgriddb": {
"asset": "Individual Assets",
},
"tiktok": {
"vmpost": "VM Posts",
},
"tumblr": {
"day": "Days",
},
Expand Down Expand Up @@ -408,6 +412,11 @@
'#extractor-wallhaven-api-key">API Key</a>')
_APIKEY_WY = ('<a href="https://gdl-org.github.io/docs/configuration.html'
'#extractor-weasyl-api-key">API Key</a>')
_TIKTOK_MORE_INFO = '''
<br /><span title="Pass `-o videos` to download photos only, and
`-o tiktok-range=a-yt-dlp-range` to tell yt-dlp to only extract XYZ links
when scraping a user profile. Also note that profile avatars will only be
downloaded when downloading a User Profile.">Hover for more info</span>'''

AUTH_MAP = {
"aibooru" : "Supported",
Expand Down Expand Up @@ -457,6 +466,7 @@
"smugmug" : _OAUTH,
"subscribestar" : "Supported",
"tapas" : "Supported",
"tiktok" : _COOKIES + _TIKTOK_MORE_INFO,
"tsumino" : "Supported",
"tumblr" : _OAUTH,
"twitter" : "Supported",
Expand Down
Loading

0 comments on commit daac2c6

Please sign in to comment.