Skip to content

Commit

Permalink
[tumblrgallery] improve 'id' extraction (#2115)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Dec 19, 2021
1 parent ac80474 commit 0bd7607
Showing 1 changed file with 41 additions and 51 deletions.
92 changes: 41 additions & 51 deletions gallery_dl/extractor/tumblrgallery.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,20 @@ class TumblrgalleryExtractor(GalleryExtractor):
directory_fmt = ("{category}", "{gallery_id} {title}")
root = "https://tumblrgallery.xyz"

@staticmethod
def _urls_from_page(page):
return text.extract_iter(
page, '<div class="report"> <a class="xx-co-me" href="', '"')

@staticmethod
def _data_from_url(url):
filename = text.nameext_from_url(url)["filename"]
parts = filename.split("_")
try:
return {"id": parts[1] if parts[1] != "inline" else parts[2]}
except IndexError:
return {"id": filename}


class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
"""Extractor for Tumblrblog on tumblrgallery.xyz"""
Expand All @@ -39,25 +53,17 @@ def metadata(self, page):
def images(self, _):
page_num = 1
while True:
response = self.request(
"{}/tumblrblog/gallery/{}/{}.html"
.format(self.root, self.gallery_id, page_num),
allow_redirects=False, fatal=False,
)
if response.status_code != 200:
url = "{}/tumblrblog/gallery/{}/{}.html".format(
self.root, self.gallery_id, page_num)
response = self.request(url, allow_redirects=False, fatal=False)

if response.status_code >= 300:
return

page = response.text
for url in self._urls_from_page(response.text):
yield url, self._data_from_url(url)
page_num += 1

urls = text.extract_iter(
page, '<div class="report"> <a class="xx-co-me" href="', '"')

for image_src in urls:
yield image_src, {
"id": text.extract(image_src, "tumblr_", "_")[0]
}


class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
"""Extractor for Posts on tumblrgallery.xyz"""
Expand All @@ -82,14 +88,8 @@ def metadata(self, page):
}

def images(self, page):
urls = text.extract_iter(
page, '<div class="report"> <a class="xx-co-me" href="', '"')

for image_src in urls:
yield image_src, {
"id": text.extract(image_src, "tumblr_", "_")[0] or
text.nameext_from_url(image_src)["filename"]
}
for url in self._urls_from_page(page):
yield url, self._data_from_url(url)


class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
Expand All @@ -112,36 +112,26 @@ def metadata(self, page):
def images(self, _):
page_num = 1
while True:
response = self.request(
"{}/s.php?q={}&page={}"
.format(self.root, self.search_term, page_num),
allow_redirects=False
)
if response.status_code != 200:
url = "{}/s.php?q={}&page={}".format(
self.root, self.search_term, page_num)
response = self.request(url, allow_redirects=False, fatal=False)

if response.status_code >= 300:
return

page = response.text
page_num += 1

gallery_ids = list(text.extract_iter(
page,
'<div class="title"><a href="post/',
'.html'
))

for gallery_id in gallery_ids:
post_page = self.request(
"{}/post/{}.html"
.format(self.root, gallery_id),
allow_redirects=False
).text
for image_src in TumblrgalleryPostExtractor.images(
self, post_page
):
image_src[1]["title"] = text.remove_html(
text.unescape(
text.extract(post_page, "<title>", "</title>")[0]
)
).replace("_", "-")
image_src[1]["gallery_id"] = gallery_id
yield image_src
for gallery_id in text.extract_iter(
page, '<div class="title"><a href="post/', '.html'):

url = "{}/post/{}.html".format(self.root, gallery_id)
post_page = self.request(url).text

for url in self._urls_from_page(post_page):
data = self._data_from_url(url)
data["gallery_id"] = gallery_id
data["title"] = text.remove_html(text.unescape(
text.extract(post_page, "<title>", "</title>")[0]
)).replace("_", "-")
yield url, data

6 comments on commit 0bd7607

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 0bd7607 Dec 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh.. does someone know where this issue went?

@mikf
Copy link
Owner Author

@mikf mikf commented on 0bd7607 Dec 21, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Deleted by the creator, same as his other issue (#2112).

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 0bd7607 Dec 22, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's strange. Usually, you can close your own issues, but not outright delete them. Unless your GitHub account might have some special privileges that I am not aware of.
Anyway, I think that sounds unlikely.

@mikf
Copy link
Owner Author

@mikf mikf commented on 0bd7607 Dec 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, that is strange. Stranger still is that they are both back and available again. The user account for dajotim937 was also gone and gave a 404, but now it's back and says "Joined 8 days ago". I also didn't get a notification for his third issue #2132. Must have been some bug with Github.

@dajotim937
Copy link

@dajotim937 dajotim937 commented on 0bd7607 Dec 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used temp mail for account because I wanted report couple issues and then delete account but github system marked my account as "spammy" and hide it from public. I wrote to support, changed email to gmail.com and they restore my account status to normal. That's why it was "deleted" and now available again.
And third issue i created while account was still with "spammy" status so because of it you didn't get notification. But actually, I saw notification message and it said that my issue and comments available for repo administration or something like that. So that's why you couldn't see my issues.

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 0bd7607 Dec 24, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, okay. Well, thanks for clearing that up @dajotim937 😄

Please sign in to comment.