Skip to content

Commit

Permalink
Adding -t, --text to reddit posts
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Dec 6, 2024
1 parent 8fb9cf0 commit a88ac13
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 7 deletions.
5 changes: 5 additions & 0 deletions minet/cli/reddit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
"help": "Number of posts to retrieve.",
"type": int,
},
{
"flags": ["-t", "--text"],
"help": "Retrieve the text of the post. Note that it will require one request per post.",
"action": "store_true",
}
],
)

Expand Down
14 changes: 10 additions & 4 deletions minet/cli/reddit/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
title="Scraping posts",
unit="groups",
nested=True,
sub_unit="posts",
sub_unit="subreddits",
)
def action(cli_args, enricher, loading_bar):
scraper = RedditScraper()
Expand All @@ -26,12 +26,18 @@ def action(cli_args, enricher, loading_bar):
with loading_bar.step(url):
try:
if cli_args.number:
posts = scraper.get_posts(url, cli_args.number)
if cli_args.text:
posts = scraper.get_posts(url, True, cli_args.number)
else:
posts = scraper.get_posts(url, False, cli_args.number)
else:
posts = scraper.get_posts(url)
if cli_args.text:
posts = scraper.get_posts(url, True)
else:
posts = scraper.get_posts(url, False)
except :
loading_bar.print(
"problème"
"the script could not complete normally on line %i" % (i)
)
continue

Expand Down
16 changes: 13 additions & 3 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def reddit_request(url, pool_manager):
sleep(1)
response = request(url, pool_manager=pool_manager)
soup = response.soup()
if response.status == 404 or soup.scrape("p[id='noresults']"):
if response.status == 404 or (soup.scrape("p[id='noresults']") and not soup.scrape("div[class='commentarea']")):
print("invalid url!")
return
remaining_requests = float(response.headers["x-ratelimit-remaining"])
Expand All @@ -49,7 +49,7 @@ class RedditScraper(object):
def __init__(self):
self.pool_manager = create_pool_manager()

def get_posts(self, url, nb_post=25):
def get_posts(self, url: str, add_text: bool, nb_post=25):
list_posts = []
nb_pages = ceil(int(nb_post) / 25)
old_url = get_old_url(get_url_from_subreddit(url))
Expand Down Expand Up @@ -81,12 +81,22 @@ def get_posts(self, url, nb_post=25):
upvote = post.select_one("div[class='score unvoted']").get_text()
published_date = post.scrape_one("time", "datetime")
link = post.scrape_one("a[class*='title']", "href")
if add_text:
text_response = reddit_request(post_url, self.pool_manager)
text_soup = text_response.soup()
try_content = text_soup.select_one("div[id='siteTable'] div[class^='usertext']")
if try_content:
content = try_content.get_text()
else:
content = ""
else:
content = ""

data = RedditPost(
title=title,
url=post_url,
author=author,
author_text=None,
author_text=content,
upvote=upvote,
number_comments=n_comments,
published_date=published_date,
Expand Down

0 comments on commit a88ac13

Please sign in to comment.