Skip to content

Commit

Permalink
Adding user_posts function
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Dec 20, 2024
1 parent bc901cb commit e40d672
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 3 deletions.
28 changes: 28 additions & 0 deletions minet/cli/reddit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,33 @@
],
)

REDDIT_USER_POSTS_SUBCOMMAND = command(
"user_posts",
"minet.cli.reddit.user_posts",
title="Minet Reddit User Posts Command",
description="""
Retrieve reddit posts from a user link.
""",
epilog="""
Example:
. Searching posts from the user page of u/random_user:
$ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
""",
variadic_input={
"dummy_column": "user",
"item_label": "user url, user shortcode or user id",
"item_label_plural": "user urls, user shortcodes or user ids",
},
arguments=[
{
"flags": ["-n", "--number"],
"help": "Number of posts to retrieve.",
"type": int,
},
],
)


REDDIT_COMMAND = command(
"reddit",
Expand All @@ -79,5 +106,6 @@
subcommands=[
REDDIT_POSTS_SUBCOMMAND,
REDDIT_COMMENTS_SUBCOMMAND,
REDDIT_USER_POSTS_SUBCOMMAND
],
)
46 changes: 46 additions & 0 deletions minet/cli/reddit/user_posts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# =============================================================================
# Minet Reddit Posts CLI Action
# =============================================================================
#
# Logic of the `rd user_posts` action.
#
from minet.cli.utils import with_enricher_and_loading_bar
from minet.reddit.scraper import RedditScraper
from minet.reddit.types import RedditUserPost
from minet.reddit.exceptions import RedditInvalidTargetError


@with_enricher_and_loading_bar(
headers=RedditUserPost,
title="Scraping user posts",
unit="groups",
nested=True,
sub_unit="user",
)
def action(cli_args, enricher, loading_bar):
scraper = RedditScraper()

for i, row, url in enricher.enumerate_cells(
cli_args.column, with_rows=True, start=1
):
with loading_bar.step(url):
try:
if cli_args.number:
if cli_args.text:
posts = scraper.get_user_posts(url, cli_args.number)
else:
posts = scraper.get_user_posts(url, cli_args.number)
else:
if cli_args.text:
posts = scraper.get_user_posts(url)
else:
posts = scraper.get_user_posts(url)
except RedditInvalidTargetError:
loading_bar.print(
"the script could not complete normally on line %i" % (i)
)
continue

for post in posts:
loading_bar.nested_advance()
enricher.writerow(row, post)
54 changes: 52 additions & 2 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from math import ceil
from ural import get_domain_name, urlpathsplit, is_url
from time import sleep
from minet.reddit.types import RedditPost, RedditComment
from minet.reddit.types import RedditPost, RedditComment, RedditUserPost
from minet.reddit.exceptions import RedditInvalidTargetError
import re
from urllib.parse import urljoin
Expand Down Expand Up @@ -145,7 +145,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
url=post_url,
author=author,
author_text=content,
upvote=upvote,
points=upvote,
number_comments=n_comments,
published_date=published_date,
link=resolve_relative_url(link),
Expand Down Expand Up @@ -220,3 +220,53 @@ def get_comments(self, url: str, all):
)
if data.id != "":
yield data

def get_user_posts(self, url: str, nb = 25):
nb_pages = ceil(int(nb) / 25)
n_crawled = 0
old_url = get_old_url(url)
for _ in range(nb_pages):
if n_crawled == int(nb):
break
response = reddit_request(old_url, self.pool_manager)
soup = response.soup()
posts = soup.select("div[id^='thing_t3_']")
for post in posts:
sub = post.scrape_one("a[class*='subreddit']", "href")
title = post.scrape_one("a[class^='title']")
points = post.scrape_one("div[class='score unvoted']")
post_url = post.scrape_one("a[class^='bylink comment']", "href")
nb_comments = post.scrape_one("a[class^='bylink comment']")
match = re.match(r"(\d+)\s+comments", nb_comments)
if match:
nb_comments = int(match.group(1))
else:
nb_comments = 0
link = post.scrape_one("a[class^='title']", "href")
published_date = post.scrape("time", "datetime")

data = RedditUserPost(
title=title,
url=post_url,
points=points,
number_comments=nb_comments,
published_date=published_date,
link=link,
subreddit=sub
)

yield data
n_crawled += 1
old_url = soup.scrape("span[class='next-button'] a", "href")[0]










def get_user_comments(self, url: str, nb = 25):
old_url = get_old_url(url)
13 changes: 12 additions & 1 deletion minet/reddit/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class RedditPost(TabularRecord):
url: str
author: str
author_text: Optional[str]
upvote: int
points: int
number_comments: int
published_date: str
link: Optional[str]
Expand All @@ -25,3 +25,14 @@ class RedditComment(TabularRecord):
points: int
published_date: str
comment: str


@dataclass
class RedditUserPost(TabularRecord):
title: str
url: str
points: int
number_comments: int
published_date: str
link: str
subreddit: str

0 comments on commit e40d672

Please sign in to comment.