Adding user_posts function

medialab · Dec 20, 2024 · e40d672 · e40d672
1 parent bc901cb
commit e40d672
Show file tree

Hide file tree

Showing 4 changed files with 138 additions and 3 deletions.
diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
@@ -67,6 +67,33 @@
     ],
 )
 
+REDDIT_USER_POSTS_SUBCOMMAND = command(
+    "user_posts",
+    "minet.cli.reddit.user_posts",
+    title="Minet Reddit User Posts Command",
+    description="""
+        Retrieve reddit posts from a user link.
+    """,
+    epilog="""
+        Example:
+
+        . Searching posts from the user page of u/random_user:
+            $ minet reddit posts https://www.reddit.com/user/random_user/submitted/ > random_user_posts.csv
+    """,
+    variadic_input={
+        "dummy_column": "user",
+        "item_label": "user url, user shortcode or user id",
+        "item_label_plural": "user urls, user shortcodes or user ids",
+    },
+    arguments=[
+        {
+            "flags": ["-n", "--number"],
+            "help": "Number of posts to retrieve.",
+            "type": int,
+        },
+    ],
+)
+
 
 REDDIT_COMMAND = command(
     "reddit",
@@ -79,5 +106,6 @@
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
         REDDIT_COMMENTS_SUBCOMMAND,
+        REDDIT_USER_POSTS_SUBCOMMAND
     ],
 )
diff --git a/minet/cli/reddit/user_posts.py b/minet/cli/reddit/user_posts.py
@@ -0,0 +1,46 @@
+# =============================================================================
+# Minet Reddit Posts CLI Action
+# =============================================================================
+#
+# Logic of the `rd user_posts` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditUserPost
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditUserPost,
+    title="Scraping user posts",
+    unit="groups",
+    nested=True,
+    sub_unit="user",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.number:
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url, cli_args.number)
+                    else:
+                        posts = scraper.get_user_posts(url, cli_args.number)
+                else:
+                    if cli_args.text:
+                        posts = scraper.get_user_posts(url)
+                    else:
+                        posts = scraper.get_user_posts(url)
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for post in posts:
+                loading_bar.nested_advance()
+                enricher.writerow(row, post)
diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
@@ -2,7 +2,7 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost, RedditComment
+from minet.reddit.types import RedditPost, RedditComment, RedditUserPost
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
 from urllib.parse import urljoin
@@ -145,7 +145,7 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         url=post_url,
                         author=author,
                         author_text=content,
-                        upvote=upvote,
+                        points=upvote,
                         number_comments=n_comments,
                         published_date=published_date,
                         link=resolve_relative_url(link),
@@ -220,3 +220,53 @@ def get_comments(self, url: str, all):
                 )
                 if data.id != "":
                     yield data
+
+    def get_user_posts(self, url: str, nb = 25):
+        nb_pages = ceil(int(nb) / 25)
+        n_crawled = 0
+        old_url = get_old_url(url)
+        for _ in range(nb_pages):
+            if n_crawled == int(nb):
+                break
+            response = reddit_request(old_url, self.pool_manager)
+            soup = response.soup()
+            posts = soup.select("div[id^='thing_t3_']")
+            for post in posts:
+                sub = post.scrape_one("a[class*='subreddit']", "href")
+                title = post.scrape_one("a[class^='title']")
+                points = post.scrape_one("div[class='score unvoted']")
+                post_url = post.scrape_one("a[class^='bylink comment']", "href")
+                nb_comments = post.scrape_one("a[class^='bylink comment']")
+                match = re.match(r"(\d+)\s+comments", nb_comments)
+                if match:
+                    nb_comments = int(match.group(1))
+                else:
+                    nb_comments = 0
+                link = post.scrape_one("a[class^='title']", "href")
+                published_date = post.scrape("time", "datetime")
+
+                data = RedditUserPost(
+                    title=title,
+                    url=post_url,
+                    points=points,
+                    number_comments=nb_comments,
+                    published_date=published_date,
+                    link=link,
+                    subreddit=sub
+                )
+
+                yield data
+                n_crawled += 1
+            old_url = soup.scrape("span[class='next-button'] a", "href")[0]
+
+
+
+
+
+
+
+
+
+
+    def get_user_comments(self, url: str, nb = 25):
+        old_url = get_old_url(url)
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
@@ -10,7 +10,7 @@ class RedditPost(TabularRecord):
     url: str
     author: str
     author_text: Optional[str]
-    upvote: int
+    points: int
     number_comments: int
     published_date: str
     link: Optional[str]
@@ -25,3 +25,14 @@ class RedditComment(TabularRecord):
     points: int
     published_date: str
     comment: str
+
+
+@dataclass
+class RedditUserPost(TabularRecord):
+    title: str
+    url: str
+    points: int
+    number_comments: int
+    published_date: str
+    link: str
+    subreddit: str