First version of reddit comments

medialab · Dec 9, 2024 · ef116eb · ef116eb
1 parent a49a818
commit ef116eb
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 3 deletions.
diff --git a/minet/cli/reddit/__init__.py b/minet/cli/reddit/__init__.py
@@ -39,6 +39,34 @@
     ],
 )
 
+REDDIT_COMMENTS_SUBCOMMAND = command(
+    "comments",
+    "minet.cli.reddit.comments",
+    title="Minet Reddit Comments Command",
+    description="""
+        Retrieve comments from a reddit post link.
+        Note that it will only retrieve the comments displayed on the page. If you want all the comments you need to use -A, --all but it will require a request per comment, and you can only make 100 requests per 10 minutes.
+    """,
+    epilog="""
+        Example:
+
+        . Searching comments from a reddit post:
+            $ minet reddit comments https://www.reddit.com/r/france/comments/... > r_france_comments.csv
+    """,
+    variadic_input={
+        "dummy_column": "post",
+        "item_label": "post url, post shortcode or post id",
+        "item_label_plural": "posts urls, posts shortcodes or posts ids",
+    },
+    arguments=[
+        {
+            "flags": ["-A", "--all"],
+            "help": "Retrieve all comments.",
+            "action": "store_true",
+        },
+    ],
+)
+
 
 REDDIT_COMMAND = command(
     "reddit",
@@ -50,5 +78,6 @@
     """,
     subcommands=[
         REDDIT_POSTS_SUBCOMMAND,
+        REDDIT_COMMENTS_SUBCOMMAND,
     ],
 )
diff --git a/minet/cli/reddit/comments.py b/minet/cli/reddit/comments.py
@@ -0,0 +1,41 @@
+# =============================================================================
+# Minet Reddit Comments CLI Action
+# =============================================================================
+#
+# Logic of the `rd comments` action.
+#
+from minet.cli.utils import with_enricher_and_loading_bar
+from minet.reddit.scraper import RedditScraper
+from minet.reddit.types import RedditComment
+from minet.reddit.exceptions import RedditInvalidTargetError
+
+
+@with_enricher_and_loading_bar(
+    headers=RedditComment,
+    title="Scraping comments",
+    unit="groups",
+    nested=True,
+    sub_unit="comments",
+)
+def action(cli_args, enricher, loading_bar):
+    scraper = RedditScraper()
+
+    for i, row, url in enricher.enumerate_cells(
+        cli_args.column, with_rows=True, start=1
+    ):
+        with loading_bar.step(url):
+            try:
+                if cli_args.all:
+                    comments = scraper.get_comments(url, True)
+                else:
+                    comments = scraper.get_comments(url, False)
+
+            except RedditInvalidTargetError:
+                loading_bar.print(
+                    "the script could not complete normally on line %i" % (i)
+                )
+                continue
+
+            for comment in comments:
+                loading_bar.nested_advance()
+                enricher.writerow(row, comment)
diff --git a/minet/cli/reddit/posts.py b/minet/cli/reddit/posts.py
@@ -15,7 +15,7 @@
     title="Scraping posts",
     unit="groups",
     nested=True,
-    sub_unit="subreddits",
+    sub_unit="posts",
 )
 def action(cli_args, enricher, loading_bar):
     scraper = RedditScraper()

diff --git a/minet/reddit/scraper.py b/minet/reddit/scraper.py
@@ -2,9 +2,14 @@
 from math import ceil
 from ural import get_domain_name, urlpathsplit, is_url
 from time import sleep
-from minet.reddit.types import RedditPost
+from minet.reddit.types import RedditPost, RedditComment
 from minet.reddit.exceptions import RedditInvalidTargetError
 import re
+from urllib.parse import urljoin
+
+
+def resolve_relative_url(path):
+    return urljoin("https://old.reddit.com", path)
 
 
 def get_old_url(url):
@@ -47,10 +52,47 @@ def reddit_request(url, pool_manager):
     return response
 
 
+def extract_t1_ids(text):
+    pattern = r"t1_(\w+)"
+    return [match.group(1) for match in re.finditer(pattern, text)]
+
+
+def get_current_id(com):
+    current_id = com.get("id")
+    if current_id:
+        current_id = current_id.split("_")[-1]
+    else:
+        current_id = com.get("data-permalink").split("/")[-2]
+    return current_id
+
+
 class RedditScraper(object):
     def __init__(self):
         self.pool_manager = create_pool_manager()
 
+    def get_childs_l500(self, url, list_comments, parent_id):
+        response = reddit_request(url, self.pool_manager)
+        soup = response.soup()
+        comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        for com in comments:
+            child = com.find("div", class_="child")
+            if child.text != "":
+                child = child.find("div")
+                child_com = child.find_all(
+                    "div",
+                    class_=lambda x: x
+                    and (
+                        "comment" in x
+                        or "deleted comment" in x
+                        or "morerecursion" in x
+                        or "morechildren" in x
+                    ),
+                    recursive=False,
+                )
+                for ele in child_com:
+                    list_comments.append((parent_id, ele))
+        return list_comments
+
     def get_posts(self, url: str, add_text: bool, nb_post=25):
         list_posts = []
         nb_pages = ceil(int(nb_post) / 25)
@@ -82,6 +124,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                     try_author = post.select_one("a[class*='author']")
                     author = try_author.get_text() if try_author else "Deleted"
                     upvote = post.select_one("div[class='score unvoted']").get_text()
+                    if upvote == '•':
+                        upvote = ""
                     published_date = post.scrape_one("time", "datetime")
                     link = post.scrape_one("a[class*='title']", "href")
                     if add_text:
@@ -105,10 +149,68 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
                         upvote=upvote,
                         number_comments=n_comments,
                         published_date=published_date,
-                        link=link,
+                        link=resolve_relative_url(link),
                     )
 
                     list_posts.append(data)
                     n_crawled += 1
             old_url = soup.scrape("span[class='next-button'] a", "href")[0]
         return list(list_posts)
+
+
+    def get_comments(self, url: str, all):
+        list_return = []
+        m_comments = []
+        old_url = get_old_url(url)
+        url_limit = old_url + "?limit=500"
+        response = reddit_request(url_limit, self.pool_manager)
+        soup = response.soup()
+        first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
+        for ele in first_comments:
+            m_comments.append((None, ele))
+        while m_comments:
+            parent, com = m_comments.pop()
+            current_id = get_current_id(com)
+            if "morerecursion" in com.get("class") and all:
+                url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
+                m_comments = self.get_childs_l500(url_rec, m_comments, parent)
+            elif "morechildren" in com.get("class") and all:
+                a = com.select_one("a")
+                onclick = a["onclick"]
+                id_list = extract_t1_ids(onclick)
+                for id in id_list:
+                    comment_url = f"{old_url}{id}"
+                    m_comments = self.get_childs_l500(comment_url, m_comments, current_id)
+            else:
+                child = com.find("div", class_="child")
+                if child.text != "":
+                    child = child.find("div")
+                    if all:
+                        child_com = child.find_all(
+                            "div",
+                            class_=lambda x: x
+                            and (
+                                "comment" in x
+                                or "deleted comment" in x
+                                or "morerecursion" in x
+                                or "morechildren" in x
+                            ),
+                            recursive=False,
+                        )
+                    else:
+                        child_com = child.find_all(
+                            "div",
+                            class_=lambda x: x
+                            and ("comment" in x or "deleted comment" in x),
+                            recursive=False,
+                        )
+                    for ele in child_com:
+                        m_comments.append((current_id, ele))
+                data = RedditComment(
+                    id=current_id,
+                    parent=parent,
+                    comment=com.scrape_one("div[class='md']:not(div.child a)"),
+                )
+                if data.id != "":
+                    list_return.append(data)
+        return list_return
diff --git a/minet/reddit/types.py b/minet/reddit/types.py
@@ -14,3 +14,14 @@ class RedditPost(TabularRecord):
     number_comments: int
     published_date: str
     link: Optional[str]
+
+
+@dataclass
+class RedditComment(TabularRecord):
+    # url: str
+    # author: str
+    id: str
+    parent: str
+    # points: Optional[str]
+    # published_date: str
+    comment: str