Skip to content

Commit

Permalink
First version of reddit comments
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Dec 9, 2024
1 parent a49a818 commit ef116eb
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 3 deletions.
29 changes: 29 additions & 0 deletions minet/cli/reddit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,34 @@
],
)

REDDIT_COMMENTS_SUBCOMMAND = command(
"comments",
"minet.cli.reddit.comments",
title="Minet Reddit Comments Command",
description="""
Retrieve comments from a reddit post link.
Note that it will only retrieve the comments displayed on the page. If you want all the comments you need to use -A, --all but it will require a request per comment, and you can only make 100 requests per 10 minutes.
""",
epilog="""
Example:
. Searching comments from a reddit post:
$ minet reddit comments https://www.reddit.com/r/france/comments/... > r_france_comments.csv
""",
variadic_input={
"dummy_column": "post",
"item_label": "post url, post shortcode or post id",
"item_label_plural": "posts urls, posts shortcodes or posts ids",
},
arguments=[
{
"flags": ["-A", "--all"],
"help": "Retrieve all comments.",
"action": "store_true",
},
],
)


REDDIT_COMMAND = command(
"reddit",
Expand All @@ -50,5 +78,6 @@
""",
subcommands=[
REDDIT_POSTS_SUBCOMMAND,
REDDIT_COMMENTS_SUBCOMMAND,
],
)
41 changes: 41 additions & 0 deletions minet/cli/reddit/comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# =============================================================================
# Minet Reddit Comments CLI Action
# =============================================================================
#
# Logic of the `rd comments` action.
#
from minet.cli.utils import with_enricher_and_loading_bar
from minet.reddit.scraper import RedditScraper
from minet.reddit.types import RedditComment
from minet.reddit.exceptions import RedditInvalidTargetError


@with_enricher_and_loading_bar(
headers=RedditComment,
title="Scraping comments",
unit="groups",
nested=True,
sub_unit="comments",
)
def action(cli_args, enricher, loading_bar):
scraper = RedditScraper()

for i, row, url in enricher.enumerate_cells(
cli_args.column, with_rows=True, start=1
):
with loading_bar.step(url):
try:
if cli_args.all:
comments = scraper.get_comments(url, True)
else:
comments = scraper.get_comments(url, False)

except RedditInvalidTargetError:
loading_bar.print(
"the script could not complete normally on line %i" % (i)
)
continue

for comment in comments:
loading_bar.nested_advance()
enricher.writerow(row, comment)
2 changes: 1 addition & 1 deletion minet/cli/reddit/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
title="Scraping posts",
unit="groups",
nested=True,
sub_unit="subreddits",
sub_unit="posts",
)
def action(cli_args, enricher, loading_bar):
scraper = RedditScraper()
Expand Down
106 changes: 104 additions & 2 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@
from math import ceil
from ural import get_domain_name, urlpathsplit, is_url
from time import sleep
from minet.reddit.types import RedditPost
from minet.reddit.types import RedditPost, RedditComment
from minet.reddit.exceptions import RedditInvalidTargetError
import re
from urllib.parse import urljoin


def resolve_relative_url(path):
return urljoin("https://old.reddit.com", path)


def get_old_url(url):
Expand Down Expand Up @@ -47,10 +52,47 @@ def reddit_request(url, pool_manager):
return response


def extract_t1_ids(text):
pattern = r"t1_(\w+)"
return [match.group(1) for match in re.finditer(pattern, text)]


def get_current_id(com):
current_id = com.get("id")
if current_id:
current_id = current_id.split("_")[-1]
else:
current_id = com.get("data-permalink").split("/")[-2]
return current_id


class RedditScraper(object):
def __init__(self):
self.pool_manager = create_pool_manager()

def get_childs_l500(self, url, list_comments, parent_id):
response = reddit_request(url, self.pool_manager)
soup = response.soup()
comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
for com in comments:
child = com.find("div", class_="child")
if child.text != "":
child = child.find("div")
child_com = child.find_all(
"div",
class_=lambda x: x
and (
"comment" in x
or "deleted comment" in x
or "morerecursion" in x
or "morechildren" in x
),
recursive=False,
)
for ele in child_com:
list_comments.append((parent_id, ele))
return list_comments

def get_posts(self, url: str, add_text: bool, nb_post=25):
list_posts = []
nb_pages = ceil(int(nb_post) / 25)
Expand Down Expand Up @@ -82,6 +124,8 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
try_author = post.select_one("a[class*='author']")
author = try_author.get_text() if try_author else "Deleted"
upvote = post.select_one("div[class='score unvoted']").get_text()
if upvote == '•':
upvote = ""
published_date = post.scrape_one("time", "datetime")
link = post.scrape_one("a[class*='title']", "href")
if add_text:
Expand All @@ -105,10 +149,68 @@ def get_posts(self, url: str, add_text: bool, nb_post=25):
upvote=upvote,
number_comments=n_comments,
published_date=published_date,
link=link,
link=resolve_relative_url(link),
)

list_posts.append(data)
n_crawled += 1
old_url = soup.scrape("span[class='next-button'] a", "href")[0]
return list(list_posts)


def get_comments(self, url: str, all):
list_return = []
m_comments = []
old_url = get_old_url(url)
url_limit = old_url + "?limit=500"
response = reddit_request(url_limit, self.pool_manager)
soup = response.soup()
first_comments = soup.select("div[class='commentarea']>div>div[class*='comment']")
for ele in first_comments:
m_comments.append((None, ele))
while m_comments:
parent, com = m_comments.pop()
current_id = get_current_id(com)
if "morerecursion" in com.get("class") and all:
url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
m_comments = self.get_childs_l500(url_rec, m_comments, parent)
elif "morechildren" in com.get("class") and all:
a = com.select_one("a")
onclick = a["onclick"]
id_list = extract_t1_ids(onclick)
for id in id_list:
comment_url = f"{old_url}{id}"
m_comments = self.get_childs_l500(comment_url, m_comments, current_id)
else:
child = com.find("div", class_="child")
if child.text != "":
child = child.find("div")
if all:
child_com = child.find_all(
"div",
class_=lambda x: x
and (
"comment" in x
or "deleted comment" in x
or "morerecursion" in x
or "morechildren" in x
),
recursive=False,
)
else:
child_com = child.find_all(
"div",
class_=lambda x: x
and ("comment" in x or "deleted comment" in x),
recursive=False,
)
for ele in child_com:
m_comments.append((current_id, ele))
data = RedditComment(
id=current_id,
parent=parent,
comment=com.scrape_one("div[class='md']:not(div.child a)"),
)
if data.id != "":
list_return.append(data)
return list_return
11 changes: 11 additions & 0 deletions minet/reddit/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,14 @@ class RedditPost(TabularRecord):
number_comments: int
published_date: str
link: Optional[str]


@dataclass
class RedditComment(TabularRecord):
# url: str
# author: str
id: str
parent: str
# points: Optional[str]
# published_date: str
comment: str

0 comments on commit ef116eb

Please sign in to comment.