From e9811b930914d402d834c5f9d4bbb795b0b8e0fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Romagn=C3=A9?= Date: Mon, 3 Feb 2025 20:39:31 +0100 Subject: [PATCH 1/2] Add feature to extract players URL in fbref --- soccerdata/fbref.py | 39 ++++++++++++++++++++++++++++++++++++++- tests/test_FBref.py | 25 +++++++++++++++++++++---- 2 files changed, 59 insertions(+), 5 deletions(-) diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py index cf4a0e2..a771295 100644 --- a/soccerdata/fbref.py +++ b/soccerdata/fbref.py @@ -533,7 +533,35 @@ def read_team_match_stats( # noqa: C901 .loc[self.leagues] ) - def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: + def _extract_players_url(self, tree: etree.ElementTree) -> dict: + """Extract players profile URL from the parsed HTML tree.""" + player_urls = {} + + # The table is often inside a comment + comments = tree.xpath("//comment()") + for comment in comments: + if "div_stats" in comment.text: + parser = etree.HTMLParser(recover=True) + table_tree = etree.fromstring(comment.text, parser) + + for player_elem in table_tree.xpath("//td[@data-stat='player']/a"): + player_name = player_elem.text + player_url = player_elem.get("href") + if player_name and player_url: + player_urls[player_name] = f"https://fbref.com{player_url}" + return player_urls + + # If not inside a comment, try normal extraction + for player_elem in tree.xpath("//td[@data-stat='player']/a"): + player_name = player_elem.text + player_url = player_elem.get("href") + if player_name and player_url: + player_urls[player_name] = f"https://fbref.com{player_url}" + return player_urls + + def read_player_season_stats( # noqa: C901 + self, stat_type: str = "standard", extract_players_url: bool = False + ) -> pd.DataFrame: """Retrieve players from the datasource for the selected leagues and seasons. The following stat types are available: @@ -553,6 +581,8 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: ---------- stat_type :str Type of stats to retrieve. + extract_players_url :bool + If True, the URL to player profiles will be extracted. Raises ------ @@ -633,6 +663,13 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: df_table[("Unnamed: league", "league")] = lkey df_table[("Unnamed: season", "season")] = skey df_table = _fix_nation_col(df_table) + + if extract_players_url: + player_links = self._extract_players_url(tree) + df_table["player_url"] = df_table[("Unnamed: 1_level_0", "Player")].map( + player_links + ) + players.append(df_table) # return dataframe diff --git a/tests/test_FBref.py b/tests/test_FBref.py index a4bab4a..36eb974 100644 --- a/tests/test_FBref.py +++ b/tests/test_FBref.py @@ -92,6 +92,10 @@ def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame) +def test_read_player_season_stats_with_player_url(fbref_ligue1: FBref) -> None: + assert isinstance(fbref_ligue1.read_player_season_stats("standard", True), pd.DataFrame) + + def test_read_schedule(fbref_ligue1: FBref) -> None: assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame) @@ -110,7 +114,8 @@ def test_read_schedule(fbref_ligue1: FBref) -> None: ) def test_read_player_match_stats(fbref_ligue1: FBref, stat_type: str) -> None: assert isinstance( - fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame + fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), + pd.DataFrame, ) @@ -143,17 +148,29 @@ def test_read_lineup(fbref_ligue1: FBref) -> None: def test_concat() -> None: df1 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Performance", "Goals"), + ("Performance", "Assists"), + ] ) ) df2 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Unnamed: b", "Goals"), + ("Performance", "Assists"), + ] ) ) df3 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Goals", "Unnamed: b"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Goals", "Unnamed: b"), + ("Performance", "Assists"), + ] ) ) res = _concat([df1, df2, df3], key=["player"]) From 5ee4e54be6230d8b0fe5da5748391f478fe56ecc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Romagn=C3=A9?= Date: Fri, 14 Feb 2025 14:27:14 +0100 Subject: [PATCH 2/2] Extract player ID from fbref --- soccerdata/fbref.py | 55 +++++++++++++-------------------------------- tests/test_FBref.py | 4 ---- 2 files changed, 15 insertions(+), 44 deletions(-) diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py index a771295..ced8030 100644 --- a/soccerdata/fbref.py +++ b/soccerdata/fbref.py @@ -533,35 +533,7 @@ def read_team_match_stats( # noqa: C901 .loc[self.leagues] ) - def _extract_players_url(self, tree: etree.ElementTree) -> dict: - """Extract players profile URL from the parsed HTML tree.""" - player_urls = {} - - # The table is often inside a comment - comments = tree.xpath("//comment()") - for comment in comments: - if "div_stats" in comment.text: - parser = etree.HTMLParser(recover=True) - table_tree = etree.fromstring(comment.text, parser) - - for player_elem in table_tree.xpath("//td[@data-stat='player']/a"): - player_name = player_elem.text - player_url = player_elem.get("href") - if player_name and player_url: - player_urls[player_name] = f"https://fbref.com{player_url}" - return player_urls - - # If not inside a comment, try normal extraction - for player_elem in tree.xpath("//td[@data-stat='player']/a"): - player_name = player_elem.text - player_url = player_elem.get("href") - if player_name and player_url: - player_urls[player_name] = f"https://fbref.com{player_url}" - return player_urls - - def read_player_season_stats( # noqa: C901 - self, stat_type: str = "standard", extract_players_url: bool = False - ) -> pd.DataFrame: + def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: """Retrieve players from the datasource for the selected leagues and seasons. The following stat types are available: @@ -581,8 +553,6 @@ def read_player_season_stats( # noqa: C901 ---------- stat_type :str Type of stats to retrieve. - extract_players_url :bool - If True, the URL to player profiles will be extracted. Raises ------ @@ -647,7 +617,7 @@ def read_player_season_stats( # noqa: C901 for elem in tree.xpath("//td[@data-stat='comp_level']//span"): elem.getparent().remove(elem) if big_five: - df_table = _parse_table(tree) + df_table = _parse_table(tree, with_player_id=True) df_table[("Unnamed: league", "league")] = ( df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT) ) @@ -659,17 +629,11 @@ def read_player_season_stats( # noqa: C901 (html_table,) = etree.fromstring(el.text, parser).xpath( f"//table[contains(@id, 'stats_{stat_type}')]" ) - df_table = _parse_table(html_table) + df_table = _parse_table(html_table, with_player_id=True) df_table[("Unnamed: league", "league")] = lkey df_table[("Unnamed: season", "season")] = skey df_table = _fix_nation_col(df_table) - if extract_players_url: - player_links = self._extract_players_url(tree) - df_table["player_url"] = df_table[("Unnamed: 1_level_0", "Player")].map( - player_links - ) - players.append(df_table) # return dataframe @@ -1176,13 +1140,15 @@ def read_shot_events( ) -def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame: +def _parse_table(html_table: html.HtmlElement, with_player_id: bool = False) -> pd.DataFrame: """Parse HTML table into a dataframe. Parameters ---------- html_table : lxml.html.HtmlElement HTML table to clean up. + with_player_id : bool + If True, will extract player IDs. Returns ------- @@ -1199,6 +1165,15 @@ def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame: elem.getparent().remove(elem) # parse HTML to dataframe (df_table,) = pd.read_html(html.tostring(html_table), flavor="lxml") + + if with_player_id: + player_ids = [ + elem.get("data-append-csv") for elem in html_table.xpath("//td[@data-append-csv]") + ] + df_table["player_id"] = player_ids + + return df_table.convert_dtypes() + return df_table.convert_dtypes() diff --git a/tests/test_FBref.py b/tests/test_FBref.py index 36eb974..31f4828 100644 --- a/tests/test_FBref.py +++ b/tests/test_FBref.py @@ -92,10 +92,6 @@ def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None: assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame) -def test_read_player_season_stats_with_player_url(fbref_ligue1: FBref) -> None: - assert isinstance(fbref_ligue1.read_player_season_stats("standard", True), pd.DataFrame) - - def test_read_schedule(fbref_ligue1: FBref) -> None: assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame)