From e9811b930914d402d834c5f9d4bbb795b0b8e0fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Romagn=C3=A9?= <michael.romagne@sanofi.com>
Date: Mon, 3 Feb 2025 20:39:31 +0100
Subject: [PATCH 1/2] Add feature to extract players URL in fbref

---
 soccerdata/fbref.py | 39 ++++++++++++++++++++++++++++++++++++++-
 tests/test_FBref.py | 25 +++++++++++++++++++++----
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py
index cf4a0e2..a771295 100644
--- a/soccerdata/fbref.py
+++ b/soccerdata/fbref.py
@@ -533,7 +533,35 @@ def read_team_match_stats(  # noqa: C901
             .loc[self.leagues]
         )
 
-    def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
+    def _extract_players_url(self, tree: etree.ElementTree) -> dict:
+        """Extract players profile URL from the parsed HTML tree."""
+        player_urls = {}
+
+        # The table is often inside a comment
+        comments = tree.xpath("//comment()")
+        for comment in comments:
+            if "div_stats" in comment.text:
+                parser = etree.HTMLParser(recover=True)
+                table_tree = etree.fromstring(comment.text, parser)
+
+                for player_elem in table_tree.xpath("//td[@data-stat='player']/a"):
+                    player_name = player_elem.text
+                    player_url = player_elem.get("href")
+                    if player_name and player_url:
+                        player_urls[player_name] = f"https://fbref.com{player_url}"
+                return player_urls
+
+        # If not inside a comment, try normal extraction
+        for player_elem in tree.xpath("//td[@data-stat='player']/a"):
+            player_name = player_elem.text
+            player_url = player_elem.get("href")
+            if player_name and player_url:
+                player_urls[player_name] = f"https://fbref.com{player_url}"
+        return player_urls
+
+    def read_player_season_stats(  # noqa: C901
+        self, stat_type: str = "standard", extract_players_url: bool = False
+    ) -> pd.DataFrame:
         """Retrieve players from the datasource for the selected leagues and seasons.
 
         The following stat types are available:
@@ -553,6 +581,8 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
         ----------
         stat_type :str
             Type of stats to retrieve.
+        extract_players_url :bool
+            If True, the URL to player profiles will be extracted.
 
         Raises
         ------
@@ -633,6 +663,13 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
                 df_table[("Unnamed: league", "league")] = lkey
                 df_table[("Unnamed: season", "season")] = skey
             df_table = _fix_nation_col(df_table)
+
+            if extract_players_url:
+                player_links = self._extract_players_url(tree)
+                df_table["player_url"] = df_table[("Unnamed: 1_level_0", "Player")].map(
+                    player_links
+                )
+
             players.append(df_table)
 
         # return dataframe
diff --git a/tests/test_FBref.py b/tests/test_FBref.py
index a4bab4a..36eb974 100644
--- a/tests/test_FBref.py
+++ b/tests/test_FBref.py
@@ -92,6 +92,10 @@ def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
     assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame)
 
 
+def test_read_player_season_stats_with_player_url(fbref_ligue1: FBref) -> None:
+    assert isinstance(fbref_ligue1.read_player_season_stats("standard", True), pd.DataFrame)
+
+
 def test_read_schedule(fbref_ligue1: FBref) -> None:
     assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame)
 
@@ -110,7 +114,8 @@ def test_read_schedule(fbref_ligue1: FBref) -> None:
 )
 def test_read_player_match_stats(fbref_ligue1: FBref, stat_type: str) -> None:
     assert isinstance(
-        fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame
+        fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"),
+        pd.DataFrame,
     )
 
 
@@ -143,17 +148,29 @@ def test_read_lineup(fbref_ligue1: FBref) -> None:
 def test_concat() -> None:
     df1 = pd.DataFrame(
         columns=pd.MultiIndex.from_tuples(
-            [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")]
+            [
+                ("Unnamed: a", "player"),
+                ("Performance", "Goals"),
+                ("Performance", "Assists"),
+            ]
         )
     )
     df2 = pd.DataFrame(
         columns=pd.MultiIndex.from_tuples(
-            [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Assists")]
+            [
+                ("Unnamed: a", "player"),
+                ("Unnamed: b", "Goals"),
+                ("Performance", "Assists"),
+            ]
         )
     )
     df3 = pd.DataFrame(
         columns=pd.MultiIndex.from_tuples(
-            [("Unnamed: a", "player"), ("Goals", "Unnamed: b"), ("Performance", "Assists")]
+            [
+                ("Unnamed: a", "player"),
+                ("Goals", "Unnamed: b"),
+                ("Performance", "Assists"),
+            ]
         )
     )
     res = _concat([df1, df2, df3], key=["player"])

From 5ee4e54be6230d8b0fe5da5748391f478fe56ecc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C3=ABl=20Romagn=C3=A9?= <michael.romagne@sanofi.com>
Date: Fri, 14 Feb 2025 14:27:14 +0100
Subject: [PATCH 2/2] Extract player ID from fbref

---
 soccerdata/fbref.py | 55 +++++++++++++--------------------------------
 tests/test_FBref.py |  4 ----
 2 files changed, 15 insertions(+), 44 deletions(-)

diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py
index a771295..ced8030 100644
--- a/soccerdata/fbref.py
+++ b/soccerdata/fbref.py
@@ -533,35 +533,7 @@ def read_team_match_stats(  # noqa: C901
             .loc[self.leagues]
         )
 
-    def _extract_players_url(self, tree: etree.ElementTree) -> dict:
-        """Extract players profile URL from the parsed HTML tree."""
-        player_urls = {}
-
-        # The table is often inside a comment
-        comments = tree.xpath("//comment()")
-        for comment in comments:
-            if "div_stats" in comment.text:
-                parser = etree.HTMLParser(recover=True)
-                table_tree = etree.fromstring(comment.text, parser)
-
-                for player_elem in table_tree.xpath("//td[@data-stat='player']/a"):
-                    player_name = player_elem.text
-                    player_url = player_elem.get("href")
-                    if player_name and player_url:
-                        player_urls[player_name] = f"https://fbref.com{player_url}"
-                return player_urls
-
-        # If not inside a comment, try normal extraction
-        for player_elem in tree.xpath("//td[@data-stat='player']/a"):
-            player_name = player_elem.text
-            player_url = player_elem.get("href")
-            if player_name and player_url:
-                player_urls[player_name] = f"https://fbref.com{player_url}"
-        return player_urls
-
-    def read_player_season_stats(  # noqa: C901
-        self, stat_type: str = "standard", extract_players_url: bool = False
-    ) -> pd.DataFrame:
+    def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame:
         """Retrieve players from the datasource for the selected leagues and seasons.
 
         The following stat types are available:
@@ -581,8 +553,6 @@ def read_player_season_stats(  # noqa: C901
         ----------
         stat_type :str
             Type of stats to retrieve.
-        extract_players_url :bool
-            If True, the URL to player profiles will be extracted.
 
         Raises
         ------
@@ -647,7 +617,7 @@ def read_player_season_stats(  # noqa: C901
             for elem in tree.xpath("//td[@data-stat='comp_level']//span"):
                 elem.getparent().remove(elem)
             if big_five:
-                df_table = _parse_table(tree)
+                df_table = _parse_table(tree, with_player_id=True)
                 df_table[("Unnamed: league", "league")] = (
                     df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT)
                 )
@@ -659,17 +629,11 @@ def read_player_season_stats(  # noqa: C901
                 (html_table,) = etree.fromstring(el.text, parser).xpath(
                     f"//table[contains(@id, 'stats_{stat_type}')]"
                 )
-                df_table = _parse_table(html_table)
+                df_table = _parse_table(html_table, with_player_id=True)
                 df_table[("Unnamed: league", "league")] = lkey
                 df_table[("Unnamed: season", "season")] = skey
             df_table = _fix_nation_col(df_table)
 
-            if extract_players_url:
-                player_links = self._extract_players_url(tree)
-                df_table["player_url"] = df_table[("Unnamed: 1_level_0", "Player")].map(
-                    player_links
-                )
-
             players.append(df_table)
 
         # return dataframe
@@ -1176,13 +1140,15 @@ def read_shot_events(
         )
 
 
-def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame:
+def _parse_table(html_table: html.HtmlElement, with_player_id: bool = False) -> pd.DataFrame:
     """Parse HTML table into a dataframe.
 
     Parameters
     ----------
     html_table : lxml.html.HtmlElement
         HTML table to clean up.
+    with_player_id : bool
+        If True, will extract player IDs.
 
     Returns
     -------
@@ -1199,6 +1165,15 @@ def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame:
         elem.getparent().remove(elem)
     # parse HTML to dataframe
     (df_table,) = pd.read_html(html.tostring(html_table), flavor="lxml")
+
+    if with_player_id:
+        player_ids = [
+            elem.get("data-append-csv") for elem in html_table.xpath("//td[@data-append-csv]")
+        ]
+        df_table["player_id"] = player_ids
+
+    return df_table.convert_dtypes()
+
     return df_table.convert_dtypes()
 
 
diff --git a/tests/test_FBref.py b/tests/test_FBref.py
index 36eb974..31f4828 100644
--- a/tests/test_FBref.py
+++ b/tests/test_FBref.py
@@ -92,10 +92,6 @@ def test_read_player_season_stats(fbref_ligue1: FBref, stat_type: str) -> None:
     assert isinstance(fbref_ligue1.read_player_season_stats(stat_type), pd.DataFrame)
 
 
-def test_read_player_season_stats_with_player_url(fbref_ligue1: FBref) -> None:
-    assert isinstance(fbref_ligue1.read_player_season_stats("standard", True), pd.DataFrame)
-
-
 def test_read_schedule(fbref_ligue1: FBref) -> None:
     assert isinstance(fbref_ligue1.read_schedule(), pd.DataFrame)