diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py index cf4a0e2..ced8030 100644 --- a/soccerdata/fbref.py +++ b/soccerdata/fbref.py @@ -617,7 +617,7 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: for elem in tree.xpath("//td[@data-stat='comp_level']//span"): elem.getparent().remove(elem) if big_five: - df_table = _parse_table(tree) + df_table = _parse_table(tree, with_player_id=True) df_table[("Unnamed: league", "league")] = ( df_table.xs("Comp", axis=1, level=1).squeeze().map(BIG_FIVE_DICT) ) @@ -629,10 +629,11 @@ def read_player_season_stats(self, stat_type: str = "standard") -> pd.DataFrame: (html_table,) = etree.fromstring(el.text, parser).xpath( f"//table[contains(@id, 'stats_{stat_type}')]" ) - df_table = _parse_table(html_table) + df_table = _parse_table(html_table, with_player_id=True) df_table[("Unnamed: league", "league")] = lkey df_table[("Unnamed: season", "season")] = skey df_table = _fix_nation_col(df_table) + players.append(df_table) # return dataframe @@ -1139,13 +1140,15 @@ def read_shot_events( ) -def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame: +def _parse_table(html_table: html.HtmlElement, with_player_id: bool = False) -> pd.DataFrame: """Parse HTML table into a dataframe. Parameters ---------- html_table : lxml.html.HtmlElement HTML table to clean up. + with_player_id : bool + If True, will extract player IDs. Returns ------- @@ -1162,6 +1165,15 @@ def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame: elem.getparent().remove(elem) # parse HTML to dataframe (df_table,) = pd.read_html(html.tostring(html_table), flavor="lxml") + + if with_player_id: + player_ids = [ + elem.get("data-append-csv") for elem in html_table.xpath("//td[@data-append-csv]") + ] + df_table["player_id"] = player_ids + + return df_table.convert_dtypes() + return df_table.convert_dtypes() diff --git a/tests/test_FBref.py b/tests/test_FBref.py index a4bab4a..31f4828 100644 --- a/tests/test_FBref.py +++ b/tests/test_FBref.py @@ -110,7 +110,8 @@ def test_read_schedule(fbref_ligue1: FBref) -> None: ) def test_read_player_match_stats(fbref_ligue1: FBref, stat_type: str) -> None: assert isinstance( - fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), pd.DataFrame + fbref_ligue1.read_player_match_stats(stat_type, match_id="796787da"), + pd.DataFrame, ) @@ -143,17 +144,29 @@ def test_read_lineup(fbref_ligue1: FBref) -> None: def test_concat() -> None: df1 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Performance", "Goals"), + ("Performance", "Assists"), + ] ) ) df2 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Unnamed: b", "Goals"), + ("Performance", "Assists"), + ] ) ) df3 = pd.DataFrame( columns=pd.MultiIndex.from_tuples( - [("Unnamed: a", "player"), ("Goals", "Unnamed: b"), ("Performance", "Assists")] + [ + ("Unnamed: a", "player"), + ("Goals", "Unnamed: b"), + ("Performance", "Assists"), + ] ) ) res = _concat([df1, df2, df3], key=["player"])