From bf6a565800c8646bfa9b1a0d84dc8aa15eeb2ce6 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 26 Feb 2025 13:55:25 +0100 Subject: [PATCH 1/7] ENH: optimize StringEncoder For memory and speed --- CHANGES.rst | 3 +++ skrub/_string_encoder.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 62d0d1707..e07583963 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -24,6 +24,9 @@ Changes - Progress messages when generating a ``TableReport`` are now written to stderr instead of stdout. :pr:`1236` by :user:`Priscilla Baah` +- Optimize the :class:`StringEncoder`: significant memory reduction and 1.5x speed-up. + :pr:`1243` by :user:`Gaël Varoquaux ` + Release 0.5.1 ============= diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 3d82024d8..b93975792 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -27,7 +27,7 @@ class StringEncoder(SingleColumnTransformer): n_components : int, default=30 Number of components to be used for the singular value decomposition (SVD). Must be a positive integer. - vectorizer : str, "tfidf" or "hashing" + vectorizer : str, "tfidf" or "hashing", default="tfidf" Vectorizer to apply to the strings, either `tfidf` or `hashing` for scikit-learn TfidfVectorizer or HashingVectorizer respectively. @@ -133,11 +133,13 @@ def fit_transform(self, X, y=None): f" 'hashing', got {self.vectorizer!r}" ) - X = sbd.fill_nulls(X, "") - X_out = self.vectorizer_.fit_transform(X) + X_filled = sbd.fill_nulls(X, "") + X_out = self.vectorizer_.fit_transform(X_filled).astype('float32') + del X_filled # optimizes memory: we no longer need X if (min_shape := min(X_out.shape)) >= self.n_components: - self.tsvd_ = TruncatedSVD(n_components=self.n_components) + self.tsvd_ = TruncatedSVD(n_components=self.n_components, + algorithm='arpack') result = self.tsvd_.fit_transform(X_out) else: warnings.warn( @@ -152,6 +154,8 @@ def fit_transform(self, X, y=None): # Therefore, self.n_components_ below stores the resulting # number of dimensions of result. result = X_out[:, : self.n_components].toarray() + result = result.copy() + del X_out # optimize memory: we no longer need X_out self._is_fitted = True self.n_components_ = result.shape[1] From ef02b735ed0a55219dd76992fea63385ef5983f5 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 26 Feb 2025 14:03:59 +0100 Subject: [PATCH 2/7] style --- CHANGES.rst | 2 +- skrub/_string_encoder.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e07583963..3933466e1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -25,7 +25,7 @@ Changes :pr:`1236` by :user:`Priscilla Baah` - Optimize the :class:`StringEncoder`: significant memory reduction and 1.5x speed-up. - :pr:`1243` by :user:`Gaël Varoquaux ` + :pr:`1248` by :user:`Gaël Varoquaux ` Release 0.5.1 ============= diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index b93975792..6b8d61d38 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -134,12 +134,13 @@ def fit_transform(self, X, y=None): ) X_filled = sbd.fill_nulls(X, "") - X_out = self.vectorizer_.fit_transform(X_filled).astype('float32') - del X_filled # optimizes memory: we no longer need X + X_out = self.vectorizer_.fit_transform(X_filled).astype("float32") + del X_filled # optimizes memory: we no longer need X if (min_shape := min(X_out.shape)) >= self.n_components: - self.tsvd_ = TruncatedSVD(n_components=self.n_components, - algorithm='arpack') + self.tsvd_ = TruncatedSVD( + n_components=self.n_components, algorithm="arpack" + ) result = self.tsvd_.fit_transform(X_out) else: warnings.warn( @@ -155,7 +156,7 @@ def fit_transform(self, X, y=None): # number of dimensions of result. result = X_out[:, : self.n_components].toarray() result = result.copy() - del X_out # optimize memory: we no longer need X_out + del X_out # optimize memory: we no longer need X_out self._is_fitted = True self.n_components_ = result.shape[1] From 8464087ce54e170becf13fe206ad4537bfd10f04 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 26 Feb 2025 14:22:09 +0100 Subject: [PATCH 3/7] [doc build] Force a doc build, and also fix some failing examples (still more to do) --- skrub/_string_encoder.py | 7 +++++-- skrub/tests/test_string_encoder.py | 18 ++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 6b8d61d38..c32e4f074 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -182,12 +182,15 @@ def transform(self, X): The embedding representation of the input. """ - X = sbd.fill_nulls(X, "") - X_out = self.vectorizer_.transform(X) + X_filled = sbd.fill_nulls(X, "") + X_out = self.vectorizer_.transform(X_filled).astype("float32") + del X_filled # optimizes memory: we no longer need X if hasattr(self, "tsvd_"): result = self.tsvd_.transform(X_out) else: result = X_out[:, : self.n_components].toarray() + result = result.copy() + del X_out # optimize memory: we no longer need X_out return self._post_process(X, result) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 34b9cb8fb..5d3632bf1 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -1,4 +1,5 @@ import pytest +from numpy.testing import assert_almost_equal from sklearn.base import clone from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import ( @@ -37,6 +38,7 @@ def test_tfidf_vectorizer(encode_column, df_module): ] ) check = pipe.fit_transform(sbd.to_numpy(encode_column)) + check = check.astype("float32") # StringEncoder is float32 names = [f"col1_{idx}" for idx in range(2)] @@ -197,21 +199,21 @@ def test_missing_values(df_module, vectorizer): encoder = StringEncoder(n_components=2, vectorizer=vectorizer) out = encoder.fit_transform(col) for c in sbd.to_column_list(out): - assert c[1] == 0.0 - assert c[2] == 0.0 + assert_almost_equal(c[1], 0.0) + assert_almost_equal(c[2], 0.0) out = encoder.transform(col) for c in sbd.to_column_list(out): - assert c[1] == 0.0 - assert c[2] == 0.0 + assert_almost_equal(c[1], 0.0) + assert_almost_equal(c[2], 0.0) tv = TableVectorizer( low_cardinality=StringEncoder(n_components=2, vectorizer=vectorizer) ) df = df_module.make_dataframe({"col": col}) out = tv.fit_transform(df) for c in sbd.to_column_list(out): - assert c[1] == 0.0 - assert c[2] == 0.0 + assert_almost_equal(c[1], 0.0) + assert_almost_equal(c[2], 0.0) out = tv.transform(df) for c in sbd.to_column_list(out): - assert c[1] == 0.0 - assert c[2] == 0.0 + assert_almost_equal(c[1], 0.0) + assert_almost_equal(c[2], 0.0) From f5a1d84c5529ac0f53182208a0762a9710862812 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 26 Feb 2025 14:40:38 +0100 Subject: [PATCH 4/7] fix tests --- skrub/_string_encoder.py | 4 +++- skrub/tests/test_string_encoder.py | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index c32e4f074..18e1bea7e 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -137,11 +137,13 @@ def fit_transform(self, X, y=None): X_out = self.vectorizer_.fit_transform(X_filled).astype("float32") del X_filled # optimizes memory: we no longer need X - if (min_shape := min(X_out.shape)) >= self.n_components: + if (min_shape := min(X_out.shape)) > self.n_components: self.tsvd_ = TruncatedSVD( n_components=self.n_components, algorithm="arpack" ) result = self.tsvd_.fit_transform(X_out) + elif X_out.shape[1] == self.n_components: + result = X_out.toarray() else: warnings.warn( f"The matrix shape is {(X_out.shape)}, and its minimum is " diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 5d3632bf1..6c8a75ec1 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -199,21 +199,21 @@ def test_missing_values(df_module, vectorizer): encoder = StringEncoder(n_components=2, vectorizer=vectorizer) out = encoder.fit_transform(col) for c in sbd.to_column_list(out): - assert_almost_equal(c[1], 0.0) - assert_almost_equal(c[2], 0.0) + assert_almost_equal(c[1], 0.0, decimal=6) + assert_almost_equal(c[2], 0.0, decimal=6) out = encoder.transform(col) for c in sbd.to_column_list(out): - assert_almost_equal(c[1], 0.0) - assert_almost_equal(c[2], 0.0) + assert_almost_equal(c[1], 0.0, decimal=6) + assert_almost_equal(c[2], 0.0, decimal=6) tv = TableVectorizer( low_cardinality=StringEncoder(n_components=2, vectorizer=vectorizer) ) df = df_module.make_dataframe({"col": col}) out = tv.fit_transform(df) for c in sbd.to_column_list(out): - assert_almost_equal(c[1], 0.0) - assert_almost_equal(c[2], 0.0) + assert_almost_equal(c[1], 0.0, decimal=6) + assert_almost_equal(c[2], 0.0, decimal=6) out = tv.transform(df) for c in sbd.to_column_list(out): - assert_almost_equal(c[1], 0.0) - assert_almost_equal(c[2], 0.0) + assert_almost_equal(c[1], 0.0, decimal=6) + assert_almost_equal(c[2], 0.0, decimal=6) From eafa5ff0c4ad80edaf4467d6ca2580c497e36ce4 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Wed, 26 Feb 2025 18:41:56 +0100 Subject: [PATCH 5/7] comment --- skrub/_string_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py index 18e1bea7e..911e2386d 100644 --- a/skrub/_string_encoder.py +++ b/skrub/_string_encoder.py @@ -157,7 +157,7 @@ def fit_transform(self, X, y=None): # Therefore, self.n_components_ below stores the resulting # number of dimensions of result. result = X_out[:, : self.n_components].toarray() - result = result.copy() + result = result.copy() # To avoid a reference to X_out del X_out # optimize memory: we no longer need X_out self._is_fitted = True From 4b219dccf9c67af21a7057c4e1d134a83064f347 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Wed, 26 Feb 2025 18:59:06 +0100 Subject: [PATCH 6/7] add test --- skrub/tests/test_string_encoder.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 6c8a75ec1..2c624eb11 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -193,6 +193,14 @@ def test_n_components(df_module): assert encoder_30.n_components_ == 30 +def test_n_components_equal_voc_size(df_module): + x = df_module.make_column("x", ["aab", "bba"]) + encoder = StringEncoder(n_components=2, ngram_range=(1, 1)) + out = encoder.fit_transform(x) + assert sbd.column_names(out) == ["x_0", "x_1"] + assert not hasattr(encoder, "tsvd_") + + @pytest.mark.parametrize("vectorizer", ["tfidf", "hashing"]) def test_missing_values(df_module, vectorizer): col = df_module.make_column("col", ["one two", None, "", "two three"]) From 4514d79111e777739a55ec5830f2aeed0d876db9 Mon Sep 17 00:00:00 2001 From: Jerome Dockes Date: Thu, 27 Feb 2025 09:37:14 +0100 Subject: [PATCH 7/7] fix test --- skrub/tests/test_string_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py index 2c624eb11..a0160f417 100644 --- a/skrub/tests/test_string_encoder.py +++ b/skrub/tests/test_string_encoder.py @@ -195,7 +195,7 @@ def test_n_components(df_module): def test_n_components_equal_voc_size(df_module): x = df_module.make_column("x", ["aab", "bba"]) - encoder = StringEncoder(n_components=2, ngram_range=(1, 1)) + encoder = StringEncoder(n_components=2, ngram_range=(1, 1), analyzer="char") out = encoder.fit_transform(x) assert sbd.column_names(out) == ["x_0", "x_1"] assert not hasattr(encoder, "tsvd_")