ENH: optimize StringEncoder (#1248)

GaelVaroquaux · jeromedockes · web-flow · commit 97011bda76e5 · 2025-02-27T14:08:06.000+01:00
Co-authored-by: Jerome Dockes &lt;jerome@dockes.org&gt;
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -24,6 +24,9 @@ Changes
 - Progress messages when generating a ``TableReport`` are now written to stderr instead of stdout.
   :pr:`1236` by :user:`Priscilla Baah<priscilla-b>`
 
+- Optimize the :class:`StringEncoder`: significant memory reduction and 1.5x speed-up.
+  :pr:`1248` by :user:`Gaël Varoquaux <gaelvaroquaux>`
+
 Release 0.5.1
 =============
 
diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -27,7 +27,7 @@ class StringEncoder(SingleColumnTransformer):
     n_components : int, default=30
         Number of components to be used for the singular value decomposition (SVD).
         Must be a positive integer.
-    vectorizer : str, "tfidf" or "hashing"
+    vectorizer : str, "tfidf" or "hashing", default="tfidf"
         Vectorizer to apply to the strings, either `tfidf` or `hashing` for
         scikit-learn TfidfVectorizer or HashingVectorizer respectively.
 
@@ -133,12 +133,17 @@ def fit_transform(self, X, y=None):
                 f" 'hashing', got {self.vectorizer!r}"
             )
 
-        X = sbd.fill_nulls(X, "")
-        X_out = self.vectorizer_.fit_transform(X)
+        X_filled = sbd.fill_nulls(X, "")
+        X_out = self.vectorizer_.fit_transform(X_filled).astype("float32")
+        del X_filled  # optimizes memory: we no longer need X
 
-        if (min_shape := min(X_out.shape)) >= self.n_components:
-            self.tsvd_ = TruncatedSVD(n_components=self.n_components)
+        if (min_shape := min(X_out.shape)) > self.n_components:
+            self.tsvd_ = TruncatedSVD(
+                n_components=self.n_components, algorithm="arpack"
+            )
             result = self.tsvd_.fit_transform(X_out)
+        elif X_out.shape[1] == self.n_components:
+            result = X_out.toarray()
         else:
             warnings.warn(
                 f"The matrix shape is {(X_out.shape)}, and its minimum is "
@@ -152,6 +157,8 @@ def fit_transform(self, X, y=None):
             # Therefore, self.n_components_ below stores the resulting
             # number of dimensions of result.
             result = X_out[:, : self.n_components].toarray()
+            result = result.copy()  # To avoid a reference to X_out
+        del X_out  # optimize memory: we no longer need X_out
 
         self._is_fitted = True
         self.n_components_ = result.shape[1]
@@ -177,12 +184,15 @@ def transform(self, X):
             The embedding representation of the input.
         """
 
-        X = sbd.fill_nulls(X, "")
-        X_out = self.vectorizer_.transform(X)
+        X_filled = sbd.fill_nulls(X, "")
+        X_out = self.vectorizer_.transform(X_filled).astype("float32")
+        del X_filled  # optimizes memory: we no longer need X
         if hasattr(self, "tsvd_"):
             result = self.tsvd_.transform(X_out)
         else:
             result = X_out[:, : self.n_components].toarray()
+            result = result.copy()
+        del X_out  # optimize memory: we no longer need X_out
 
         return self._post_process(X, result)
 
diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py
@@ -1,4 +1,5 @@
 import pytest
+from numpy.testing import assert_almost_equal
 from sklearn.base import clone
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import (
@@ -37,6 +38,7 @@ def test_tfidf_vectorizer(encode_column, df_module):
         ]
     )
     check = pipe.fit_transform(sbd.to_numpy(encode_column))
+    check = check.astype("float32")  # StringEncoder is float32
 
     names = [f"col1_{idx}" for idx in range(2)]
 
@@ -191,27 +193,35 @@ def test_n_components(df_module):
     assert encoder_30.n_components_ == 30
 
 
+def test_n_components_equal_voc_size(df_module):
+    x = df_module.make_column("x", ["aab", "bba"])
+    encoder = StringEncoder(n_components=2, ngram_range=(1, 1), analyzer="char")
+    out = encoder.fit_transform(x)
+    assert sbd.column_names(out) == ["x_0", "x_1"]
+    assert not hasattr(encoder, "tsvd_")
+
+
 @pytest.mark.parametrize("vectorizer", ["tfidf", "hashing"])
 def test_missing_values(df_module, vectorizer):
     col = df_module.make_column("col", ["one two", None, "", "two three"])
     encoder = StringEncoder(n_components=2, vectorizer=vectorizer)
     out = encoder.fit_transform(col)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     out = encoder.transform(col)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     tv = TableVectorizer(
         low_cardinality=StringEncoder(n_components=2, vectorizer=vectorizer)
     )
     df = df_module.make_dataframe({"col": col})
     out = tv.fit_transform(df)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     out = tv.transform(df)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)