skrub-data · Vincent-Maladiere · Feb 27, 2025 · Feb 26, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -24,6 +24,9 @@ Changes
 - Progress messages when generating a ``TableReport`` are now written to stderr instead of stdout.
   :pr:`1236` by :user:`Priscilla Baah<priscilla-b>`
 
+- Optimize the :class:`StringEncoder`: significant memory reduction and 1.5x speed-up.
+  :pr:`1248` by :user:`Gaël Varoquaux <gaelvaroquaux>`
+
 Release 0.5.1
 =============
 

diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -27,7 +27,7 @@
     n_components : int, default=30
         Number of components to be used for the singular value decomposition (SVD).
         Must be a positive integer.
-    vectorizer : str, "tfidf" or "hashing"
+    vectorizer : str, "tfidf" or "hashing", default="tfidf"
         Vectorizer to apply to the strings, either `tfidf` or `hashing` for
         scikit-learn TfidfVectorizer or HashingVectorizer respectively.
 
@@ -133,12 +133,17 @@
                 f" 'hashing', got {self.vectorizer!r}"
             )
 
-        X = sbd.fill_nulls(X, "")
-        X_out = self.vectorizer_.fit_transform(X)
+        X_filled = sbd.fill_nulls(X, "")
+        X_out = self.vectorizer_.fit_transform(X_filled).astype("float32")
+        del X_filled  # optimizes memory: we no longer need X
 
-        if (min_shape := min(X_out.shape)) >= self.n_components:
-            self.tsvd_ = TruncatedSVD(n_components=self.n_components)
+        if (min_shape := min(X_out.shape)) > self.n_components:
+            self.tsvd_ = TruncatedSVD(
+                n_components=self.n_components, algorithm="arpack"
+            )
             result = self.tsvd_.fit_transform(X_out)
+        elif X_out.shape[1] == self.n_components:
+            result = X_out.toarray()
         else:
             warnings.warn(
                 f"The matrix shape is {(X_out.shape)}, and its minimum is "
@@ -152,6 +157,8 @@
             # Therefore, self.n_components_ below stores the resulting
             # number of dimensions of result.
             result = X_out[:, : self.n_components].toarray()
+            result = result.copy()
+        del X_out  # optimize memory: we no longer need X_out
 
         self._is_fitted = True
         self.n_components_ = result.shape[1]
@@ -177,12 +184,15 @@
             The embedding representation of the input.
         """
 
-        X = sbd.fill_nulls(X, "")
-        X_out = self.vectorizer_.transform(X)
+        X_filled = sbd.fill_nulls(X, "")
+        X_out = self.vectorizer_.transform(X_filled).astype("float32")
+        del X_filled  # optimizes memory: we no longer need X
         if hasattr(self, "tsvd_"):
             result = self.tsvd_.transform(X_out)
         else:
             result = X_out[:, : self.n_components].toarray()
+            result = result.copy()
+        del X_out  # optimize memory: we no longer need X_out
 
         return self._post_process(X, result)
 

diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py
@@ -1,4 +1,5 @@
 import pytest
+from numpy.testing import assert_almost_equal
 from sklearn.base import clone
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import (
@@ -37,6 +38,7 @@ def test_tfidf_vectorizer(encode_column, df_module):
         ]
     )
     check = pipe.fit_transform(sbd.to_numpy(encode_column))
+    check = check.astype("float32")  # StringEncoder is float32
 
     names = [f"col1_{idx}" for idx in range(2)]
 
@@ -197,21 +199,21 @@ def test_missing_values(df_module, vectorizer):
     encoder = StringEncoder(n_components=2, vectorizer=vectorizer)
     out = encoder.fit_transform(col)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     out = encoder.transform(col)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     tv = TableVectorizer(
         low_cardinality=StringEncoder(n_components=2, vectorizer=vectorizer)
     )
     df = df_module.make_dataframe({"col": col})
     out = tv.fit_transform(df)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)
     out = tv.transform(df)
     for c in sbd.to_column_list(out):
-        assert c[1] == 0.0
-        assert c[2] == 0.0
+        assert_almost_equal(c[1], 0.0, decimal=6)
+        assert_almost_equal(c[2], 0.0, decimal=6)