@@ -27,7 +27,7 @@ class StringEncoder(SingleColumnTransformer):
27
27
n_components : int, default=30
28
28
Number of components to be used for the singular value decomposition (SVD).
29
29
Must be a positive integer.
30
- vectorizer : str, "tfidf" or "hashing"
30
+ vectorizer : str, "tfidf" or "hashing", default="tfidf"
31
31
Vectorizer to apply to the strings, either `tfidf` or `hashing` for
32
32
scikit-learn TfidfVectorizer or HashingVectorizer respectively.
33
33
@@ -133,12 +133,17 @@ def fit_transform(self, X, y=None):
133
133
f" 'hashing', got { self .vectorizer !r} "
134
134
)
135
135
136
- X = sbd .fill_nulls (X , "" )
137
- X_out = self .vectorizer_ .fit_transform (X )
136
+ X_filled = sbd .fill_nulls (X , "" )
137
+ X_out = self .vectorizer_ .fit_transform (X_filled ).astype ("float32" )
138
+ del X_filled # optimizes memory: we no longer need X
138
139
139
- if (min_shape := min (X_out .shape )) >= self .n_components :
140
- self .tsvd_ = TruncatedSVD (n_components = self .n_components )
140
+ if (min_shape := min (X_out .shape )) > self .n_components :
141
+ self .tsvd_ = TruncatedSVD (
142
+ n_components = self .n_components , algorithm = "arpack"
143
+ )
141
144
result = self .tsvd_ .fit_transform (X_out )
145
+ elif X_out .shape [1 ] == self .n_components :
146
+ result = X_out .toarray ()
142
147
else :
143
148
warnings .warn (
144
149
f"The matrix shape is { (X_out .shape )} , and its minimum is "
@@ -152,6 +157,8 @@ def fit_transform(self, X, y=None):
152
157
# Therefore, self.n_components_ below stores the resulting
153
158
# number of dimensions of result.
154
159
result = X_out [:, : self .n_components ].toarray ()
160
+ result = result .copy () # To avoid a reference to X_out
161
+ del X_out # optimize memory: we no longer need X_out
155
162
156
163
self ._is_fitted = True
157
164
self .n_components_ = result .shape [1 ]
@@ -177,12 +184,15 @@ def transform(self, X):
177
184
The embedding representation of the input.
178
185
"""
179
186
180
- X = sbd .fill_nulls (X , "" )
181
- X_out = self .vectorizer_ .transform (X )
187
+ X_filled = sbd .fill_nulls (X , "" )
188
+ X_out = self .vectorizer_ .transform (X_filled ).astype ("float32" )
189
+ del X_filled # optimizes memory: we no longer need X
182
190
if hasattr (self , "tsvd_" ):
183
191
result = self .tsvd_ .transform (X_out )
184
192
else :
185
193
result = X_out [:, : self .n_components ].toarray ()
194
+ result = result .copy ()
195
+ del X_out # optimize memory: we no longer need X_out
186
196
187
197
return self ._post_process (X , result )
188
198
0 commit comments