Skip to content

Commit 1b0decf

Browse files
whalebot-helmsmanrasbt
authored andcommitted
Multiprocessing over features rather than CV folds in Sequential Feature Selection (addressing #191) (#193)
1 parent 89a2a0e commit 1b0decf

File tree

4 files changed

+66
-42
lines changed

4 files changed

+66
-42
lines changed

docs/sources/CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ The CHANGELOG for the current development version is available at
99
### Version 0.6.1 (TBD)
1010

1111

12+
1213
##### Downloads
1314

1415
- [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.6.1.zip)
@@ -24,6 +25,8 @@ The CHANGELOG for the current development version is available at
2425
- `plot_decision_regions` now supports plotting decision regions for more than 2 training features. (via [James Bourbeau](https://github.com/jrbourbeau)).
2526

2627

28+
- Parallel execution in `mlxtend.feature_selection.SequentialFeatureSelector` and `mlxtend.feature_selection.ExhaustiveFeatureSelector` is now performed over different feature subsets instead of the different cross-validation folds to better utilize machines with multiple processors if the number of features is large ([#193](https://github.com/rasbt/mlxtend/pull/193), via [@whalebot-helmsman](https://github.com/whalebot-helmsman)).
29+
2730
##### Bug Fixes
2831

2932
- `SequentialFeatureSelector` now correctly accepts a `None` argument for the `scoring` parameter to infer the default scoring metric from scikit-learn classifiers and regressors.

mlxtend/feature_selection/exhaustive_feature_selector.py

+24-17
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,21 @@
2020
from sklearn.base import MetaEstimatorMixin
2121
from ..externals.name_estimators import _name_estimators
2222
from sklearn.model_selection import cross_val_score
23+
from sklearn.externals.joblib import Parallel, delayed
24+
25+
26+
def _calc_score(selector, X, y, indices):
27+
if selector.cv:
28+
scores = cross_val_score(selector.est_,
29+
X[:, indices], y,
30+
cv=selector.cv,
31+
scoring=selector.scorer,
32+
n_jobs=1,
33+
pre_dispatch=selector.pre_dispatch)
34+
else:
35+
selector.est_.fit(X[:, indices], y)
36+
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
37+
return indices, scores
2338

2439

2540
class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -51,10 +66,11 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
5166
otherwise.
5267
No cross-validation if cv is None, False, or 0.
5368
n_jobs : int (default: 1)
54-
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
69+
The number of CPUs to use for evaluating different feature subsets
70+
in parallel. -1 means 'all CPUs'.
5571
pre_dispatch : int, or string (default: '2*n_jobs')
5672
Controls the number of jobs that get dispatched
57-
during parallel execution in cross_val_score.
73+
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
5874
Reducing this number can be useful to avoid an explosion of
5975
memory consumption when more jobs get dispatched than CPUs can process.
6076
This parameter can be:
@@ -147,8 +163,12 @@ def fit(self, X, y):
147163

148164
self.subsets_ = {}
149165
all_comb = len(candidates)
150-
for iteration, c in enumerate(candidates):
151-
cv_scores = self._calc_score(X=X, y=y, indices=c)
166+
n_jobs = min(self.n_jobs, all_comb)
167+
parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
168+
work = enumerate(parallel(delayed(_calc_score)(self, X, y, c)
169+
for c in candidates))
170+
171+
for iteration, (c, cv_scores) in work:
152172

153173
self.subsets_[iteration] = {'feature_idx': c,
154174
'cv_scores': cv_scores,
@@ -173,19 +193,6 @@ def fit(self, X, y):
173193
self.fitted = True
174194
return self
175195

176-
def _calc_score(self, X, y, indices):
177-
if self.cv:
178-
scores = cross_val_score(self.est_,
179-
X[:, indices], y,
180-
cv=self.cv,
181-
scoring=self.scorer,
182-
n_jobs=self.n_jobs,
183-
pre_dispatch=self.pre_dispatch)
184-
else:
185-
self.est_.fit(X[:, indices], y)
186-
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
187-
return scores
188-
189196
def transform(self, X):
190197
"""Return the best selected features from X.
191198

mlxtend/feature_selection/sequential_feature_selector.py

+39-23
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,21 @@
2121
from sklearn.base import MetaEstimatorMixin
2222
from ..externals.name_estimators import _name_estimators
2323
from sklearn.model_selection import cross_val_score
24+
from sklearn.externals.joblib import Parallel, delayed
25+
26+
27+
def _calc_score(selector, X, y, indices):
28+
if selector.cv:
29+
scores = cross_val_score(selector.est_,
30+
X[:, indices], y,
31+
cv=selector.cv,
32+
scoring=selector.scorer,
33+
n_jobs=1,
34+
pre_dispatch=selector.pre_dispatch)
35+
else:
36+
selector.est_.fit(X[:, indices], y)
37+
scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
38+
return indices, scores
2439

2540

2641
class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -69,10 +84,11 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
6984
exclusion/inclusion if floating=True and
7085
algorithm gets stuck in cycles.
7186
n_jobs : int (default: 1)
72-
The number of CPUs to use for cross validation. -1 means 'all CPUs'.
87+
The number of CPUs to use for evaluating different feature subsets
88+
in parallel. -1 means 'all CPUs'.
7389
pre_dispatch : int, or string (default: '2*n_jobs')
7490
Controls the number of jobs that get dispatched
75-
during parallel execution in cross_val_score.
91+
during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
7692
Reducing this number can be useful to avoid an explosion of
7793
memory consumption when more jobs get dispatched than CPUs can process.
7894
This parameter can be:
@@ -222,7 +238,7 @@ def fit(self, X, y):
222238
k_to_select = self.k_features[0]
223239
k_idx = tuple(range(X.shape[1]))
224240
k = len(k_idx)
225-
k_score = self._calc_score(X, y, k_idx)
241+
k_idx, k_score = _calc_score(self, X, y, k_idx)
226242
self.subsets_[k] = {
227243
'feature_idx': k_idx,
228244
'cv_scores': k_score,
@@ -325,32 +341,26 @@ def _is_stuck(self, sdq):
325341
stuck = True
326342
return stuck
327343

328-
def _calc_score(self, X, y, indices):
329-
if self.cv:
330-
scores = cross_val_score(self.est_,
331-
X[:, indices], y,
332-
cv=self.cv,
333-
scoring=self.scorer,
334-
n_jobs=self.n_jobs,
335-
pre_dispatch=self.pre_dispatch)
336-
else:
337-
self.est_.fit(X[:, indices], y)
338-
scores = np.array([self.scorer(self.est_, X[:, indices], y)])
339-
return scores
340-
341344
def _inclusion(self, orig_set, subset, X, y):
342345
all_avg_scores = []
343346
all_cv_scores = []
344347
all_subsets = []
345348
res = (None, None, None)
346349
remaining = orig_set - subset
347350
if remaining:
348-
for feature in remaining:
349-
new_subset = tuple(subset | {feature})
350-
cv_scores = self._calc_score(X, y, new_subset)
351+
features = len(remaining)
352+
n_jobs = min(self.n_jobs, features)
353+
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
354+
pre_dispatch=self.pre_dispatch)
355+
work = parallel(delayed(_calc_score)
356+
(self, X, y, tuple(subset | {feature}))
357+
for feature in remaining)
358+
359+
for new_subset, cv_scores in work:
351360
all_avg_scores.append(cv_scores.mean())
352361
all_cv_scores.append(cv_scores)
353362
all_subsets.append(new_subset)
363+
354364
best = np.argmax(all_avg_scores)
355365
res = (all_subsets[best],
356366
all_avg_scores[best],
@@ -364,13 +374,19 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
364374
all_avg_scores = []
365375
all_cv_scores = []
366376
all_subsets = []
367-
for p in combinations(feature_set, r=n - 1):
368-
if fixed_feature and fixed_feature not in set(p):
369-
continue
370-
cv_scores = self._calc_score(X, y, p)
377+
features = n
378+
n_jobs = min(self.n_jobs, features)
379+
parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
380+
pre_dispatch=self.pre_dispatch)
381+
work = parallel(delayed(_calc_score)(self, X, y, p)
382+
for p in combinations(feature_set, r=n - 1)
383+
if not fixed_feature or fixed_feature in set(p))
384+
385+
for p, cv_scores in work:
371386
all_avg_scores.append(cv_scores.mean())
372387
all_cv_scores.append(cv_scores)
373388
all_subsets.append(p)
389+
374390
best = np.argmax(all_avg_scores)
375391
res = (all_subsets[best],
376392
all_avg_scores[best],

requirements.txt

-2
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,3 @@ numpy>=1.10.4
33
pandas>=0.17.1
44
scikit-learn>=0.18
55
matplotlib>=1.5.1
6-
7-

0 commit comments

Comments
 (0)