Multiprocessing over features rather than CV folds in Sequential Feature Selection (addressing #191) (#193)

whalebot-helmsman · rasbt · commit 1b0decf71fb1 · 2017-05-18T18:07:28.000-04:00
diff --git a/docs/sources/CHANGELOG.md b/docs/sources/CHANGELOG.md
@@ -9,6 +9,7 @@ The CHANGELOG for the current development version is available at
 ### Version 0.6.1 (TBD)
 
 
+
 ##### Downloads
 
 - [Source code (zip)](https://github.com/rasbt/mlxtend/archive/v0.6.1.zip)
@@ -24,6 +25,8 @@ The CHANGELOG for the current development version is available at
 - `plot_decision_regions` now supports plotting decision regions for more than 2 training features. (via [James Bourbeau](https://github.com/jrbourbeau)).
 
 
+- Parallel execution in `mlxtend.feature_selection.SequentialFeatureSelector` and `mlxtend.feature_selection.ExhaustiveFeatureSelector` is now performed over different feature subsets instead of the different cross-validation folds to better utilize machines with multiple processors if the number of features is large ([#193](https://github.com/rasbt/mlxtend/pull/193), via [@whalebot-helmsman](https://github.com/whalebot-helmsman)).
+
 ##### Bug Fixes
 
 - `SequentialFeatureSelector` now correctly accepts a `None` argument for the `scoring` parameter to infer the default scoring metric from scikit-learn classifiers and regressors.
diff --git a/mlxtend/feature_selection/exhaustive_feature_selector.py b/mlxtend/feature_selection/exhaustive_feature_selector.py
@@ -20,6 +20,21 @@
 from sklearn.base import MetaEstimatorMixin
 from ..externals.name_estimators import _name_estimators
 from sklearn.model_selection import cross_val_score
+from sklearn.externals.joblib import Parallel, delayed
+
+
+def _calc_score(selector, X, y, indices):
+    if selector.cv:
+        scores = cross_val_score(selector.est_,
+                                 X[:, indices], y,
+                                 cv=selector.cv,
+                                 scoring=selector.scorer,
+                                 n_jobs=1,
+                                 pre_dispatch=selector.pre_dispatch)
+    else:
+        selector.est_.fit(X[:, indices], y)
+        scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
+    return indices, scores
 
 
 class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -51,10 +66,11 @@ class ExhaustiveFeatureSelector(BaseEstimator, MetaEstimatorMixin):
         otherwise.
         No cross-validation if cv is None, False, or 0.
     n_jobs : int (default: 1)
-        The number of CPUs to use for cross validation. -1 means 'all CPUs'.
+        The number of CPUs to use for evaluating different feature subsets
+        in parallel. -1 means 'all CPUs'.
     pre_dispatch : int, or string (default: '2*n_jobs')
         Controls the number of jobs that get dispatched
-        during parallel execution in cross_val_score.
+        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
         Reducing this number can be useful to avoid an explosion of
         memory consumption when more jobs get dispatched than CPUs can process.
         This parameter can be:
@@ -147,8 +163,12 @@ def fit(self, X, y):
 
         self.subsets_ = {}
         all_comb = len(candidates)
-        for iteration, c in enumerate(candidates):
-            cv_scores = self._calc_score(X=X, y=y, indices=c)
+        n_jobs = min(self.n_jobs, all_comb)
+        parallel = Parallel(n_jobs=n_jobs, pre_dispatch=self.pre_dispatch)
+        work = enumerate(parallel(delayed(_calc_score)(self, X, y, c)
+                                  for c in candidates))
+
+        for iteration, (c, cv_scores) in work:
 
             self.subsets_[iteration] = {'feature_idx': c,
                                         'cv_scores': cv_scores,
@@ -173,19 +193,6 @@ def fit(self, X, y):
         self.fitted = True
         return self
 
-    def _calc_score(self, X, y, indices):
-        if self.cv:
-            scores = cross_val_score(self.est_,
-                                     X[:, indices], y,
-                                     cv=self.cv,
-                                     scoring=self.scorer,
-                                     n_jobs=self.n_jobs,
-                                     pre_dispatch=self.pre_dispatch)
-        else:
-            self.est_.fit(X[:, indices], y)
-            scores = np.array([self.scorer(self.est_, X[:, indices], y)])
-        return scores
-
     def transform(self, X):
         """Return the best selected features from X.
 
diff --git a/mlxtend/feature_selection/sequential_feature_selector.py b/mlxtend/feature_selection/sequential_feature_selector.py
@@ -21,6 +21,21 @@
 from sklearn.base import MetaEstimatorMixin
 from ..externals.name_estimators import _name_estimators
 from sklearn.model_selection import cross_val_score
+from sklearn.externals.joblib import Parallel, delayed
+
+
+def _calc_score(selector, X, y, indices):
+    if selector.cv:
+        scores = cross_val_score(selector.est_,
+                                 X[:, indices], y,
+                                 cv=selector.cv,
+                                 scoring=selector.scorer,
+                                 n_jobs=1,
+                                 pre_dispatch=selector.pre_dispatch)
+    else:
+        selector.est_.fit(X[:, indices], y)
+        scores = np.array([selector.scorer(selector.est_, X[:, indices], y)])
+    return indices, scores
 
 
 class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
@@ -69,10 +84,11 @@ class SequentialFeatureSelector(BaseEstimator, MetaEstimatorMixin):
         exclusion/inclusion if floating=True and
         algorithm gets stuck in cycles.
     n_jobs : int (default: 1)
-        The number of CPUs to use for cross validation. -1 means 'all CPUs'.
+        The number of CPUs to use for evaluating different feature subsets
+        in parallel. -1 means 'all CPUs'.
     pre_dispatch : int, or string (default: '2*n_jobs')
         Controls the number of jobs that get dispatched
-        during parallel execution in cross_val_score.
+        during parallel execution if `n_jobs > 1` or `n_jobs=-1`.
         Reducing this number can be useful to avoid an explosion of
         memory consumption when more jobs get dispatched than CPUs can process.
         This parameter can be:
@@ -222,7 +238,7 @@ def fit(self, X, y):
                 k_to_select = self.k_features[0]
             k_idx = tuple(range(X.shape[1]))
             k = len(k_idx)
-            k_score = self._calc_score(X, y, k_idx)
+            k_idx, k_score = _calc_score(self, X, y, k_idx)
             self.subsets_[k] = {
                 'feature_idx': k_idx,
                 'cv_scores': k_score,
@@ -325,32 +341,26 @@ def _is_stuck(self, sdq):
             stuck = True
         return stuck
 
-    def _calc_score(self, X, y, indices):
-        if self.cv:
-            scores = cross_val_score(self.est_,
-                                     X[:, indices], y,
-                                     cv=self.cv,
-                                     scoring=self.scorer,
-                                     n_jobs=self.n_jobs,
-                                     pre_dispatch=self.pre_dispatch)
-        else:
-            self.est_.fit(X[:, indices], y)
-            scores = np.array([self.scorer(self.est_, X[:, indices], y)])
-        return scores
-
     def _inclusion(self, orig_set, subset, X, y):
         all_avg_scores = []
         all_cv_scores = []
         all_subsets = []
         res = (None, None, None)
         remaining = orig_set - subset
         if remaining:
-            for feature in remaining:
-                new_subset = tuple(subset | {feature})
-                cv_scores = self._calc_score(X, y, new_subset)
+            features = len(remaining)
+            n_jobs = min(self.n_jobs, features)
+            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
+                                pre_dispatch=self.pre_dispatch)
+            work = parallel(delayed(_calc_score)
+                            (self, X, y, tuple(subset | {feature}))
+                            for feature in remaining)
+
+            for new_subset, cv_scores in work:
                 all_avg_scores.append(cv_scores.mean())
                 all_cv_scores.append(cv_scores)
                 all_subsets.append(new_subset)
+
             best = np.argmax(all_avg_scores)
             res = (all_subsets[best],
                    all_avg_scores[best],
@@ -364,13 +374,19 @@ def _exclusion(self, feature_set, X, y, fixed_feature=None):
             all_avg_scores = []
             all_cv_scores = []
             all_subsets = []
-            for p in combinations(feature_set, r=n - 1):
-                if fixed_feature and fixed_feature not in set(p):
-                    continue
-                cv_scores = self._calc_score(X, y, p)
+            features = n
+            n_jobs = min(self.n_jobs, features)
+            parallel = Parallel(n_jobs=n_jobs, verbose=self.verbose,
+                                pre_dispatch=self.pre_dispatch)
+            work = parallel(delayed(_calc_score)(self, X, y, p)
+                            for p in combinations(feature_set, r=n - 1)
+                            if not fixed_feature or fixed_feature in set(p))
+
+            for p, cv_scores in work:
                 all_avg_scores.append(cv_scores.mean())
                 all_cv_scores.append(cv_scores)
                 all_subsets.append(p)
+
             best = np.argmax(all_avg_scores)
             res = (all_subsets[best],
                    all_avg_scores[best],
diff --git a/requirements.txt b/requirements.txt
@@ -3,5 +3,3 @@ numpy>=1.10.4
 pandas>=0.17.1
 scikit-learn>=0.18
 matplotlib>=1.5.1
-
-