-
Notifications
You must be signed in to change notification settings - Fork 876
/
Copy pathprincipal_component_analysis.py
146 lines (122 loc) · 4.85 KB
/
principal_component_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Sebastian Raschka 2014-2018
# mlxtend Machine Learning Library Extensions
#
# Principal Component Analysis for dimensionality reduction.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import numpy as np
from .._base import _BaseModel
class PrincipalComponentAnalysis(_BaseModel):
"""
Principal Component Analysis Class
Parameters
----------
n_components : int (default: None)
The number of principal components for transformation.
Keeps the original dimensions of the dataset if `None`.
solver : str (default: 'svd')
Method for performing the matrix decomposition.
{'eigen', 'svd'}
Attributes
----------
w_ : array-like, shape=[n_features, n_components]
Projection matrix
e_vals_ : array-like, shape=[n_features]
Eigenvalues in sorted order.
e_vecs_ : array-like, shape=[n_features]
Eigenvectors in sorted order.
loadings_ : array_like, shape=[n_features, n_features]
The factor loadings of the original variables onto
the principal components. The columns are the principal
components, and the rows are the features loadings.
For instance, the first column contains the loadings onto
the first principal component. Note that the signs may
be flipped depending on whether you use the 'eigen' or
'svd' solver; this does not affect the interpretation
of the loadings though.
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/feature_extraction/PrincipalComponentAnalysis/
"""
def __init__(self, n_components=None, solver='svd'):
valid_solver = {'eigen', 'svd'}
if solver not in valid_solver:
raise AttributeError('Must be in %s. Found %s'
% (valid_solver, solver))
self.solver = solver
if n_components is not None and n_components < 1:
raise AttributeError('n_components must be > 1 or None')
self.n_components = n_components
self._is_fitted = False
def fit(self, X, y=None):
"""Learn model from training data.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
self : object
"""
self._is_fitted = False
self._check_arrays(X=X)
self._fit(X=X)
self._is_fitted = True
return self
def _fit(self, X):
n_samples = X.shape[0]
n_features = X.shape[1]
if self.n_components is None or self.n_components > n_features:
n_components = n_features
else:
n_components = self.n_components
if self.solver == 'eigen':
cov_mat = self._covariance_matrix(X)
self.e_vals_, self.e_vecs_ =\
self._decomposition(cov_mat, n_samples)
elif self.solver == 'svd':
self.e_vals_, self.e_vecs_ = self._decomposition(X, n_samples)
self.w_ = self._projection_matrix(eig_vals=self.e_vals_,
eig_vecs=self.e_vecs_,
n_components=n_components)
self.loadings_ = self._loadings()
return self
def transform(self, X):
""" Apply the linear transformation on X.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.
Returns
-------
X_projected : np.ndarray, shape = [n_samples, n_components]
Projected training vectors.
"""
self._check_arrays(X=X)
if not hasattr(self, 'w_'):
raise AttributeError('Object as not been fitted, yet.')
return X.dot(self.w_)
def _covariance_matrix(self, X):
mean_vec = np.mean(X, axis=0)
cov_mat = (X - mean_vec).T.dot((X - mean_vec)) / (X.shape[0] - 1)
return cov_mat
def _decomposition(self, mat, n_samples):
if self.solver == 'eigen':
e_vals, e_vecs = np.linalg.eig(mat)
elif self.solver == 'svd':
u, s, v = np.linalg.svd(mat.T)
e_vecs, e_vals = u, s
e_vals = e_vals ** 2 / n_samples
sort_idx = np.argsort(e_vals)[::-1]
e_vals, e_vecs = e_vals[sort_idx], e_vecs[:, sort_idx]
return e_vals, e_vecs
def _loadings(self):
"""Compute factor loadings"""
return self.e_vecs_ * np.sqrt(self.e_vals_)
def _projection_matrix(self, eig_vals, eig_vecs, n_components):
matrix_w = np.vstack([eig_vecs[:, i] for i in range(n_components)]).T
return matrix_w