Skip to content

Commit

Permalink
Initial commit to pass an estimator check. WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
rosecers committed May 17, 2023
1 parent dbab1f2 commit af60553
Show file tree
Hide file tree
Showing 19 changed files with 355 additions and 249 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ __pycache__
*.egg-info
*.swp
*.swo
*DS_Store

.tox/
build/
Expand Down
168 changes: 84 additions & 84 deletions src/skmatter/_selection.py

Large diffs are not rendered by default.

83 changes: 46 additions & 37 deletions src/skmatter/decomposition/_kernel_pcovr.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ def __init__(
self.kernel_params = kernel_params

self.n_jobs = n_jobs
self.n_samples_ = None

self.fit_inverse_transform = fit_inverse_transform

Expand Down Expand Up @@ -308,17 +307,19 @@ def fit(self, X, Y, W=None):

if self.n_components is None:
if self.svd_solver != "arpack":
self.n_components = X.shape[0]
self.n_components_ = X.shape[0]
else:
self.n_components = X.shape[0] - 1
self.n_components_ = X.shape[0] - 1
else:
self.n_components_ = self.n_components

K = self._get_kernel(X)

if self.center:
self.centerer_ = KernelNormalizer()
K = self.centerer_.fit_transform(K)

self.n_samples_ = X.shape[0]
self.n_samples_in_, self.n_features_in_ = X.shape

if self.regressor != "precomputed":
if self.regressor is None:
Expand Down Expand Up @@ -362,7 +363,7 @@ def fit(self, X, Y, W=None):
# to avoid needing to compute the kernel a second time
self.regressor_ = check_krr_fit(regressor, K, X, Y)

W = self.regressor_.dual_coef_.reshape(X.shape[0], -1)
W = self.regressor_.dual_coef_.reshape(self.n_samples_in_, -1)

# Use this instead of `self.regressor_.predict(K)`
# so that we can handle the case of the pre-fitted regressor
Expand All @@ -387,12 +388,17 @@ def fit(self, X, Y, W=None):
# Handle svd_solver
self._fit_svd_solver = self.svd_solver
if self._fit_svd_solver == "auto":
# Small problem or self.n_components == 'mle', just call full PCA
if max(X.shape) <= 500 or self.n_components == "mle":
# Small problem or self.n_components_ == 'mle', just call full PCA
if (
max(self.n_samples_in_, self.n_features_in_) <= 500
or self.n_components_ == "mle"
):
self._fit_svd_solver = "full"
elif self.n_components >= 1 and self.n_components < 0.8 * min(X.shape):
elif self.n_components_ >= 1 and self.n_components_ < 0.8 * max(
self.n_samples_in_, self.n_features_in_
):
self._fit_svd_solver = "randomized"
# This is also the case of self.n_components in (0,1)
# This is also the case of self.n_components_ in (0,1)
else:
self._fit_svd_solver = "full"

Expand Down Expand Up @@ -536,31 +542,31 @@ def score(self, X, Y):
return -sum([Lkpca, Lkrr])

def _decompose_truncated(self, mat):
if not 1 <= self.n_components <= self.n_samples_:
if not 1 <= self.n_components_ <= self.n_samples_in_:
raise ValueError(
"n_components=%r must be between 1 and "
"n_samples=%r with "
"svd_solver='%s'"
% (
self.n_components,
self.n_samples_,
self.n_components_,
self.n_samples_in_,
self.svd_solver,
)
)
elif not isinstance(self.n_components, numbers.Integral):
elif not isinstance(self.n_components_, numbers.Integral):
raise ValueError(
"n_components=%r must be of type int "
"when greater than or equal to 1, was of type=%r"
% (self.n_components, type(self.n_components))
% (self.n_components_, type(self.n_components_))
)
elif self.svd_solver == "arpack" and self.n_components == self.n_samples_:
elif self.svd_solver == "arpack" and self.n_components_ == self.n_samples_in_:
raise ValueError(
"n_components=%r must be strictly less than "
"n_samples=%r with "
"svd_solver='%s'"
% (
self.n_components,
self.n_samples_,
self.n_components_,
self.n_samples_in_,
self.svd_solver,
)
)
Expand All @@ -569,7 +575,7 @@ def _decompose_truncated(self, mat):

if self._fit_svd_solver == "arpack":
v0 = _init_arpack_v0(min(mat.shape), random_state)
U, S, Vt = svds(mat, k=self.n_components, tol=self.tol, v0=v0)
U, S, Vt = svds(mat, k=self.n_components_, tol=self.tol, v0=v0)
# svds doesn't abide by scipy.linalg.svd/randomized_svd
# conventions, so reverse its outputs.
S = S[::-1]
Expand All @@ -581,7 +587,7 @@ def _decompose_truncated(self, mat):
# sign flipping is done inside
U, S, Vt = randomized_svd(
mat,
n_components=self.n_components,
n_components=self.n_components_,
n_iter=self.iterated_power,
flip_sign=True,
random_state=random_state,
Expand All @@ -594,24 +600,25 @@ def _decompose_truncated(self, mat):
return U, S, Vt

def _decompose_full(self, mat):
if self.n_components != "mle":
if not (0 <= self.n_components <= self.n_samples_):
if self.n_components_ != "mle":
if not (0 <= self.n_components_ <= self.n_samples_in_):
raise ValueError(
"n_components=%r must be between 1 and "
"n_samples=%r with "
"svd_solver='%s'"
% (
self.n_components,
self.n_samples_,
self.n_components_,
self.n_samples_in_,
self.svd_solver,
)
)
elif self.n_components >= 1:
if not isinstance(self.n_components, numbers.Integral):
elif self.n_components_ >= 1:
if not isinstance(self.n_components_, numbers.Integral):
raise ValueError(
"n_components=%r must be of type int "
"when greater than or equal to 1, "
"was of type=%r" % (self.n_components, type(self.n_components))
"was of type=%r"
% (self.n_components_, type(self.n_components_))
)

U, S, Vt = linalg.svd(mat, full_matrices=False)
Expand All @@ -623,26 +630,28 @@ def _decompose_full(self, mat):
U, Vt = svd_flip(U, Vt)

# Get variance explained by singular values
explained_variance_ = (S**2) / (self.n_samples_ - 1)
explained_variance_ = (S**2) / (self.n_samples_in_ - 1)
total_var = explained_variance_.sum()
explained_variance_ratio_ = explained_variance_ / total_var

# Postprocess the number of components required
if self.n_components == "mle":
self.n_components = _infer_dimension(explained_variance_, self.n_samples_)
elif 0 < self.n_components < 1.0:
if self.n_components_ == "mle":
self.n_components_ = _infer_dimension(
explained_variance_, self.n_samples_in_
)
elif 0 < self.n_components_ < 1.0:
# number of components for which the cumulated explained
# variance percentage is superior to the desired threshold
# side='right' ensures that number of features selected
# their variance is always greater than self.n_components float
# their variance is always greater than self.n_components_ float
# passed. More discussion in issue: #15669
ratio_cumsum = stable_cumsum(explained_variance_ratio_)
self.n_components = (
np.searchsorted(ratio_cumsum, self.n_components, side="right") + 1
self.n_components_ = (
np.searchsorted(ratio_cumsum, self.n_components_, side="right") + 1
)
self.n_components = self.n_components

return (
U[:, : self.n_components],
S[: self.n_components],
Vt[: self.n_components],
U[:, : self.n_components_],
S[: self.n_components_],
Vt[: self.n_components_],
)
Loading

0 comments on commit af60553

Please sign in to comment.