Skip to content

Commit

Permalink
Fix sklearn tests for kmeans (#263)
Browse files Browse the repository at this point in the history
  • Loading branch information
Pahandrovich authored Aug 12, 2020
1 parent b9299b4 commit 40d5b9a
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
38 changes: 28 additions & 10 deletions daal4py/sklearn/cluster/_k_means_0_23.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

string_types = str

from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.extmath import row_norms
import warnings

Expand Down Expand Up @@ -72,7 +73,7 @@ def _tolerance(X, rtol):
mean_var = _daal_mean_var(X)
return mean_var * rtol

def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, random_state):
def _daal4py_compute_starting_centroids(X, X_fptype, nClusters, cluster_centers_0, verbose, random_state):

def is_string(s, target_str):
return isinstance(s, string_types) and s == target_str
Expand Down Expand Up @@ -108,7 +109,11 @@ def is_string(s, target_str):
kmeans_init_res = kmeans_init.compute(X)
centroids_ = kmeans_init_res.centroids
else:
raise ValueError("Cluster centers should either be 'k-means++', 'random', 'deterministic' or an array")
raise ValueError(
f"init should be either 'k-means++', 'random', a ndarray or a "
f"callable, got '{cluster_centers_0}' instead.")
if verbose:
print("Initialization complete")
return deterministic, centroids_

def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype = "double",
Expand Down Expand Up @@ -144,7 +149,7 @@ def _daal4py_k_means_predict(X, nClusters, centroids, resultsToEvaluate = 'compu
return res.assignments[:,0], res.objectiveFunction[0,0]


def _daal4py_k_means_fit(X, nClusters, numIterations, tol, cluster_centers_0, n_init, random_state):
def _daal4py_k_means_fit(X, nClusters, numIterations, tol, cluster_centers_0, n_init, verbose, random_state):
if numIterations < 0:
raise ValueError("Wrong iterations number")

Expand All @@ -164,11 +169,14 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, tol, cluster_centers_0, n_

for k in range(n_init):
deterministic, starting_centroids_ = _daal4py_compute_starting_centroids(
X, X_fptype, nClusters, cluster_centers_0, random_state)
X, X_fptype, nClusters, cluster_centers_0, verbose, random_state)

res = kmeans_algo.compute(X, starting_centroids_)

inertia = res.objectiveFunction[0,0]
if verbose:
print(f"Iteration {k}, inertia {inertia}.")

if best_inertia is None or inertia < best_inertia:
best_cluster_centers = res.centroids
if n_init > 1:
Expand All @@ -184,6 +192,15 @@ def _daal4py_k_means_fit(X, nClusters, numIterations, tol, cluster_centers_0, n_

flag_compute = 'computeAssignments|computeExactObjectiveFunction'
best_labels, best_inertia = _daal4py_k_means_predict(X, nClusters, best_cluster_centers, flag_compute)

distinct_clusters = np.unique(best_labels).size
if distinct_clusters < nClusters:
warnings.warn(
"Number of distinct clusters ({}) found smaller than "
"n_clusters ({}). Possibly due to duplicate points "
"in X.".format(distinct_clusters, nClusters),
ConvergenceWarning, stacklevel=2) # for passing test case "test_kmeans_warns_less_centers_than_unique_points"

return best_cluster_centers, best_labels, best_inertia, best_n_iter


Expand Down Expand Up @@ -219,14 +236,14 @@ def fit(self, X, y=None, sample_weight=None):
self._n_threads = _openmp_effective_n_threads(self._n_threads)

if self.n_init <= 0:
raise ValueError("Invalid number of initializations."
" n_init=%d must be bigger than zero." % self.n_init)
raise ValueError(
f"n_init should be > 0, got {self.n_init} instead.")

random_state = check_random_state(self.random_state)

if self.max_iter <= 0:
raise ValueError('Number of iterations should be a positive number,'
' got %d instead' % self.max_iter)
raise ValueError(
f"max_iter should be > 0, got {self.max_iter} instead.")

# avoid forcing order when copy_x=False
order = "C" if self.copy_x else None
Expand Down Expand Up @@ -264,7 +281,7 @@ def fit(self, X, y=None, sample_weight=None):
self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
_daal4py_k_means_fit(
X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
random_state)
self.verbose, random_state)
else:
logging.info("sklearn.cluster.KMeans.fit: " + method_uses_sklearn)
super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
Expand Down Expand Up @@ -296,7 +313,8 @@ def predict(self, X, sample_weight=None):

X = self._check_test_data(X)

daal_ready = sample_weight is None and hasattr(X, '__array__') # or sp.isspmatrix_csr(X)
daal_ready = sample_weight is None and hasattr(X, '__array__')
daal_ready = daal_ready or sp.isspmatrix_csr(X)

if daal_ready:
logging.info("sklearn.cluster.KMeans.predict: " + method_uses_daal)
Expand Down
1 change: 1 addition & 0 deletions deselected_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ deselected_tests:

# DAAL does not check convergence for tol == 0.0 for ease of benchmarking
- cluster/tests/test_k_means.py::test_kmeans_convergence
- cluster/tests/test_k_means.py::test_kmeans_verbose

# For Newton-CG solver, solution computed in float32 disagrees with that of float64 a little more than
# the test expects, see https://github.com/scikit-learn/scikit-learn/pull/13645
Expand Down

0 comments on commit 40d5b9a

Please sign in to comment.