Source code for coclust.clustering.spherical_kmeans

# -*- coding: utf-8 -*-

"""
:mod:`coclust.clustering.spherical_kmeans` provides an implementation of the
spherical k-means algorithm.
"""

# Author: Melissa Ailem <melissa.ailem@parisdescartes.fr>
#         Francois Role <francois.role@parisdescartes.fr>

# License: BSD 3 clause


import numpy as np
import scipy.sparse as sp
from sklearn.utils import check_random_state
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer

from ..io.input_checking import (check_numbers_clustering,
                                 check_array)
from ..initialization import (random_init_clustering)


[docs]class SphericalKmeans: """Spherical k-means clustering. Parameters ---------- n_clusters : int, optional, default: 2 Number of clusters to form init : numpy array or scipy sparse matrix, \ shape (n_features, n_clusters), optional, default: None Initial column labels max_iter : int, optional, default: 20 Maximum number of iterations n_init : int, optional, default: 1 Number of time the algorithm will be run with different initializations. The final results will be the best output of `n_init` consecutive runs. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. tol : float, default: 1e-9 Relative tolerance with regards to criterion to declare convergence weighting : boolean, default: True Flag to activate or deactivate TF-IDF weighting Attributes ---------- labels_ : array-like, shape (n_rows,) cluster label of each row criterion : float criterion obtained from the best run criterions : list of floats sequence of criterion values during the best run """ def __init__(self, n_clusters=2, init=None, max_iter=20, n_init=1, tol=1e-9, random_state=None, weighting=True): self.n_clusters = n_clusters self.init = init self.max_iter = max_iter self.n_init = n_init self.tol = tol self.random_state = check_random_state(random_state) self.labels_ = None self.criterions = [] self.criterion = -np.inf self.weighting = weighting self.Z = None self.Z_fuzzy = None
[docs] def fit(self, X, y=None): """Perform clustering. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ check_array(X, pos=False) check_numbers_clustering(X, self.n_clusters) criterion = self.criterion if self.weighting: transformer = TfidfTransformer(norm='l2', smooth_idf=True) X = transformer.fit_transform(X) X = X.todense() X = np.array(X) X = sp.lil_matrix(X) #X = sp.csr_matrix(X) X = normalize(X) #X = X.astype(float) random_state = check_random_state(self.random_state) seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) for seed in seeds: print(" == New init == ") self.random_state = seed self._fit_single(X) # remember attributes corresponding to the best criterion if (self.criterion > criterion): criterion = self.criterion criterions = self.criterions labels_ = self.labels_ z = self.Z z_fuzzy = self.Z_fuzzy self.random_state = random_state # update attributes self.criterion = criterion self.criterions = criterions self.row_labels_ = labels_ self.Z = z self.Z_fuzzy = z_fuzzy
def _fit_single(self, X, y=None): """Perform one run of clustering. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ K = self.n_clusters if self.init is None: Z = random_init_clustering(K, X.shape[0], self.random_state) else: Z = np.matrix(self.init, dtype=float) X = sp.lil_matrix(X) Z = sp.lil_matrix(Z) # random_init function returns a nd_array change = True c_init = -np.inf c_list = [] n_iter = 0 while change and n_iter < self.max_iter: print("iteration:", n_iter) change = False # compute centroids (in fact only summation along cols) centers = Z.T*X # centers = sparse matrix # normalize centroids centers = normalize(centers) # hard assignment #Z=centers*X.T Z1 = X * centers.T Z1 = Z1.todense() Z1 = np.array(Z1) Z = np.zeros_like(Z1) Z[np.arange(len(Z1)), Z1.argmax(1)] = 1 Z = sp.csc_matrix(Z) # compute and check if cosine criterion still evolves k_times_k = Z.T * X * centers.T c = np.trace(k_times_k.todense()) # no trace for sp ... if np.abs(c - c_init) > 1e-9: c_init = c change = True c_list.append(c) print(c) n_iter += 1 self.criterion = c self.criterions = c_list part = Z.todense().argmax(axis=1).tolist() self.labels_ = [item for sublist in part for item in sublist] self.Z = Z self.Z_fuzzy = Z1