Source code for coclust.clustering.spherical_kmeans

# -*- coding: utf-8 -*-

"""
:mod:`coclust.clustering.spherical_kmeans` provides an implementation of the
spherical k-means algorithm.
"""

# Author: Melissa Ailem <melissa.ailem@parisdescartes.fr>
#         Francois Role <francois.role@parisdescartes.fr>

# License: BSD 3 clause


import numpy as np
import scipy.sparse as sp
from sklearn.utils import check_random_state
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer

from ..io.input_checking import (check_numbers_clustering,
                                 check_array)
from ..initialization import (random_init_clustering)


[docs]class SphericalKmeans:
    """Spherical k-means clustering.

    Parameters
    ----------
    n_clusters : int, optional, default: 2
        Number of clusters to form
    init : numpy array or scipy sparse matrix, \
        shape (n_features, n_clusters), optional, default: None
        Initial column labels
    max_iter : int, optional, default: 20
        Maximum number of iterations
    n_init : int, optional, default: 1
        Number of time the algorithm will be run with different
        initializations. The final results will be the best output of `n_init`
        consecutive runs.
    random_state : integer or numpy.RandomState, optional
        The generator used to initialize the centers. If an integer is
        given, it fixes the seed. Defaults to the global numpy random
        number generator.
    tol : float, default: 1e-9
        Relative tolerance with regards to criterion to declare convergence
    weighting : boolean, default: True
        Flag to activate or deactivate TF-IDF weighting

    Attributes
    ----------
    labels_ : array-like, shape (n_rows,)
        cluster label of each row
    criterion : float
        criterion obtained from the best run
    criterions : list of floats
        sequence of criterion values during the best run
    """

    def __init__(self, n_clusters=2, init=None, max_iter=20, n_init=1,
                 tol=1e-9, random_state=None, weighting=True):
        self.n_clusters = n_clusters
        self.init = init
        self.max_iter = max_iter
        self.n_init = n_init
        self.tol = tol
        self.random_state = check_random_state(random_state)
        self.labels_ = None
        self.criterions = []
        self.criterion = -np.inf
        self.weighting = weighting
        self.Z = None
        self.Z_fuzzy = None

[docs]    def fit(self, X, y=None):
        """Perform clustering.

        Parameters
        ----------
        X : numpy array or scipy sparse matrix, shape=(n_samples, n_features)
            Matrix to be analyzed
        """

        check_array(X, pos=False)

        check_numbers_clustering(X, self.n_clusters)

        criterion = self.criterion

        if self.weighting:
            transformer = TfidfTransformer(norm='l2', smooth_idf=True)
            X = transformer.fit_transform(X)

        X = X.todense()
        X = np.array(X)
        X = sp.lil_matrix(X)
        #X = sp.csr_matrix(X)
        X = normalize(X)

        #X = X.astype(float)

        random_state = check_random_state(self.random_state)
        seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init)
        for seed in seeds:
            print(" == New init == ")
            self.random_state = seed
            self._fit_single(X)
            # remember attributes corresponding to the best criterion
            if (self.criterion > criterion):
                criterion = self.criterion
                criterions = self.criterions
                labels_ = self.labels_
                z = self.Z
                z_fuzzy = self.Z_fuzzy

        self.random_state = random_state

        # update attributes
        self.criterion = criterion
        self.criterions = criterions
        self.row_labels_ = labels_
        self.Z = z
        self.Z_fuzzy = z_fuzzy

    def _fit_single(self, X, y=None):
        """Perform one run of clustering.

        Parameters
        ----------
        X : numpy array or scipy sparse matrix, shape=(n_samples, n_features)
            Matrix to be analyzed
        """
        K = self.n_clusters

        if self.init is None:
            Z = random_init_clustering(K, X.shape[0], self.random_state)
        else:
            Z = np.matrix(self.init, dtype=float)

        X = sp.lil_matrix(X)

        Z = sp.lil_matrix(Z)  # random_init function returns a nd_array

        change = True

        c_init = -np.inf
        c_list = []
        n_iter = 0

        while change and n_iter < self.max_iter:
            print("iteration:", n_iter)
            change = False

            # compute centroids (in fact only summation along cols)
            centers = Z.T*X  # centers = sparse matrix

            # normalize centroids
            centers = normalize(centers)

            # hard assignment
            #Z=centers*X.T
            Z1 = X * centers.T
            Z1 = Z1.todense()
            Z1 = np.array(Z1)
            Z = np.zeros_like(Z1)
            Z[np.arange(len(Z1)), Z1.argmax(1)] = 1
            Z = sp.csc_matrix(Z)

            # compute and check if cosine criterion still evolves
            k_times_k = Z.T * X * centers.T
            c = np.trace(k_times_k.todense())  # no trace for sp ...

            if np.abs(c - c_init) > 1e-9:
                c_init = c
                change = True
                c_list.append(c)
                print(c)
            n_iter += 1

        self.criterion = c
        self.criterions = c_list
        part = Z.todense().argmax(axis=1).tolist()
        self.labels_ = [item for sublist in part for item in sublist]
        self.Z = Z
        self.Z_fuzzy = Z1