Source code for coclust.coclustering.coclust_mod

# -*- coding: utf-8 -*-

"""
The :mod:`coclust.coclustering.coclust_mod` module provides an implementation
of a co-clustering algorithm by direct maximization of graph modularity.
"""

# Author: Francois Role <francois.role@gmail.com>
#         Stanislas Morbieu <stanislas.morbieu@gmail.com>

# License: BSD 3 clause

import numpy as np
from sklearn.utils import check_random_state, check_array

from ..initialization import random_init
from .base_diagonal_coclust import BaseDiagonalCoclust


[docs]class CoclustMod(BaseDiagonalCoclust): """Co-clustering by direct maximization of graph modularity. Parameters ---------- n_clusters : int, optional, default: 2 Number of co-clusters to form init : numpy array or scipy sparse matrix, \ shape (n_features, n_clusters), optional, default: None Initial column labels max_iter : int, optional, default: 20 Maximum number of iterations n_init : int, optional, default: 1 Number of time the algorithm will be run with different initializations. The final results will be the best output of `n_init` consecutive runs in terms of modularity. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. tol : float, default: 1e-9 Relative tolerance with regards to modularity to declare convergence Attributes ---------- row_labels_ : array-like, shape (n_rows,) Bicluster label of each row column_labels_ : array-like, shape (n_cols,) Bicluster label of each column modularity : float Final value of the modularity modularities : list Record of all computed modularity values for all iterations References ---------- * Ailem M., Role F., Nadif M., Co-clustering Document-term Matrices by \ Direct Maximization of Graph Modularity. CIKM 2015: 1807-1810 """ def __init__(self, n_clusters=2, init=None, max_iter=20, n_init=1, tol=1e-9, random_state=None): self.n_clusters = n_clusters self.init = init self.max_iter = max_iter self.n_init = n_init self.tol = tol self.random_state = random_state self.row_labels_ = None self.column_labels_ = None self.modularity = -np.inf self.modularities = []
[docs] def fit(self, X, y=None): """Perform co-clustering by direct maximization of graph modularity. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=self.n_clusters, ensure_min_features=self.n_clusters, warn_on_dtype=False, estimator=None) if type(X) == np.ndarray: X = np.matrix(X) X = X.astype(float) modularity = self.modularity modularities = [] row_labels_ = None column_labels_ = None seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) for seed in seeds: self._fit_single(X, seed, y) if np.isnan(self.modularity): raise ValueError("matrix may contain unexpected NaN values") # remember attributes corresponding to the best modularity if (self.modularity > modularity): modularity = self.modularity modularities = self.modularities row_labels_ = self.row_labels_ column_labels_ = self.column_labels_ # update attributes self.modularity = modularity self.modularities = modularities self.row_labels_ = row_labels_ self.column_labels_ = column_labels_ return self
def _fit_single(self, X, random_state, y=None): """Perform one run of co-clustering by direct maximization of graph modularity. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ if self.init is None: W = random_init(self.n_clusters, X.shape[1], random_state) else: W = np.matrix(self.init, dtype=float) Z = np.zeros((X.shape[0], self.n_clusters)) # Compute the modularity matrix row_sums = np.matrix(X.sum(axis=1)) col_sums = np.matrix(X.sum(axis=0)) N = float(X.sum()) indep = (row_sums.dot(col_sums)) / N # B is a numpy matrix B = X - indep self.modularities = [] # Loop m_begin = float("-inf") change = True iteration = 0 while change: change = False # Reassign rows BW = B.dot(W) for idx, k in enumerate(np.argmax(BW, axis=1)): Z[idx, :] = 0 Z[idx, k] = 1 # Reassign columns BtZ = (B.T).dot(Z) for idx, k in enumerate(np.argmax(BtZ, axis=1)): W[idx, :] = 0 W[idx, k] = 1 k_times_k = (Z.T).dot(BW) m_end = np.trace(k_times_k) iteration += 1 if (np.abs(m_end - m_begin) > self.tol and iteration < self.max_iter): self.modularities.append(m_end/N) m_begin = m_end change = True self.row_labels_ = np.argmax(Z, axis=1).tolist() self.column_labels_ = np.argmax(W, axis=1).tolist() self.btz = BtZ self.bw = BW self.modularity = m_end / N self.nb_iterations = iteration
[docs] def get_assignment_matrix(self, kind, i): """Returns the indices of 'best' i cols of an assignment matrix (row or column). Parameters ---------- kind : string Assignment matrix to be used: rows or cols Returns ------- numpy array or scipy sparse matrix Matrix containing the i 'best' columns of a row or column assignment matrix """ if kind == "rows": s_bw = np.argsort(self.bw) return s_bw[:, -1:-(i+1):-1] if kind == "cols": s_btz = np.argsort(self.btz) return s_btz[:, -1:-(i+1):-1]