Source code for coclust.io.data_loading

# -*- coding: utf-8 -*-

"""
The :mod:`coclust.io.data_loading` module provides functions to load data
from files of different types.
"""

# Author: Severine Affeldt  <severine.affeldt@parisdescartes.fr>

import logging
import os.path

import numpy as np
from scipy.sparse import coo_matrix
from scipy.io import whosmat, loadmat


logger = logging.getLogger(__name__)

# module variables
key_name_data = 'doc_term_matrix'
key_name_term_labels = 'term_labels'
key_name_doc_labels = 'doc_labels'


[docs]def load_doc_term_data(data_filepath,
                       term_labels_filepath=None,
                       doc_labels_filepath=None):

    """Load cooccurence data from a .[...]sv or a .mat file.

    The expected formats are:

    - ``(data_filepath).[...]sv``: three [...] separated columns:

        1st line:
            - 1st column: number of documents
            - 2nd column: number of words
        Other lines:
            - 1st column: document index
            - 2nd column: word index
            - 3rd column: word counts

    - ``(data_filepath).mat``: matlab file with fields:

        - ``'doc_term_matrix'``: :class:`scipy.sparse.csr_matrix` of
          shape (#docs, #terms)
        - ``'doc_labels'``: list of int (len = #docs)
        - ``'term_labels'``: list of string (len = #terms)

        If the key ``'doc_term_matrix'`` is not found, data loading fails.
        If the key ``'doc_labels'`` or ``'term_labels'`` are missing, a warning
        message is displayed.

    Term and doc labels can be separatly loaded from a one column
    .[x]sv|.txt file:

    - (term_labels_filepath).[x]sv|.txt:
        one column, one term label per row. The row index is assumed to
        correspond to the term index in the (columns of the) co-occurrence
        data matrix.

    - (doc_labels_filepath).[x]sv|.txt:
        one column, one document label per row. The row index is assumed to
        correspond to the non zero value number read by row from the
        co-occurrence data matrix.

    Parameters
    ----------
    file_path: string
        Path to file that contains the cooccurence data

    Returns
    -------
    a dictionnary:

        - ``'doc_term_matrix'``: :class:`scipy.sparse.csr_matrix` of shape
          (#docs, #terms)
        - ``'doc_labels'``: list of int (#docs)
        - ``'term_labels'``: list of string (#terms)

    Raises
    ------
    ValueError
        If the input file is not found or if its content is not correct.

    Example
    -------
    >>> dict = load_doc_term_data('../datasets/classic3.csv')
    >>> dict['doc_term_matrix'].shape
    (3891, 4303)

    """

    # Check that file_name is a file path and correspond to an exisiting file
    if not os.path.isfile(data_filepath):
        raise ValueError("[file_name] argument (%s) is not a file path or "
                         "file does not exist."
                         % os.path.abspath(data_filepath))

    # Get the file extension of the data_filepath
    _, file_extension = os.path.splitext(data_filepath)

    doc_term_dict = {}
    if file_extension == '.mat':
        # Load cooccurence table from .mat file
        doc_term_dict = _load_doc_term_data_from_mat_(data_filepath)
    else:
        # Load and format cooccurence table from .xsv file (.csv or .tsv)
        doc_term_dict = _load_doc_term_data_from_xsv_(data_filepath)

    # If doc_term_matrix is None, raise exception
    if doc_term_dict[key_name_data] is None:
        raise ValueError("doc_term matrix is None, check your input file "
                         "content or field names.")

    # Get the number of terms and docs
    n_term = doc_term_dict[key_name_data].shape[1]
    n_doc = doc_term_dict[key_name_data].shape[0]

    # If term|document labels are missing, load them from
    # term|doc_labels_filepath
    # --> terms
    if doc_term_dict[key_name_term_labels] is None:
        if term_labels_filepath is not None:
            tmp_term_labels = np.loadtxt(term_labels_filepath, dtype='str')\
                              .tolist()
            if len(tmp_term_labels) != n_term:
                raise ValueError("Number of term labels (%d) not compatible "
                                 "with co-occurence matrix shape (%d, %d)"
                                 % (n_term, n_doc, len(tmp_term_labels)))
            else:
                doc_term_dict[key_name_term_labels] = tmp_term_labels
    # --> docs
    if doc_term_dict[key_name_doc_labels] is None:
        if doc_labels_filepath is not None:
            tmp_doc_labels = np.loadtxt(doc_labels_filepath, dtype='int')\
                             .tolist()
            if len(tmp_doc_labels) != n_term:
                raise ValueError("Number of doc labels (%d) not compatible "
                                 "with the number of terms (%d)"
                                 % (len(tmp_doc_labels), n_term))
            doc_term_dict[key_name_doc_labels] = tmp_doc_labels

    if doc_term_dict[key_name_data] is None:
        raise ValueError("Co-occurence data cannot be loaded from .mat file: "
                         "no 'coccurrence' field found.")
    if doc_term_dict[key_name_term_labels] is None:
        logger.warning("Term labels cannot be loaded from .mat file. Use "
                       "input argument 'term_labels_filepath' if term labels "
                       "are available.")
    if doc_term_dict[key_name_doc_labels] is None:
        logger.warning("Document labels cannot be loaded  from .mat file. Use "
                       "input argument 'doc_labels_filepath' if doc labels "
                       "are available.")

    return doc_term_dict


def _get_file_delimiter_(extension):
    switcher = {
        '.csv': ',',
        '.tsv': '\t',
    }
    return switcher.get(extension, "\t")


def _load_doc_term_data_from_xsv_(path, extension=None):

    tmp_dict = {key: None for key in [key_name_data,
                                      key_name_term_labels,
                                      key_name_doc_labels]}

    # Get the file extension if needed
    if extension is None:
        _, extension = os.path.splitext(path)

    # --> get the delimeter from the extension
    file_delimiter = _get_file_delimiter_(extension)

    # --> build the matrix (it may take a few seconds)
    a = np.loadtxt(path, delimiter=file_delimiter, skiprows=1)

    # --> Set the co-occurrence data
    tmp_dict[key_name_data] = (coo_matrix((a[:, 2],
                                          (a[:, 0].astype(int),
                                           a[:, 1].astype(int))))).tocsr()

    return tmp_dict


def _load_doc_term_data_from_mat_(path):

    tmp_dict = {key: None for key in [key_name_data,
                                      key_name_term_labels,
                                      key_name_doc_labels]}

    # Get the fields from the matlab file
    matlab_content = whosmat(path)

    for index, element in enumerate(matlab_content):
        # if co-occurence data, load
        if element[0] == key_name_data:
            tmp_dict[key_name_data] = loadmat(path)[key_name_data]
        # if term label data, load and convert to list
        elif element[0] == key_name_term_labels:
            tmp_dict[key_name_term_labels] = loadmat(path)[key_name_term_labels]
            tmp_dict[key_name_term_labels] = tmp_dict[key_name_term_labels].tolist()
        # if doc label data, load, convert to list and take list inside...
        elif element[0] == key_name_doc_labels:
            tmp_dict[key_name_doc_labels] = loadmat(path)[key_name_doc_labels]
            tmp_dict[key_name_doc_labels] = tmp_dict[key_name_doc_labels].tolist()[0]

    return tmp_dict