QuaPy/quapy/data/preprocessing.py

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from data.base import Dataset
from scipy.sparse import spmatrix
from util import parallelize
from .base import LabelledCollection
from tqdm import tqdm


def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
    """
    Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
    :param dataset: a Dataset where the instances are lists of str
    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
    :param sublinear_tf: whether or not to apply the log scalling to the tf counters
    :param inplace: whether or not to apply the transformation inplace, or to a new copy
    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
    :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
    where the instances are stored in a csr_matrix of real-valued tfidf scores
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)

    vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
    training_documents = vectorizer.fit_transform(dataset.training.instances)
    test_documents = vectorizer.transform(dataset.test.instances)

    if inplace:
        dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)
        dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)
        dataset.vocabulary = vectorizer.vocabulary_
        return dataset
    else:
        training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)
        test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)
        return Dataset(training, test, vectorizer.vocabulary_)


def reduce_columns(dataset:Dataset, min_df=5, inplace=False):
    """
    Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
    _min_occurrences_ instances
    :param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
    :param min_df: minimum number of instances below which the columns are removed
    :param inplace: whether or not to apply the transformation inplace, or to a new copy
    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
    where the dimensions corresponding to infrequent instances have been removed
    """
    __check_type(dataset.training.instances, spmatrix)
    __check_type(dataset.test.instances, spmatrix)
    assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces'

    def filter_by_occurrences(X, W):
        column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten()
        take_columns = column_prevalence >= min_df
        X = X[:, take_columns]
        W = W[:, take_columns]
        return X, W

    Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances)
    if inplace:
        dataset.training.instances = Xtr
        dataset.test.instances = Xte
        return dataset
    else:
        training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)
        test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)
        return Dataset(training, test)


def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
    """
    Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
    Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
    :param dataset: a Dataset where the instances are lists of str
    :param min_df: minimum number of instances below which the term is replaced by a UNK index
    :param inplace: whether or not to apply the transformation inplace, or to a new copy
    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
    consisting of lists of integer values representing indices.
    """
    __check_type(dataset.training.instances, np.ndarray, str)
    __check_type(dataset.test.instances, np.ndarray, str)

    indexer = IndexTransformer(min_df=min_df, **kwargs)
    training_index = indexer.fit_transform(dataset.training.instances)
    test_index = indexer.transform(dataset.test.instances)

    if inplace:
        dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)
        dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)
        dataset.vocabulary = indexer.vocabulary_
        return dataset
    else:
        training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)
        test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)
        return Dataset(training, test, indexer.vocabulary_)


def __check_type(container, container_type=None, element_type=None):
    if container_type:
        assert isinstance(container, container_type), \
            f'unexpected type of container (expected {container_type}, found {type(container)})'
    if element_type:
        assert isinstance(container[0], element_type), \
            f'unexpected type of element (expected {container_type}, found {type(container)})'


class IndexTransformer:

    def __init__(self, **kwargs):
        """
        :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
        """
        self.vect = CountVectorizer(**kwargs)
        self.unk = -1  # a valid index is assigned after fit

    def fit(self, X):
        """
        :param X: a list of strings
        :return: self
        """
        self.vect.fit(X)
        self.analyzer = self.vect.build_analyzer()
        self.vocabulary_ = self.vect.vocabulary_
        self.unk = self.add_word('UNK')
        return self

    def transform(self, X, n_jobs=-1):
        # given the number of tasks and the number of jobs, generates the slices for the parallel threads
        assert self.unk > 0, 'transform called before fit'
        indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
        return np.asarray(indexed)

    def index(self, documents):
        vocab = self.vocabulary_.copy()
        return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]

    def fit_transform(self, X, n_jobs=-1):
        return self.fit(X).transform(X, n_jobs=n_jobs)

    def vocabulary_size(self):
        return len(self.vocabulary_)

    def add_word(self, word):
        if word in self.vocabulary_:
            raise ValueError(f'word {word} already in dictionary')
        self.vocabulary_[word] = len(self.vocabulary_)
        return self.vocabulary_[word]
fixing dataset loading 2020-12-03 16:36:54 +01:00			`import numpy as np`
data loading 2020-12-03 16:24:21 +01:00			`from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer`
evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils 2020-12-10 19:04:33 +01:00			`from data.base import Dataset`
data loading 2020-12-03 16:24:21 +01:00			`from scipy.sparse import spmatrix`
uniform sampling added if *prevs is empty 2020-12-17 18:17:17 +01:00			`from util import parallelize`
data loading 2020-12-03 16:24:21 +01:00			`from .base import LabelledCollection`
evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils 2020-12-10 19:04:33 +01:00			`from tqdm import tqdm`
data loading 2020-12-03 16:24:21 +01:00

			`def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):`
			`"""`
			`Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors`
			`:param dataset: a Dataset where the instances are lists of str`
			`:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary`
			`:param sublinear_tf: whether or not to apply the log scalling to the tf counters`
			`:param inplace: whether or not to apply the transformation inplace, or to a new copy`
			`:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)`
			`:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)`
			`where the instances are stored in a csr_matrix of real-valued tfidf scores`
			`"""`
fixing dataset loading 2020-12-03 16:36:54 +01:00			`__check_type(dataset.training.instances, np.ndarray, str)`
			`__check_type(dataset.test.instances, np.ndarray, str)`
data loading 2020-12-03 16:24:21 +01:00
			`vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)`
			`training_documents = vectorizer.fit_transform(dataset.training.instances)`
			`test_documents = vectorizer.transform(dataset.test.instances)`

			`if inplace:`
			`dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)`
			`dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)`
			`dataset.vocabulary = vectorizer.vocabulary_`
			`return dataset`
			`else:`
			`training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)`
			`test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)`
			`return Dataset(training, test, vectorizer.vocabulary_)`


			`def reduce_columns(dataset:Dataset, min_df=5, inplace=False):`
			`"""`
			`Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least`
			`_min_occurrences_ instances`
			`:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)`
			`:param min_df: minimum number of instances below which the columns are removed`
			`:param inplace: whether or not to apply the transformation inplace, or to a new copy`
			`:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)`
			`where the dimensions corresponding to infrequent instances have been removed`
			`"""`
fixing dataset loading 2020-12-03 16:36:54 +01:00			`__check_type(dataset.training.instances, spmatrix)`
			`__check_type(dataset.test.instances, spmatrix)`
data loading 2020-12-03 16:24:21 +01:00			`assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces'`

			`def filter_by_occurrences(X, W):`
			`column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten()`
			`take_columns = column_prevalence >= min_df`
			`X = X[:, take_columns]`
			`W = W[:, take_columns]`
			`return X, W`

			`Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances)`
			`if inplace:`
			`dataset.training.instances = Xtr`
			`dataset.test.instances = Xte`
			`return dataset`
			`else:`
			`training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)`
			`test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)`
			`return Dataset(training, test)`


			`def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):`
			`"""`
			`Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.`
			`Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK`
			`:param dataset: a Dataset where the instances are lists of str`
			`:param min_df: minimum number of instances below which the term is replaced by a UNK index`
			`:param inplace: whether or not to apply the transformation inplace, or to a new copy`
			`:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)`
			`:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)`
			`consisting of lists of integer values representing indices.`
			`"""`
evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils 2020-12-10 19:04:33 +01:00			`__check_type(dataset.training.instances, np.ndarray, str)`
			`__check_type(dataset.test.instances, np.ndarray, str)`
data loading 2020-12-03 16:24:21 +01:00
			`indexer = IndexTransformer(min_df=min_df, **kwargs)`
			`training_index = indexer.fit_transform(dataset.training.instances)`
			`test_index = indexer.transform(dataset.test.instances)`

			`if inplace:`
			`dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)`
			`dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)`
			`dataset.vocabulary = indexer.vocabulary_`
			`return dataset`
			`else:`
			`training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)`
			`test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)`
			`return Dataset(training, test, indexer.vocabulary_)`


			`def __check_type(container, container_type=None, element_type=None):`
			`if container_type:`
			`assert isinstance(container, container_type), \`
			`f'unexpected type of container (expected {container_type}, found {type(container)})'`
			`if element_type:`
fixing dataset loading 2020-12-03 16:36:54 +01:00			`assert isinstance(container[0], element_type), \`
data loading 2020-12-03 16:24:21 +01:00			`f'unexpected type of element (expected {container_type}, found {type(container)})'`


			`class IndexTransformer:`

			`def __init__(self, **kwargs):`
			`"""`
			`:param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_`
			`"""`
			`self.vect = CountVectorizer(**kwargs)`
			`self.unk = -1 # a valid index is assigned after fit`

			`def fit(self, X):`
			`"""`
			`:param X: a list of strings`
			`:return: self`
			`"""`
			`self.vect.fit(X)`
			`self.analyzer = self.vect.build_analyzer()`
			`self.vocabulary_ = self.vect.vocabulary_`
			`self.unk = self.add_word('UNK')`
			`return self`

			`def transform(self, X, n_jobs=-1):`
			`# given the number of tasks and the number of jobs, generates the slices for the parallel threads`
			`assert self.unk > 0, 'transform called before fit'`
			`indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)`
			`return np.asarray(indexed)`

			`def index(self, documents):`
			`vocab = self.vocabulary_.copy()`
			`return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]`

			`def fit_transform(self, X, n_jobs=-1):`
			`return self.fit(X).transform(X, n_jobs=n_jobs)`

			`def vocabulary_size(self):`
evaluation by artificial prevalence sampling added. New methods added. New util functions added to quapy.functional and quapy.utils 2020-12-10 19:04:33 +01:00			`return len(self.vocabulary_)`
data loading 2020-12-03 16:24:21 +01:00
			`def add_word(self, word):`
			`if word in self.vocabulary_:`
			`raise ValueError(f'word {word} already in dictionary')`
			`self.vocabulary_[word] = len(self.vocabulary_)`
			`return self.vocabulary_[word]`