data loading

2020-12-03 16:24:21 +01:00 · 2020-12-03 16:24:21 +01:00 · b6820e8dba
parent c0db4a2867
commit b6820e8dba
5 changed files with 348 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -129,3 +129,4 @@ dmypy.json
 # Pyre type checker
 .pyre/

+*__pycache__*
--- a/quapy/dataset/init.py
+++ b/quapy/dataset/init.py
@ -0,0 +1,4 @@
+from .base import *
+from . import base
+from . import reader
+from . import preprocessing
--- a/quapy/dataset/base.py
+++ b/quapy/dataset/base.py
@ -0,0 +1,137 @@
+import numpy as np
+from scipy.sparse import issparse, dok_matrix
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from quapy.functional import artificial_prevalence_sampling
+from scipy.sparse import vstack
+
+
+class LabelledCollection:
+
+    def __init__(self, instances, labels, n_classes=None):
+        self.instances = instances if issparse(instances) else np.asarray(instances)
+        self.labels = np.asarray(labels, dtype=int)
+        n_docs = len(self)
+        if n_classes is None:
+            self.classes_ = np.unique(self.labels)
+            self.classes_.sort()
+        else:
+            self.classes_ = np.arange(n_classes)
+        self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_}
+
+    @classmethod
+    def load(cls, path:str, loader_func:callable):
+        return LabelledCollection(*loader_func(path))
+
+    @classmethod
+    def load_dataset(cls, train_path, test_path):
+        training = cls.load(train_path)
+        test = cls.load(test_path)
+        return Dataset(training, test)
+
+    def __len__(self):
+        return self.instances.shape[0]
+
+    def prevalence(self):
+        return self.counts()/len(self)
+
+    def counts(self):
+        return np.asarray([len(self.index[ci]) for ci in self.classes_])
+
+    @property
+    def n_classes(self):
+        return len(self.classes_)
+
+    @property
+    def binary(self):
+        return self.n_classes==2
+
+    def sampling_index(self, size, *prevs, shuffle=True):
+        if len(prevs) == self.n_classes-1:
+            prevs = prevs + (1-sum(prevs),)
+        assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
+        assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})'
+
+        taken = 0
+        indexes_sample = []
+        for i, class_i in enumerate(self.classes_):
+            if i == self.n_classes-1:
+                n_requested = size - taken
+            else:
+                n_requested = int(size * prevs[i])
+
+            n_candidates = len(self.index[class_i])
+            index_sample = self.index[class_i][
+                np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
+            ] if n_requested > 0 else []
+
+            indexes_sample.append(index_sample)
+            taken += n_requested
+
+        indexes_sample = np.concatenate(indexes_sample).astype(int)
+
+        if shuffle:
+            indexes_sample = np.random.permutation(indexes_sample)
+
+        return indexes_sample
+
+    def sampling(self, size, *prevs, shuffle=True):
+        index = self.sampling_index(size, *prevs, shuffle=shuffle)
+        return self.sampling_from_index(index)
+
+    def sampling_from_index(self, index):
+        documents = self.instances[index]
+        labels = self.labels[index]
+        return LabelledCollection(documents, labels, n_classes=self.n_classes)
+
+    def split_stratified(self, train_prop=0.6):
+        tr_docs, te_docs, tr_labels, te_labels = \
+            train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
+        return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
+
+    def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
+        dimensions=self.n_classes
+        for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
+            yield self.sampling(sample_size, *prevs)
+
+    def __add__(self, other):
+        if issparse(self.instances) and issparse(other.documents):
+            docs = vstack([self.instances, other.documents])
+        elif isinstance(self.instances, list) and isinstance(other.documents, list):
+            docs = self.instances + other.documents
+        else:
+            raise NotImplementedError('unsupported operation for collection types')
+        labels = np.concatenate([self.labels, other.labels])
+        return LabelledCollection(docs, labels)
+
+
+
+class Dataset:
+
+    def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None):
+        assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
+        self.training = training
+        self.test = test
+        self.vocabulary = vocabulary
+
+    @classmethod
+    def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
+        return Dataset(*collection.split_stratified(train_prop=train_size))
+
+    @property
+    def n_classes(self):
+        return self.training.n_classes
+
+    @property
+    def binary(self):
+        return self.training.binary
+
+    @classmethod
+    def load(cls, train_path, test_path, loader_func:callable):
+        training = LabelledCollection.load(train_path, loader_func)
+        test = LabelledCollection.load(test_path, loader_func)
+        return Dataset(training, test)
+
+
+
+
--- a/quapy/dataset/preprocessing.py
+++ b/quapy/dataset/preprocessing.py
@ -0,0 +1,150 @@
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from dataset.base import Dataset
+from scipy.sparse import spmatrix
+import numpy as np
+from utils.util import parallelize
+from .base import LabelledCollection
+
+
+def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
+    """
+    Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
+    :param dataset: a Dataset where the instances are lists of str
+    :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
+    :param sublinear_tf: whether or not to apply the log scalling to the tf counters
+    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
+    :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
+    where the instances are stored in a csr_matrix of real-valued tfidf scores
+    """
+    __check_type(dataset.training.instances, list, str)
+    __check_type(dataset.test.instances, list, str)
+
+    vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
+    training_documents = vectorizer.fit_transform(dataset.training.instances)
+    test_documents = vectorizer.transform(dataset.test.instances)
+
+    if inplace:
+        dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)
+        dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)
+        dataset.vocabulary = vectorizer.vocabulary_
+        return dataset
+    else:
+        training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)
+        test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)
+        return Dataset(training, test, vectorizer.vocabulary_)
+
+
+def reduce_columns(dataset:Dataset, min_df=5, inplace=False):
+    """
+    Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
+    _min_occurrences_ instances
+    :param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
+    :param min_df: minimum number of instances below which the columns are removed
+    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+    where the dimensions corresponding to infrequent instances have been removed
+    """
+    __check_type(dataset.training, spmatrix)
+    __check_type(dataset.test, spmatrix)
+    assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces'
+
+    def filter_by_occurrences(X, W):
+        column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten()
+        take_columns = column_prevalence >= min_df
+        X = X[:, take_columns]
+        W = W[:, take_columns]
+        return X, W
+
+    Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances)
+    if inplace:
+        dataset.training.instances = Xtr
+        dataset.test.instances = Xte
+        return dataset
+    else:
+        training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)
+        test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)
+        return Dataset(training, test)
+
+
+def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
+    """
+    Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
+    Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
+    :param dataset: a Dataset where the instances are lists of str
+    :param min_df: minimum number of instances below which the term is replaced by a UNK index
+    :param inplace: whether or not to apply the transformation inplace, or to a new copy
+    :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
+    :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
+    consisting of lists of integer values representing indices.
+    """
+    __check_type(dataset.training.instances, list, str)
+    __check_type(dataset.test.instances, list, str)
+
+    indexer = IndexTransformer(min_df=min_df, **kwargs)
+    training_index = indexer.fit_transform(dataset.training.instances)
+    test_index = indexer.transform(dataset.test.instances)
+
+    if inplace:
+        dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)
+        dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)
+        dataset.vocabulary = indexer.vocabulary_
+        return dataset
+    else:
+        training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)
+        test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)
+        return Dataset(training, test, indexer.vocabulary_)
+
+
+def __check_type(container, container_type=None, element_type=None):
+    if container_type:
+        assert isinstance(container, container_type), \
+            f'unexpected type of container (expected {container_type}, found {type(container)})'
+    if element_type:
+        assert isinstance(next(container), element_type), \
+            f'unexpected type of element (expected {container_type}, found {type(container)})'
+
+
+
+class IndexTransformer:
+
+    def __init__(self, **kwargs):
+        """
+        :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
+        """
+        self.vect = CountVectorizer(**kwargs)
+        self.unk = -1  # a valid index is assigned after fit
+
+    def fit(self, X):
+        """
+        :param X: a list of strings
+        :return: self
+        """
+        self.vect.fit(X)
+        self.analyzer = self.vect.build_analyzer()
+        self.vocabulary_ = self.vect.vocabulary_
+        self.unk = self.add_word('UNK')
+        return self
+
+    def transform(self, X, n_jobs=-1):
+        # given the number of tasks and the number of jobs, generates the slices for the parallel threads
+        assert self.unk > 0, 'transform called before fit'
+        indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
+        return np.asarray(indexed)
+
+    def index(self, documents):
+        vocab = self.vocabulary_.copy()
+        return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
+
+    def fit_transform(self, X, n_jobs=-1):
+        return self.fit(X).transform(X, n_jobs=n_jobs)
+
+    def vocabulary_size(self):
+        return len(self.vocabulary_) + 1  # the reserved unk token
+
+    def add_word(self, word):
+        if word in self.vocabulary_:
+            raise ValueError(f'word {word} already in dictionary')
+        self.vocabulary_[word] = len(self.vocabulary_)
+        return self.vocabulary_[word]
+
--- a/quapy/dataset/reader.py
+++ b/quapy/dataset/reader.py
@ -0,0 +1,56 @@
+import numpy as np
+from scipy.sparse import dok_matrix
+from tqdm import tqdm
+
+
+def from_text(path):
+    """
+    Reas a labelled colletion of documents.
+    File fomart <0 or 1>\t<document>\n
+    :param path: path to the labelled collection
+    :return: a list of sentences, and a list of labels
+    """
+    all_sentences, all_labels = [], []
+    for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
+        line = line.strip()
+        if line:
+            label, sentence = line.split('\t')
+            sentence = sentence.strip()
+            label = int(label)
+            if sentence:
+                all_sentences.append(sentence)
+                all_labels.append(label)
+    return all_sentences, all_labels
+
+
+def from_sparse(path):
+    """
+    Reas a labelled colletion of real-valued instances expressed in sparse format
+    File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
+    :param path: path to the labelled collection
+    :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
+    """
+
+    def split_col_val(col_val):
+        col, val = col_val.split(':')
+        col, val = int(col) - 1, float(val)
+        return col, val
+
+    all_documents, all_labels = [], []
+    max_col = 0
+    for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
+        parts = line.strip().split()
+        if parts:
+            all_labels.append(int(parts[0]))
+            cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]])
+            cols, vals = np.asarray(cols), np.asarray(vals)
+            max_col = max(max_col, cols.max())
+            all_documents.append((cols, vals))
+    n_docs = len(all_labels)
+    X = dok_matrix((n_docs, max_col + 1), dtype=float)
+    for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents),
+                                desc=f'\-- filling matrix of shape {X.shape}'):
+        X[i, cols] = vals
+    X = X.tocsr()
+    y = np.asarray(all_labels) + 1
+    return X, y