diff --git a/.gitignore b/.gitignore index 13d1490..b9703a3 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,4 @@ dmypy.json # Pyre type checker .pyre/ +*__pycache__* diff --git a/quapy/dataset/__init__.py b/quapy/dataset/__init__.py new file mode 100644 index 0000000..0853ddb --- /dev/null +++ b/quapy/dataset/__init__.py @@ -0,0 +1,4 @@ +from .base import * +from . import base +from . import reader +from . import preprocessing diff --git a/quapy/dataset/base.py b/quapy/dataset/base.py new file mode 100644 index 0000000..7086596 --- /dev/null +++ b/quapy/dataset/base.py @@ -0,0 +1,137 @@ +import numpy as np +from scipy.sparse import issparse, dok_matrix +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from quapy.functional import artificial_prevalence_sampling +from scipy.sparse import vstack + + +class LabelledCollection: + + def __init__(self, instances, labels, n_classes=None): + self.instances = instances if issparse(instances) else np.asarray(instances) + self.labels = np.asarray(labels, dtype=int) + n_docs = len(self) + if n_classes is None: + self.classes_ = np.unique(self.labels) + self.classes_.sort() + else: + self.classes_ = np.arange(n_classes) + self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_} + + @classmethod + def load(cls, path:str, loader_func:callable): + return LabelledCollection(*loader_func(path)) + + @classmethod + def load_dataset(cls, train_path, test_path): + training = cls.load(train_path) + test = cls.load(test_path) + return Dataset(training, test) + + def __len__(self): + return self.instances.shape[0] + + def prevalence(self): + return self.counts()/len(self) + + def counts(self): + return np.asarray([len(self.index[ci]) for ci in self.classes_]) + + @property + def n_classes(self): + return len(self.classes_) + + @property + def binary(self): + return self.n_classes==2 + + def sampling_index(self, size, *prevs, shuffle=True): + if len(prevs) == self.n_classes-1: + prevs = prevs + (1-sum(prevs),) + assert len(prevs) == self.n_classes, 'unexpected number of prevalences' + assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})' + + taken = 0 + indexes_sample = [] + for i, class_i in enumerate(self.classes_): + if i == self.n_classes-1: + n_requested = size - taken + else: + n_requested = int(size * prevs[i]) + + n_candidates = len(self.index[class_i]) + index_sample = self.index[class_i][ + np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) + ] if n_requested > 0 else [] + + indexes_sample.append(index_sample) + taken += n_requested + + indexes_sample = np.concatenate(indexes_sample).astype(int) + + if shuffle: + indexes_sample = np.random.permutation(indexes_sample) + + return indexes_sample + + def sampling(self, size, *prevs, shuffle=True): + index = self.sampling_index(size, *prevs, shuffle=shuffle) + return self.sampling_from_index(index) + + def sampling_from_index(self, index): + documents = self.instances[index] + labels = self.labels[index] + return LabelledCollection(documents, labels, n_classes=self.n_classes) + + def split_stratified(self, train_prop=0.6): + tr_docs, te_docs, tr_labels, te_labels = \ + train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels) + return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) + + def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): + dimensions=self.n_classes + for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): + yield self.sampling(sample_size, *prevs) + + def __add__(self, other): + if issparse(self.instances) and issparse(other.documents): + docs = vstack([self.instances, other.documents]) + elif isinstance(self.instances, list) and isinstance(other.documents, list): + docs = self.instances + other.documents + else: + raise NotImplementedError('unsupported operation for collection types') + labels = np.concatenate([self.labels, other.labels]) + return LabelledCollection(docs, labels) + + + +class Dataset: + + def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None): + assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' + self.training = training + self.test = test + self.vocabulary = vocabulary + + @classmethod + def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): + return Dataset(*collection.split_stratified(train_prop=train_size)) + + @property + def n_classes(self): + return self.training.n_classes + + @property + def binary(self): + return self.training.binary + + @classmethod + def load(cls, train_path, test_path, loader_func:callable): + training = LabelledCollection.load(train_path, loader_func) + test = LabelledCollection.load(test_path, loader_func) + return Dataset(training, test) + + + + diff --git a/quapy/dataset/preprocessing.py b/quapy/dataset/preprocessing.py new file mode 100644 index 0000000..1db2c27 --- /dev/null +++ b/quapy/dataset/preprocessing.py @@ -0,0 +1,150 @@ +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from dataset.base import Dataset +from scipy.sparse import spmatrix +import numpy as np +from utils.util import parallelize +from .base import LabelledCollection + + +def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): + """ + Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors + :param dataset: a Dataset where the instances are lists of str + :param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary + :param sublinear_tf: whether or not to apply the log scalling to the tf counters + :param inplace: whether or not to apply the transformation inplace, or to a new copy + :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer) + :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True) + where the instances are stored in a csr_matrix of real-valued tfidf scores + """ + __check_type(dataset.training.instances, list, str) + __check_type(dataset.test.instances, list, str) + + vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) + training_documents = vectorizer.fit_transform(dataset.training.instances) + test_documents = vectorizer.transform(dataset.test.instances) + + if inplace: + dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes) + dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes) + dataset.vocabulary = vectorizer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes) + test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes) + return Dataset(training, test, vectorizer.vocabulary_) + + +def reduce_columns(dataset:Dataset, min_df=5, inplace=False): + """ + Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least + _min_occurrences_ instances + :param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix) + :param min_df: minimum number of instances below which the columns are removed + :param inplace: whether or not to apply the transformation inplace, or to a new copy + :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) + where the dimensions corresponding to infrequent instances have been removed + """ + __check_type(dataset.training, spmatrix) + __check_type(dataset.test, spmatrix) + assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces' + + def filter_by_occurrences(X, W): + column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten() + take_columns = column_prevalence >= min_df + X = X[:, take_columns] + W = W[:, take_columns] + return X, W + + Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances) + if inplace: + dataset.training.instances = Xtr + dataset.test.instances = Xte + return dataset + else: + training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes) + test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes) + return Dataset(training, test) + + +def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): + """ + Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index. + Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK + :param dataset: a Dataset where the instances are lists of str + :param min_df: minimum number of instances below which the term is replaced by a UNK index + :param inplace: whether or not to apply the transformation inplace, or to a new copy + :param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer) + :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) + consisting of lists of integer values representing indices. + """ + __check_type(dataset.training.instances, list, str) + __check_type(dataset.test.instances, list, str) + + indexer = IndexTransformer(min_df=min_df, **kwargs) + training_index = indexer.fit_transform(dataset.training.instances) + test_index = indexer.transform(dataset.test.instances) + + if inplace: + dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes) + dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes) + dataset.vocabulary = indexer.vocabulary_ + return dataset + else: + training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes) + test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes) + return Dataset(training, test, indexer.vocabulary_) + + +def __check_type(container, container_type=None, element_type=None): + if container_type: + assert isinstance(container, container_type), \ + f'unexpected type of container (expected {container_type}, found {type(container)})' + if element_type: + assert isinstance(next(container), element_type), \ + f'unexpected type of element (expected {container_type}, found {type(container)})' + + + +class IndexTransformer: + + def __init__(self, **kwargs): + """ + :param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_ + """ + self.vect = CountVectorizer(**kwargs) + self.unk = -1 # a valid index is assigned after fit + + def fit(self, X): + """ + :param X: a list of strings + :return: self + """ + self.vect.fit(X) + self.analyzer = self.vect.build_analyzer() + self.vocabulary_ = self.vect.vocabulary_ + self.unk = self.add_word('UNK') + return self + + def transform(self, X, n_jobs=-1): + # given the number of tasks and the number of jobs, generates the slices for the parallel threads + assert self.unk > 0, 'transform called before fit' + indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs) + return np.asarray(indexed) + + def index(self, documents): + vocab = self.vocabulary_.copy() + return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')] + + def fit_transform(self, X, n_jobs=-1): + return self.fit(X).transform(X, n_jobs=n_jobs) + + def vocabulary_size(self): + return len(self.vocabulary_) + 1 # the reserved unk token + + def add_word(self, word): + if word in self.vocabulary_: + raise ValueError(f'word {word} already in dictionary') + self.vocabulary_[word] = len(self.vocabulary_) + return self.vocabulary_[word] + diff --git a/quapy/dataset/reader.py b/quapy/dataset/reader.py new file mode 100644 index 0000000..e160d15 --- /dev/null +++ b/quapy/dataset/reader.py @@ -0,0 +1,56 @@ +import numpy as np +from scipy.sparse import dok_matrix +from tqdm import tqdm + + +def from_text(path): + """ + Reas a labelled colletion of documents. + File fomart <0 or 1>\t\n + :param path: path to the labelled collection + :return: a list of sentences, and a list of labels + """ + all_sentences, all_labels = [], [] + for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): + line = line.strip() + if line: + label, sentence = line.split('\t') + sentence = sentence.strip() + label = int(label) + if sentence: + all_sentences.append(sentence) + all_labels.append(label) + return all_sentences, all_labels + + +def from_sparse(path): + """ + Reas a labelled colletion of real-valued instances expressed in sparse format + File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n + :param path: path to the labelled collection + :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels + """ + + def split_col_val(col_val): + col, val = col_val.split(':') + col, val = int(col) - 1, float(val) + return col, val + + all_documents, all_labels = [], [] + max_col = 0 + for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): + parts = line.strip().split() + if parts: + all_labels.append(int(parts[0])) + cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]]) + cols, vals = np.asarray(cols), np.asarray(vals) + max_col = max(max_col, cols.max()) + all_documents.append((cols, vals)) + n_docs = len(all_labels) + X = dok_matrix((n_docs, max_col + 1), dtype=float) + for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents), + desc=f'\-- filling matrix of shape {X.shape}'): + X[i, cols] = vals + X = X.tocsr() + y = np.asarray(all_labels) + 1 + return X, y