diff --git a/TODO.txt b/TODO.txt index 6ff9e9c..2e153a2 100644 --- a/TODO.txt +++ b/TODO.txt @@ -17,14 +17,13 @@ Current issues: In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as an instance of single-label with 2 labels. Check -Add classnames to LabelledCollection? This should improve visualization of reports Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps) OVR I believe is currently tied to aggregative methods. We should provide a general interface also for general quantifiers Currently, being "binary" only adds one checker; we should figure out how to impose the check to be automatically performed +Add random seed management to support replicability (see temp_seed in util.py). Improvements: ========================================== -Clarify whether QuaNet is an aggregative method or not. Explore the hyperparameter "number of bins" in HDy Rename EMQ to SLD ? Parallelize the kFCV in ACC and PACC? diff --git a/quapy/classification/neural.py b/quapy/classification/neural.py index 68a924e..afeb649 100644 --- a/quapy/classification/neural.py +++ b/quapy/classification/neural.py @@ -11,8 +11,8 @@ from torch.nn.utils.rnn import pad_sequence from tqdm import tqdm import quapy as qp -from data import LabelledCollection -from util import EarlyStop +from quapy.data import LabelledCollection +from quapy.util import EarlyStop class NeuralClassifierTrainer: diff --git a/quapy/data/base.py b/quapy/data/base.py index 6b2ddec..ffd7e31 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -2,40 +2,52 @@ import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold + from quapy.functional import artificial_prevalence_sampling, strprev class LabelledCollection: + ''' + A LabelledCollection is a set of objects each with a label associated to it. + ''' - def __init__(self, instances, labels, n_classes=None): + def __init__(self, instances, labels, classes_=None): + """ + :param instances: list of objects + :param labels: list of labels, same length of instances + :param classes_: optional, list of classes from which labels are taken. When used, must contain the set of values used in labels. + """ if issparse(instances): self.instances = instances - elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str): + elif isinstance(instances, list) and len(instances) > 0 and isinstance(instances[0], str): # lists of strings occupy too much as ndarrays (although python-objects add a heavy overload) self.instances = np.asarray(instances, dtype=object) else: self.instances = np.asarray(instances) - self.labels = np.asarray(labels, dtype=int) + self.labels = np.asarray(labels) n_docs = len(self) - if n_classes is None: + if classes_ is None: self.classes_ = np.unique(self.labels) self.classes_.sort() else: - self.classes_ = np.arange(n_classes) - self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_} + self.classes_ = np.unique(np.asarray(classes_)) + self.classes_.sort() + if len(set(self.labels).difference(set(classes_))) > 0: + raise ValueError('labels contains values not included in classes_') + self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} @classmethod - def load(cls, path:str, loader_func:callable): + def load(cls, path: str, loader_func: callable): return LabelledCollection(*loader_func(path)) def __len__(self): return self.instances.shape[0] def prevalence(self): - return self.counts()/len(self) + return self.counts() / len(self) def counts(self): - return np.asarray([len(self.index[ci]) for ci in self.classes_]) + return np.asarray([len(self.index[class_]) for class_ in self.classes_]) @property def n_classes(self): @@ -48,21 +60,21 @@ class LabelledCollection: def sampling_index(self, size, *prevs, shuffle=True): if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling return np.random.choice(len(self), size, replace=False) - if len(prevs) == self.n_classes-1: - prevs = prevs + (1-sum(prevs),) + if len(prevs) == self.n_classes - 1: + prevs = prevs + (1 - sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' taken = 0 indexes_sample = [] - for i, class_i in enumerate(self.classes_): - if i == self.n_classes-1: + for i, class_ in enumerate(self.classes_): + if i == self.n_classes - 1: n_requested = size - taken else: n_requested = int(size * prevs[i]) - n_candidates = len(self.index[class_i]) - index_sample = self.index[class_i][ + n_candidates = len(self.index[class_]) + index_sample = self.index[class_][ np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates)) ] if n_requested > 0 else [] @@ -90,21 +102,22 @@ class LabelledCollection: def sampling_from_index(self, index): documents = self.instances[index] labels = self.labels[index] - return LabelledCollection(documents, labels, n_classes=self.n_classes) + return LabelledCollection(documents, labels, classes_=self.classes_) def split_stratified(self, train_prop=0.6, random_state=None): # with temp_seed(42): tr_docs, te_docs, tr_labels, te_labels = \ - train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state) + train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, + random_state=random_state) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): - dimensions=self.n_classes + dimensions = self.n_classes for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling(sample_size, *prevs) def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): - dimensions=self.n_classes + dimensions = self.n_classes for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling_index(sample_size, *prevs) @@ -142,10 +155,10 @@ class LabelledCollection: else: nfeats = '?' stats_ = {'instances': ninstances, - 'type': instance_type, - 'features': nfeats, - 'classes': self.n_classes, - 'prevs': strprev(self.prevalence())} + 'type': instance_type, + 'features': nfeats, + 'classes': self.classes_, + 'prevs': strprev(self.prevalence())} if show: print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') @@ -155,13 +168,14 @@ class LabelledCollection: kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) for train_index, test_index in kf.split(*self.Xy): train = self.sampling_from_index(train_index) - test = self.sampling_from_index(test_index) + test = self.sampling_from_index(test_index) yield train, test + class Dataset: def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''): - assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections' + assert set(training.classes_) == set(test.classes_), 'incompatible labels in training and test collections' self.training = training self.test = test self.vocabulary = vocabulary @@ -171,6 +185,10 @@ class Dataset: def SplitStratified(cls, collection: LabelledCollection, train_size=0.6): return Dataset(*collection.split_stratified(train_prop=train_size)) + @property + def classes_(self): + return self.training.classes_ + @property def n_classes(self): return self.training.n_classes @@ -195,19 +213,15 @@ class Dataset: print(f'Dataset={self.name} #tr-instances={tr_stats["instances"]}, #te-instances={te_stats["instances"]}, ' f'type={tr_stats["type"]}, #features={tr_stats["features"]}, #classes={tr_stats["classes"]}, ' f'tr-prevs={tr_stats["prevs"]}, te-prevs={te_stats["prevs"]}') - return {'train': tr_stats ,'test':te_stats} + return {'train': tr_stats, 'test': te_stats} @classmethod def kFCV(cls, data: LabelledCollection, nfolds=5, nrepeats=1, random_state=0): for i, (train, test) in enumerate(data.kFCV(nfolds=nfolds, nrepeats=nrepeats, random_state=random_state)): - yield Dataset(train, test, name=f'fold {(i%nfolds)+1}/{nfolds} (round={(i//nfolds)+1})') + yield Dataset(train, test, name=f'fold {(i % nfolds) + 1}/{nfolds} (round={(i // nfolds) + 1})') def isbinary(data): if isinstance(data, Dataset) or isinstance(data, LabelledCollection): return data.binary return False - - - - diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 79d0bbf..575ffca 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -47,7 +47,7 @@ UCI_DATASETS = ['acute.a', 'acute.b', 'yeast'] -def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False): +def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ Load a Reviews dataset as a Dataset instance, as used in: Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification." @@ -91,7 +91,7 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle return data -def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False): +def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False) -> Dataset: """ Load a Twitter dataset as a Dataset instance, as used in: Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis. @@ -162,12 +162,12 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom return data -def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False): +def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: data = fetch_UCILabelledCollection(dataset_name, data_home, verbose) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False): +def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset: assert dataset_name in UCI_DATASETS, \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index 77752f0..ee1627e 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -29,13 +29,13 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw test_documents = vectorizer.transform(dataset.test.instances) if inplace: - dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes) - dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes) + dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.classes_) dataset.vocabulary = vectorizer.vocabulary_ return dataset else: - training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test, vectorizer.vocabulary_) @@ -66,8 +66,8 @@ def reduce_columns(dataset: Dataset, min_df=5, inplace=False): dataset.test.instances = Xte return dataset else: - training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test) @@ -100,13 +100,13 @@ def index(dataset: Dataset, min_df=5, inplace=False, **kwargs): test_index = indexer.transform(dataset.test.instances) if inplace: - dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes) - dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes) + dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.classes_) + dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.classes_) dataset.vocabulary = indexer.vocabulary_ return dataset else: - training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes) - test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes) + training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.classes_) + test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.classes_) return Dataset(training, test, indexer.vocabulary_) diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 743b99e..5b4d115 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix from tqdm import tqdm -def from_text(path): +def from_text(path, encoding='utf-8'): """ Reas a labelled colletion of documents. File fomart <0 or 1>\t\n @@ -11,7 +11,7 @@ def from_text(path): :return: a list of sentences, and a list of labels """ all_sentences, all_labels = [], [] - for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'): + for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'): line = line.strip() if line: label, sentence = line.split('\t') @@ -25,8 +25,8 @@ def from_text(path): def from_sparse(path): """ - Reas a labelled colletion of real-valued instances expressed in sparse format - File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n + Reads a labelled collection of real-valued instances expressed in sparse format + File format <-1 or 0 or 1>[\s col(int):val(float)]\n :param path: path to the labelled collection :return: a csr_matrix containing the instances (rows), and a ndarray containing the labels """ @@ -56,16 +56,16 @@ def from_sparse(path): return X, y -def from_csv(path): +def from_csv(path, encoding='utf-8'): """ - Reas a csv file in which columns are separated by ','. - File fomart