diff --git a/TODO.txt b/TODO.txt index 3c00f94..fd46d02 100644 --- a/TODO.txt +++ b/TODO.txt @@ -35,6 +35,8 @@ GridSearchQ is not trully parallelized. It only parallelizes on the predictions. In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__", in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere... +QuaNet needs refactoring. The base quantifiers ACC and PACC receive val_data with instances already transformed. This + issue is due to a bad design. Improvements: ========================================== @@ -49,6 +51,10 @@ We might want to think of (improving and) adding the class Tabular (it is define experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but also for plots). Add proper logging system. Currently we use print +It might be good to simplify the number of methods that have to be implemented for any new Quantifier. At the moment, + there are many functions like get_params, set_params, and, specially, @property classes_, which are cumbersome to + implement for quick experiments. A possible solution is to impose get_params and set_params only in cases in which + the model extends some "ModelSelectable" interface only. The classes_ should have a default implementation. Checks: ========================================== diff --git a/quapy/data/base.py b/quapy/data/base.py index 69d3ae6..b482548 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,3 +1,5 @@ +from typing import List, Union + import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack @@ -174,6 +176,104 @@ class LabelledCollection: yield train, test +class MultilingualLabelledCollection: + def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]): + assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists' + assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections' + assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \ + 'inconsistent classes found for some labelled collections' + self.llc = {l: lc for l, lc in zip(langs, labelledCollections)} + self.classes_=labelledCollections[0].classes_ + + @classmethod + def fromLangDict(cls, lang_labelledCollection:dict): + return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items())))) + + def langs(self): + return list(sorted(self.llc.keys())) + + def __getitem__(self, lang)->LabelledCollection: + return self.llc[lang] + + @classmethod + def load(cls, path: str, loader_func: callable): + return MultilingualLabelledCollection(*loader_func(path)) + + def __len__(self): + return sum(map(len, self.llc.values())) + + def prevalence(self): + prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0) + return prev / prev.sum() + + def language_prevalence(self): + lang_count = np.asarray([len(self.llc[l]) for l in self.langs()]) + return lang_count / lang_count.sum() + + def counts(self): + return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0) + + @property + def n_classes(self): + return len(self.classes_) + + @property + def binary(self): + return self.n_classes == 2 + + def __check_langs(self, l_dict:dict): + assert len(l_dict)==len(self.langs()), 'wrong number of languages' + assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes' + + def __check_sizes(self, l_sizes: Union[int,dict]): + assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes' + if isinstance(l_sizes, int): + return {l:l_sizes for l in self.langs()} + self.__check_langs(l_sizes) + return l_sizes + + def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True): + l_sizes = self.__check_sizes(l_sizes) + return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} + + def uniform_sampling_index(self, l_sizes: Union[int, dict]): + l_sizes = self.__check_sizes(l_sizes) + return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()} + + def uniform_sampling(self, l_sizes: Union[int, dict]): + l_sizes = self.__check_sizes(l_sizes) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()} + ) + + def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True): + l_sizes = self.__check_sizes(l_sizes) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()} + ) + + def sampling_from_index(self, l_index:dict): + self.__check_langs(l_index) + return MultilingualLabelledCollection.fromLangDict( + {l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()} + ) + + def split_stratified(self, train_prop=0.6, random_state=None): + train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()])) + return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test) + + def asLabelledCollection(self, return_langs=False): + lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi) + ls,Xs,ys = list(zip(*lXy_list)) + ls = np.concatenate(ls) + vertstack = vstack if issparse(Xs[0]) else np.vstack + Xs = vertstack(Xs) + ys = np.concatenate(ys) + lc = LabelledCollection(Xs, ys, classes_=self.classes_) + # return lc, ls if return_langs else lc +# +# +# class Dataset: def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):