Merge branch 'crosslingual' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy

This commit is contained in:
Alejandro Moreo Fernandez 2021-07-02 13:51:52 +02:00
commit 537a95fa18
2 changed files with 106 additions and 0 deletions

View File

@ -35,6 +35,8 @@ GridSearchQ is not trully parallelized. It only parallelizes on the predictions.
In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__",
in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to
impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere...
QuaNet needs refactoring. The base quantifiers ACC and PACC receive val_data with instances already transformed. This
issue is due to a bad design.
Improvements:
==========================================
@ -49,6 +51,10 @@ We might want to think of (improving and) adding the class Tabular (it is define
experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but
also for plots).
Add proper logging system. Currently we use print
It might be good to simplify the number of methods that have to be implemented for any new Quantifier. At the moment,
there are many functions like get_params, set_params, and, specially, @property classes_, which are cumbersome to
implement for quick experiments. A possible solution is to impose get_params and set_params only in cases in which
the model extends some "ModelSelectable" interface only. The classes_ should have a default implementation.
Checks:
==========================================

View File

@ -1,3 +1,5 @@
from typing import List, Union
import numpy as np
from scipy.sparse import issparse
from scipy.sparse import vstack
@ -174,6 +176,104 @@ class LabelledCollection:
yield train, test
class MultilingualLabelledCollection:
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
'inconsistent classes found for some labelled collections'
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
self.classes_=labelledCollections[0].classes_
@classmethod
def fromLangDict(cls, lang_labelledCollection:dict):
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
def langs(self):
return list(sorted(self.llc.keys()))
def __getitem__(self, lang)->LabelledCollection:
return self.llc[lang]
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilingualLabelledCollection(*loader_func(path))
def __len__(self):
return sum(map(len, self.llc.values()))
def prevalence(self):
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
return prev / prev.sum()
def language_prevalence(self):
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
return lang_count / lang_count.sum()
def counts(self):
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def binary(self):
return self.n_classes == 2
def __check_langs(self, l_dict:dict):
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
def __check_sizes(self, l_sizes: Union[int,dict]):
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
if isinstance(l_sizes, int):
return {l:l_sizes for l in self.langs()}
self.__check_langs(l_sizes)
return l_sizes
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
def uniform_sampling(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
)
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
)
def sampling_from_index(self, l_index:dict):
self.__check_langs(l_index)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
)
def split_stratified(self, train_prop=0.6, random_state=None):
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
def asLabelledCollection(self, return_langs=False):
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
ls,Xs,ys = list(zip(*lXy_list))
ls = np.concatenate(ls)
vertstack = vstack if issparse(Xs[0]) else np.vstack
Xs = vertstack(Xs)
ys = np.concatenate(ys)
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
# return lc, ls if return_langs else lc
#
#
#
class Dataset:
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):