adding features for cross-lingual
This commit is contained in:
parent
986e61620c
commit
ce908573e7
6
TODO.txt
6
TODO.txt
|
@ -35,6 +35,8 @@ GridSearchQ is not trully parallelized. It only parallelizes on the predictions.
|
|||
In the context of a quantifier (e.g., QuaNet or CC), the parameters of the learner should be prefixed with "estimator__",
|
||||
in QuaNet this is resolved with a __check_params_colision, but this should be improved. It might be cumbersome to
|
||||
impose the "estimator__" prefix for, e.g., quantifiers like CC though... This should be changed everywhere...
|
||||
QuaNet needs refactoring. The base quantifiers ACC and PACC receive val_data with instances already transformed. This
|
||||
issue is due to a bad design.
|
||||
|
||||
Improvements:
|
||||
==========================================
|
||||
|
@ -49,6 +51,10 @@ We might want to think of (improving and) adding the class Tabular (it is define
|
|||
experiment looks like. (Do we want to abstract experimental results? this could be useful not only for tables but
|
||||
also for plots).
|
||||
Add proper logging system. Currently we use print
|
||||
It might be good to simplify the number of methods that have to be implemented for any new Quantifier. At the moment,
|
||||
there are many functions like get_params, set_params, and, specially, @property classes_, which are cumbersome to
|
||||
implement for quick experiments. A possible solution is to impose get_params and set_params only in cases in which
|
||||
the model extends some "ModelSelectable" interface only. The classes_ should have a default implementation.
|
||||
|
||||
Checks:
|
||||
==========================================
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from typing import List, Union
|
||||
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from scipy.sparse import vstack
|
||||
|
@ -174,6 +176,104 @@ class LabelledCollection:
|
|||
yield train, test
|
||||
|
||||
|
||||
class MultilingualLabelledCollection:
|
||||
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
|
||||
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
|
||||
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
|
||||
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
|
||||
'inconsistent classes found for some labelled collections'
|
||||
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
|
||||
self.classes_=labelledCollections[0].classes_
|
||||
|
||||
@classmethod
|
||||
def fromLangDict(cls, lang_labelledCollection:dict):
|
||||
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
|
||||
|
||||
def langs(self):
|
||||
return list(sorted(self.llc.keys()))
|
||||
|
||||
def __getitem__(self, lang)->LabelledCollection:
|
||||
return self.llc[lang]
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return MultilingualLabelledCollection(*loader_func(path))
|
||||
|
||||
def __len__(self):
|
||||
return sum(map(len, self.llc.values()))
|
||||
|
||||
def prevalence(self):
|
||||
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
|
||||
return prev / prev.sum()
|
||||
|
||||
def language_prevalence(self):
|
||||
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
|
||||
return lang_count / lang_count.sum()
|
||||
|
||||
def counts(self):
|
||||
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.n_classes == 2
|
||||
|
||||
def __check_langs(self, l_dict:dict):
|
||||
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
|
||||
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
|
||||
|
||||
def __check_sizes(self, l_sizes: Union[int,dict]):
|
||||
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
|
||||
if isinstance(l_sizes, int):
|
||||
return {l:l_sizes for l in self.langs()}
|
||||
self.__check_langs(l_sizes)
|
||||
return l_sizes
|
||||
|
||||
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
|
||||
def uniform_sampling(self, l_sizes: Union[int, dict]):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
|
||||
l_sizes = self.__check_sizes(l_sizes)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def sampling_from_index(self, l_index:dict):
|
||||
self.__check_langs(l_index)
|
||||
return MultilingualLabelledCollection.fromLangDict(
|
||||
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
|
||||
)
|
||||
|
||||
def split_stratified(self, train_prop=0.6, random_state=None):
|
||||
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
|
||||
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
|
||||
|
||||
def asLabelledCollection(self, return_langs=False):
|
||||
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
|
||||
ls,Xs,ys = list(zip(*lXy_list))
|
||||
ls = np.concatenate(ls)
|
||||
vertstack = vstack if issparse(Xs[0]) else np.vstack
|
||||
Xs = vertstack(Xs)
|
||||
ys = np.concatenate(ys)
|
||||
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
|
||||
# return lc, ls if return_langs else lc
|
||||
#
|
||||
#
|
||||
#
|
||||
class Dataset:
|
||||
|
||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):
|
||||
|
|
Loading…
Reference in New Issue