refactor: methods requiring a val_split can now declare a default value in the __init__ method that will be used in case the fit method is called without specifying the val_split, which now is by default None in the fit, i.e., by default takes the value of the init, that is generally set to 0.4; some uci datasets added; ensembles can now be optimized for quantification, and can be trained on samples of smaller size

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-22 18:01:51 +01:00
parent 54dc2980e6
commit 03cf73aff6
11 changed files with 277 additions and 143 deletions

View File

@ -1,3 +1,34 @@
# QuaPy # QuaPy
A Quantification framework written in Python. QuaPy is an open source framework for Quantification (a.k.a. Supervised Prevalence Estimation)
written in Python.
QuaPy roots on the concept of data sample, and provides implementations of
most important concepts in quantification literature, such as the most important
quantification baselines, many advanced quantification methods,
quantification-oriented model selection, many evaluation measures and protocols
used for evaluating quantification methods.
QuaPy also integrates commonly used datasets and offers visualization tools
for facilitating the analysis and interpretation of results.
```python
import quapy as qp
from sklearn.linear_model import LogisticRegression
dataset = qp.datasets.fetch_twitter('semeval16')
# create an "Adjusted Classify & Count" quantifier
model = qp.method.aggregative.ACC(LogisticRegression())
model.fit(dataset.training)
prevalences_estim = model.quantify(dataset.test.instances)
prevalences_true = dataset.test.prevalence()
error = qp.error.mae(prevalences_true, prevalences_estim)
print(f'MAE={error:.3f}')
```
binary, and single-label

View File

@ -25,3 +25,4 @@ Rename EMQ to SLD ?
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
to one always? to one always?
Parallelize the kFCV in ACC and PACC Parallelize the kFCV in ACC and PACC
Requirements: xlrd for reading excel

View File

@ -20,49 +20,64 @@ import shutil
DEBUG = False DEBUG = False
def quantification_models(): def newLR():
def newLR():
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
__C_range = np.logspace(-4, 5, 10)
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
svmperf_params = {'C': __C_range}
__C_range = np.logspace(-4, 5, 10)
lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']}
svmperf_params = {'C': __C_range}
def quantification_models():
# methods tested in Gao & Sebastiani 2016 # methods tested in Gao & Sebastiani 2016
# yield 'cc', CC(newLR()), lr_params yield 'cc', CC(newLR()), lr_params
# yield 'acc', ACC(newLR()), lr_params yield 'acc', ACC(newLR()), lr_params
# yield 'pcc', PCC(newLR()), lr_params yield 'pcc', PCC(newLR()), lr_params
# yield 'pacc', PACC(newLR()), lr_params yield 'pacc', PACC(newLR()), lr_params
# yield 'sld', EMQ(newLR()), lr_params yield 'sld', EMQ(newLR()), lr_params
# yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params yield 'svmq', OneVsAll(SVMQ(args.svmperfpath)), svmperf_params
# yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params yield 'svmkld', OneVsAll(SVMKLD(args.svmperfpath)), svmperf_params
# yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params yield 'svmnkld', OneVsAll(SVMNKLD(args.svmperfpath)), svmperf_params
#
# # methods added
# yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
# yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
# yield 'hdy', OneVsAll(HDy(newLR())), lr_params
# methods added
yield 'svmmae', OneVsAll(SVMAE(args.svmperfpath)), svmperf_params
yield 'svmmrae', OneVsAll(SVMRAE(args.svmperfpath)), svmperf_params
yield 'hdy', OneVsAll(HDy(newLR())), lr_params
def quantification_cuda_models():
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running QuaNet in {device}') print(f'Running QuaNet in {device}')
if DEBUG: learner = PCALR(**newLR().get_params())
lr_params={'C':[1,10]} yield 'quanet', QuaNet(learner, settings.SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params
yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE,
lstm_hidden_size=32, lstm_nlayers=1,
tr_iter_per_poch=50, va_iter_per_poch=10,
patience=3,
checkpointdir=args.checkpointdir, device=device), lr_params
else:
yield 'quanet', QuaNet(PCALR(**newLR().get_params()), settings.SAMPLE_SIZE,
checkpointdir=args.checkpointdir, device=device), lr_params
#param_mod_sel={'sample_size':settings.SAMPLE_SIZE, 'n_prevpoints':21, 'n_repetitions':5} def quantification_ensembles():
#yield 'epaccmaeptr', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None param_mod_sel = {
# yield 'epaccmraeptr', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='ptr', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None 'sample_size': settings.SAMPLE_SIZE,
# yield 'epaccmae', EPACC(newLR(), param_grid=lr_params, optim='mae', policy='mae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None 'n_prevpoints': 21,
# yield 'epaccmrae', EPACC(newLR(), param_grid=lr_params, optim='mrae', policy='mrae', param_mod_sel=param_mod_sel, n_jobs=settings.ENSEMBLE_N_JOBS), None 'n_repetitions': 5,
'verbose': False
}
common={
'max_sample_size': 500,
'n_jobs': settings.ENSEMBLE_N_JOBS,
'param_grid': lr_params,
'param_mod_sel': param_mod_sel,
'val_split': 0.4
}
#yield 'mlpe', MaximumLikelihoodPrevalenceEstimation(), {} # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
# will be skipped (by setting hyperparameters to None)
hyper_none = None
yield 'epaccmaeptr', EPACC(newLR(), optim='mae', policy='ptr', **common), hyper_none
yield 'epaccmaemae', EPACC(newLR(), optim='mae', policy='mae', **common), hyper_none
yield 'esldmaeptr', EEMQ(newLR(), optim='mae', policy='ptr', **common), hyper_none
yield 'esldmaemae', EEMQ(newLR(), optim='mae', policy='mae', **common), hyper_none
yield 'epaccmraeptr', EPACC(newLR(), optim='mrae', policy='ptr', **common), hyper_none
yield 'epaccmraemrae', EPACC(newLR(), optim='mrae', policy='mrae', **common), hyper_none
yield 'esldmraeptr', EEMQ(newLR(), optim='mrae', policy='ptr', **common), hyper_none
yield 'esldmraemrae', EEMQ(newLR(), optim='mrae', policy='mrae', **common), hyper_none
def evaluate_experiment(true_prevalences, estim_prevalences): def evaluate_experiment(true_prevalences, estim_prevalences):
@ -119,10 +134,7 @@ def run(experiment):
benchmark_devel.stats() benchmark_devel.stats()
# model selection (hyperparameter optimization for a quantification-oriented loss) # model selection (hyperparameter optimization for a quantification-oriented loss)
if hyperparams is None: if hyperparams is not None:
model.fit(benchmark_devel.training, benchmark_devel.test)
best_params = {}
else:
model_selection = qp.model_selection.GridSearchQ( model_selection = qp.model_selection.GridSearchQ(
model, model,
param_grid=hyperparams, param_grid=hyperparams,
@ -137,6 +149,8 @@ def run(experiment):
model_selection.fit(benchmark_devel.training, benchmark_devel.test) model_selection.fit(benchmark_devel.training, benchmark_devel.test)
model = model_selection.best_model() model = model_selection.best_model()
best_params = model_selection.best_params_ best_params = model_selection.best_params_
else:
best_params = {}
# model evaluation # model evaluation
test_names = [dataset_name] if dataset_name != 'semeval' else ['semeval13', 'semeval14', 'semeval15'] test_names = [dataset_name] if dataset_name != 'semeval' else ['semeval13', 'semeval14', 'semeval15']
@ -183,9 +197,19 @@ if __name__ == '__main__':
optim_losses = ['mae'] # ['mae', 'mrae'] optim_losses = ['mae'] # ['mae', 'mrae']
datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN
models = quantification_models()
results = Parallel(n_jobs=settings.N_JOBS)( #models = quantification_models()
#Parallel(n_jobs=settings.N_JOBS)(
# delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
#)
#models = quantification_cuda_models()
#Parallel(n_jobs=settings.CUDA_N_JOBS)(
# delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
#)
models = quantification_ensembles()
Parallel(n_jobs=1)(
delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models) delayed(run)(experiment) for experiment in itertools.product(optim_losses, datasets, models)
) )

View File

@ -1,7 +1,8 @@
import multiprocessing import multiprocessing
N_JOBS = 1 #multiprocessing.cpu_count() N_JOBS = -2 #multiprocessing.cpu_count()
CUDA_N_JOBS = 1
ENSEMBLE_N_JOBS = -2 ENSEMBLE_N_JOBS = -2
SAMPLE_SIZE = 100 SAMPLE_SIZE = 100
assert N_JOBS==1 or ENSEMBLE_N_JOBS==1, 'general N_JOBS and ENSEMBLE_N_JOBS should not be both greater than 1'

View File

@ -92,10 +92,10 @@ class LabelledCollection:
labels = self.labels[index] labels = self.labels[index]
return LabelledCollection(documents, labels, n_classes=self.n_classes) return LabelledCollection(documents, labels, n_classes=self.n_classes)
def split_stratified(self, train_prop=0.6): def split_stratified(self, train_prop=0.6, random_state=None):
# with temp_seed(42): # with temp_seed(42):
tr_docs, te_docs, tr_labels, te_labels = \ tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels) train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, random_state=random_state)
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):

View File

@ -1,6 +1,7 @@
import os import os
import zipfile import zipfile
from os.path import join from os.path import join
from urllib.error import HTTPError
import pandas as pd import pandas as pd
@ -137,9 +138,11 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3', 'balance.1', 'balance.2', 'balance.3',
'breast-cancer', 'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3', 'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3'] # ongoing... 'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
assert dataset_name in UCI_DATASETS, \ assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
@ -147,22 +150,6 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
if data_home is None: if data_home is None:
data_home = get_quapy_home() data_home = get_quapy_home()
identifier_map = {
'acute.a': 'acute',
'acute.b': 'acute',
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': 'ctg',
'ctg.2': 'ctg',
'ctg.3': 'ctg',
}
dataset_fullname = { dataset_fullname = {
'acute.a': 'Acute Inflammations (urinary bladder)', 'acute.a': 'Acute Inflammations (urinary bladder)',
'acute.b': 'Acute Inflammations (renal pelvis)', 'acute.b': 'Acute Inflammations (renal pelvis)',
@ -176,27 +163,64 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
'ctg.1': 'Cardiotocography Data Set (normal)', 'ctg.1': 'Cardiotocography Data Set (normal)',
'ctg.2': 'Cardiotocography Data Set (suspect)', 'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)', 'ctg.3': 'Cardiotocography Data Set (pathologic)',
'german': 'Statlog German Credit Data',
} }
data_folder = { # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
'acute': 'diagnosis', # to download the raw dataset
'balance-scale': 'balance-scale', identifier_map = {
'breast-cancer-wisconsin': 'breast-cancer-wisconsin', 'acute.a': 'acute',
'cmc': 'cmc' 'acute.b': 'acute',
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': '00193',
'ctg.2': '00193',
'ctg.3': '00193',
'german': 'statlog/german'
}
# the filename is the name of the file within the data_folder indexed by the identifier
file_name = {
'acute': 'diagnosis.data',
'balance-scale': 'balance-scale.data',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data',
'cmc': 'cmc.data',
'00193': 'CTG.xls',
'statlog/german': 'german.data-numeric'
}
# the filename containing the dataset description (if any)
desc_name = {
'acute': 'diagnosis.names',
'balance-scale': 'balance-scale.names',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names',
'cmc': 'cmc.names',
'00193': None,
'statlog/german': 'german.doc'
} }
identifier = identifier_map[dataset_name] identifier = identifier_map[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_path = join(data_home, 'uci_datasets', identifier) data_dir = join(data_home, 'uci_datasets', identifier)
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data') data_path = join(data_dir, file_name[identifier])
download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names') download_file_if_not_exists(f'{URL}/{file_name[identifier]}', data_path)
descfile = desc_name[identifier]
if descfile:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose: if verbose:
print(open(f'{data_path}/{identifier}.names', 'rt').read()) print(open(f'{data_dir}/{descfile}', 'rt').read())
elif verbose:
print('no file description available')
print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})') print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
if identifier == 'acute': if identifier == 'acute':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t') df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
if dataset_name == 'acute.a': if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes') y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b': elif dataset_name == 'acute.b':
@ -208,7 +232,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
X = df.loc[:, 0:5].values X = df.loc[:, 0:5].values
if identifier == 'balance-scale': if identifier == 'balance-scale':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
if dataset_name == 'balance.1': if dataset_name == 'balance.1':
y = binarize(df[0], pos_class='L') y = binarize(df[0], pos_class='L')
elif dataset_name == 'balance.2': elif dataset_name == 'balance.2':
@ -218,7 +242,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
X = df.loc[:, 1:].astype(float).values X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin': if identifier == 'breast-cancer-wisconsin':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
Xy = df.loc[:, 1:10] Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0) Xy = Xy.dropna(axis=0)
@ -227,7 +251,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
y = binarize(Xy[10], pos_class=4) y = binarize(Xy[10], pos_class=4)
if identifier == 'cmc': if identifier == 'cmc':
df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 0:8].astype(float).values X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values y = df[9].astype(int).values
if dataset_name == 'cmc.1': if dataset_name == 'cmc.1':
@ -237,25 +261,32 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False):
elif dataset_name == 'cmc.3': elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3) y = binarize(y, pos_class=3)
if identifier == '00193':
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
# replaces the header with the first row
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:22].astype(float).values
y = df['NSP'].astype(int).values
if dataset_name == 'ctg.1': # 1==Normal
y = binarize(y, pos_class=1)
elif dataset_name == 'ctg.2':
y = binarize(y, pos_class=2) # 1==Suspect
elif dataset_name == 'ctg.3':
y = binarize(y, pos_class=3) # 1==Pathologic
if identifier == 'statlog/german':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 0:24].astype(float).values
y = df[24].astype(int).values
y = binarize(y, pos_class=1)
data = LabelledCollection(X, y) data = LabelledCollection(X, y)
data.stats() data.stats()
raise NotImplementedError() return Dataset(*data.split_stratified(1-test_split, random_state=0))
#print(df)
#print(df.loc[:, 0:5].values)
#print(y)
# X = __read_csv(f'{data_path}/{identifier}.data', separator='\t')
# print(X)
#X, y = from_csv(f'{data_path}/{dataset_name}.data')
#y, classnames = reindex_labels(y)
#def __read_csv(path, separator=','):
# x = []
# for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'):
# x.append(instance.strip().split(separator))
# return x
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)

View File

@ -60,7 +60,6 @@ def artificial_sampling_prediction(
estim_prevalence = quantification_func(sample.instances) estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence return true_prevalence, estim_prevalence
print('predicting')
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
results = Parallel(n_jobs=n_jobs)( results = Parallel(n_jobs=n_jobs)(
delayed(_predict_prevalences)(index) for index in pbar delayed(_predict_prevalences)(index) for index in pbar

View File

@ -84,7 +84,7 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
def set_params(self, **parameters): def set_params(self, **parameters):
if isinstance(self.learner, CalibratedClassifierCV): if isinstance(self.learner, CalibratedClassifierCV):
parameters={'base_estimator__'+k:v for k,v in parameters.items()} parameters = {'base_estimator__'+k:v for k,v in parameters.items()}
self.learner.set_params(**parameters) self.learner.set_params(**parameters)
@property @property
@ -172,10 +172,11 @@ class CC(AggregativeQuantifier):
class ACC(AggregativeQuantifier): class ACC(AggregativeQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner:BaseEstimator, val_split=0.4):
self.learner = learner self.learner = learner
self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection]=None):
""" """
Trains a ACC quantifier Trains a ACC quantifier
:param data: the training set :param data: the training set
@ -186,7 +187,8 @@ class ACC(AggregativeQuantifier):
to estimate the parameters to estimate the parameters
:return: self :return: self
""" """
assert val_split is not None, 'val_split cannot be set to None' if val_split is None:
val_split = self.val_split
if isinstance(val_split, int): if isinstance(val_split, int):
# kFCV estimation of parameters # kFCV estimation of parameters
y, y_ = [], [] y, y_ = [], []
@ -256,10 +258,11 @@ class PCC(AggregativeProbabilisticQuantifier):
class PACC(AggregativeProbabilisticQuantifier): class PACC(AggregativeProbabilisticQuantifier):
def __init__(self, learner:BaseEstimator): def __init__(self, learner: BaseEstimator, val_split=0.4):
self.learner = learner self.learner = learner
self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=0.4): def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, int, LabelledCollection]=None):
""" """
Trains a PACC quantifier Trains a PACC quantifier
:param data: the training set :param data: the training set
@ -270,7 +273,9 @@ class PACC(AggregativeProbabilisticQuantifier):
to estimate the parameters to estimate the parameters
:return: self :return: self
""" """
assert val_split is not None, 'val_split cannot be set to None' if val_split is None:
val_split = self.val_split
if isinstance(val_split, int): if isinstance(val_split, int):
# kFCV estimation of parameters # kFCV estimation of parameters
y, y_ = [], [] y, y_ = [], []
@ -374,10 +379,11 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
estimation based on the Hellinger distance. Information Sciences, 218:146164. estimation based on the Hellinger distance. Information Sciences, 218:146164.
""" """
def __init__(self, learner: BaseEstimator): def __init__(self, learner: BaseEstimator, val_split=0.4):
self.learner = learner self.learner = learner
self.val_split = val_split
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=0.4): def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None):
""" """
Trains a HDy quantifier Trains a HDy quantifier
:param data: the training set :param data: the training set
@ -387,7 +393,9 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
indicating the validation set itself indicating the validation set itself
:return: self :return: self
""" """
assert val_split is not None, 'val_split cannot be set to None' if val_split is None:
val_split = self.val_split
self._check_binary(data, self.__class__.__name__) self._check_binary(data, self.__class__.__name__)
self.learner, validation = training_helper( self.learner, validation = training_helper(
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split) self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
@ -498,7 +506,7 @@ class OneVsAll(AggregativeQuantifier):
self.binary_quantifier = binary_quantifier self.binary_quantifier = binary_quantifier
self.n_jobs = n_jobs self.n_jobs = n_jobs
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, LabelledCollection]=None): def fit(self, data: LabelledCollection, fit_learner=True):
assert not data.binary, \ assert not data.binary, \
f'{self.__class__.__name__} expect non-binary data' f'{self.__class__.__name__} expect non-binary data'
assert isinstance(self.binary_quantifier, BaseQuantifier), \ assert isinstance(self.binary_quantifier, BaseQuantifier), \

View File

@ -34,16 +34,26 @@ class Ensemble(BaseQuantifier):
Information Fusion, 45, 1-15. Information Fusion, 45, 1-15.
""" """
def __init__(self, quantifier: BaseQuantifier, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, verbose=True, max_sample_size=None): def __init__(self,
quantifier: BaseQuantifier,
size=50,
red_size=25,
min_pos=1,
policy='ave',
max_sample_size=None,
val_split=None,
n_jobs=1,
verbose=False):
assert policy in Ensemble.VALID_POLICIES, \ assert policy in Ensemble.VALID_POLICIES, \
f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}' f'unknown policy={policy}; valid are {Ensemble.VALID_POLICIES}'
assert max_sample_size is None or max_sample_size > 0, \ assert max_sample_size is None or max_sample_size > 0, \
'wrong value for max_sample_size; set to a positive number or None' 'wrong value for max_sample_size; set it to a positive number or None'
self.base_quantifier = quantifier self.base_quantifier = quantifier
self.size = size self.size = size
self.min_pos = min_pos self.min_pos = min_pos
self.red_size = red_size self.red_size = red_size
self.policy = policy self.policy = policy
self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.post_proba_fn = None self.post_proba_fn = None
self.verbose = verbose self.verbose = verbose
@ -53,10 +63,12 @@ class Ensemble(BaseQuantifier):
if self.verbose: if self.verbose:
print('[Ensemble]' + msg) print('[Ensemble]' + msg)
def fit(self, data: qp.data.LabelledCollection, val_split:Union[qp.data.LabelledCollection, float]=None): def fit(self, data: qp.data.LabelledCollection, val_split: Union[qp.data.LabelledCollection, float]=None):
self.sout('Fit') self.sout('Fit')
if self.policy=='ds' and not data.binary: if self.policy=='ds' and not data.binary:
raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary') raise ValueError(f'ds policy is only defined for binary quantification, but this dataset is not binary')
if val_split is None:
val_split = self.val_split
# randomly chooses the prevalences for each member of the ensemble (preventing classes with less than # randomly chooses the prevalences for each member of the ensemble (preventing classes with less than
# min_pos positive examples) # min_pos positive examples)
@ -71,7 +83,8 @@ class Ensemble(BaseQuantifier):
sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data)) sample_size = len(data) if self.max_sample_size is None else min(self.max_sample_size, len(data))
self.ensemble = Parallel(n_jobs=self.n_jobs)( self.ensemble = Parallel(n_jobs=self.n_jobs)(
delayed(_delayed_new_instance)( delayed(_delayed_new_instance)(
self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy, verbose=self.verbose, sample_size=sample_size self.base_quantifier, data, val_split, prev, posteriors, keep_samples=is_static_policy,
verbose=self.verbose, sample_size=sample_size
) for prev in tqdm(prevs, desc='fitting ensamble') ) for prev in tqdm(prevs, desc='fitting ensamble')
) )
@ -206,15 +219,20 @@ def _delayed_new_instance(base_quantifier,
if verbose: if verbose:
print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}') print(f'\tfit-start for prev {F.strprev(prev)}, sample_size={sample_size}')
model = deepcopy(base_quantifier) model = deepcopy(base_quantifier)
if val_split is not None:
if isinstance(val_split, float):
assert 0 < val_split < 1, 'val_split should be in (0,1)'
data, val_split = data.split_stratified(train_prop=1-val_split)
sample_index = data.sampling_index(sample_size, *prev) sample_index = data.sampling_index(sample_size, *prev)
sample = data.sampling_from_index(sample_index) sample = data.sampling_from_index(sample_index)
if val_split is None:
model.fit(sample) if val_split is not None:
else:
if isinstance(val_split, float):
assert 0<val_split<1, 'val_split should be in (0,1)'
sample, val_split = sample.split_stratified(train_prop=1-val_split)
model.fit(sample, val_split=val_split) model.fit(sample, val_split=val_split)
else:
model.fit(sample)
tr_prevalence = sample.prevalence() tr_prevalence = sample.prevalence()
tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None tr_distribution = get_probability_distribution(posteriors[sample_index]) if (posteriors is not None) else None
if verbose: if verbose:
@ -281,35 +299,31 @@ def _check_error(error):
f'the name of an error function in {qp.error.ERROR_NAMES}') f'the name of an error function in {qp.error.ERROR_NAMES}')
def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, def ensembleFactory(learner, base_quantifier_class, param_grid=None, optim=None, param_model_sel:dict=None, **kwargs):
param_model_sel:dict=None,
size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None):
if optim is not None: if optim is not None:
if param_grid is None: if param_grid is None:
raise ValueError(f'param_grid is None but optim was requested.') raise ValueError(f'param_grid is None but optim was requested.')
if param_model_sel is None: if param_model_sel is None:
raise ValueError(f'param_model_sel is None but optim was requested.') raise ValueError(f'param_model_sel is None but optim was requested.')
error = _check_error(optim) error = _check_error(optim)
return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, return _instantiate_ensemble(learner, base_quantifier_class, param_grid, error, param_model_sel, **kwargs)
size=size, min_pos=min_pos, red_size=red_size,
policy=policy, n_jobs=n_jobs, max_sample_size=max_sample_size)
def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None): def ECC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size) return ensembleFactory(learner, CC, param_grid, optim, param_mod_sel, **kwargs)
def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None): def EACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size) return ensembleFactory(learner, ACC, param_grid, optim, param_mod_sel, **kwargs)
def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None): def EPACC(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size) return ensembleFactory(learner, PACC, param_grid, optim, param_mod_sel, **kwargs)
def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None): def EHDy(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size) return ensembleFactory(learner, HDy, param_grid, optim, param_mod_sel, **kwargs)
def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, size=50, min_pos=1, red_size=25, policy='ave', n_jobs=1, max_sample_size=None): def EEMQ(learner, param_grid=None, optim=None, param_mod_sel=None, **kwargs):
return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, size, min_pos, red_size, policy, n_jobs, max_sample_size=max_sample_size) return ensembleFactory(learner, EMQ, param_grid, optim, param_mod_sel, **kwargs)

View File

@ -21,6 +21,7 @@ class GridSearchQ(BaseQuantifier):
eval_budget : int = None, eval_budget : int = None,
error: Union[Callable, str] = qp.error.mae, error: Union[Callable, str] = qp.error.mae,
refit=False, refit=False,
val_split=0.4,
n_jobs=1, n_jobs=1,
random_seed=42, random_seed=42,
timeout=-1, timeout=-1,
@ -63,6 +64,7 @@ class GridSearchQ(BaseQuantifier):
self.n_repetitions = n_repetitions self.n_repetitions = n_repetitions
self.eval_budget = eval_budget self.eval_budget = eval_budget
self.refit = refit self.refit = refit
self.val_split = val_split
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.random_seed = random_seed self.random_seed = random_seed
self.timeout = timeout self.timeout = timeout
@ -118,12 +120,14 @@ class GridSearchQ(BaseQuantifier):
raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n' raise ValueError(f'unexpected error type; must either be a callable function or a str representing\n'
f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}') f'the name of an error function in {qp.error.QUANTIFICATION_ERROR_NAMES}')
def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=0.4): def fit(self, training: LabelledCollection, val_split: Union[LabelledCollection, float]=None):
""" """
:param training: the training set on which to optimize the hyperparameters :param training: the training set on which to optimize the hyperparameters
:param val_split: either a LabelledCollection on which to test the performance of the different settings, or :param val_split: either a LabelledCollection on which to test the performance of the different settings, or
a float in [0,1] indicating the proportion of labelled data to extract from the training set a float in [0,1] indicating the proportion of labelled data to extract from the training set
""" """
if val_split is None:
val_split = self.val_split
training, val_split = self.__check_training_validation(training, val_split) training, val_split = self.__check_training_validation(training, val_split)
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer' assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
self.__check_num_evals(self.n_prevpoints, self.eval_budget, self.n_repetitions, training.n_classes) self.__check_num_evals(self.n_prevpoints, self.eval_budget, self.n_repetitions, training.n_classes)
@ -158,7 +162,7 @@ class GridSearchQ(BaseQuantifier):
model.fit(training) model.fit(training)
true_prevalences, estim_prevalences = artificial_sampling_prediction( true_prevalences, estim_prevalences = artificial_sampling_prediction(
model, val_split, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed, model, val_split, self.sample_size, self.n_prevpoints, self.n_repetitions, n_jobs, self.random_seed,
verbose=True verbose=False
) )
score = self.error(true_prevalences, estim_prevalences) score = self.error(true_prevalences, estim_prevalences)

37
test.py
View File

@ -13,9 +13,7 @@ from quapy.model_selection import GridSearchQ
#qp.datasets.fetch_UCIDataset('acute.b', verbose=True)
#sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500 qp.environ['SAMPLE_SIZE'] = 500
#param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} #param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']} param_grid = {'C': np.logspace(0,3,4), 'class_weight': ['balanced']}
@ -26,11 +24,12 @@ binary = False
svmperf_home = './svm_perf_quantification' svmperf_home = './svm_perf_quantification'
if binary: if binary:
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5) #dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
dataset = qp.datasets.fetch_UCIDataset('german', verbose=True)
#qp.data.preprocessing.index(dataset, inplace=True) #qp.data.preprocessing.index(dataset, inplace=True)
else: else:
dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=False, min_df=5, pickle=True) dataset = qp.datasets.fetch_twitter('gasp', for_model_selection=True, min_df=5, pickle=True)
#dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3) #dataset.training = dataset.training.sampling(sample_size, 0.2, 0.5, 0.3)
print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}') print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.test)}')
@ -57,10 +56,32 @@ print(f'dataset loaded: #training={len(dataset.training)} #test={len(dataset.tes
# model = qp.method.aggregative.ClassifyAndCount(learner) # model = qp.method.aggregative.ClassifyAndCount(learner)
learner = LogisticRegression(max_iter=1000) learner = LogisticRegression(max_iter=1000)
model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=200) #model = qp.method.aggregative.PACC(learner)
# param_grid={'C':[1,10,100]}, #model = qp.method.aggregative.ACC(learner)
# optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5}, model = qp.method.meta.EPACC(learner, size=10, red_size=5, max_sample_size=500, n_jobs=-1,
# policy='ptr', n_jobs=1) param_grid={'C':[1,10,100]},
optim='mae', param_mod_sel={'sample_size':100, 'n_prevpoints':21, 'n_repetitions':5, 'verbose':True},
policy='ptr',
val_split=0.4)
"""
Problemas:
- La interfaz es muy fea, hay que conocer practicamente todos los detalles así que no ahorra nada con respecto a crear
un objeto con otros anidados dentro
- El fit genera las prevalences random, y esto hace que despues de la model selection, un nuevo fit tire todo el trabajo
hecho.
- El fit de un GridSearcQ tiene dentro un best_estimator, pero después de la model selection, hacer fit otra vez sobre
este objeto no se limita a re-entrenar el modelo con los mejores parámetros, sino que inicia una nueva búsqueda
en modo grid search.
- Posible solución (no vale): sería hacer directamente model selection con el benchmark final, aunque esto haría que los hyper-
parámetros se buscasen en un conjunto diferente del resto de models....
- Posible solución:
- Elegir las prevalences en init
-
- Problema: el parámetro val_split es muy ambiguo en todo el framework. Por ejemplo, en EPACC podría ser un float que,
en el caso de un GridSearchQ podría referir al split de validación para los hyperparámetros o al split que usa PACC
para encontrar los parámetros...
"""
# regressor = LinearSVR(max_iter=10000) # regressor = LinearSVR(max_iter=10000)
# param_grid = {'C': np.logspace(-1,3,5)} # param_grid = {'C': np.logspace(-1,3,5)}
# model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False) # model = AveragePoolQuantification(regressor, sample_size, trials=5000, n_components=500, zscore=False)