cleaning examples

This commit is contained in:
Alejandro Moreo Fernandez 2025-09-26 15:22:41 +02:00
parent bf71aecf91
commit 3c16536b3d
7 changed files with 36 additions and 28 deletions

View File

@ -1,4 +1,5 @@
Adapt examples; remaining: example 4-onwards
not working: 4, 4b, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
Add 'platt' to calib options in EMQ?
@ -8,7 +9,7 @@ Update READMEs, wiki, & examples for new fit-predict interface
Add the fix suggested by Alexander:
For a more general application, I would maybe first stablish a per-class threshold value of plausible prevalence
For a more general application, I would maybe first establish a per-class threshold value of plausible prevalence
based on the number of actual positives and the required sample size; e.g., for sample_size=100 and actual
positives [10, 100, 500] -> [0.1, 1.0, 1.0], meaning that class 0 can be sampled at most at 0.1 prevalence, while
the others can be sampled up to 1. prevalence. Then, when a prevalence value is requested, e.g., [0.33, 0.33, 0.33],

View File

@ -37,7 +37,7 @@ quantifier = EMQ(classifier=LogisticRegression())
param_grid = {
'classifier__C': np.logspace(-3, 3, 7), # classifier-dependent: inverse of regularization strength
'classifier__class_weight': ['balanced', None], # classifier-dependent: weights of each class
'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7)
# 'calib': ['bcts', None] # quantifier-dependent: recalibration method (new in v0.1.7)
}
model_selection = GridSearchQ(quantifier, param_grid, protocol=val_generator, error='mrae', refit=False, verbose=True)
quantifier = model_selection.fit(Xtr, ytr)
@ -51,4 +51,4 @@ report['estim-prev'] = report['estim-prev'].map(F.strprev)
print(report)
print('Averaged values:')
print(report.mean())
print(report.mean(numeric_only=True))

View File

@ -50,7 +50,7 @@ train_modsel, val = qp.datasets.fetch_twitter('hcr', for_model_selection=True, p
model selection:
We explore the classifier's loss and the classifier's C hyperparameters.
Since our model is actually an instance of OneVsAllAggregative, we need to add the prefix "binary_quantifier", and
since our binary quantifier is an instance of CC, we need to add the prefix "classifier".
since our binary quantifier is an instance of CC (an aggregative quantifier), we need to add the prefix "classifier".
"""
param_grid = {
'binary_quantifier__classifier__loss': ['q', 'kld', 'mae'], # classifier-dependent hyperparameter

View File

@ -20,11 +20,10 @@ train, test = dataset.train_test
# train the text classifier:
cnn_module = CNNnet(dataset.vocabulary_size, dataset.training.n_classes)
cnn_classifier = NeuralClassifierTrainer(cnn_module, device='cuda')
cnn_classifier.fit(*dataset.training.Xy)
# train QuaNet (alternatively, we can set fit_classifier=True and let QuaNet train the classifier)
quantifier = QuaNet(cnn_classifier, device='cuda')
quantifier.fit(train, fit_classifier=False)
quantifier.fit(*train.Xy)
# prediction and evaluation
estim_prevalence = quantifier.predict(test.instances)

View File

@ -50,7 +50,7 @@ def quantification_models():
yield 'MAX', MAX(newLR()), lr_params
yield 'MS', MS(newLR()), lr_params
yield 'MS2', MS2(newLR()), lr_params
yield 'sldc', EMQ(newLR(), calib='platt'), lr_params
yield 'sldc', EMQ(newLR()), lr_params
yield 'svmmae', newSVMAE(), svmperf_params
yield 'hdy', HDy(newLR()), lr_params
@ -98,8 +98,8 @@ def run(experiment):
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
# model selection (hyperparameter optimization for a quantification-oriented loss)
train, test = data.train_test
train, val = train.split_stratified()
if hyperparams is not None:
train, val = train.split_stratified()
model_selection = qp.model_selection.GridSearchQ(
deepcopy(model),
param_grid=hyperparams,
@ -109,11 +109,11 @@ def run(experiment):
timeout=60*60,
verbose=True
)
model_selection.fit(train)
model_selection.fit(*train.Xy)
model = model_selection.best_model()
best_params = model_selection.best_params_
else:
model.fit(data.training)
model.fit(*train.Xy)
best_params = {}
# model evaluation
@ -121,19 +121,19 @@ def run(experiment):
model,
protocol=APP(test, n_prevalences=21, repeats=100)
)
test_true_prevalence = data.test.prevalence()
test_true_prevalence = test.prevalence()
evaluate_experiment(true_prevalences, estim_prevalences)
save_results(dataset_name, model_name, run, optim_loss,
true_prevalences, estim_prevalences,
data.training.prevalence(), test_true_prevalence,
train.prevalence(), test_true_prevalence,
best_params)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
parser.add_argument('results', metavar='RESULT_PATH', type=str,
help='path to the directory where to store the results')
parser.add_argument('--results', metavar='RESULT_PATH', type=str,
help='path to the directory where to store the results', default='./uci_results')
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
help='path to the directory with svmperf')
parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',

View File

@ -1401,7 +1401,7 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
"""
If the base quantifier is not probabilistic, returns a matrix of shape `(n,m,)` with `n` the number of
instances and `m` the number of classes. The entry `(i,j)` is a binary value indicating whether instance
`i `belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
`i` belongs to class `j`. The binary classifications are independent of each other, meaning that an instance
can end up be attributed to 0, 1, or more classes.
If the base quantifier is probabilistic, returns a matrix of shape `(n,m,2)` with `n` the number of instances
and `m` the number of classes. The entry `(i,j,1)` (resp. `(i,j,0)`) is a value in [0,1] indicating the
@ -1422,6 +1422,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
prevalences = self._parallel(self._delayed_binary_aggregate, classif_predictions)
return F.normalize_prevalence(prevalences)
def aggregation_fit(self, classif_predictions, labels):
self._parallel(self._delayed_binary_aggregate_fit(c, classif_predictions, labels))
return self
def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X)
@ -1429,6 +1433,10 @@ class OneVsAllAggregative(OneVsAllGeneric, AggregativeQuantifier):
# the estimation for the positive class prevalence
return self.dict_binary_quantifiers[c].aggregate(classif_predictions[:, c])[1]
def _delayed_binary_aggregate_fit(self, c, classif_predictions, labels):
# trains the aggregation function of the cth quantifier
return self.dict_binary_quantifiers[c].aggregate_fit(classif_predictions[:, c], labels)
class AggregativeMedianEstimator(BinaryQuantifier):
"""

View File

@ -89,18 +89,18 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
self.binary_quantifier = binary_quantifier
self.n_jobs = qp._get_njobs(n_jobs)
def fit(self, data: LabelledCollection, fit_classifier=True):
assert not data.binary, f'{self.__class__.__name__} expect non-binary data'
assert fit_classifier == True, 'fit_classifier must be True'
def fit(self, X, y):
self.classes = sorted(np.unique(y))
assert len(self.classes)!=2, f'{self.__class__.__name__} expect non-binary data'
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
self._parallel(self._delayed_binary_fit, data)
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in self.classes}
self._parallel(self._delayed_binary_fit, X, y)
return self
def _parallel(self, func, *args, **kwargs):
return np.asarray(
Parallel(n_jobs=self.n_jobs, backend='threading')(
delayed(func)(c, *args, **kwargs) for c in self.classes_
delayed(func)(c, *args, **kwargs) for c in self.classes
)
)
@ -108,13 +108,13 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier):
prevalences = self._parallel(self._delayed_binary_predict, X)
return qp.functional.normalize_prevalence(prevalences)
@property
def classes_(self):
return sorted(self.dict_binary_quantifiers.keys())
# @property
# def classes_(self):
# return sorted(self.dict_binary_quantifiers.keys())
def _delayed_binary_predict(self, c, X):
return self.dict_binary_quantifiers[c].predict(X)[1]
def _delayed_binary_fit(self, c, data):
bindata = LabelledCollection(data.instances, data.labels == c, classes=[False, True])
self.dict_binary_quantifiers[c].fit(bindata)
def _delayed_binary_fit(self, c, X, y):
bindata = LabelledCollection(X, y == c, classes=[False, True])
self.dict_binary_quantifiers[c].fit(*bindata.Xy)