From a1a716dc4aa5ce7b6008f626d6f319a11b37a0c6 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 24 Apr 2024 15:27:35 +0200 Subject: [PATCH] trying to select training documents based on test score distribution --- Retrieval/experiments.py | 44 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py index 2630b13..cd7088f 100644 --- a/Retrieval/experiments.py +++ b/Retrieval/experiments.py @@ -62,6 +62,7 @@ def methods(classifier, class_name): # yield ('PCC', PCC(classifier)) # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) + yield ('PACC-s', PACC(classifier, val_split=5, n_jobs=-1)) # yield ('EMQ', EMQ(classifier, exact_train_prev=True)) # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) # yield ('EMQh', EMQ(classifier, exact_train_prev=False)) @@ -79,6 +80,7 @@ def methods(classifier, class_name): yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) + yield ('KDE01-s', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) # yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) @@ -146,13 +148,15 @@ def run_experiment(): Xtr, ytr, score_tr = train Xte, yte, score_te = test - if HALF: + if HALF and not method_name.endswith('-s'): n = len(ytr) // 2 train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_) else: train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) - if method_name not in ['Naive', 'NaiveQuery']: + idx, max_score_round_robin = get_idx_score_matrix_per_class(train_col, score_tr) + + if method_name not in ['Naive', 'NaiveQuery'] and not method_name.endswith('-s'): quantifier.fit(train_col, val_split=train_col, fit_classifier=False) elif method_name == 'Naive': quantifier.fit(train_col) @@ -163,6 +167,11 @@ def run_experiment(): if method_name == 'NaiveQuery': train_k = reduceAtK(train_col, k) quantifier.fit(train_k) + elif method_name.endswith('-s'): + test_min_score = score_te[k] if k < len(score_te) else score_te[-1] + train_k = reduce_train_at_score(train_col, idx, max_score_round_robin, test_min_score) + print(f'{k=}, {test_min_score=} {len(train_k)=}') + quantifier.fit(train_k, val_split=train_k, fit_classifier=False) estim_prev = quantifier.quantify(test_k.instances) @@ -177,6 +186,33 @@ def run_experiment(): return results +def get_idx_score_matrix_per_class(train, score_tr): + classes = train.classes_ + num_classes = len(classes) + num_docs = len(train) + scores = np.zeros(shape=(num_docs, num_classes), dtype=float) + idx = np.full(shape=(num_docs, num_classes), fill_value=-1, dtype=int) + X, y = train.Xy + for i, class_i in enumerate(classes): + class_i_scores = score_tr[y == class_i] + rank_i = np.argwhere(y == class_i).flatten() + scores[:len(class_i_scores), i] = class_i_scores + idx[:len(class_i_scores), i] = rank_i + max_score_round_robin = scores.max(axis=1) + return idx, max_score_round_robin + + +def reduce_train_at_score(train, idx, max_score_round_robin, score_te_at_k, min_docs_per_class=5): + min_index = np.min(np.argwhere(max_score_round_robin