kde working

This commit is contained in:
Alejandro Moreo Fernandez 2024-04-19 18:16:14 +02:00
parent 985f430d52
commit bc656fe207
9 changed files with 620 additions and 74 deletions

View File

@ -0,0 +1,82 @@
import itertools
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
"""
data_home = 'data'
datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category']
param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}
# param_grid = {'C': np.logspace(-1, 1, 2)}
classifiers = [
('LR', LogisticRegression(max_iter=5000), param_grid),
('SVM', LinearSVC(), param_grid)
]
def benchmark_name(class_name):
return class_name.replace('_', '\_')
table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets])
table.format.show_std = False
table.format.stat_test = None
table.format.lower_is_better = False
for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers):
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
texts, labels = load_sample(train_data_path, class_name=class_name)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='')
classifier = GridSearchCV(
cls,
param_grid=grid,
n_jobs=-1,
cv=5,
verbose=10
)
classifier.fit(Xtr, labels)
classifier_acc = classifier.best_score_
classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_]
print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}')
table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold)
Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table])

View File

@ -8,118 +8,92 @@ from quapy.protocol import AbstractProtocol
import json
def load_sample(path, class_name, max_lines=-1):
def load_sample(path, class_name):
"""
Loads a sample json as a dataframe and returns text and labels for
the given class_name
:param path: path to a json file
:param class_name: string representing the target class
:param max_lines: if provided and > 0 then returns only the
first requested number of instances
:return: texts and labels for class_name
:return: texts, labels for class_name
"""
df = pd.read_json(path)
text = df.text.values
try:
labels = df[class_name].values
except KeyError as e:
print(f'error in {path}; key {class_name} not found')
raise e
if max_lines is not None and max_lines>0:
text = text[:max_lines]
labels = labels[:max_lines]
labels = df[class_name].values
return text, labels
class TextRankings:
def get_text_label_score(df, class_name, vectorizer=None, filter_classes=None):
text = df.text.values
labels = df[class_name].values
rel_score = df.score.values
def __init__(self, path, class_name):
self.obj = json.load(open(path, 'rt'))
self.class_name = class_name
if filter_classes is not None:
idx = np.isin(labels, filter_classes)
text = text[idx]
labels = labels[idx]
rel_score = rel_score[idx]
def get_sample_Xy(self, sample_id, max_lines=-1):
sample_id = str(sample_id)
O = self.obj
docs_ids = [doc_id for doc_id, query_id in O['qid'].items() if query_id == sample_id]
texts = [O['text'][doc_id] for doc_id in docs_ids]
labels = [O[self.class_name][doc_id] for doc_id in docs_ids]
if max_lines > 0 and len(texts) > max_lines:
ranks = [int(O['rank'][doc_id]) for doc_id in docs_ids]
sel = np.argsort(ranks)[:max_lines]
texts = np.asarray(texts)[sel]
labels = np.asarray(labels)[sel]
if vectorizer is not None:
text = vectorizer.transform(text)
return texts, labels
order = np.argsort(-rel_score)
return text[order], labels[order], rel_score[order]
def filter_by_classes(X, y, classes):
idx = np.isin(y, classes)
return X[idx], y[idx]
class RetrievedSamples(AbstractProtocol):
class RetrievedSamples:
def __init__(self,
class_home: str,
test_rankings_path: str,
load_fn,
vectorizer,
class_name,
max_train_lines=None,
max_test_lines=None,
classes=None
):
self.class_home = class_home
self.test_rankings_df = pd.read_json(test_rankings_path)
self.load_fn = load_fn
self.vectorizer = vectorizer
self.class_name = class_name
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
self.classes=classes
def __call__(self):
tests_df = self.test_rankings_df
class_name = self.class_name
vectorizer = self.vectorizer
for file in self._list_queries():
texts, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
texts, y = filter_by_classes(texts, y, self.classes)
X = self.vectorizer.transform(texts)
train_sample = LabelledCollection(X, y, classes=self.classes)
# loads the training sample
train_df = pd.read_json(file)
Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes)
# loads the test sample
query_id = self._get_query_id_from_path(file)
texts, y = self._get_test_sample(query_id, max_lines=self.max_test_lines)
texts, y = filter_by_classes(texts, y, self.classes)
X = self.vectorizer.transform(texts)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
yield train_sample, test_sample
except ValueError as e:
print(f'file {file} caused an exception: {e}')
yield None, None
sel_df = tests_df[tests_df.qid == int(query_id)]
Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes)
yield (Xtr, ytr, score_tr), (Xte, yte, score_te)
def _list_queries(self):
return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
def _get_test_sample(self, query_id, max_lines=-1):
df = self.test_rankings_df
sel_df = df[df.qid==int(query_id)]
texts = sel_df.text.values
try:
labels = sel_df[self.class_name].values
except KeyError as e:
print(f'error: key {self.class_name} not found in test rankings')
raise e
if max_lines > 0 and len(texts) > max_lines:
ranks = sel_df.rank.values
idx = np.argsort(ranks)[:max_lines]
texts = np.asarray(texts)[idx]
labels = np.asarray(labels)[idx]
return texts, labels
# def _get_test_sample(self, query_id, max_lines=-1):
# df = self.test_rankings_df
# sel_df = df[df.qid==int(query_id)]
# return get_text_label_score(sel_df)
# texts = sel_df.text.values
# try:
# labels = sel_df[self.class_name].values
# except KeyError as e:
# print(f'error: key {self.class_name} not found in test rankings')
# raise e
# if max_lines > 0 and len(texts) > max_lines:
# ranks = sel_df.rank.values
# idx = np.argsort(ranks)[:max_lines]
# texts = np.asarray(texts)[idx]
# labels = np.asarray(labels)[idx]
# return texts, labels
def total(self):
return len(self._list_queries())
@ -130,4 +104,6 @@ class RetrievedSamples(AbstractProtocol):
qid = path
qid = qid[:qid.index(posfix)]
qid = qid[qid.index(prefix) + len(prefix):]
return qid
return qid

245
Retrieval/experiments.py Normal file
View File

@ -0,0 +1,245 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
In this sixth experiment, we have a collection C of >6M documents.
We split C in two equally-sized pools TrPool, TePool
I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents.
We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender.
From the training set I have created smaller subsets for each category:
100K, 500K, 1M and FULL (3.25M)
For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500). Let me know if you think we need more.
To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group.
For example: (Male 200, Female 200, Unknown 200)
Sometimes this is infeasible, we should probably discuss this at some point.
You can find the results for every query in a file named:
"training_Query-[QID]Sample-200SPLIT.json"
Test:
To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file.
testRanking_Results.json
"""
def methods(classifier, class_name):
kde_param = {
'continent': 0.18,
'gender': 0.12,
'years_category':0.09
}
yield ('Naive', Naive())
yield ('NaiveQuery', Naive())
yield ('CC', ClassifyAndCount(classifier))
# yield ('PCC', PCC(classifier))
# yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
# yield ('EMQ', EMQ(classifier, exact_train_prev=True))
# yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
# yield ('EMQh', EMQ(classifier, exact_train_prev=False))
# yield ('EMQ-BCTS', EMQ(classifier, exact_train_prev=True, recalib='bcts'))
# yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts'))
# yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs'))
# yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001))
# yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
# yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
# yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
# yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman'))
# yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott'))
yield ('KDE-opt', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07))
# yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10))
def train_classifier(train_path):
"""
Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
class_weight (range {'balanced', None}) optimized via 5FCV.
:return: the tfidf-vectorizer and the classifier trained
"""
texts, labels = load_sample(train_path, class_name=class_name)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='')
classifier = LogisticRegression(max_iter=5000)
classifier = GridSearchCV(
classifier,
param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
n_jobs=-1,
cv=5
)
classifier.fit(Xtr, labels)
classifier = classifier.best_estimator_
classifier_acc = classifier.best_score_
print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score')
training = LabelledCollection(Xtr, labels)
print('training classes:', training.classes_)
print('training prevalence:', training.prevalence())
return tfidf, classifier
def reduceAtK(data: LabelledCollection, k):
# if k > len(data):
# print(f'[warning] {k=}>{len(data)=}')
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
def benchmark_name(class_name, k):
scape_class_name = class_name.replace('_', '\_')
return f'{scape_class_name}@{k}'
def run_experiment():
results = {
'mae': {k: [] for k in Ks},
'mrae': {k: [] for k in Ks}
}
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
if HALF:
n = len(ytr) // 2
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_)
else:
train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
if method_name not in ['Naive', 'NaiveQuery']:
quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive':
quantifier.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_)
for k in Ks:
test_k = reduceAtK(test_col, k)
if method_name == 'NaiveQuery':
train_k = reduceAtK(train_col, k)
quantifier.fit(train_k)
estim_prev = quantifier.quantify(test_k.instances)
mae = qp.error.mae(test_k.prevalence(), estim_prev)
mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k)))
results['mae'][k].append(mae)
results['mrae'][k].append(mrae)
pbar.set_description(f'{method_name}')
return results
data_home = 'data'
HALF=True
exp_posfix = '_half'
method_names = [name for name, *other in methods(None, 'continent')]
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']:
tables_mae, tables_mrae = [], []
benchmarks = [benchmark_name(class_name, k) for k in Ks]
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
table_mae.format.mean_prec = 5
table_mae.format.remove_zero = True
table_mae.format.color_mode = 'global'
tables_mae.append(table_mae)
tables_mrae.append(table_mrae)
class_home = join(data_home, class_name, data_size)
# train_data_path = join(class_home, 'classifier_training.json')
# classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl')
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier
test_rankings_path = join(data_home, 'testRanking_Results.json')
results_home = join('results'+exp_posfix, class_name, data_size)
tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path)
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
vectorizer=tfidf,
class_name=class_name,
classes=classifier_trained.classes_
)
for method_name, quantifier in methods(classifier_trained, class_name):
results_path = join(results_home, method_name + '.pkl')
if os.path.exists(results_path):
print(f'Method {method_name=} already computed')
results = pickle.load(open(results_path, 'rb'))
else:
results = run_experiment()
os.makedirs(Path(results_path).parent, exist_ok=True)
pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL)
for k in Ks:
table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
# Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae)
Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae)

View File

@ -0,0 +1,77 @@
import itertools
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from quapy.protocol import UPP
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.model_selection import GridSearchQ
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
"""
data_home = 'data'
datasets = ['continent', 'gender', 'years_category'] #, 'relative_pageviews_category', 'num_sitelinks_category']
for class_name in datasets:
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
texts, labels = load_sample(train_data_path, class_name=class_name)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb'))
classifier_hyper = classifier_trained.get_params()
print(f'{classifier_hyper=}')
X = tfidf.transform(texts)
print(f'Xtr shape={X.shape}')
pool = LabelledCollection(X, labels)
train, val = pool.split_stratified(train_prop=0.5, random_state=0)
q = KDEyML(LogisticRegression())
classifier_hyper = {'classifier__C':[classifier_hyper['C'], 0.00000001], 'classifier__class_weight':[classifier_hyper['class_weight']]}
quantifier_hyper = {'bandwidth': np.linspace(0.01, 0.2, 20)}
hyper = {**classifier_hyper, **quantifier_hyper}
qp.environ['SAMPLE_SIZE'] = 100
modsel = GridSearchQ(
model=q,
param_grid=hyper,
protocol=UPP(val, sample_size=100),
n_jobs=-1,
error='mrae',
verbose=True
)
modsel.fit(train)
print(class_name)
print(f'{modsel.best_params_}')
print(f'{modsel.best_score_}')

View File

@ -0,0 +1,105 @@
import os.path
import pickle
from collections import defaultdict
from itertools import zip_longest
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import numpy as np
import matplotlib.pyplot as plt
"""
Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
- training pool size (100K, 500K, 1M, FULL)
- rank
"""
data_home = 'data'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
test_added = False
Mtrs, Mtes, source = [], [], []
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
class_home = join(data_home, class_name, data_size)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
_, classifier = pickle.load(open(classifier_path, 'rb'))
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
vectorizer=None,
class_name=class_name,
classes=classifier.classes_
)
Mtr = []
Mte = []
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
Mtr.append(score_tr)
Mte.append(score_te)
Mtrs.append(Mtr)
if not test_added:
Mtes.append(Mte)
test_added = True
source.append(data_size)
fig, ax = plt.subplots()
train_source = ['train-'+s for s in source]
Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
for M, source in Ms:
M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
num_rep, num_docs = M.shape
mean_values = np.nanmean(M, axis=0)
n_filled = np.count_nonzero(~np.isnan(M), axis=0)
std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
color = line[-1].get_color()
ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
ax.set_xlabel('Doc. Rank')
ax.set_ylabel('Rel. Score')
ax.set_title(class_name)
ax.legend()
# plt.show()
os.makedirs('plots', exist_ok=True)
plotpath = f'plots/{class_name}.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath)

27
Retrieval/tmp.py Normal file
View File

@ -0,0 +1,27 @@
import pandas as pd
from os.path import join
from Retrieval.commons import load_json_sample
from quapy.data import LabelledCollection
data_home = 'data'
CLASS_NAME = 'continent'
datasize = '100K'
file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json')
text, classes = load_json_sample(file_path, CLASS_NAME)
data = LabelledCollection(text, classes)
print(data.classes_)
print(data.prevalence())
print('done')
test_ranking_path = join(data_home, 'testRanking_Results.json')
# obj = json.load(open(test_ranking_path))
df = pd.read_json(test_ranking_path)
print('done')

View File

@ -141,6 +141,19 @@ def uniform_prevalence_sampling(n_classes, size=1):
return u
def uniform_prevalence(n_classes):
"""
Returns a vector representing the uniform distribution for `n_classes`
:param n_classes: number of classes
:return: np.ndarray with all values 1/n_classes
"""
assert isinstance(n_classes, int) and n_classes>0, \
(f'param {n_classes} not understood; must be a positive integer representing the '
f'number of classes ')
return np.full(shape=n_classes, fill_value=1./n_classes)
uniform_simplex_sampling = uniform_prevalence_sampling

View File

@ -62,7 +62,13 @@ class KDEBase:
:param bandwidth: float, the bandwidth of the kernel
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes]
class_cond_X = []
for cat in classes:
selX = X[y==cat]
if selX.size==0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(selX)
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]

View File

@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Callable, Union
import numpy as np
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling
from scipy import optimize
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
@ -636,20 +636,35 @@ class EMQ(AggregativeSoftQuantifier):
calibrator = TempScaling()
elif self.recalib == 'vs':
calibrator = VectorScaling()
elif self.recalib == 'platt':
calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit')
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
if not np.issubdtype(y.dtype, np.number):
y = np.searchsorted(data.classes_, y)
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if self.recalib == 'platt':
self.classifier = calibrator.fit(*data.Xy)
else:
print(classif_predictions.prevalence())
try:
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
except RuntimeError as e:
print(e)
print('defaults to I')
self.calibration_function = lambda P:P
if self.exact_train_prev:
self.train_prevalence = data.prevalence()
else:
train_posteriors = classif_predictions.X
if self.recalib is not None:
train_posteriors = self.calibration_function(train_posteriors)
if self.recalib == 'platt':
train_posteriors = self.classifier.predict_proba(train_posteriors)
else:
train_posteriors = self.calibration_function(train_posteriors)
self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
def aggregate(self, classif_posteriors, epsilon=EPSILON):