diff --git a/Retrieval/classifier_kfcv_accuracy.py b/Retrieval/classifier_kfcv_accuracy.py new file mode 100644 index 0000000..6db3d06 --- /dev/null +++ b/Retrieval/classifier_kfcv_accuracy.py @@ -0,0 +1,82 @@ +import itertools +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table + +""" + +""" + +data_home = 'data' + +datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category'] + +param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]} +# param_grid = {'C': np.logspace(-1, 1, 2)} + +classifiers = [ + ('LR', LogisticRegression(max_iter=5000), param_grid), + ('SVM', LinearSVC(), param_grid) +] + +def benchmark_name(class_name): + return class_name.replace('_', '\_') + +table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets]) +table.format.show_std = False +table.format.stat_test = None +table.format.lower_is_better = False + +for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers): + + train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier + + texts, labels = load_sample(train_data_path, class_name=class_name) + + tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3) + Xtr = tfidf.fit_transform(texts) + print(f'Xtr shape={Xtr.shape}') + + print('training classifier...', end='') + classifier = GridSearchCV( + cls, + param_grid=grid, + n_jobs=-1, + cv=5, + verbose=10 + ) + classifier.fit(Xtr, labels) + classifier_acc = classifier.best_score_ + classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_] + + print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}') + + table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold) + + Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table]) + + + + + + + + diff --git a/Retrieval/commons.py b/Retrieval/commons.py index 323d980..ae66ed7 100644 --- a/Retrieval/commons.py +++ b/Retrieval/commons.py @@ -8,118 +8,92 @@ from quapy.protocol import AbstractProtocol import json -def load_sample(path, class_name, max_lines=-1): +def load_sample(path, class_name): """ Loads a sample json as a dataframe and returns text and labels for the given class_name :param path: path to a json file :param class_name: string representing the target class - :param max_lines: if provided and > 0 then returns only the - first requested number of instances - :return: texts and labels for class_name + :return: texts, labels for class_name """ df = pd.read_json(path) text = df.text.values - try: - labels = df[class_name].values - except KeyError as e: - print(f'error in {path}; key {class_name} not found') - raise e - if max_lines is not None and max_lines>0: - text = text[:max_lines] - labels = labels[:max_lines] + labels = df[class_name].values return text, labels -class TextRankings: +def get_text_label_score(df, class_name, vectorizer=None, filter_classes=None): + text = df.text.values + labels = df[class_name].values + rel_score = df.score.values - def __init__(self, path, class_name): - self.obj = json.load(open(path, 'rt')) - self.class_name = class_name + if filter_classes is not None: + idx = np.isin(labels, filter_classes) + text = text[idx] + labels = labels[idx] + rel_score = rel_score[idx] - def get_sample_Xy(self, sample_id, max_lines=-1): - sample_id = str(sample_id) - O = self.obj - docs_ids = [doc_id for doc_id, query_id in O['qid'].items() if query_id == sample_id] - texts = [O['text'][doc_id] for doc_id in docs_ids] - labels = [O[self.class_name][doc_id] for doc_id in docs_ids] - if max_lines > 0 and len(texts) > max_lines: - ranks = [int(O['rank'][doc_id]) for doc_id in docs_ids] - sel = np.argsort(ranks)[:max_lines] - texts = np.asarray(texts)[sel] - labels = np.asarray(labels)[sel] + if vectorizer is not None: + text = vectorizer.transform(text) - return texts, labels + order = np.argsort(-rel_score) + return text[order], labels[order], rel_score[order] -def filter_by_classes(X, y, classes): - idx = np.isin(y, classes) - return X[idx], y[idx] - - -class RetrievedSamples(AbstractProtocol): +class RetrievedSamples: def __init__(self, class_home: str, test_rankings_path: str, - load_fn, vectorizer, class_name, - max_train_lines=None, - max_test_lines=None, classes=None ): self.class_home = class_home self.test_rankings_df = pd.read_json(test_rankings_path) - self.load_fn = load_fn self.vectorizer = vectorizer self.class_name = class_name - self.max_train_lines = max_train_lines - self.max_test_lines = max_test_lines self.classes=classes def __call__(self): + tests_df = self.test_rankings_df + class_name = self.class_name + vectorizer = self.vectorizer for file in self._list_queries(): - texts, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines) - texts, y = filter_by_classes(texts, y, self.classes) - X = self.vectorizer.transform(texts) - train_sample = LabelledCollection(X, y, classes=self.classes) + # loads the training sample + train_df = pd.read_json(file) + Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes) + # loads the test sample query_id = self._get_query_id_from_path(file) - texts, y = self._get_test_sample(query_id, max_lines=self.max_test_lines) - texts, y = filter_by_classes(texts, y, self.classes) - X = self.vectorizer.transform(texts) - - try: - test_sample = LabelledCollection(X, y, classes=train_sample.classes_) - yield train_sample, test_sample - except ValueError as e: - print(f'file {file} caused an exception: {e}') - yield None, None + sel_df = tests_df[tests_df.qid == int(query_id)] + Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes) + yield (Xtr, ytr, score_tr), (Xte, yte, score_te) def _list_queries(self): return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json'))) - def _get_test_sample(self, query_id, max_lines=-1): - df = self.test_rankings_df - sel_df = df[df.qid==int(query_id)] - texts = sel_df.text.values - try: - labels = sel_df[self.class_name].values - except KeyError as e: - print(f'error: key {self.class_name} not found in test rankings') - raise e - if max_lines > 0 and len(texts) > max_lines: - ranks = sel_df.rank.values - idx = np.argsort(ranks)[:max_lines] - texts = np.asarray(texts)[idx] - labels = np.asarray(labels)[idx] - return texts, labels + # def _get_test_sample(self, query_id, max_lines=-1): + # df = self.test_rankings_df + # sel_df = df[df.qid==int(query_id)] + # return get_text_label_score(sel_df) + # texts = sel_df.text.values + # try: + # labels = sel_df[self.class_name].values + # except KeyError as e: + # print(f'error: key {self.class_name} not found in test rankings') + # raise e + # if max_lines > 0 and len(texts) > max_lines: + # ranks = sel_df.rank.values + # idx = np.argsort(ranks)[:max_lines] + # texts = np.asarray(texts)[idx] + # labels = np.asarray(labels)[idx] + # return texts, labels def total(self): return len(self._list_queries()) @@ -130,4 +104,6 @@ class RetrievedSamples(AbstractProtocol): qid = path qid = qid[:qid.index(posfix)] qid = qid[qid.index(prefix) + len(prefix):] - return qid \ No newline at end of file + return qid + + diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py new file mode 100644 index 0000000..c1450be --- /dev/null +++ b/Retrieval/experiments.py @@ -0,0 +1,245 @@ +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table + +""" +In this sixth experiment, we have a collection C of >6M documents. +We split C in two equally-sized pools TrPool, TePool + +I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents. + +We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender. + +From the training set I have created smaller subsets for each category: +100K, 500K, 1M and FULL (3.25M) + +For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500). Let me know if you think we need more. + +To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group. +For example: (Male 200, Female 200, Unknown 200) +Sometimes this is infeasible, we should probably discuss this at some point. + + You can find the results for every query in a file named: + +"training_Query-[QID]Sample-200SPLIT.json" + +Test: +To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file. + testRanking_Results.json + +""" + + +def methods(classifier, class_name): + + kde_param = { + 'continent': 0.18, + 'gender': 0.12, + 'years_category':0.09 + } + + yield ('Naive', Naive()) + yield ('NaiveQuery', Naive()) + yield ('CC', ClassifyAndCount(classifier)) + # yield ('PCC', PCC(classifier)) + # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) + yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) + # yield ('EMQ', EMQ(classifier, exact_train_prev=True)) + # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) + # yield ('EMQh', EMQ(classifier, exact_train_prev=False)) + # yield ('EMQ-BCTS', EMQ(classifier, exact_train_prev=True, recalib='bcts')) + # yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts')) + # yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs')) + # yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs')) + # yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001)) + # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow! + # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) + # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) + # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) + # yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman')) + # yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott')) + yield ('KDE-opt', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name])) + yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) + yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02)) + yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03)) + yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04)) + yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05)) + yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07)) + # yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10)) + + +def train_classifier(train_path): + """ + Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation. + The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and + class_weight (range {'balanced', None}) optimized via 5FCV. + + :return: the tfidf-vectorizer and the classifier trained + """ + texts, labels = load_sample(train_path, class_name=class_name) + + tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3) + Xtr = tfidf.fit_transform(texts) + print(f'Xtr shape={Xtr.shape}') + + print('training classifier...', end='') + classifier = LogisticRegression(max_iter=5000) + classifier = GridSearchCV( + classifier, + param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}, + n_jobs=-1, + cv=5 + ) + classifier.fit(Xtr, labels) + classifier = classifier.best_estimator_ + classifier_acc = classifier.best_score_ + print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score') + + training = LabelledCollection(Xtr, labels) + print('training classes:', training.classes_) + print('training prevalence:', training.prevalence()) + + return tfidf, classifier + + +def reduceAtK(data: LabelledCollection, k): + # if k > len(data): + # print(f'[warning] {k=}>{len(data)=}') + X, y = data.Xy + X = X[:k] + y = y[:k] + return LabelledCollection(X, y, classes=data.classes_) + + +def benchmark_name(class_name, k): + scape_class_name = class_name.replace('_', '\_') + return f'{scape_class_name}@{k}' + + +def run_experiment(): + results = { + 'mae': {k: [] for k in Ks}, + 'mrae': {k: [] for k in Ks} + } + + pbar = tqdm(experiment_prot(), total=experiment_prot.total()) + for train, test in pbar: + Xtr, ytr, score_tr = train + Xte, yte, score_te = test + + if HALF: + n = len(ytr) // 2 + train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_) + else: + train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_) + + if method_name not in ['Naive', 'NaiveQuery']: + quantifier.fit(train_col, val_split=train_col, fit_classifier=False) + elif method_name == 'Naive': + quantifier.fit(train_col) + + test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_) + for k in Ks: + test_k = reduceAtK(test_col, k) + if method_name == 'NaiveQuery': + train_k = reduceAtK(train_col, k) + quantifier.fit(train_k) + + estim_prev = quantifier.quantify(test_k.instances) + + mae = qp.error.mae(test_k.prevalence(), estim_prev) + mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k))) + + results['mae'][k].append(mae) + results['mrae'][k].append(mrae) + + pbar.set_description(f'{method_name}') + + return results + + +data_home = 'data' + +HALF=True +exp_posfix = '_half' + +method_names = [name for name, *other in methods(None, 'continent')] + +Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] + +for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']: + tables_mae, tables_mrae = [], [] + + benchmarks = [benchmark_name(class_name, k) for k in Ks] + + for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: + + table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names) + table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names) + table_mae.format.mean_prec = 5 + table_mae.format.remove_zero = True + table_mae.format.color_mode = 'global' + + tables_mae.append(table_mae) + tables_mrae.append(table_mrae) + + class_home = join(data_home, class_name, data_size) + # train_data_path = join(class_home, 'classifier_training.json') + # classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl') + train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') # <------------ fixed classifier + test_rankings_path = join(data_home, 'testRanking_Results.json') + results_home = join('results'+exp_posfix, class_name, data_size) + + tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path) + + experiment_prot = RetrievedSamples( + class_home, + test_rankings_path, + vectorizer=tfidf, + class_name=class_name, + classes=classifier_trained.classes_ + ) + for method_name, quantifier in methods(classifier_trained, class_name): + + results_path = join(results_home, method_name + '.pkl') + if os.path.exists(results_path): + print(f'Method {method_name=} already computed') + results = pickle.load(open(results_path, 'rb')) + else: + results = run_experiment() + + os.makedirs(Path(results_path).parent, exist_ok=True) + pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL) + + for k in Ks: + table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k]) + table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k]) + + # Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae) + Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae) + + + + + + + diff --git a/Retrieval/kdey_bandwith_selection.py b/Retrieval/kdey_bandwith_selection.py new file mode 100644 index 0000000..658a826 --- /dev/null +++ b/Retrieval/kdey_bandwith_selection.py @@ -0,0 +1,77 @@ +import itertools +import os.path +import pickle +from collections import defaultdict +from pathlib import Path + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +from Retrieval.commons import RetrievedSamples, load_sample +from quapy.protocol import UPP +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.model_selection import GridSearchQ +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.data.base import LabelledCollection + +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table + +""" + +""" + +data_home = 'data' + +datasets = ['continent', 'gender', 'years_category'] #, 'relative_pageviews_category', 'num_sitelinks_category'] + +for class_name in datasets: + + train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier + texts, labels = load_sample(train_data_path, class_name=class_name) + + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') + tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb')) + classifier_hyper = classifier_trained.get_params() + print(f'{classifier_hyper=}') + + X = tfidf.transform(texts) + print(f'Xtr shape={X.shape}') + + pool = LabelledCollection(X, labels) + train, val = pool.split_stratified(train_prop=0.5, random_state=0) + q = KDEyML(LogisticRegression()) + classifier_hyper = {'classifier__C':[classifier_hyper['C'], 0.00000001], 'classifier__class_weight':[classifier_hyper['class_weight']]} + quantifier_hyper = {'bandwidth': np.linspace(0.01, 0.2, 20)} + hyper = {**classifier_hyper, **quantifier_hyper} + qp.environ['SAMPLE_SIZE'] = 100 + modsel = GridSearchQ( + model=q, + param_grid=hyper, + protocol=UPP(val, sample_size=100), + n_jobs=-1, + error='mrae', + verbose=True + ) + modsel.fit(train) + + print(class_name) + print(f'{modsel.best_params_}') + print(f'{modsel.best_score_}') + + + + + + + + + + + diff --git a/Retrieval/relscore_distribution.py b/Retrieval/relscore_distribution.py new file mode 100644 index 0000000..1db4b38 --- /dev/null +++ b/Retrieval/relscore_distribution.py @@ -0,0 +1,105 @@ +import os.path +import pickle +from collections import defaultdict +from itertools import zip_longest +from pathlib import Path + +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.svm import LinearSVC + +import quapy as qp +import quapy.functional as F +from Retrieval.commons import RetrievedSamples, load_sample +from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive +from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML +from quapy.protocol import AbstractProtocol +from quapy.data.base import LabelledCollection + +from glob import glob +from os.path import join +from tqdm import tqdm + +from result_table.src.table import Table +import numpy as np +import matplotlib.pyplot as plt + +""" +Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt: +- training pool size (100K, 500K, 1M, FULL) +- rank +""" + + +data_home = 'data' +Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] + +for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']: + test_added = False + Mtrs, Mtes, source = [], [], [] + for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: + + class_home = join(data_home, class_name, data_size) + classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') + test_rankings_path = join(data_home, 'testRanking_Results.json') + + _, classifier = pickle.load(open(classifier_path, 'rb')) + + experiment_prot = RetrievedSamples( + class_home, + test_rankings_path, + vectorizer=None, + class_name=class_name, + classes=classifier.classes_ + ) + + Mtr = [] + Mte = [] + pbar = tqdm(experiment_prot(), total=experiment_prot.total()) + for train, test in pbar: + Xtr, ytr, score_tr = train + Xte, yte, score_te = test + Mtr.append(score_tr) + Mte.append(score_te) + + Mtrs.append(Mtr) + if not test_added: + Mtes.append(Mte) + test_added = True + source.append(data_size) + + fig, ax = plt.subplots() + train_source = ['train-'+s for s in source] + Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test'])) + + for M, source in Ms: + M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T + + num_rep, num_docs = M.shape + + mean_values = np.nanmean(M, axis=0) + n_filled = np.count_nonzero(~np.isnan(M), axis=0) + std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled) + + line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None) + color = line[-1].get_color() + ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color) + + + ax.set_xlabel('Doc. Rank') + ax.set_ylabel('Rel. Score') + ax.set_title(class_name) + + ax.legend() + + # plt.show() + os.makedirs('plots', exist_ok=True) + plotpath = f'plots/{class_name}.pdf' + print(f'saving plot in {plotpath}') + plt.savefig(plotpath) + + + diff --git a/Retrieval/tmp.py b/Retrieval/tmp.py new file mode 100644 index 0000000..6e10f87 --- /dev/null +++ b/Retrieval/tmp.py @@ -0,0 +1,27 @@ +import pandas as pd + +from os.path import join + +from Retrieval.commons import load_json_sample +from quapy.data import LabelledCollection + +data_home = 'data' +CLASS_NAME = 'continent' +datasize = '100K' + +file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json') + +text, classes = load_json_sample(file_path, CLASS_NAME) + + +data = LabelledCollection(text, classes) +print(data.classes_) +print(data.prevalence()) +print('done') + +test_ranking_path = join(data_home, 'testRanking_Results.json') +# obj = json.load(open(test_ranking_path)) + + +df = pd.read_json(test_ranking_path) +print('done') \ No newline at end of file diff --git a/quapy/functional.py b/quapy/functional.py index c6dc351..3d1e1d5 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -141,6 +141,19 @@ def uniform_prevalence_sampling(n_classes, size=1): return u +def uniform_prevalence(n_classes): + """ + Returns a vector representing the uniform distribution for `n_classes` + + :param n_classes: number of classes + :return: np.ndarray with all values 1/n_classes + """ + assert isinstance(n_classes, int) and n_classes>0, \ + (f'param {n_classes} not understood; must be a positive integer representing the ' + f'number of classes ') + return np.full(shape=n_classes, fill_value=1./n_classes) + + uniform_simplex_sampling = uniform_prevalence_sampling diff --git a/quapy/method/_kdey.py b/quapy/method/_kdey.py index a6ecbea..3504b22 100644 --- a/quapy/method/_kdey.py +++ b/quapy/method/_kdey.py @@ -62,7 +62,13 @@ class KDEBase: :param bandwidth: float, the bandwidth of the kernel :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates """ - return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes] + class_cond_X = [] + for cat in classes: + selX = X[y==cat] + if selX.size==0: + selX = [F.uniform_prevalence(len(classes))] + class_cond_X.append(selX) + return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X] diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 27d692d..cf2f366 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from copy import deepcopy from typing import Callable, Union import numpy as np -from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling +from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling from scipy import optimize from sklearn.base import BaseEstimator from sklearn.calibration import CalibratedClassifierCV @@ -636,20 +636,35 @@ class EMQ(AggregativeSoftQuantifier): calibrator = TempScaling() elif self.recalib == 'vs': calibrator = VectorScaling() + elif self.recalib == 'platt': + calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit') else: raise ValueError('invalid param argument for recalibration method; available ones are ' '"nbvs", "bcts", "ts", and "vs".') if not np.issubdtype(y.dtype, np.number): y = np.searchsorted(data.classes_, y) - self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True) + + if self.recalib == 'platt': + self.classifier = calibrator.fit(*data.Xy) + else: + print(classif_predictions.prevalence()) + try: + self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True) + except RuntimeError as e: + print(e) + print('defaults to I') + self.calibration_function = lambda P:P if self.exact_train_prev: self.train_prevalence = data.prevalence() else: train_posteriors = classif_predictions.X if self.recalib is not None: - train_posteriors = self.calibration_function(train_posteriors) + if self.recalib == 'platt': + train_posteriors = self.classifier.predict_proba(train_posteriors) + else: + train_posteriors = self.calibration_function(train_posteriors) self.train_prevalence = F.prevalence_from_probabilities(train_posteriors) def aggregate(self, classif_posteriors, epsilon=EPSILON):