Compare commits

...

24 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez 517686eea1 improving the quality of the plots 2024-05-17 13:52:56 +02:00
Alejandro Moreo Fernandez 0df44c13a9 switching 2024-05-15 12:00:00 +02:00
Alejandro Moreo Fernandez 2ac48a9798 setting a rank threshold to 1000, and finalizing plots 2024-05-10 15:46:13 +02:00
Alejandro Moreo Fernandez 67ed6e4c6c adding methods of prior work to git 2024-05-09 16:24:20 +02:00
Alejandro Moreo Fernandez 5284e04c90 final plots 2024-05-09 16:22:59 +02:00
Alejandro Moreo Fernandez 366020d45c finalizing experiments and bugfix in kld error 2024-05-08 11:31:28 +02:00
Alejandro Moreo Fernandez 1007257280 adding Dkl 2024-05-02 16:36:23 +02:00
Alejandro Moreo Fernandez e1f6149f71 adding the prevalence of the judged relevant per each query 2024-05-02 10:59:16 +02:00
Alejandro Moreo Fernandez a1a716dc4a trying to select training documents based on test score distribution 2024-04-24 15:27:35 +02:00
Alejandro Moreo Fernandez 36c53639d7 model selection for kde in a past TREC dataset 2024-04-23 09:53:31 +02:00
Alejandro Moreo Fernandez bc656fe207 kde working 2024-04-19 18:16:14 +02:00
Alejandro Moreo Fernandez 985f430d52 refactoring everything 2024-04-18 09:32:30 +02:00
Alejandro Moreo Fernandez 8399552c8d testing gender and continent again 2024-04-12 12:03:38 +02:00
Alejandro Moreo Fernandez 8ad41b1d33 new experimental protocol applied to continent 2024-04-09 09:48:56 +02:00
Alejandro Moreo Fernandez 1b420afd6c fixing code to handle different categories 2024-04-05 18:09:52 +02:00
Alejandro Moreo Fernandez 8f9d19dd5f fixing code to handle different categories 2024-04-05 18:09:20 +02:00
Alejandro Moreo Fernandez 2a685cec1e seems to be working :D 2024-03-23 20:12:10 +01:00
Alejandro Moreo Fernandez 4150f4351f statring 5th approach 2024-03-15 16:57:45 +01:00
Alejandro Moreo Fernandez 1aa9891ff9 cleaning gitignore 2024-02-23 16:48:53 +01:00
Alejandro Moreo Fernandez 1c03dd651b first commit, some ideas already explored 2024-02-23 16:42:31 +01:00
Alejandro Moreo Fernandez b3ccf71edb Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel 2024-02-23 16:30:11 +01:00
Alejandro Moreo Fernandez 320b3eac38 small fixes in kdey (now should work with string labels) and EMQ (in case some training prior prob was 0, it broke) 2024-02-23 16:29:53 +01:00
Alejandro Moreo Fernandez 9542eaee61 doing some benchmarking 2024-02-22 15:10:45 +01:00
Alejandro Moreo Fernandez d50a86daf4 sketching readme system by Lu and King, Hopings and King 2024-02-16 17:34:10 +01:00
23 changed files with 1899 additions and 17 deletions

8
.gitignore vendored
View File

@ -143,8 +143,7 @@ LeQua2022
MultiLabel
NewMethods
Ordinal
Retrieval
eDiscovery
Archived/eDiscovery
poster-cikm
slides-cikm
slides-short-cikm
@ -153,9 +152,4 @@ svm_perf_quantification/svm_struct
svm_perf_quantification/svm_light
TweetSentQuant
*.png

View File

@ -1,3 +1,9 @@
Change Log 0.1.9
----------------
<...>
Change Log 0.1.8
----------------

View File

@ -0,0 +1,84 @@
import itertools
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
"""
data_home = 'data'
datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category']
param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}
classifiers = [
('LR', LogisticRegression(max_iter=5000), param_grid),
('SVM', LinearSVC(), param_grid)
]
def benchmark_name(class_name):
return class_name.replace('_', '\_')
table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets])
table.format.show_std = False
table.format.stat_test = None
table.format.lower_is_better = False
table.format.color = False
table.format.remove_zero = True
table.format.style = 'rules'
for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers):
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <-------- fixed classifier
texts, labels = load_sample(train_data_path, class_name=class_name)
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='')
classifier = GridSearchCV(
cls,
param_grid=grid,
n_jobs=-1,
cv=5,
verbose=10
)
classifier.fit(Xtr, labels)
classifier_acc = classifier.best_score_
classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_]
print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}')
table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold)
Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table])

153
Retrieval/commons.py Normal file
View File

@ -0,0 +1,153 @@
import pandas as pd
import numpy as np
from glob import glob
from os.path import join
import quapy.functional as F
Ks = [50, 100, 500, 1000]
CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
protected_group = {
'gender': 'Female',
'continent': 'Africa',
'years_category': 'Pre-1900s',
}
def load_sample(path, class_name):
"""
Loads a sample json as a dataframe and returns text and labels for
the given class_name
:param path: path to a json file
:param class_name: string representing the target class
:return: texts, labels for class_name
"""
df = pd.read_json(path)
text = df.text.values
labels = df[class_name].values
return text, labels
def binarize_labels(labels, positive_class=None):
if positive_class is not None:
protected_labels = labels==positive_class
labels[protected_labels] = 1
labels[~protected_labels] = 0
labels = labels.astype(int)
return labels
class RetrievedSamples:
def __init__(self,
class_home: str,
test_rankings_path: str,
test_query_prevs_path: str,
vectorizer,
class_name,
positive_class=None,
classes=None,
):
self.class_home = class_home
self.test_rankings_df = pd.read_json(test_rankings_path)
self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
self.vectorizer = vectorizer
self.class_name = class_name
self.positive_class = positive_class
self.classes = classes
def get_text_label_score(self, df, filter_rank=1000):
df = df[df['rank']<filter_rank]
class_name = self.class_name
vectorizer = self.vectorizer
filter_classes = self.classes
text = df.text.values
labels = df[class_name].values
rel_score = df.score.values
labels = binarize_labels(labels, self.positive_class)
if filter_classes is not None:
idx = np.isin(labels, filter_classes)
text = text[idx]
labels = labels[idx]
rel_score = rel_score[idx]
if vectorizer is not None:
text = vectorizer.transform(text)
order = np.argsort(-rel_score)
return text[order], labels[order], rel_score[order]
def __call__(self):
tests_df = self.test_rankings_df
class_name = self.class_name
for file in self._list_queries():
# loads the training sample
train_df = pd.read_json(file)
if len(train_df) == 0:
print('empty dataframe: ', file)
else:
Xtr, ytr, score_tr = self.get_text_label_score(train_df)
# loads the test sample
query_id = self._get_query_id_from_path(file)
sel_df = tests_df[tests_df.qid == query_id]
Xte, yte, score_te = self.get_text_label_score(sel_df)
# gets the prevalence of all judged relevant documents for the query
df = self.test_query_prevs_df
q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
if self.positive_class is not None:
if self.positive_class not in q_rel_prevs:
print(f'positive class {self.positive_class} not found in the query; skipping')
continue
q_rel_prevs = F.as_binary_prevalence(q_rel_prevs[self.positive_class])
else:
q_rel_prevs = np.asarray([q_rel_prevs.get(class_i, 0.) for class_i in self.classes])
yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
def _list_queries(self):
return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
# def _get_test_sample(self, query_id, max_lines=-1):
# df = self.test_rankings_df
# sel_df = df[df.qid==int(query_id)]
# return get_text_label_score(sel_df)
# texts = sel_df.text.values
# try:
# labels = sel_df[self.class_name].values
# except KeyError as e:
# print(f'error: key {self.class_name} not found in test rankings')
# raise e
# if max_lines > 0 and len(texts) > max_lines:
# ranks = sel_df.rank.values
# idx = np.argsort(ranks)[:max_lines]
# texts = np.asarray(texts)[idx]
# labels = np.asarray(labels)[idx]
# return texts, labels
def total(self):
return len(self._list_queries())
def _get_query_id_from_path(self, path):
prefix = 'training_Query-'
posfix = 'Sample-200SPLIT'
qid = path
qid = qid[:qid.index(posfix)]
qid = qid[qid.index(prefix) + len(prefix):]
qid = int(qid)
return qid

View File

@ -0,0 +1,182 @@
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_txt_sample, load_json_sample
from Retrieval.tabular import Table
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case.
As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from
a pool of the same size. Unlike the fourth experiment, here the training queries are
Por ahora 1000 en tr y 100 en test
Parece que ahora hay muy poco shift
"""
def cls(classifier_trained=None):
if classifier_trained is None:
# return LinearSVC()
return LogisticRegression()
else:
return classifier_trained
def methods(classifier_trained=None):
yield ('CC', ClassifyAndCount(cls(classifier_trained)))
yield ('PCC', PCC(cls(classifier_trained)))
yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
# yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
# yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
# yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
# yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
# yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
# yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
# yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
# yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
# yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def train_classifier():
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_json_sample, class_name=CLASS_NAME)
if REDUCE_TR > 0 and len(training) > REDUCE_TR:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR, *training.prevalence())
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
print('training classifier')
classifier_trained = LogisticRegression()
classifier_trained = GridSearchCV(classifier_trained,
param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
n_jobs=-1, cv=5)
classifier_trained.fit(Xtr, ytr)
classifier_trained = classifier_trained.best_estimator_
trained = True
print('[Done!]')
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
return tfidf, classifier_trained
def reduceAtK(data: LabelledCollection, k):
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
RANK_AT_K = -1
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
def scape_latex(string):
return string.replace('_', '\_')
Ks = [10, 50, 100, 250, 500, 1000, 2000]
# Ks = [500]
for CLASS_NAME in ['gender_category'] : #'years_category']: #['continent', 'first_letter_category']: #, 'gender', 'gender_category', 'occupations', 'source_countries', 'source_subcont_regions', 'years_category', 'relative_pageviews_category']:
data_path = './' + CLASS_NAME
if CLASS_NAME in ['years_category', 'continent', 'gender_category']:
train_path = join(data_path, 'train500PerGroup.json')
else:
train_path = join(data_path, 'train3000samples.json')
tfidf, classifier_trained = qp.util.pickled_resource(f'classifier_{CLASS_NAME}.pkl', train_classifier)
trained=True
experiment_prot = RetrievedSamples(data_path,
load_fn=load_json_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K, classes=classifier_trained.classes_, class_name=CLASS_NAME)
method_names = [name for name, *other in methods()]
benchmarks = [f'{scape_latex(CLASS_NAME)}@{k}' for k in Ks]
table_mae = Table(benchmarks, method_names, color_mode='global')
table_mrae = Table(benchmarks, method_names, color_mode='global')
for method_name, quantifier in methods(classifier_trained):
# print('Starting with method=', method_name)
mae_errors = {k:[] for k in Ks}
mrae_errors = {k:[] for k in Ks}
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
if trained and method_name!='MLPE':
quantifier.fit(train, val_split=train, fit_classifier=False)
else:
quantifier.fit(train)
for k in Ks:
test_k = reduceAtK(test, k)
estim_prev = quantifier.quantify(test_k.instances)
mae_errors[k].append(qp.error.mae(test_k.prevalence(), estim_prev))
mrae_errors[k].append(qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1./(2*k))))
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
# pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
pbar.set_description(f'{method_name}')
for k in Ks:
table_mae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mae_errors[k])
table_mrae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mrae_errors[k])
table_mae.latexPDF('./latex', f'table_{CLASS_NAME}_mae.tex')
table_mrae.latexPDF('./latex', f'table_{CLASS_NAME}_mrae.tex')

View File

@ -0,0 +1,161 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_txt_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the third experiment, and the fairness group are defined upon geographic info as in the third case.
The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
a pool of the same size.
Por ahora 1000 en tr y 100 en test
Parece que ahora hay muy poco shift
"""
def cls(classifier_trained=None):
if classifier_trained is None:
# return LinearSVC()
return LogisticRegression()
else:
return classifier_trained
def methods(classifier_trained=None):
yield ('CC', ClassifyAndCount(cls(classifier_trained)))
yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
yield ('PCC', PCC(cls(classifier_trained)))
yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def train_classifier():
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR > 0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR, *training.prevalence())
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
print('training classifier')
classifier_trained = LogisticRegression()
classifier_trained = GridSearchCV(classifier_trained,
param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
n_jobs=-1, cv=5)
classifier_trained.fit(Xtr, ytr)
classifier_trained = classifier_trained.best_estimator_
trained = True
print('[Done!]')
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
return tfidf, classifier_trained
RANK_AT_K = 1000
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
trained=True
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
result_mae_dict = {}
result_mrae_dict = {}
for method_name, quantifier in methods(classifier_trained):
# print('Starting with method=', method_name)
mae_errors = []
mrae_errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print(train.prevalence())
# print(test.prevalence())
if trained and method_name!='MLPE':
quantifier.fit(train, val_split=train, fit_classifier=False)
else:
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
mae_errors.append(mae)
mrae = qp.error.mrae(test.prevalence(), estim_prev)
mrae_errors.append(mrae)
# print()
# print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print('Estim prevalence:', F.strprev(estim_prev))
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
print()
result_mae_dict[method_name] = np.mean(mae_errors)
result_mrae_dict[method_name] = np.mean(mrae_errors)
print('Results\n'+('-'*100))
for method_name in result_mae_dict.keys():
MAE = result_mae_dict[method_name]
MRAE = result_mrae_dict[method_name]
print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')

View File

@ -0,0 +1,98 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
"""
This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
This is a clear indication that the PPS assumptions do not hold.
Actually, while the training set could be some iid sample from a distribution L and every test set
is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
are biased towards a query term whereas the training set is not.
"""
def methods():
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
def load_txt_sample(path, verbose=False):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text']
y = df['first_letter_category']
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
def __call__(self):
for file in glob(join(self.path_dir, 'test_data_*.txt')):
X, y = self.load_fn(file)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
sample = LabelledCollection(X, y, classes=self.classes)
yield sample.Xp
qp.environ['SAMPLE_SIZE']=100
data_path = './data'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
# training = training.sampling(1000)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('Xtr shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
print('Training prevalence:', F.strprev(training.prevalence()))
for X, p in test_prot():
print('Test prevalence:', F.strprev(p))
for method_name, quantifier in methods():
print('training ', method_name)
quantifier.fit(training)
print('[done]')
report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
print(report.mean())

View File

@ -0,0 +1,131 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
Both elements in the pair are *retrieved according to the same query*. This is a way to impose
the same type of bias that was present in the test, to the training set. Let's see...
"""
def methods():
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['first_letter_category'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y, classes=self.classes)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
test_sample = LabelledCollection(X, y, classes=self.classes)
yield train_sample, test_sample
RANK_AT_K = 500
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollection'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
classes=classes,
max_train_lines=RANK_AT_K,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

View File

@ -0,0 +1,155 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the second experiment, but in this case the fairness group are defined upon geographic info.
"""
def methods():
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
except ValueError as e:
print(f'file {file} caused error {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample
RANK_AT_K = 100
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollectionGeo'
train_path = join(data_path, 'train_data_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print(train.prevalence())
# print(test.prevalence())
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

299
Retrieval/experiments.py Normal file
View File

@ -0,0 +1,299 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.base import clone
import quapy as qp
from Retrieval.commons import *
from Retrieval.methods import *
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
"""
In this sixth experiment, we have a collection C of >6M documents.
We split C in two equally-sized pools TrPool, TePool
I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents.
We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender.
From the training set I have created smaller subsets for each category:
100K, 500K, 1M and FULL (3.25M)
For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500). Let me know if you think we need more.
To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group.
For example: (Male 200, Female 200, Unknown 200)
Sometimes this is infeasible, we should probably discuss this at some point.
You can find the results for every query in a file named:
"training_Query-[QID]Sample-200SPLIT.json"
Test:
To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file.
testRanking_Results.json
"""
def methods(classifier, class_name=None, binarize=False):
kde_param = {
'continent': 0.01,
'gender': 0.03,
'years_category':0.03
}
yield ('NaiveQuery', Naive())
yield ('CC', ClassifyAndCount(classifier))
yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param.get(class_name, 0.01)))
if binarize:
yield ('M3b', M3rND_ModelB(classifier))
yield ('M3b+', M3rND_ModelB(classifier))
yield ('M3d', M3rND_ModelD(classifier))
yield ('M3d+', M3rND_ModelD(classifier))
def train_classifier_fn(train_path):
"""
Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
class_weight (range {'balanced', None}) optimized via 5FCV.
:return: the tfidf-vectorizer and the classifier trained
"""
texts, labels = load_sample(train_path, class_name=class_name)
if BINARIZE:
labels = binarize_labels(labels, positive_class=protected_group[class_name])
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
Xtr = tfidf.fit_transform(texts)
print(f'Xtr shape={Xtr.shape}')
print('training classifier...', end='')
classifier = LogisticRegression(max_iter=5000)
modsel = GridSearchCV(
classifier,
param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
n_jobs=-1,
cv=5
)
modsel.fit(Xtr, labels)
classifier = modsel.best_estimator_
classifier_acc = modsel.best_score_
best_params = modsel.best_params_
print(f'[done] best-params={best_params} got {classifier_acc:.4f} score')
print('generating cross-val predictions for M3')
predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10)
conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_)
training = LabelledCollection(Xtr, labels)
print('training classes:', training.classes_)
print('training prevalence:', training.prevalence())
return tfidf, classifier, conf_matrix
def reduceAtK(data: LabelledCollection, k):
# if k > len(data):
# print(f'[warning] {k=}>{len(data)=}')
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
def benchmark_name(class_name, k=None):
scape_class_name = class_name.replace('_', '\_')
if k is None:
return scape_class_name
else:
return f'{scape_class_name}@{k}'
def run_experiment():
results = {
'mae': {k: [] for k in Ks},
'mrae': {k: [] for k in Ks},
'rKL_error': [],
'rND_error': []
}
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test, q_rel_prevs in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
if not method_name.startswith('Naive') and not method_name.startswith('M3'):
method.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive':
method.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
rKL_estim, rKL_true = [], []
rND_estim, rND_true = [], []
for k in Ks:
test_k = reduceAtK(test_col, k)
if method_name == 'NaiveQuery':
train_k = reduceAtK(train_col, k)
method.fit(train_k)
estim_prev = method.quantify(test_k.instances)
# epsilon value for prevalence smoothing
eps=(1. / (2. * k))
# error metrics
test_k_prev = test_k.prevalence()
mae = qp.error.mae(test_k_prev, estim_prev)
mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps)
rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
rKL_at_k_true = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps)
if BINARIZE:
# [1] is the index of the minority or historically disadvantaged group
rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1])
rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1])
# collect results
results['mae'][k].append(mae)
results['mrae'][k].append(mrae)
rKL_estim.append(rKL_at_k_estim)
rKL_true.append(rKL_at_k_true)
if BINARIZE:
rND_estim.append(rND_at_k_estim)
rND_true.append(rND_at_k_true)
# aggregate fairness metrics
def aggregate(rMs, Ks, Z=1):
return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks))
Z = sum((1. / np.log2(k)) for k in Ks)
rKL_estim = aggregate(rKL_estim, Ks, Z)
rKL_true = aggregate(rKL_true, Ks, Z)
rKL_error = np.abs(rKL_true-rKL_estim)
results['rKL_error'].append(rKL_error)
if BINARIZE:
rND_estim = aggregate(rND_estim, Ks, Z)
rND_true = aggregate(rND_true, Ks, Z)
if isinstance(method, AbstractM3rND):
if method_name.endswith('+'):
# learns the correction parameters from the query-specific training data
conf_matrix_ = method.get_confusion_matrix(*train_col.Xy)
else:
# learns the correction parameters from the training data used to train the classifier
conf_matrix_ = conf_matrix.copy()
rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_)
rND_error = np.abs(rND_true - rND_estim)
results['rND_error'].append(rND_error)
pbar.set_description(f'{method_name}')
return results
data_home = 'data'
if __name__ == '__main__':
# final tables only contain the information for the data size 10K, each row is a class name and each colum
# the corresponding rND (for binary) or rKL (for multiclass) score
tables_RND, tables_DKL = [], []
tables_final = []
for class_mode in ['multiclass', 'binary']:
BINARIZE = (class_mode=='binary')
method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
table_final = Table(name=f'rND' if BINARIZE else f'rKL', benchmarks=[benchmark_name(c) for c in CLASS_NAMES], methods=method_names)
table_final.format.mean_macro = False
tables_final.append(table_final)
for class_name in CLASS_NAMES:
tables_mae, tables_mrae = [], []
benchmarks_size =[benchmark_name(class_name, s) for s in DATA_SIZES]
table_DKL = Table(name=f'rKL-{class_name}', benchmarks=benchmarks_size, methods=method_names)
table_RND = Table(name=f'rND-{class_name}', benchmarks=benchmarks_size, methods=method_names)
for data_size in DATA_SIZES:
print(class_name, class_mode, data_size)
benchmarks_k = [benchmark_name(class_name, k) for k in Ks]
# table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks_k, methods=method_names)
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks_k, methods=method_names)
# tables_mae.append(table_mae)
tables_mrae.append(table_mrae)
# sets all paths
class_home = join(data_home, class_name, data_size)
train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <----- fixed classifier
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
results_home = join('results', class_name, class_mode, data_size)
positive_class = protected_group[class_name] if BINARIZE else None
# instantiates the classifier (trains it the first time, loads it in the subsequent executions)
tfidf, classifier, conf_matrix \
= qp.util.pickled_resource(classifier_path, train_classifier_fn, train_data_path)
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
test_query_prevs_path,
vectorizer=tfidf,
class_name=class_name,
positive_class=positive_class,
classes=classifier.classes_
)
for method_name, method in methods(classifier, class_name, BINARIZE):
results_path = join(results_home, method_name + '.pkl')
results = qp.util.pickled_resource(results_path, run_experiment)
# compose the tables
for k in Ks:
# table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rKL_error'])
if BINARIZE:
table_RND.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rND_error'])
if data_size=='10K':
value = results['rND_error'] if BINARIZE else results['rKL_error']
table_final.add(benchmark=benchmark_name(class_name), method=method_name, v=value)
tables = ([table_RND] + tables_mrae) if BINARIZE else ([table_DKL] + tables_mrae)
Table.LatexPDF(f'./latex/{class_mode}/{class_name}.pdf', tables=tables)
if BINARIZE:
tables_RND.append(table_RND)
else:
tables_DKL.append(table_DKL)
Table.LatexPDF(f'./latex/global/main.pdf', tables=tables_RND+tables_DKL, dedicated_pages=False)
Table.LatexPDF(f'./latex/final/main.pdf', tables=tables_final, dedicated_pages=False)

View File

@ -0,0 +1,88 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from experiments import benchmark_name, reduceAtK, run_experiment
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
def methods(classifier):
for i, bandwidth in enumerate(np.linspace(0.01, 0.1, 10)):
yield (f'KDE{str(i).zfill(2)}', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=bandwidth))
if __name__ == '__main__':
data_home = 'data-modsel'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
method_names = [m for m, *_ in methods(None)]
class_mode = 'multiclass'
dir_names={
'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES',
'continent': '100K_CONT_TREC21_QUERIES/100K-NEW-QUERIES',
'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES'
}
for class_name in ['gender', 'continent', 'years_category']:
tables_mrae = []
benchmarks = [benchmark_name(class_name, k) for k in Ks]
for data_size in ['100K']:
table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
tables_mrae.append(table_mrae)
class_home = join(data_home, dir_names[class_name])
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json')
test_query_prevs_path = join('data', 'prevelance_vectors_judged_docs.json')
results_home = join('results', 'modsel', class_name, data_size)
tfidf, classifier, conf_matrix = pickle.load(open(classifier_path, 'rb'))
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
test_query_prevs_path,
vectorizer=tfidf,
class_name=class_name,
classes=classifier.classes_
)
for method_name, quantifier in methods(classifier):
results_path = join(results_home, method_name + '.pkl')
results = qp.util.pickled_resource(results_path, run_experiment)
for k in Ks:
table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
Table.LatexPDF(f'./latex/modsel/{class_name}.pdf', tables=tables_mrae)

88
Retrieval/methods.py Normal file
View File

@ -0,0 +1,88 @@
"""
This file implements some of the methods presented in the FAccT'22 paper by
Ghazimatin, Kleindessner, Russell, Abedjan, and Golebiowski,
Measuring Fairness of Rankings under Noisy Sensitive Information.
In particular, it implements two variants of a method relying on M3=rND:
one in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|A) (called "b")
and another in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|Â) (called "d")
"""
import numpy as np
from abc import ABC, abstractmethod
from sklearn.metrics import confusion_matrix
from quapy.method.aggregative import CC
class AbstractM3rND(ABC):
def __init__(self, classifier):
self.quantifier = CC(classifier)
def proxy_labels(self, instances):
return self.quantifier.classify(instances)
def quantify(self, instances):
return self.quantifier.quantify(instances)
@abstractmethod
def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
...
def get_confusion_matrix(self, X, y, additive_smoothing=0.5):
"""
Some confusion matrices may contain 0 values for certain classes, and this causes
instabilities in the correction. If requested, applies additive smoothing. Default
is adding half a count.
:param X: array-like with the covariates
:param y: array-like with the true labels
:param additive_smoothing: float, default 0.5
:return: the confusion matrix C with entries Cij=P(Y=i,Ŷ=j)
"""
proxy_labels = self.proxy_labels(X)
true_labels = y
labels = self.quantifier.classes_
conf_matrix = confusion_matrix(true_labels, proxy_labels, labels=labels)
if additive_smoothing > 0:
conf_matrix = conf_matrix.astype(float) + additive_smoothing
return conf_matrix
class M3rND_ModelB(AbstractM3rND):
def __init__(self, classifier):
super().__init__(classifier)
def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
# conf_matrix contains values Cij=P(Y=i,Ŷ=j)
# truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
p = truecond_matrix[0, 1] # P(hat{A}=1|A=0)
q = truecond_matrix[1, 0] # P(hat{A}=0|A=1)
den = (1 - p - q)
if den != 0:
corr = 1./den
rND_estim = rND_estim * corr
return rND_estim
class M3rND_ModelD(AbstractM3rND):
def __init__(self, classifier):
super().__init__(classifier)
def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
# conf_matrix contains values Cij=P(Y=i,Ŷ=j)
# truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
prev_A = conf_matrix.sum(axis=1)
beta = prev_A[1] # P(A)
p = truecond_matrix[0, 1] # P(hat{A}=1|A=0)
q = truecond_matrix[1, 0] # P(hat{A}=0|A=1)
x = (1 - q) * beta + p * (1 - beta)
y = q * beta + (1 - p) * (1 - beta)
if x != 0 and y != 0:
corr = ((((1 - q) * beta) / x) - (q * beta / y))
rND_estim = rND_estim * corr
return rND_estim

View File

@ -0,0 +1,123 @@
import itertools
import os.path
import pickle
import numpy as np
from Retrieval.experiments import methods
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
from os.path import join
import matplotlib.pyplot as plt
data_home = 'data'
class_mode = 'multiclass'
method_names = [name for name, *other in methods(None, 'continent')]
all_results = {}
class_name_label = {
'continent': 'Geographic Location',
'gender': 'Gender',
'years_category': 'Age of Topic'
}
# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
# class_name -> data_size -> method_name -> k -> stat -> float
# where stat is "mean", "std", "max"
def load_all_results():
for class_name in CLASS_NAMES:
all_results[class_name] = {}
for data_size in DATA_SIZES:
all_results[class_name][data_size] = {}
results_home = join('results', class_name, class_mode, data_size)
all_results[class_name][data_size] = {}
for method_name in method_names:
results_path = join(results_home, method_name + '.pkl')
try:
results = pickle.load(open(results_path, 'rb'))
except Exception as e:
print(f'missing result {results}', e)
all_results[class_name][data_size][method_name] = {}
for k in Ks:
all_results[class_name][data_size][method_name][k] = {}
values = results['mrae']
all_results[class_name][data_size][method_name][k]['mean'] = np.mean(values[k])
all_results[class_name][data_size][method_name][k]['std'] = np.std(values[k])
all_results[class_name][data_size][method_name][k]['max'] = np.max(values[k])
return all_results
results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
for class_name in CLASS_NAMES:
for data_size in DATA_SIZES[:1]:
log = class_name=='gender'
fig, ax = plt.subplots()
max_means = []
markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
for method_name in method_names:
# class_name -> data_size -> method_name -> k -> stat -> float
means = [
results[class_name][data_size][method_name][k]['mean'] for k in Ks
]
stds = [
results[class_name][data_size][method_name][k]['std'] for k in Ks
]
# max_mean = np.max([
# results[class_name][data_size][method_name][k]['max'] for k in Ks
# ])
max_means.append(max(means))
means = np.asarray(means)
stds = np.asarray(stds)
method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
marker = next(markers)
line = ax.plot(Ks, means, 'o-', label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
color = line[-1].get_color()
if log:
ax.set_yscale('log')
# ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
ax.set_xlabel('k')
ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
data_size_label = '$\mathcal{L}_{10\mathrm{K}}$'
ax.set_title(f'{class_name_label[class_name]} from {data_size_label}')
ax.set_ylim([0, max(max_means)*1.05])
if class_name == 'years_category':
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
os.makedirs(f'plots/var_k/{class_name}', exist_ok=True)
plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath, bbox_inches='tight')

View File

@ -0,0 +1,87 @@
import itertools
import os.path
from Retrieval.experiments import methods
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
import matplotlib.pyplot as plt
from Retrieval.plot_mrae_xaxis_k import load_all_results
data_home = 'data'
class_mode = 'multiclass'
method_names = [name for name, *other in methods(None)]
all_results = {}
class_name_label = {
'continent': 'Geographic Location',
'gender': 'Gender',
'years_category': 'Age of Topic'
}
# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
# class_name -> data_size -> method_name -> k -> stat -> float
results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
# X_DATA_SIZES = [int(x.replace('K', '000').replace('M', '000000').replace('FULL', '3250000')) for x in DATA_SIZES]
X_DATA_SIZES = [x.replace('FULL', '3.25M') for x in DATA_SIZES]
for class_name in CLASS_NAMES:
for k in [100]: #Ks:
log = class_name=='gender'
fig, ax = plt.subplots()
max_means = []
markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
for method_name in method_names:
# class_name -> data_size -> method_name -> k -> stat -> float
means = [
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
]
stds = [
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
]
# max_mean = np.max([
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
# ])
max_means.append(max(means))
style = 'o-' if method_name != 'CC' else '--'
method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
marker=next(markers)
line = ax.plot(X_DATA_SIZES, means, style, label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
color = line[-1].get_color()
if log:
ax.set_yscale('log')
# ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
ax.set_xlabel('training pool size')
ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
ax.set_title(f'{class_name_label[class_name]} at exposure {k=}')
ax.set_ylim([0, max(max_means)*1.05])
if class_name == 'years_category':
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
os.makedirs(f'plots/var_size/{class_name}', exist_ok=True)
plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath, bbox_inches='tight')

View File

@ -0,0 +1,93 @@
import os.path
import pickle
from itertools import zip_longest
from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
from os.path import join
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
"""
Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
- training pool size (10K, 50K, 100K, 500K, 1M, FULL)
- rank
"""
data_home = 'data'
up_to = 250
for class_name in ['continent']: # 'num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
test_added = False
Mtrs, Mtes, source = [], [], []
for data_size in DATA_SIZES:
class_home = join(data_home, class_name, data_size)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
_, classifier = pickle.load(open(classifier_path, 'rb'))
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
test_query_prevs_path,
vectorizer=None,
class_name=class_name,
classes=classifier.classes_
)
Mtr = []
Mte = []
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test, *_ in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
if len(score_tr) >= up_to:
Mtr.append(score_tr)
Mte.append(score_te)
Mtrs.append(Mtr)
if not test_added:
Mtes.append(Mte)
test_added = True
source.append(data_size)
fig, ax = plt.subplots()
# train_source = ['train-'+s for s in source]
train_source = ['$\mathcal{L}_{'+s.replace('FULL', '3.25M').replace('K','\mathrm{K}').replace('M','\mathrm{M}')+'}$' for s in source]
# Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
Ms = list(zip(Mtrs, train_source)) + list(zip(Mtes, ['$\mathcal{U}_{(3.25\mathrm{M})}$']))
for M, source in Ms:
M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
num_rep, num_docs = M.shape
mean_values = np.nanmean(M, axis=0)
n_filled = np.count_nonzero(~np.isnan(M), axis=0)
std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
color = line[-1].get_color()
ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
ax.set_xlabel('rank ($k$)')
ax.set_ylabel('predicted relevance score')
ax.set_title(class_name.replace('continent', 'Geographic Location'))
ax.set_xlim((0,up_to))
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
# plt.show()
os.makedirs('plots', exist_ok=True)
plotpath = f'plots/{class_name}_rel_distrbution.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath, bbox_inches='tight')

16
Retrieval/tmp.py Normal file
View File

@ -0,0 +1,16 @@
import pandas as pd
from os.path import join
from quapy.data import LabelledCollection
data_home = 'data'
CLASS_NAME = 'continent'
datasize = '100K'
file_path = join(data_home, 'prevelance_vectors_judged_docs.json')
df = pd.read_json(file_path)
pd.set_option('display.max_columns', None)
print(df)

View File

@ -11,7 +11,7 @@ from . import util
from . import model_selection
from . import classification
__version__ = '0.1.8'
__version__ = '0.1.9'
environ = {
'SAMPLE_SIZE': None,

View File

@ -158,8 +158,8 @@ def kld(prevs, prevs_hat, eps=None):
:return: Kullback-Leibler divergence between the two distributions
"""
eps = __check_eps(eps)
smooth_prevs = prevs + eps
smooth_prevs_hat = prevs_hat + eps
smooth_prevs = smooth(prevs, eps)
smooth_prevs_hat = smooth(prevs_hat, eps)
return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)

View File

@ -141,6 +141,19 @@ def uniform_prevalence_sampling(n_classes, size=1):
return u
def uniform_prevalence(n_classes):
"""
Returns a vector representing the uniform distribution for `n_classes`
:param n_classes: number of classes
:return: np.ndarray with all values 1/n_classes
"""
assert isinstance(n_classes, int) and n_classes>0, \
(f'param {n_classes} not understood; must be a positive integer representing the '
f'number of classes ')
return np.full(shape=n_classes, fill_value=1./n_classes)
uniform_simplex_sampling = uniform_prevalence_sampling

View File

@ -52,7 +52,7 @@ class KDEBase:
"""
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, n_classes, bandwidth):
def get_mixture_components(self, X, y, classes, bandwidth):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,7 +62,13 @@ class KDEBase:
:param bandwidth: float, the bandwidth of the kernel
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
class_cond_X = []
for cat in classes:
selX = X[y==cat]
if selX.size==0:
selX = [F.uniform_prevalence(len(classes))]
class_cond_X.append(np.asarray(selX))
return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
@ -114,7 +120,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
self.random_state=random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self
def aggregate(self, posteriors: np.ndarray):
@ -196,7 +202,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
self.montecarlo_trials = montecarlo_trials
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
N = self.montecarlo_trials
rs = self.random_state

View File

@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
from copy import deepcopy
from typing import Callable, Union
import numpy as np
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling
from scipy import optimize
from sklearn.base import BaseEstimator
from sklearn.calibration import CalibratedClassifierCV
@ -636,18 +636,35 @@ class EMQ(AggregativeSoftQuantifier):
calibrator = TempScaling()
elif self.recalib == 'vs':
calibrator = VectorScaling()
elif self.recalib == 'platt':
calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit')
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if not np.issubdtype(y.dtype, np.number):
y = np.searchsorted(data.classes_, y)
if self.recalib == 'platt':
self.classifier = calibrator.fit(*data.Xy)
else:
print(classif_predictions.prevalence())
try:
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
except RuntimeError as e:
print(e)
print('defaults to I')
self.calibration_function = lambda P:P
if self.exact_train_prev:
self.train_prevalence = data.prevalence()
else:
train_posteriors = classif_predictions.X
if self.recalib is not None:
train_posteriors = self.calibration_function(train_posteriors)
if self.recalib == 'platt':
train_posteriors = self.classifier.predict_proba(train_posteriors)
else:
train_posteriors = self.calibration_function(train_posteriors)
self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
def aggregate(self, classif_posteriors, epsilon=EPSILON):
@ -681,6 +698,11 @@ class EMQ(AggregativeSoftQuantifier):
"""
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
if np.product(Ptr) == 0: # some entry is 0; we should smooth the values to avoid 0 division
Ptr += epsilon
Ptr /= Ptr.sum()
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
s, converged = 0, False

View File

@ -1,5 +1,6 @@
from typing import Union, Callable
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from quapy.functional import get_divergence
from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
return F.argmin_prevalence(loss, n_classes, method=self.search)
class ReadMe(BaseQuantifier):
def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
self.bootstrap_trials = bootstrap_trials
self.bootstrap_range = bootstrap_range
self.bagging_trials = bagging_trials
self.bagging_range = bagging_range
self.vectorizer_kwargs = vectorizer_kwargs
def fit(self, data: LabelledCollection):
X, y = data.Xy
self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
X = self.vectorizer.fit_transform(X)
self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
def quantify(self, instances):
X = self.vectorizer.transform(instances)
# number of features
num_docs, num_feats = X.shape
# bootstrap
p_boots = []
for _ in range(self.bootstrap_trials):
docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
Xboot = X[docs_idx]
# bagging
p_bags = []
for _ in range(self.bagging_trials):
feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
Xbag = Xboot[:,feat_idx]
p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
p_bags.append(p)
p_boots.append(np.mean(p_bags, axis=0))
p_mean = np.mean(p_boots, axis=0)
p_std = np.std(p_bags, axis=0)
return p_mean
def std_constrained_linear_ls(self, X, class_cond_X: dict):
pass
def _get_features_range(X):
feat_ranges = []

View File

@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
:param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
return out
def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
"""
A wrapper of multiprocessing:
>>> Parallel(n_jobs=n_jobs)(
>>> delayed(func)(*args_i) for args_i in args
>>> )
that takes the `quapy.environ` variable as input silently.
Seeds the child processes to ensure reproducibility when n_jobs>1.
:param func: callable
:param args: args of func
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
# set a context with a temporal seed to ensure results are reproducibles in parallel
with ExitStack() as stack:
if seed is not None:
stack.enter_context(qp.util.temp_seed(seed))
return func(*args)
out = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
)
if asarray:
out = np.asarray(out)
return out
@contextlib.contextmanager
def temp_seed(random_state):
"""