1
0
Fork 0

first commit, some ideas already explored

This commit is contained in:
Alejandro Moreo Fernandez 2024-02-23 16:42:31 +01:00
parent b3ccf71edb
commit 1c03dd651b
6 changed files with 685 additions and 0 deletions

74
Retrieval/commons.py Normal file
View File

@ -0,0 +1,74 @@
import pandas as pd
import numpy as np
from glob import glob
from os.path import join
from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
self.classes=classes
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings', 'test_rankingstraining_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y, classes=self.classes)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
# if len(X)!=qp.environ['SAMPLE_SIZE']:
# print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
except ValueError as e:
print(f'file {file} caused error {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample

161
Retrieval/fourth.py Normal file
View File

@ -0,0 +1,161 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_txt_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the third experiment, and the fairness group are defined upon geographic info as in the third case.
The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
a pool of the same size.
Por ahora 1000 en tr y 100 en test
Parece que ahora hay muy poco shift
"""
def cls(classifier_trained=None):
if classifier_trained is None:
# return LinearSVC()
return LogisticRegression()
else:
return classifier_trained
def methods(classifier_trained=None):
yield ('CC', ClassifyAndCount(cls(classifier_trained)))
yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
yield ('PCC', PCC(cls(classifier_trained)))
yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def train_classifier():
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR > 0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR, *training.prevalence())
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
print('training classifier')
classifier_trained = LogisticRegression()
classifier_trained = GridSearchCV(classifier_trained,
param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
n_jobs=-1, cv=5)
classifier_trained.fit(Xtr, ytr)
classifier_trained = classifier_trained.best_estimator_
trained = True
print('[Done!]')
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
return tfidf, classifier_trained
RANK_AT_K = 1000
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
trained=True
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
result_mae_dict = {}
result_mrae_dict = {}
for method_name, quantifier in methods(classifier_trained):
# print('Starting with method=', method_name)
mae_errors = []
mrae_errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print(train.prevalence())
# print(test.prevalence())
if trained and method_name!='MLPE':
quantifier.fit(train, val_split=train, fit_classifier=False)
else:
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
mae_errors.append(mae)
mrae = qp.error.mrae(test.prevalence(), estim_prev)
mrae_errors.append(mrae)
# print()
# print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print('Estim prevalence:', F.strprev(estim_prev))
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
print()
result_mae_dict[method_name] = np.mean(mae_errors)
result_mrae_dict[method_name] = np.mean(mrae_errors)
print('Results\n'+('-'*100))
for method_name in result_mae_dict.keys():
MAE = result_mae_dict[method_name]
MRAE = result_mrae_dict[method_name]
print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')

View File

@ -0,0 +1,98 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
"""
This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
This is a clear indication that the PPS assumptions do not hold.
Actually, while the training set could be some iid sample from a distribution L and every test set
is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
are biased towards a query term whereas the training set is not.
"""
def methods():
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
def load_txt_sample(path, verbose=False):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text']
y = df['first_letter_category']
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
def __call__(self):
for file in glob(join(self.path_dir, 'test_data_*.txt')):
X, y = self.load_fn(file)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
sample = LabelledCollection(X, y, classes=self.classes)
yield sample.Xp
qp.environ['SAMPLE_SIZE']=100
data_path = './data'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
# training = training.sampling(1000)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('Xtr shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
print('Training prevalence:', F.strprev(training.prevalence()))
for X, p in test_prot():
print('Test prevalence:', F.strprev(p))
for method_name, quantifier in methods():
print('training ', method_name)
quantifier.fit(training)
print('[done]')
report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
print(report.mean())

View File

@ -0,0 +1,131 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
Both elements in the pair are *retrieved according to the same query*. This is a way to impose
the same type of bias that was present in the test, to the training set. Let's see...
"""
def methods():
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['first_letter_category'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y, classes=self.classes)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
test_sample = LabelledCollection(X, y, classes=self.classes)
yield train_sample, test_sample
RANK_AT_K = 500
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollection'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
classes=classes,
max_train_lines=RANK_AT_K,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

155
Retrieval/previous/third.py Normal file
View File

@ -0,0 +1,155 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the second experiment, but in this case the fairness group are defined upon geographic info.
"""
def methods():
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
except ValueError as e:
print(f'file {file} caused error {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample
RANK_AT_K = 100
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollectionGeo'
train_path = join(data_path, 'train_data_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print(train.prevalence())
# print(test.prevalence())
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

View File

@ -0,0 +1,66 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.svm import LinearSVC
from quapy.data.base import LabelledCollection
from sklearn.model_selection import cross_val_score, GridSearchCV
from os.path import join
"""
In this experiment, I simply try to understand whether the learning task can be learned or not.
The problem is that we are quantifying the categories based on the alphabetical order (of what?).
"""
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
data = data.sampling(20000)
train, test = data.split_stratified()
train.instances = tfidf.fit_transform(train.instances)
test.instances = tfidf.transform(test.instances)
# svm = LinearSVC()
# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
cls = LogisticRegression()
cls.fit(*train.Xy)
# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
# print(score)
# print(np.mean(score))
y_pred = cls.predict(test.instances)
macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
print('macro', macrof1)
print('micro', microf1)