1
0
Fork 0

Compare commits

...

11 Commits

15 changed files with 1443 additions and 12 deletions

8
.gitignore vendored
View File

@ -143,8 +143,7 @@ LeQua2022
MultiLabel
NewMethods
Ordinal
Retrieval
eDiscovery
Archived/eDiscovery
poster-cikm
slides-cikm
slides-short-cikm
@ -153,9 +152,4 @@ svm_perf_quantification/svm_struct
svm_perf_quantification/svm_light
TweetSentQuant
*.png

View File

@ -1,3 +1,9 @@
Change Log 0.1.9
----------------
<...>
Change Log 0.1.8
----------------

121
Retrieval/commons.py Normal file
View File

@ -0,0 +1,121 @@
import pandas as pd
import numpy as np
from glob import glob
from os.path import join
from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol
import json
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
def load_json_sample(path, class_name, max_lines=-1):
obj = json.load(open(path, 'rt'))
keys = [f'{id}' for id in range(len(obj['text'].keys()))]
text = [obj['text'][id] for id in keys]
classes = [obj[class_name][id] for id in keys]
if max_lines is not None and max_lines>0:
text = text[:max_lines]
classes = classes[:max_lines]
return text, classes
class TextRankings:
def __init__(self, path, class_name):
self.obj = json.load(open(path, 'rt'))
self.class_name = class_name
def get_sample_Xy(self, sample_id, max_lines=-1):
sample_id = str(sample_id)
O = self.obj
docs_ids = [doc_id for doc_id, query_id in O['qid'].items() if query_id == sample_id]
texts = [O['text'][doc_id] for doc_id in docs_ids]
labels = [O[self.class_name][doc_id] for doc_id in docs_ids]
if max_lines > 0 and len(texts) > max_lines:
ranks = [int(O['rank'][doc_id]) for doc_id in docs_ids]
sel = np.argsort(ranks)[:max_lines]
texts = np.asarray(texts)[sel]
labels = np.asarray(labels)[sel]
return texts, labels
def get_query_id_from_path(path, prefix='training', posfix='200SPLIT'):
qid = path
qid = qid[:qid.index(posfix)]
qid = qid[qid.index(prefix)+len(prefix):]
return qid
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None, class_name=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
self.classes=classes
assert class_name is not None, 'class name should be specified'
self.class_name = class_name
self.text_samples = TextRankings(join(self.path_dir, 'testRankingsRetrieval.json'), class_name=class_name)
def __call__(self):
for file in glob(join(self.path_dir, 'training*SPLIT.json')):
X, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y, classes=self.classes)
query_id = get_query_id_from_path(file)
X, y = self.text_samples.get_sample_Xy(query_id, max_lines=self.max_test_lines)
# if len(X)!=qp.environ['SAMPLE_SIZE']:
# print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
except ValueError as e:
print(f'file {file} caused error {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample

182
Retrieval/fifth.py Normal file
View File

@ -0,0 +1,182 @@
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_txt_sample, load_json_sample
from Retrieval.tabular import Table
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case.
As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from
a pool of the same size. Unlike the fourth experiment, here the training queries are
Por ahora 1000 en tr y 100 en test
Parece que ahora hay muy poco shift
"""
def cls(classifier_trained=None):
if classifier_trained is None:
# return LinearSVC()
return LogisticRegression()
else:
return classifier_trained
def methods(classifier_trained=None):
yield ('CC', ClassifyAndCount(cls(classifier_trained)))
yield ('PCC', PCC(cls(classifier_trained)))
yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
# yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
# yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
# yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
# yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
# yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
# yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
# yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
# yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
# yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
# yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def train_classifier():
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_json_sample, class_name=CLASS_NAME)
if REDUCE_TR > 0 and len(training) > REDUCE_TR:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR, *training.prevalence())
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
print('training classifier')
classifier_trained = LogisticRegression()
classifier_trained = GridSearchCV(classifier_trained,
param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
n_jobs=-1, cv=5)
classifier_trained.fit(Xtr, ytr)
classifier_trained = classifier_trained.best_estimator_
trained = True
print('[Done!]')
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
return tfidf, classifier_trained
def reduceAtK(data: LabelledCollection, k):
X, y = data.Xy
X = X[:k]
y = y[:k]
return LabelledCollection(X, y, classes=data.classes_)
RANK_AT_K = -1
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
def scape_latex(string):
return string.replace('_', '\_')
Ks = [10, 50, 100, 250, 500, 1000, 2000]
# Ks = [500]
for CLASS_NAME in ['continent'] : #'years_category']: #['continent', 'first_letter_category']: #, 'gender', 'gender_category', 'occupations', 'source_countries', 'source_subcont_regions', 'years_category', 'relative_pageviews_category']:
data_path = './' + CLASS_NAME
if CLASS_NAME in ['years_category', 'continent']:
train_path = join(data_path, 'train500PerGroup.json')
else:
train_path = join(data_path, 'train3000samples.json')
tfidf, classifier_trained = qp.util.pickled_resource(f'classifier_{CLASS_NAME}.pkl', train_classifier)
trained=True
experiment_prot = RetrievedSamples(data_path,
load_fn=load_json_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K, classes=classifier_trained.classes_, class_name=CLASS_NAME)
method_names = [name for name, *other in methods()]
benchmarks = [f'{scape_latex(CLASS_NAME)}@{k}' for k in Ks]
table_mae = Table(benchmarks, method_names, color_mode='global')
table_mrae = Table(benchmarks, method_names, color_mode='global')
for method_name, quantifier in methods(classifier_trained):
# print('Starting with method=', method_name)
mae_errors = {k:[] for k in Ks}
mrae_errors = {k:[] for k in Ks}
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
if trained and method_name!='MLPE':
quantifier.fit(train, val_split=train, fit_classifier=False)
else:
quantifier.fit(train)
for k in Ks:
test_k = reduceAtK(test, k)
estim_prev = quantifier.quantify(test_k.instances)
mae_errors[k].append(qp.error.mae(test_k.prevalence(), estim_prev))
mrae_errors[k].append(qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1./(2*k))))
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
# pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
pbar.set_description(f'{method_name}')
for k in Ks:
table_mae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mae_errors[k])
table_mrae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mrae_errors[k])
table_mae.latexPDF('./latex', f'table_{CLASS_NAME}_mae.tex')
table_mrae.latexPDF('./latex', f'table_{CLASS_NAME}_mrae.tex')

161
Retrieval/fourth.py Normal file
View File

@ -0,0 +1,161 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_txt_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the third experiment, and the fairness group are defined upon geographic info as in the third case.
The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
a pool of the same size.
Por ahora 1000 en tr y 100 en test
Parece que ahora hay muy poco shift
"""
def cls(classifier_trained=None):
if classifier_trained is None:
# return LinearSVC()
return LogisticRegression()
else:
return classifier_trained
def methods(classifier_trained=None):
yield ('CC', ClassifyAndCount(cls(classifier_trained)))
yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
# yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
yield ('PCC', PCC(cls(classifier_trained)))
yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def train_classifier():
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR > 0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR, *training.prevalence())
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
print('training classifier')
classifier_trained = LogisticRegression()
classifier_trained = GridSearchCV(classifier_trained,
param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
n_jobs=-1, cv=5)
classifier_trained.fit(Xtr, ytr)
classifier_trained = classifier_trained.best_estimator_
trained = True
print('[Done!]')
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
return tfidf, classifier_trained
RANK_AT_K = 1000
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
trained=True
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
result_mae_dict = {}
result_mrae_dict = {}
for method_name, quantifier in methods(classifier_trained):
# print('Starting with method=', method_name)
mae_errors = []
mrae_errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print(train.prevalence())
# print(test.prevalence())
if trained and method_name!='MLPE':
quantifier.fit(train, val_split=train, fit_classifier=False)
else:
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
mae_errors.append(mae)
mrae = qp.error.mrae(test.prevalence(), estim_prev)
mrae_errors.append(mrae)
# print()
# print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print('Estim prevalence:', F.strprev(estim_prev))
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
print()
result_mae_dict[method_name] = np.mean(mae_errors)
result_mrae_dict[method_name] = np.mean(mrae_errors)
print('Results\n'+('-'*100))
for method_name in result_mae_dict.keys():
MAE = result_mae_dict[method_name]
MRAE = result_mrae_dict[method_name]
print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')

View File

@ -0,0 +1,98 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
"""
This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
This is a clear indication that the PPS assumptions do not hold.
Actually, while the training set could be some iid sample from a distribution L and every test set
is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
are biased towards a query term whereas the training set is not.
"""
def methods():
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
def load_txt_sample(path, verbose=False):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text']
y = df['first_letter_category']
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
def __call__(self):
for file in glob(join(self.path_dir, 'test_data_*.txt')):
X, y = self.load_fn(file)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
sample = LabelledCollection(X, y, classes=self.classes)
yield sample.Xp
qp.environ['SAMPLE_SIZE']=100
data_path = './data'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
# training = training.sampling(1000)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('Xtr shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
print('Training prevalence:', F.strprev(training.prevalence()))
for X, p in test_prot():
print('Test prevalence:', F.strprev(p))
for method_name, quantifier in methods():
print('training ', method_name)
quantifier.fit(training)
print('[done]')
report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
print(report.mean())

View File

@ -0,0 +1,131 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
Both elements in the pair are *retrieved according to the same query*. This is a way to impose
the same type of bias that was present in the test, to the training set. Let's see...
"""
def methods():
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['first_letter_category'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.classes = classes
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y, classes=self.classes)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
test_sample = LabelledCollection(X, y, classes=self.classes)
yield train_sample, test_sample
RANK_AT_K = 500
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollection'
train_path = join(data_path, 'train_data.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
classes=classes,
max_train_lines=RANK_AT_K,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

155
Retrieval/previous/third.py Normal file
View File

@ -0,0 +1,155 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
"""
In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the second experiment, but in this case the fairness group are defined upon geographic info.
"""
def methods():
yield ('CC', ClassifyAndCount(LogisticRegression()))
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('EMQ', EMQ(LogisticRegression()))
yield ('PCC', PCC(LogisticRegression()))
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
self.path_dir = path_dir
self.load_fn = load_fn
self.vectorizer = vectorizer
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
def __call__(self):
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
train_sample = LabelledCollection(X, y)
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
if len(X)!=qp.environ['SAMPLE_SIZE']:
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
except ValueError as e:
print(f'file {file} caused error {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample
RANK_AT_K = 100
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
data_path = './newCollectionGeo'
train_path = join(data_path, 'train_data_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
if REDUCE_TR>0:
print('Reducing the number of documents in the training to', REDUCE_TR)
training = training.sampling(REDUCE_TR)
Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)
training = LabelledCollection(Xtr, ytr)
classes = training.classes_
print('training classes:', classes)
print('training prevalence:', training.prevalence())
experiment_prot = RetrievedSamples(data_path,
load_fn=load_txt_sample,
vectorizer=tfidf,
max_train_lines=None,
max_test_lines=RANK_AT_K)
for method_name, quantifier in methods():
print('Starting with method=', method_name)
errors = []
pbar = tqdm(experiment_prot(), total=49)
for train, test in pbar:
if train is not None:
try:
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
# print(train.prevalence())
# print(test.prevalence())
quantifier.fit(train)
estim_prev = quantifier.quantify(test.instances)
mae = qp.error.mae(test.prevalence(), estim_prev)
errors.append(mae)
except Exception as e:
print(f'wow, something happened here! skipping; {e}')
else:
print('skipping one!')
pbar.set_description(f'mae={np.mean(errors):.4f}')
print()

427
Retrieval/tabular.py Normal file
View File

@ -0,0 +1,427 @@
import os.path
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
from pathlib import Path
from os.path import join
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True, color_mode='local', maxtone=50):
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col:j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = ttest
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.color_mode = color_mode
self.maxtone = maxtone
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
def _addcolor(self):
minval = {}
maxval = {}
if self.color_mode == 'global':
filled_cols_idx = np.argwhere(self.map['fill'])
col_means = [self.map['mean'][i, j] for i, j in filled_cols_idx]
if len(filled_cols_idx) > 0:
global_minval = min(col_means)
global_maxval = max(col_means)
for i in range(self.nbenchmarks):
minval[i] = global_minval
maxval[i] = global_maxval
elif self.color_mode == 'local':
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i, i + 1])
if len(filled_cols_idx)>0:
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
minval[i] = min(col_means)
maxval[i] = max(col_means)
else:
print(f'color mode {self.color_mode} not understood, valid ones are "local" and "global"; skip')
return
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
for col_idx in filled_cols_idx:
val = self.map['mean'][i,col_idx]
if i not in maxval or i not in minval:
continue
norm = (maxval[i] - minval[i])
if norm > 0:
normval = (val - minval[i]) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
normval = np.clip(normval, 0,1)
self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
try:
_, p_val = wilcoxon(values1, values2)
except ValueError:
p_val = 0
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False]*self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j==best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods,
lower_is_better=self.lower_is_better,
ttest=self.ttest,
average=False,
missing=self.missing,
missing_str=self.missing_str,
prec_mean=self.prec_mean,
prec_std=self.prec_std,
clean_zero=self.clean_zero,
show_std=self.show_std,
color=self.color,
maxtone=self.maxtone)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
else: # wilcoxon
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim==0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
self.map['values'][rid, cid] = values
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v,float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latex(self, benchmark, method):
self.update()
i,j = self._coordinates(benchmark, method)
if self.map['fill'][i,j] == False:
return self.missing_str
mean = self.map['mean'][i,j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i,j] == 1
if isbest:
l = "\\textbf{"+l.strip()+"}"
stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
if self.ttest is not None and self.some_similar[j]:
test_label = self.map['ttest'][i,j]
if test_label == 'Sim':
stat = '^{\dag}'
elif test_label == 'Same':
stat = '^{\ddag}'
elif isbest or test_label == 'Diff':
stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i,j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f"\pm {std:{self.prec_std}}"
if stat!='' or std!='':
l = f'{l}${stat}{std}$'
if self.color:
l += ' ' + self.map['color'][i,j]
return l
def latexPDF(self, path, name:str, *args, **kwargs):
if not name.endswith('.tex'):
name += '.tex'
self.latexSaveDocument(join(path, name), *args, **kwargs)
print("[Tables Done] runing latex")
os.chdir(path)
os.system('pdflatex '+name)
basename = name.replace('.tex', '')
os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
os.chdir('..')
def latexSaveDocument(self, path, *args, **kwargs):
document = self.latexDocument(*args, **kwargs)
parent = Path(path).parent
os.makedirs(parent, exist_ok=True)
with open(path, 'wt') as foo:
foo.write(document)
print('text file save at ', path)
def latexDocument(self, *args, **kwargs):
document = """
\\documentclass[10pt,a4paper]{article}
\\usepackage[utf8]{inputenc}
\\usepackage{amsmath}
\\usepackage{amsfonts}
\\usepackage{amssymb}
\\usepackage{graphicx}
\\usepackage{xcolor}
\\usepackage{colortbl}
\\begin{document}
"""
document += self.latexTable(*args, **kwargs)
document += "\n\end{document}\n"
return document
def latexTable(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline', resizebox=True):
table = """
\\begin{table}
\center
%%%\\resizebox{\\textwidth}{!}{% \n
"""
table += "\n\\begin{tabular}{|c"+"|c" * self.nmethods + "|}\n"
table += self.latexTabular(benchmark_replace, method_replace, aslines, endl)
table += "\n\\end{tabular}\n"
table += """
%%%}%
\end{table}
"""
if resizebox:
table = table.replace("%%%", "")
return table
def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
lines = []
l = '\multicolumn{1}{c|}{} & '
l += ' & '.join([method_replace.get(col, col) for col in self.methods])
l += ' \\\\\hline'
lines.append(l)
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
l = rowname + ' & '
l += self.latexRow(row, endl=endl)
lines.append(l)
if self.add_average:
# l += '\hline\n'
l = '\hline \n \\textit{Average} & '
l += self.latexAverage(endl=endl)
lines.append(l)
if not aslines:
lines='\n'.join(lines)
return lines
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latex(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self, prec_mean=0):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col:j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

View File

@ -0,0 +1,66 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.svm import LinearSVC
from quapy.data.base import LabelledCollection
from sklearn.model_selection import cross_val_score, GridSearchCV
from os.path import join
"""
In this experiment, I simply try to understand whether the learning task can be learned or not.
The problem is that we are quantifying the categories based on the alphabetical order (of what?).
"""
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
data = data.sampling(20000)
train, test = data.split_stratified()
train.instances = tfidf.fit_transform(train.instances)
test.instances = tfidf.transform(test.instances)
# svm = LinearSVC()
# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
cls = LogisticRegression()
cls.fit(*train.Xy)
# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
# print(score)
# print(np.mean(score))
y_pred = cls.predict(test.instances)
macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
print('macro', macrof1)
print('micro', microf1)

View File

@ -11,7 +11,7 @@ from . import util
from . import model_selection
from . import classification
__version__ = '0.1.8'
__version__ = '0.1.9'
environ = {
'SAMPLE_SIZE': None,

View File

@ -52,7 +52,7 @@ class KDEBase:
"""
return np.exp(kde.score_samples(X))
def get_mixture_components(self, X, y, n_classes, bandwidth):
def get_mixture_components(self, X, y, classes, bandwidth):
"""
Returns an array containing the mixture components, i.e., the KDE functions for each class.
@ -62,7 +62,7 @@ class KDEBase:
:param bandwidth: float, the bandwidth of the kernel
:return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
"""
return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes]
@ -114,7 +114,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
self.random_state=random_state
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
return self
def aggregate(self, posteriors: np.ndarray):
@ -196,7 +196,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
self.montecarlo_trials = montecarlo_trials
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
N = self.montecarlo_trials
rs = self.random_state

View File

@ -640,6 +640,8 @@ class EMQ(AggregativeSoftQuantifier):
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
if not np.issubdtype(y.dtype, np.number):
y = np.searchsorted(data.classes_, y)
self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
if self.exact_train_prev:
@ -681,6 +683,11 @@ class EMQ(AggregativeSoftQuantifier):
"""
Px = posterior_probabilities
Ptr = np.copy(tr_prev)
if np.product(Ptr) == 0: # some entry is 0; we should smooth the values to avoid 0 division
Ptr += epsilon
Ptr /= Ptr.sum()
qs = np.copy(Ptr) # qs (the running estimate) is initialized as the training prevalence
s, converged = 0, False

View File

@ -1,5 +1,6 @@
from typing import Union, Callable
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from quapy.functional import get_divergence
from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
return F.argmin_prevalence(loss, n_classes, method=self.search)
class ReadMe(BaseQuantifier):
def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
self.bootstrap_trials = bootstrap_trials
self.bootstrap_range = bootstrap_range
self.bagging_trials = bagging_trials
self.bagging_range = bagging_range
self.vectorizer_kwargs = vectorizer_kwargs
def fit(self, data: LabelledCollection):
X, y = data.Xy
self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
X = self.vectorizer.fit_transform(X)
self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
def quantify(self, instances):
X = self.vectorizer.transform(instances)
# number of features
num_docs, num_feats = X.shape
# bootstrap
p_boots = []
for _ in range(self.bootstrap_trials):
docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
Xboot = X[docs_idx]
# bagging
p_bags = []
for _ in range(self.bagging_trials):
feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
Xbag = Xboot[:,feat_idx]
p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
p_bags.append(p)
p_boots.append(np.mean(p_bags, axis=0))
p_mean = np.mean(p_boots, axis=0)
p_std = np.std(p_bags, axis=0)
return p_mean
def std_constrained_linear_ls(self, X, class_cond_X: dict):
pass
def _get_features_range(X):
feat_ranges = []

View File

@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
:param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
return out
def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
"""
A wrapper of multiprocessing:
>>> Parallel(n_jobs=n_jobs)(
>>> delayed(func)(*args_i) for args_i in args
>>> )
that takes the `quapy.environ` variable as input silently.
Seeds the child processes to ensure reproducibility when n_jobs>1.
:param func: callable
:param args: args of func
:param seed: the numeric seed
:param asarray: set to True to return a np.ndarray instead of a list
:param backend: indicates the backend used for handling parallel works
"""
def func_dec(environ, seed, *args):
qp.environ = environ.copy()
qp.environ['N_JOBS'] = 1
# set a context with a temporal seed to ensure results are reproducibles in parallel
with ExitStack() as stack:
if seed is not None:
stack.enter_context(qp.util.temp_seed(seed))
return func(*args)
out = Parallel(n_jobs=n_jobs, backend=backend)(
delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
)
if asarray:
out = np.asarray(out)
return out
@contextlib.contextmanager
def temp_seed(random_state):
"""