132 lines
4.5 KiB
Python
132 lines
4.5 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
import quapy as qp
|
|
import quapy.functional as F
|
|
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
|
|
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
|
|
from quapy.protocol import AbstractProtocol
|
|
from quapy.data.base import LabelledCollection
|
|
|
|
from glob import glob
|
|
from os.path import join
|
|
from tqdm import tqdm
|
|
|
|
"""
|
|
In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
|
|
Both elements in the pair are *retrieved according to the same query*. This is a way to impose
|
|
the same type of bias that was present in the test, to the training set. Let's see...
|
|
"""
|
|
|
|
def methods():
|
|
yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
|
|
yield ('CC', ClassifyAndCount(LogisticRegression()))
|
|
yield ('EMQ', EMQ(LogisticRegression()))
|
|
yield ('PCC', PCC(LogisticRegression()))
|
|
yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
|
|
yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
|
|
|
|
|
|
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
|
|
if verbose:
|
|
print(f'loading {path}...', end='')
|
|
df = pd.read_csv(path, sep='\t')
|
|
if verbose:
|
|
print('[done]')
|
|
X = df['text'].values
|
|
y = df['first_letter_category'].values
|
|
|
|
if parse_columns:
|
|
rank = df['rank'].values
|
|
scores = df['score'].values
|
|
order = np.argsort(rank)
|
|
X = X[order]
|
|
y = y[order]
|
|
rank = rank[order]
|
|
scores = scores[order]
|
|
|
|
if max_lines is not None:
|
|
X = X[:max_lines]
|
|
y = y[:max_lines]
|
|
|
|
return X, y
|
|
|
|
|
|
class RetrievedSamples(AbstractProtocol):
|
|
|
|
def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
|
|
self.path_dir = path_dir
|
|
self.load_fn = load_fn
|
|
self.vectorizer = vectorizer
|
|
self.classes = classes
|
|
self.max_train_lines = max_train_lines
|
|
self.max_test_lines = max_test_lines
|
|
|
|
def __call__(self):
|
|
for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
|
|
|
|
X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
|
|
X = self.vectorizer.transform(X)
|
|
train_sample = LabelledCollection(X, y, classes=self.classes)
|
|
|
|
X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
|
|
if len(X)!=qp.environ['SAMPLE_SIZE']:
|
|
print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
|
|
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
|
|
X = self.vectorizer.transform(X)
|
|
test_sample = LabelledCollection(X, y, classes=self.classes)
|
|
|
|
yield train_sample, test_sample
|
|
|
|
|
|
RANK_AT_K = 500
|
|
REDUCE_TR = 50000
|
|
qp.environ['SAMPLE_SIZE'] = RANK_AT_K
|
|
|
|
data_path = './newCollection'
|
|
train_path = join(data_path, 'train_data.txt')
|
|
|
|
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
|
|
|
|
training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
|
|
if REDUCE_TR>0:
|
|
print('Reducing the number of documents in the training to', REDUCE_TR)
|
|
training = training.sampling(REDUCE_TR)
|
|
|
|
Xtr, ytr = training.Xy
|
|
Xtr = tfidf.fit_transform(Xtr)
|
|
print('L orig shape = ', Xtr.shape)
|
|
|
|
training = LabelledCollection(Xtr, ytr)
|
|
classes = training.classes_
|
|
|
|
experiment_prot = RetrievedSamples(data_path,
|
|
load_fn=load_txt_sample,
|
|
vectorizer=tfidf,
|
|
classes=classes,
|
|
max_train_lines=RANK_AT_K,
|
|
max_test_lines=RANK_AT_K)
|
|
|
|
for method_name, quantifier in methods():
|
|
print('Starting with method=', method_name)
|
|
|
|
errors = []
|
|
pbar = tqdm(experiment_prot(), total=49)
|
|
for train, test in pbar:
|
|
# print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
|
|
# print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
|
|
|
|
quantifier.fit(train)
|
|
estim_prev = quantifier.quantify(test.instances)
|
|
mae = qp.error.mae(test.prevalence(), estim_prev)
|
|
errors.append(mae)
|
|
|
|
pbar.set_description(f'mae={np.mean(errors):.4f}')
|
|
print()
|
|
|
|
|
|
|
|
|