setting baseline experiments with data format

This commit is contained in:
Alejandro Moreo Fernandez 2021-10-21 17:14:40 +02:00
parent 4284f1daa3
commit 65b2c2ce74
7 changed files with 141 additions and 20 deletions

7
LeQua2022/TODO.txt Normal file
View File

@ -0,0 +1,7 @@
1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar)
2. tablas?
3. fetch dataset (download, unzip, etc.)
4. model selection
5. plots
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
que de todos modos genera un output con el mismo nombre del file

View File

@ -1,16 +1,27 @@
import quapy as qp
import numpy as np
import sklearn
def load_binary_raw_document(path):
documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
labels = np.asarray(labels)
labels[np.logical_or(labels == 1, labels == 2)] = 0
labels[np.logical_or(labels == 4, labels == 5)] = 1
return documents, labels
# def load_binary_raw_document(path):
# documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
# labels = np.asarray(labels)
# labels[np.logical_or(labels == 1, labels == 2)] = 0
# labels[np.logical_or(labels == 4, labels == 5)] = 1
# return documents, labels
def load_multiclass_raw_document(path):
return qp.data.from_text(path, verbose=0, class2int=False)
def load_binary_vectors(path, nF=None):
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
if __name__ == '__main__':
X, y = load_binary_vectors('./data/T1A/public/training_vectors.txt')
print(X.shape)
print(y)

View File

@ -8,16 +8,16 @@ from tqdm import tqdm
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
from data import load_binary_raw_document
from data import load_binary_vectors
import os
path_binary_raw = 'binary_raw'
result_path = os.path.join('results', 'binary_raw')
path_binary_vector = './data/T1A'
result_path = os.path.join('results', 'T1A') # binary - vector
os.makedirs(result_path, exist_ok=True)
train_file = os.path.join(path_binary_raw, 'documents', 'training.txt')
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
train = LabelledCollection.load(train_file, load_binary_raw_document)
train = LabelledCollection.load(train_file, load_binary_vectors)
print(train.classes_)
print(len(train))
@ -36,7 +36,7 @@ for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
ae_errors, rae_errors = [], []
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt')
test_file = os.path.join(path_binary_vector, 'documents', f'{sample_set}_{i}.txt')
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
test.instances = tfidf.transform(test.instances)
qp.environ['SAMPLE_SIZE'] = len(test)

View File

@ -0,0 +1,92 @@
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import pandas as pd
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
import quapy.functional as F
from data import load_binary_vectors
import os
path_binary_vector = './data/T1A'
result_path = os.path.join('results', 'T1A') # binary - vector
os.makedirs(result_path, exist_ok=True)
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
train = LabelledCollection.load(train_file, load_binary_vectors)
nF = train.instances.shape[1]
print(f'number of classes: {len(train.classes_)}')
print(f'number of training documents: {len(train)}')
print(f'training prevalence: {F.strprev(train.prevalence())}')
print(f'training matrix shape: {train.instances.shape}')
dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0)
print(dev_prev)
scores = {}
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
classifier = CalibratedClassifierCV(LogisticRegression())
model = quantifier(classifier).fit(train)
quantifier_name = model.__class__.__name__
scores[quantifier_name]={}
for sample_set, sample_size in [('dev', 1000)]:
ae_errors, rae_errors = [], []
for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'):
filename = row['filename']
prev_true = row[1:].values
sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
sample, _ = load_binary_vectors(sample_path, nF)
qp.environ['SAMPLE_SIZE'] = sample.shape[0]
prev_estim = model.quantify(sample)
# prev_true = sample.prevalence()
ae_errors.append(qp.error.mae(prev_true, prev_estim))
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
ae_errors = np.asarray(ae_errors)
rae_errors = np.asarray(rae_errors)
mae = ae_errors.mean()
mrae = rae_errors.mean()
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
for model in scores:
for sample_set in ['validation']:#, 'test']:
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
"""
test:
CC 0.1859 1.5406
ACC 0.0453 0.2840
PCC 0.1793 1.7187
PACC 0.0287 0.1494
EMQ 0.0225 0.1020
HDy 0.0631 0.2307
validation
CC 0.1862 1.9587
ACC 0.0394 0.2669
PCC 0.1789 2.1383
PACC 0.0354 0.1587
EMQ 0.0224 0.0960
HDy 0.0467 0.2121
"""

View File

@ -8,6 +8,7 @@ from tqdm import tqdm
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
from data import load_multiclass_raw_document
import os
@ -30,10 +31,10 @@ train.instances = tfidf.fit_transform(train.instances)
print(train.instances.shape[1])
scores = {}
for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]:
classifier = CalibratedClassifierCV(LogisticRegression())
# classifier = LogisticRegression()
model = quantifier(classifier).fit(train)
for quantifier in [MLPE()]:#[CC, ACC, PCC, PACC, EMQ]:#, HDy]:
# classifier = CalibratedClassifierCV(LogisticRegression())
# model = quantifier(classifier).fit(train)
model = quantifier.fit(train)
print('model trained')
quantifier_name = model.__class__.__name__
@ -67,11 +68,20 @@ for model in scores:
"""
test:
MLPE validation 0.0423 4.8582
CC validation 0.0308 2.9731
PCC validation 0.0296 3.3926
ACC validation 0.0328 3.1461
PACC validation 0.0176 1.6449
EMQ validation 0.0207 1.6960
validation
MLPE test 0.0423 4.6083
CC test 0.0308 2.9037
PCC test 0.0296 3.2764
ACC test 0.0328 3.0674
PACC test 0.0174 1.5892
EMQ test 0.0207 1.6059
"""

View File

@ -35,7 +35,7 @@ class LabelledCollection:
self.classes_ = np.unique(np.asarray(classes_))
self.classes_.sort()
if len(set(self.labels).difference(set(classes_))) > 0:
raise ValueError('labels contains values not included in classes_')
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes_)})')
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod

View File

@ -10,6 +10,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
def fit(self, data: LabelledCollection, *args):
self._classes_ = data.classes_
self.estimated_prevalence = data.prevalence()
return self
def quantify(self, documents, *args):
return self.estimated_prevalence