setting baseline experiments with data format
This commit is contained in:
parent
4284f1daa3
commit
65b2c2ce74
|
@ -0,0 +1,7 @@
|
||||||
|
1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar)
|
||||||
|
2. tablas?
|
||||||
|
3. fetch dataset (download, unzip, etc.)
|
||||||
|
4. model selection
|
||||||
|
5. plots
|
||||||
|
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
|
||||||
|
que de todos modos genera un output con el mismo nombre del file
|
|
@ -1,16 +1,27 @@
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import sklearn
|
||||||
|
|
||||||
|
|
||||||
def load_binary_raw_document(path):
|
# def load_binary_raw_document(path):
|
||||||
documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
|
# documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
|
||||||
labels = np.asarray(labels)
|
# labels = np.asarray(labels)
|
||||||
labels[np.logical_or(labels == 1, labels == 2)] = 0
|
# labels[np.logical_or(labels == 1, labels == 2)] = 0
|
||||||
labels[np.logical_or(labels == 4, labels == 5)] = 1
|
# labels[np.logical_or(labels == 4, labels == 5)] = 1
|
||||||
return documents, labels
|
# return documents, labels
|
||||||
|
|
||||||
|
|
||||||
def load_multiclass_raw_document(path):
|
def load_multiclass_raw_document(path):
|
||||||
return qp.data.from_text(path, verbose=0, class2int=False)
|
return qp.data.from_text(path, verbose=0, class2int=False)
|
||||||
|
|
||||||
|
|
||||||
|
def load_binary_vectors(path, nF=None):
|
||||||
|
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
X, y = load_binary_vectors('./data/T1A/public/training_vectors.txt')
|
||||||
|
print(X.shape)
|
||||||
|
print(y)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,16 +8,16 @@ from tqdm import tqdm
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from data import load_binary_raw_document
|
from data import load_binary_vectors
|
||||||
import os
|
import os
|
||||||
|
|
||||||
path_binary_raw = 'binary_raw'
|
path_binary_vector = './data/T1A'
|
||||||
result_path = os.path.join('results', 'binary_raw')
|
result_path = os.path.join('results', 'T1A') # binary - vector
|
||||||
os.makedirs(result_path, exist_ok=True)
|
os.makedirs(result_path, exist_ok=True)
|
||||||
|
|
||||||
train_file = os.path.join(path_binary_raw, 'documents', 'training.txt')
|
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
|
||||||
|
|
||||||
train = LabelledCollection.load(train_file, load_binary_raw_document)
|
train = LabelledCollection.load(train_file, load_binary_vectors)
|
||||||
|
|
||||||
print(train.classes_)
|
print(train.classes_)
|
||||||
print(len(train))
|
print(len(train))
|
||||||
|
@ -36,7 +36,7 @@ for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
|
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
|
||||||
ae_errors, rae_errors = [], []
|
ae_errors, rae_errors = [], []
|
||||||
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
|
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
|
||||||
test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt')
|
test_file = os.path.join(path_binary_vector, 'documents', f'{sample_set}_{i}.txt')
|
||||||
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
|
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
|
||||||
test.instances = tfidf.transform(test.instances)
|
test.instances = tfidf.transform(test.instances)
|
||||||
qp.environ['SAMPLE_SIZE'] = len(test)
|
qp.environ['SAMPLE_SIZE'] = len(test)
|
|
@ -0,0 +1,92 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import *
|
||||||
|
import quapy.functional as F
|
||||||
|
from data import load_binary_vectors
|
||||||
|
import os
|
||||||
|
|
||||||
|
path_binary_vector = './data/T1A'
|
||||||
|
result_path = os.path.join('results', 'T1A') # binary - vector
|
||||||
|
os.makedirs(result_path, exist_ok=True)
|
||||||
|
|
||||||
|
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
|
||||||
|
|
||||||
|
train = LabelledCollection.load(train_file, load_binary_vectors)
|
||||||
|
|
||||||
|
nF = train.instances.shape[1]
|
||||||
|
|
||||||
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
print(f'number of training documents: {len(train)}')
|
||||||
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
|
dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0)
|
||||||
|
print(dev_prev)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
scores = {}
|
||||||
|
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
|
|
||||||
|
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||||
|
model = quantifier(classifier).fit(train)
|
||||||
|
quantifier_name = model.__class__.__name__
|
||||||
|
|
||||||
|
scores[quantifier_name]={}
|
||||||
|
for sample_set, sample_size in [('dev', 1000)]:
|
||||||
|
ae_errors, rae_errors = [], []
|
||||||
|
for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'):
|
||||||
|
filename = row['filename']
|
||||||
|
prev_true = row[1:].values
|
||||||
|
sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
|
||||||
|
sample, _ = load_binary_vectors(sample_path, nF)
|
||||||
|
qp.environ['SAMPLE_SIZE'] = sample.shape[0]
|
||||||
|
prev_estim = model.quantify(sample)
|
||||||
|
# prev_true = sample.prevalence()
|
||||||
|
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
||||||
|
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
||||||
|
|
||||||
|
ae_errors = np.asarray(ae_errors)
|
||||||
|
rae_errors = np.asarray(rae_errors)
|
||||||
|
|
||||||
|
mae = ae_errors.mean()
|
||||||
|
mrae = rae_errors.mean()
|
||||||
|
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
||||||
|
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
||||||
|
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
||||||
|
|
||||||
|
for model in scores:
|
||||||
|
for sample_set in ['validation']:#, 'test']:
|
||||||
|
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
test:
|
||||||
|
CC 0.1859 1.5406
|
||||||
|
ACC 0.0453 0.2840
|
||||||
|
PCC 0.1793 1.7187
|
||||||
|
PACC 0.0287 0.1494
|
||||||
|
EMQ 0.0225 0.1020
|
||||||
|
HDy 0.0631 0.2307
|
||||||
|
|
||||||
|
validation
|
||||||
|
CC 0.1862 1.9587
|
||||||
|
ACC 0.0394 0.2669
|
||||||
|
PCC 0.1789 2.1383
|
||||||
|
PACC 0.0354 0.1587
|
||||||
|
EMQ 0.0224 0.0960
|
||||||
|
HDy 0.0467 0.2121
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ from tqdm import tqdm
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
from data import load_multiclass_raw_document
|
from data import load_multiclass_raw_document
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
@ -30,10 +31,10 @@ train.instances = tfidf.fit_transform(train.instances)
|
||||||
print(train.instances.shape[1])
|
print(train.instances.shape[1])
|
||||||
|
|
||||||
scores = {}
|
scores = {}
|
||||||
for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]:
|
for quantifier in [MLPE()]:#[CC, ACC, PCC, PACC, EMQ]:#, HDy]:
|
||||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
# classifier = CalibratedClassifierCV(LogisticRegression())
|
||||||
# classifier = LogisticRegression()
|
# model = quantifier(classifier).fit(train)
|
||||||
model = quantifier(classifier).fit(train)
|
model = quantifier.fit(train)
|
||||||
print('model trained')
|
print('model trained')
|
||||||
|
|
||||||
quantifier_name = model.__class__.__name__
|
quantifier_name = model.__class__.__name__
|
||||||
|
@ -67,11 +68,20 @@ for model in scores:
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
test:
|
|
||||||
|
|
||||||
|
MLPE validation 0.0423 4.8582
|
||||||
|
CC validation 0.0308 2.9731
|
||||||
|
PCC validation 0.0296 3.3926
|
||||||
|
ACC validation 0.0328 3.1461
|
||||||
|
PACC validation 0.0176 1.6449
|
||||||
|
EMQ validation 0.0207 1.6960
|
||||||
|
|
||||||
validation
|
MLPE test 0.0423 4.6083
|
||||||
|
CC test 0.0308 2.9037
|
||||||
|
PCC test 0.0296 3.2764
|
||||||
|
ACC test 0.0328 3.0674
|
||||||
|
PACC test 0.0174 1.5892
|
||||||
|
EMQ test 0.0207 1.6059
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ class LabelledCollection:
|
||||||
self.classes_ = np.unique(np.asarray(classes_))
|
self.classes_ = np.unique(np.asarray(classes_))
|
||||||
self.classes_.sort()
|
self.classes_.sort()
|
||||||
if len(set(self.labels).difference(set(classes_))) > 0:
|
if len(set(self.labels).difference(set(classes_))) > 0:
|
||||||
raise ValueError('labels contains values not included in classes_')
|
raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes_)})')
|
||||||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -10,6 +10,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
def fit(self, data: LabelledCollection, *args):
|
def fit(self, data: LabelledCollection, *args):
|
||||||
self._classes_ = data.classes_
|
self._classes_ = data.classes_
|
||||||
self.estimated_prevalence = data.prevalence()
|
self.estimated_prevalence = data.prevalence()
|
||||||
|
return self
|
||||||
|
|
||||||
def quantify(self, documents, *args):
|
def quantify(self, documents, *args):
|
||||||
return self.estimated_prevalence
|
return self.estimated_prevalence
|
||||||
|
|
Loading…
Reference in New Issue