diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt new file mode 100644 index 0000000..61c56cd --- /dev/null +++ b/LeQua2022/TODO.txt @@ -0,0 +1,7 @@ +1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar) +2. tablas? +3. fetch dataset (download, unzip, etc.) +4. model selection +5. plots +6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y + que de todos modos genera un output con el mismo nombre del file \ No newline at end of file diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 97d1a7d..f4be5a6 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -1,16 +1,27 @@ import quapy as qp import numpy as np +import sklearn -def load_binary_raw_document(path): - documents, labels = qp.data.from_text(path, verbose=0, class2int=True) - labels = np.asarray(labels) - labels[np.logical_or(labels == 1, labels == 2)] = 0 - labels[np.logical_or(labels == 4, labels == 5)] = 1 - return documents, labels +# def load_binary_raw_document(path): +# documents, labels = qp.data.from_text(path, verbose=0, class2int=True) +# labels = np.asarray(labels) +# labels[np.logical_or(labels == 1, labels == 2)] = 0 +# labels[np.logical_or(labels == 4, labels == 5)] = 1 +# return documents, labels def load_multiclass_raw_document(path): return qp.data.from_text(path, verbose=0, class2int=False) +def load_binary_vectors(path, nF=None): + return sklearn.datasets.load_svmlight_file(path, n_features=nF) + + +if __name__ == '__main__': + X, y = load_binary_vectors('./data/T1A/public/training_vectors.txt') + print(X.shape) + print(y) + + diff --git a/LeQua2022/main_binary.py b/LeQua2022/main_binary_raw.py similarity index 86% rename from LeQua2022/main_binary.py rename to LeQua2022/main_binary_raw.py index 653de00..5ec89a2 100644 --- a/LeQua2022/main_binary.py +++ b/LeQua2022/main_binary_raw.py @@ -8,16 +8,16 @@ from tqdm import tqdm import quapy as qp from quapy.data import LabelledCollection from quapy.method.aggregative import * -from data import load_binary_raw_document +from data import load_binary_vectors import os -path_binary_raw = 'binary_raw' -result_path = os.path.join('results', 'binary_raw') +path_binary_vector = './data/T1A' +result_path = os.path.join('results', 'T1A') # binary - vector os.makedirs(result_path, exist_ok=True) -train_file = os.path.join(path_binary_raw, 'documents', 'training.txt') +train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') -train = LabelledCollection.load(train_file, load_binary_raw_document) +train = LabelledCollection.load(train_file, load_binary_vectors) print(train.classes_) print(len(train)) @@ -36,7 +36,7 @@ for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]: ae_errors, rae_errors = [], [] for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'): - test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt') + test_file = os.path.join(path_binary_vector, 'documents', f'{sample_set}_{i}.txt') test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_) test.instances = tfidf.transform(test.instances) qp.environ['SAMPLE_SIZE'] = len(test) diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py new file mode 100644 index 0000000..5a60520 --- /dev/null +++ b/LeQua2022/main_binary_vector.py @@ -0,0 +1,92 @@ +import pickle + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm +import pandas as pd + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +import quapy.functional as F +from data import load_binary_vectors +import os + +path_binary_vector = './data/T1A' +result_path = os.path.join('results', 'T1A') # binary - vector +os.makedirs(result_path, exist_ok=True) + +train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') + +train = LabelledCollection.load(train_file, load_binary_vectors) + +nF = train.instances.shape[1] + +print(f'number of classes: {len(train.classes_)}') +print(f'number of training documents: {len(train)}') +print(f'training prevalence: {F.strprev(train.prevalence())}') +print(f'training matrix shape: {train.instances.shape}') + +dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0) +print(dev_prev) + + + + +scores = {} +for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]: + + classifier = CalibratedClassifierCV(LogisticRegression()) + model = quantifier(classifier).fit(train) + quantifier_name = model.__class__.__name__ + + scores[quantifier_name]={} + for sample_set, sample_size in [('dev', 1000)]: + ae_errors, rae_errors = [], [] + for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'): + filename = row['filename'] + prev_true = row[1:].values + sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename) + sample, _ = load_binary_vectors(sample_path, nF) + qp.environ['SAMPLE_SIZE'] = sample.shape[0] + prev_estim = model.quantify(sample) + # prev_true = sample.prevalence() + ae_errors.append(qp.error.mae(prev_true, prev_estim)) + rae_errors.append(qp.error.mrae(prev_true, prev_estim)) + + ae_errors = np.asarray(ae_errors) + rae_errors = np.asarray(rae_errors) + + mae = ae_errors.mean() + mrae = rae_errors.mean() + scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} + pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') + print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') + +for model in scores: + for sample_set in ['validation']:#, 'test']: + print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') + + +""" +test: +CC 0.1859 1.5406 +ACC 0.0453 0.2840 +PCC 0.1793 1.7187 +PACC 0.0287 0.1494 +EMQ 0.0225 0.1020 +HDy 0.0631 0.2307 + +validation +CC 0.1862 1.9587 +ACC 0.0394 0.2669 +PCC 0.1789 2.1383 +PACC 0.0354 0.1587 +EMQ 0.0224 0.0960 +HDy 0.0467 0.2121 +""" + + diff --git a/LeQua2022/main_multiclass.py b/LeQua2022/main_multiclass.py index a999a4e..1a6c63c 100644 --- a/LeQua2022/main_multiclass.py +++ b/LeQua2022/main_multiclass.py @@ -8,6 +8,7 @@ from tqdm import tqdm import quapy as qp from quapy.data import LabelledCollection from quapy.method.aggregative import * +from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from data import load_multiclass_raw_document import os @@ -30,10 +31,10 @@ train.instances = tfidf.fit_transform(train.instances) print(train.instances.shape[1]) scores = {} -for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]: - classifier = CalibratedClassifierCV(LogisticRegression()) - # classifier = LogisticRegression() - model = quantifier(classifier).fit(train) +for quantifier in [MLPE()]:#[CC, ACC, PCC, PACC, EMQ]:#, HDy]: + # classifier = CalibratedClassifierCV(LogisticRegression()) + # model = quantifier(classifier).fit(train) + model = quantifier.fit(train) print('model trained') quantifier_name = model.__class__.__name__ @@ -67,11 +68,20 @@ for model in scores: """ -test: +MLPE validation 0.0423 4.8582 +CC validation 0.0308 2.9731 +PCC validation 0.0296 3.3926 +ACC validation 0.0328 3.1461 +PACC validation 0.0176 1.6449 +EMQ validation 0.0207 1.6960 -validation - +MLPE test 0.0423 4.6083 +CC test 0.0308 2.9037 +PCC test 0.0296 3.2764 +ACC test 0.0328 3.0674 +PACC test 0.0174 1.5892 +EMQ test 0.0207 1.6059 """ diff --git a/quapy/data/base.py b/quapy/data/base.py index 7799c18..1a631d7 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -35,7 +35,7 @@ class LabelledCollection: self.classes_ = np.unique(np.asarray(classes_)) self.classes_.sort() if len(set(self.labels).difference(set(classes_))) > 0: - raise ValueError('labels contains values not included in classes_') + raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes_)})') self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} @classmethod diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 94b7c50..bc0a99a 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -10,6 +10,7 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): def fit(self, data: LabelledCollection, *args): self._classes_ = data.classes_ self.estimated_prevalence = data.prevalence() + return self def quantify(self, documents, *args): return self.estimated_prevalence