diff --git a/LeQua2022/baselinesSVD_T1A.py b/LeQua2022/baselinesSVD_T1A.py index c0fdc15..576a7ee 100644 --- a/LeQua2022/baselinesSVD_T1A.py +++ b/LeQua2022/baselinesSVD_T1A.py @@ -64,14 +64,6 @@ for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') """ -test: -CC 0.1859 1.5406 -ACC 0.0453 0.2840 -PCC 0.1793 1.7187 -PACC 0.0287 0.1494 -EMQ 0.0225 0.1020 -HDy 0.0631 0.2307 - validation CC 0.1862 1.9587 ACC 0.0394 0.2669 diff --git a/LeQua2022/baselines_T1A.py b/LeQua2022/baselines_T1A.py index 179995c..48d42d4 100644 --- a/LeQua2022/baselines_T1A.py +++ b/LeQua2022/baselines_T1A.py @@ -59,14 +59,6 @@ for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}') """ -test: -CC 0.1859 1.5406 -ACC 0.0453 0.2840 -PCC 0.1793 1.7187 -PACC 0.0287 0.1494 -EMQ 0.0225 0.1020 -HDy 0.0631 0.2307 - validation CC 0.1862 1.9587 ACC 0.0394 0.2669 diff --git a/LeQua2022/baselines_T1Amodsel.py b/LeQua2022/baselines_T1Amodsel.py index d6158dc..12d658a 100644 --- a/LeQua2022/baselines_T1Amodsel.py +++ b/LeQua2022/baselines_T1Amodsel.py @@ -69,14 +69,6 @@ for quantifier in [EMQ]: # [CC, ACC, PCC, PACC, EMQ, HDy]: """ -test: -CC 0.1859 1.5406 -ACC 0.0453 0.2840 -PCC 0.1793 1.7187 -PACC 0.0287 0.1494 -EMQ 0.0225 0.1020 -HDy 0.0631 0.2307 - validation CC 0.1862 1.9587 ACC 0.0394 0.2669 diff --git a/LeQua2022/main_binary_raw.py b/LeQua2022/main_binary_raw.py deleted file mode 100644 index bae5bc3..0000000 --- a/LeQua2022/main_binary_raw.py +++ /dev/null @@ -1,66 +0,0 @@ -import pickle - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from tqdm import tqdm - -import quapy as qp -from quapy.data import LabelledCollection -from quapy.method.aggregative import * -from data import load_binary_vectors -import os - -path_binary_vector = './data/T1A' -result_path = os.path.join('results', 'T1A') # binary - vector -os.makedirs(result_path, exist_ok=True) - -train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt') - -train = LabelledCollection.load(train_file, load_binary_vectors) - -print(train.classes_) -print(len(train)) -print(train.prevalence()) - -tfidf = TfidfVectorizer(min_df=5) -train.instances = tfidf.fit_transform(train.instances) - -scores = {} -for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: - classifier = CalibratedClassifierCV(LogisticRegression()) - model = quantifier(classifier).fit(train) - - quantifier_name = model.__class__.__name__ - scores[quantifier_name]={} - for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]: - ae_errors, rae_errors = [], [] - for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'): - test_file = os.path.join(path_binary_vector, 'documents', f'{sample_set}_{i}.txt') - test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_) - test.instances = tfidf.transform(test.instances) - qp.environ['SAMPLE_SIZE'] = len(test) - prev_estim = model.quantify(test.instances) - prev_true = test.prevalence() - ae_errors.append(qp.error.mae(prev_true, prev_estim)) - rae_errors.append(qp.error.mrae(prev_true, prev_estim)) - - ae_errors = np.asarray(ae_errors) - rae_errors = np.asarray(rae_errors) - - mae = ae_errors.mean() - mrae = rae_errors.mean() - scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} - pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') - print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') - -for model in scores: - for sample_set in ['validation']:#, 'test']: - print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') - - - - - diff --git a/LeQua2022/main_multiclass.py b/LeQua2022/main_multiclass.py deleted file mode 100644 index 1a6c63c..0000000 --- a/LeQua2022/main_multiclass.py +++ /dev/null @@ -1,87 +0,0 @@ -import pickle - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from tqdm import tqdm - -import quapy as qp -from quapy.data import LabelledCollection -from quapy.method.aggregative import * -from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE -from data import load_multiclass_raw_document -import os - -path_multiclass_raw = 'multiclass_raw' -result_path = os.path.join('results', 'multiclass_raw') -os.makedirs(result_path, exist_ok=True) - -train_file = os.path.join(path_multiclass_raw, 'documents', 'training.txt') - -train = LabelledCollection.load(train_file, load_multiclass_raw_document) - -print('classes', train.classes_) -print('#classes', len(train.classes_)) -print('#docs', len(train)) -print('prevalence', train.prevalence()) -print('counts', train.counts()) - -tfidf = TfidfVectorizer(min_df=5) -train.instances = tfidf.fit_transform(train.instances) -print(train.instances.shape[1]) - -scores = {} -for quantifier in [MLPE()]:#[CC, ACC, PCC, PACC, EMQ]:#, HDy]: - # classifier = CalibratedClassifierCV(LogisticRegression()) - # model = quantifier(classifier).fit(train) - model = quantifier.fit(train) - print('model trained') - - quantifier_name = model.__class__.__name__ - scores[quantifier_name]={} - for sample_set, sample_size in [('validation', 1000), ('test', 5000)]: - ae_errors, rae_errors = [], [] - for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'): - test_file = os.path.join(path_multiclass_raw, 'documents', f'{sample_set}_{i}.txt') - test = LabelledCollection.load(test_file, load_multiclass_raw_document, classes=train.classes_) - test.instances = tfidf.transform(test.instances) - qp.environ['SAMPLE_SIZE'] = len(test) - prev_estim = model.quantify(test.instances) - prev_true = test.prevalence() - ae_errors.append(qp.error.mae(prev_true, prev_estim)) - rae_errors.append(qp.error.mrae(prev_true, prev_estim)) - - ae_errors = np.asarray(ae_errors) - rae_errors = np.asarray(rae_errors) - - mae = ae_errors.mean() - mrae = rae_errors.mean() - scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} - pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) - print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') - print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') - -for model in scores: - for sample_set in ['validation', 'test']: - print(f'{model}\t{sample_set}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') - - -""" - -MLPE validation 0.0423 4.8582 -CC validation 0.0308 2.9731 -PCC validation 0.0296 3.3926 -ACC validation 0.0328 3.1461 -PACC validation 0.0176 1.6449 -EMQ validation 0.0207 1.6960 - -MLPE test 0.0423 4.6083 -CC test 0.0308 2.9037 -PCC test 0.0296 3.2764 -ACC test 0.0328 3.0674 -PACC test 0.0174 1.5892 -EMQ test 0.0207 1.6059 -""" - -