2021-10-13 20:36:53 +02:00
|
|
|
import pickle
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
from tqdm import tqdm
|
2021-10-21 17:14:40 +02:00
|
|
|
import pandas as pd
|
2021-10-13 20:36:53 +02:00
|
|
|
|
|
|
|
import quapy as qp
|
|
|
|
from quapy.data import LabelledCollection
|
|
|
|
from quapy.method.aggregative import *
|
2021-10-21 17:14:40 +02:00
|
|
|
import quapy.functional as F
|
|
|
|
from data import load_binary_vectors
|
2021-10-13 20:36:53 +02:00
|
|
|
import os
|
|
|
|
|
2021-10-21 17:14:40 +02:00
|
|
|
path_binary_vector = './data/T1A'
|
|
|
|
result_path = os.path.join('results', 'T1A') # binary - vector
|
2021-10-13 20:36:53 +02:00
|
|
|
os.makedirs(result_path, exist_ok=True)
|
|
|
|
|
2021-10-21 17:14:40 +02:00
|
|
|
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
|
|
|
|
|
|
|
|
train = LabelledCollection.load(train_file, load_binary_vectors)
|
|
|
|
|
|
|
|
nF = train.instances.shape[1]
|
|
|
|
|
|
|
|
print(f'number of classes: {len(train.classes_)}')
|
|
|
|
print(f'number of training documents: {len(train)}')
|
|
|
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
|
|
|
print(f'training matrix shape: {train.instances.shape}')
|
|
|
|
|
|
|
|
dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0)
|
|
|
|
print(dev_prev)
|
2021-10-13 20:36:53 +02:00
|
|
|
|
|
|
|
|
|
|
|
scores = {}
|
2021-10-21 17:14:40 +02:00
|
|
|
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
|
|
|
|
|
2021-10-13 20:36:53 +02:00
|
|
|
classifier = CalibratedClassifierCV(LogisticRegression())
|
|
|
|
model = quantifier(classifier).fit(train)
|
|
|
|
quantifier_name = model.__class__.__name__
|
2021-10-21 17:14:40 +02:00
|
|
|
|
2021-10-13 20:36:53 +02:00
|
|
|
scores[quantifier_name]={}
|
2021-10-21 17:14:40 +02:00
|
|
|
for sample_set, sample_size in [('dev', 1000)]:
|
2021-10-13 20:36:53 +02:00
|
|
|
ae_errors, rae_errors = [], []
|
2021-10-21 17:14:40 +02:00
|
|
|
for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'):
|
|
|
|
filename = row['filename']
|
|
|
|
prev_true = row[1:].values
|
|
|
|
sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
|
|
|
|
sample, _ = load_binary_vectors(sample_path, nF)
|
|
|
|
qp.environ['SAMPLE_SIZE'] = sample.shape[0]
|
|
|
|
prev_estim = model.quantify(sample)
|
|
|
|
# prev_true = sample.prevalence()
|
2021-10-13 20:36:53 +02:00
|
|
|
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
|
|
|
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
|
|
|
|
|
|
|
ae_errors = np.asarray(ae_errors)
|
|
|
|
rae_errors = np.asarray(rae_errors)
|
|
|
|
|
|
|
|
mae = ae_errors.mean()
|
|
|
|
mrae = rae_errors.mean()
|
|
|
|
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
|
|
|
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
|
|
|
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
|
|
|
|
|
|
|
for model in scores:
|
|
|
|
for sample_set in ['validation']:#, 'test']:
|
|
|
|
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
test:
|
|
|
|
CC 0.1859 1.5406
|
|
|
|
ACC 0.0453 0.2840
|
|
|
|
PCC 0.1793 1.7187
|
|
|
|
PACC 0.0287 0.1494
|
|
|
|
EMQ 0.0225 0.1020
|
|
|
|
HDy 0.0631 0.2307
|
|
|
|
|
|
|
|
validation
|
|
|
|
CC 0.1862 1.9587
|
|
|
|
ACC 0.0394 0.2669
|
|
|
|
PCC 0.1789 2.1383
|
|
|
|
PACC 0.0354 0.1587
|
|
|
|
EMQ 0.0224 0.0960
|
|
|
|
HDy 0.0467 0.2121
|
|
|
|
"""
|
|
|
|
|
|
|
|
|