2021-10-13 20:36:53 +02:00
|
|
|
import pickle
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
import quapy as qp
|
|
|
|
from quapy.data import LabelledCollection
|
|
|
|
from quapy.method.aggregative import *
|
2021-10-21 17:14:40 +02:00
|
|
|
from data import load_binary_vectors
|
2021-10-13 20:36:53 +02:00
|
|
|
import os
|
|
|
|
|
2021-10-21 17:14:40 +02:00
|
|
|
path_binary_vector = './data/T1A'
|
|
|
|
result_path = os.path.join('results', 'T1A') # binary - vector
|
2021-10-13 20:36:53 +02:00
|
|
|
os.makedirs(result_path, exist_ok=True)
|
|
|
|
|
2021-10-21 17:14:40 +02:00
|
|
|
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
|
2021-10-13 20:36:53 +02:00
|
|
|
|
2021-10-21 17:14:40 +02:00
|
|
|
train = LabelledCollection.load(train_file, load_binary_vectors)
|
2021-10-13 20:36:53 +02:00
|
|
|
|
|
|
|
print(train.classes_)
|
|
|
|
print(len(train))
|
|
|
|
print(train.prevalence())
|
|
|
|
|
|
|
|
tfidf = TfidfVectorizer(min_df=5)
|
|
|
|
train.instances = tfidf.fit_transform(train.instances)
|
|
|
|
|
|
|
|
scores = {}
|
|
|
|
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
|
|
|
classifier = CalibratedClassifierCV(LogisticRegression())
|
|
|
|
model = quantifier(classifier).fit(train)
|
|
|
|
|
|
|
|
quantifier_name = model.__class__.__name__
|
|
|
|
scores[quantifier_name]={}
|
|
|
|
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
|
|
|
|
ae_errors, rae_errors = [], []
|
|
|
|
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
|
2021-10-21 17:14:40 +02:00
|
|
|
test_file = os.path.join(path_binary_vector, 'documents', f'{sample_set}_{i}.txt')
|
2021-10-13 20:36:53 +02:00
|
|
|
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
|
|
|
|
test.instances = tfidf.transform(test.instances)
|
|
|
|
qp.environ['SAMPLE_SIZE'] = len(test)
|
|
|
|
prev_estim = model.quantify(test.instances)
|
|
|
|
prev_true = test.prevalence()
|
|
|
|
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
|
|
|
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
|
|
|
|
|
|
|
ae_errors = np.asarray(ae_errors)
|
|
|
|
rae_errors = np.asarray(rae_errors)
|
|
|
|
|
|
|
|
mae = ae_errors.mean()
|
|
|
|
mrae = rae_errors.mean()
|
|
|
|
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
|
|
|
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
|
|
|
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
|
|
|
|
|
|
|
for model in scores:
|
|
|
|
for sample_set in ['validation']:#, 'test']:
|
|
|
|
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
|
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
test:
|
|
|
|
CC 0.1859 1.5406
|
|
|
|
ACC 0.0453 0.2840
|
|
|
|
PCC 0.1793 1.7187
|
|
|
|
PACC 0.0287 0.1494
|
|
|
|
EMQ 0.0225 0.1020
|
|
|
|
HDy 0.0631 0.2307
|
|
|
|
|
|
|
|
validation
|
|
|
|
CC 0.1862 1.9587
|
|
|
|
ACC 0.0394 0.2669
|
|
|
|
PCC 0.1789 2.1383
|
|
|
|
PACC 0.0354 0.1587
|
|
|
|
EMQ 0.0224 0.0960
|
|
|
|
HDy 0.0467 0.2121
|
|
|
|
"""
|
|
|
|
|
|
|
|
|