QuaPy/examples/11.comparing_HDy_HDx.py

77 lines
2.6 KiB
Python

from sklearn.linear_model import LogisticRegression
from time import time
import pandas as pd
from tqdm import tqdm
import quapy as qp
from quapy.protocol import APP
from quapy.method.aggregative import HDy
from quapy.method.non_aggregative import DMx
"""
This example is meant to experimentally compare HDy and HDx.
The implementations of these methods adhere to the original design of the methods; in particular, this means that
the number of bins is not an hyperparameter, but is something that the method explores internally (returning the
median of the estimates as the final prevalence prediction), and the prevalence is not searched through any
numerical optimization procedure, but simply as a linear search between 0 and 1 steppy by 0.01.
See <https://www.sciencedirect.com/science/article/pii/S0020025512004069>`_ for further details
"""
qp.environ['SAMPLE_SIZE']=100
df = pd.DataFrame(columns=['method', 'dataset', 'MAE', 'MRAE', 'tr-time', 'te-time'])
for dataset_name in tqdm(qp.datasets.UCI_BINARY_DATASETS, total=len(qp.datasets.UCI_BINARY_DATASETS)):
if dataset_name in ['acute.a', 'acute.b', 'balance.2', 'iris.1']:
# these datasets tend to produce either too good or too bad results...
continue
collection = qp.datasets.fetch_UCIBinaryLabelledCollection(dataset_name, verbose=False)
train, test = collection.split_stratified()
# HDy............................................
tinit = time()
hdy = HDy(LogisticRegression()).fit(train)
t_hdy_train = time()-tinit
tinit = time()
hdy_report = qp.evaluation.evaluation_report(hdy, APP(test), error_metrics=['mae', 'mrae']).mean()
t_hdy_test = time() - tinit
df.loc[len(df)] = ['HDy', dataset_name, hdy_report['mae'], hdy_report['mrae'], t_hdy_train, t_hdy_test]
# HDx............................................
tinit = time()
hdx = DMx.HDx(n_jobs=-1).fit(train)
t_hdx_train = time() - tinit
tinit = time()
hdx_report = qp.evaluation.evaluation_report(hdx, APP(test), error_metrics=['mae', 'mrae']).mean()
t_hdx_test = time() - tinit
df.loc[len(df)] = ['HDx', dataset_name, hdx_report['mae'], hdx_report['mrae'], t_hdx_train, t_hdx_test]
# evaluation reports
print('\n'*3)
print('='*80)
print('Comparison in terms of performance')
print('='*80)
pv = df.pivot_table(index='dataset', columns='method', values=['MAE', 'MRAE'])
print(pv)
print('\nAveraged values:')
print(pv.mean())
print('\n'*3)
print('='*80)
print('Comparison in terms of efficiency')
print('='*80)
pv = df.pivot_table(index='dataset', columns='method', values=['tr-time', 'te-time'])
print(pv)
print('\nAveraged values:')
print(pv.mean())