QuaPy/distribution_matching/commons.py

import numpy as np
import pandas as pd
from distribution_matching.method_kdey import KDEy
from distribution_matching.method_kdey_closed import KDEyclosed
from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr
from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC
from distribution_matching.method_dirichlety import DIRy
from sklearn.linear_model import LogisticRegression
from distribution_matching.method_kdey_closed_efficient import KDEyclosed_efficient

# the full list of methods tested in the paper (reported in the appendix)
METHODS  = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS',  'DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML']

# uncomment this other list for the methods shown in the body of the paper (the other methods are not comparable in performance)
#METHODS  = ['PACC',  'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS',  'EMQ', 'KDEy-ML']

BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]


hyper_LR = {
    'classifier__C': np.logspace(-3,3,7),
    'classifier__class_weight': ['balanced', None]
}

hyper_kde = {
    'bandwidth': np.linspace(0.01, 0.2, 20)
}

nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]

def new_method(method, **lr_kwargs):

    lr = LogisticRegression(**lr_kwargs)

    if method == 'CC':
        param_grid = hyper_LR
        quantifier = CC(lr)
    elif method == 'PCC':
        param_grid = hyper_LR
        quantifier = PCC(lr)
    elif method == 'ACC':
        param_grid = hyper_LR
        quantifier = ACC(lr)
    elif method == 'PACC':
        param_grid = hyper_LR
        quantifier = PACC(lr)
    elif method in ['KDEy-HD']:
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=10000, val_split=10)
    elif method == 'KDEy-CS':
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEyclosed_efficient_corr(lr, val_split=10)
    elif method == 'KDEy-ML':
        param_grid = {**hyper_kde, **hyper_LR}
        quantifier = KDEy(lr, target='max_likelihood', val_split=10)
    elif method == 'DIR':
        param_grid = hyper_LR
        quantifier = DIRy(lr)
    elif method == 'EMQ':
        param_grid = hyper_LR
        quantifier = EMQ(lr)
    elif method == 'EMQ-BCTS':
        method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
        param_grid = {**method_params, **hyper_LR}
        quantifier = EMQ(lr)
    elif method == 'HDy':
        param_grid = hyper_LR
        quantifier = HDy(lr)
    elif method == 'HDy-OvA':
        param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
        quantifier = OneVsAllAggregative(HDy(lr))
    elif method == 'DM-T':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['topsoe']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    elif method == 'DM-HD':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['HD']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    elif method == 'DM-CS':
        method_params = {
            'nbins': nbins_range,
            'val_split': [10],
            'divergence': ['CS']
        }
        param_grid = {**method_params, **hyper_LR}
        quantifier = DistributionMatching(lr)
    else:
        raise NotImplementedError('unknown method', method)

    return param_grid, quantifier


def show_results(result_path):
    df = pd.read_csv(result_path+'.csv', sep='\t')

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
    print(pv)
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`import numpy as np`
			`import pandas as pd`
			`from distribution_matching.method_kdey import KDEy`
KDE with closed form working 2023-10-13 17:34:26 +02:00			`from distribution_matching.method_kdey_closed import KDEyclosed`
			`from distribution_matching.method_kdey_closed_efficient_correct import KDEyclosed_efficient_corr`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`from quapy.method.aggregative import EMQ, CC, PCC, DistributionMatching, PACC, HDy, OneVsAllAggregative, ACC`
			`from distribution_matching.method_dirichlety import DIRy`
			`from sklearn.linear_model import LogisticRegression`
cleaning project 2023-12-06 16:55:06 +01:00			`from distribution_matching.method_kdey_closed_efficient import KDEyclosed_efficient`

			`# the full list of methods tested in the paper (reported in the appendix)`
			`METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS', 'DIR', 'EMQ', 'EMQ-BCTS', 'KDEy-ML']`

			`# uncomment this other list for the methods shown in the body of the paper (the other methods are not comparable in performance)`
			`#METHODS = ['PACC', 'DM-T', 'DM-HD', 'KDEy-HD', 'DM-CS', 'KDEy-CS', 'EMQ', 'KDEy-ML']`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00
understanding montecarlo sampling 2023-10-02 17:50:12 +02:00			`BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00

			`hyper_LR = {`
			`'classifier__C': np.logspace(-3,3,7),`
			`'classifier__class_weight': ['balanced', None]`
KDE with closed form working 2023-10-13 17:34:26 +02:00			`}`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00
cleaning project 2023-12-06 16:55:06 +01:00			`hyper_kde = {`
			`'bandwidth': np.linspace(0.01, 0.2, 20)`
			`}`

			`nbins_range = [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64]`

cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`def new_method(method, **lr_kwargs):`

			`lr = LogisticRegression(**lr_kwargs)`

			`if method == 'CC':`
			`param_grid = hyper_LR`
			`quantifier = CC(lr)`
			`elif method == 'PCC':`
			`param_grid = hyper_LR`
			`quantifier = PCC(lr)`
			`elif method == 'ACC':`
			`param_grid = hyper_LR`
			`quantifier = ACC(lr)`
			`elif method == 'PACC':`
			`param_grid = hyper_LR`
			`quantifier = PACC(lr)`
cleaning project 2023-12-06 16:55:06 +01:00			`elif method in ['KDEy-HD']:`
			`param_grid = {hyper_kde, hyper_LR}`
			`quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=10000, val_split=10)`
			`elif method == 'KDEy-CS':`
			`param_grid = {hyper_kde, hyper_LR}`
			`quantifier = KDEyclosed_efficient_corr(lr, val_split=10)`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`elif method == 'KDEy-ML':`
cleaning project 2023-12-06 16:55:06 +01:00			`param_grid = {hyper_kde, hyper_LR}`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`quantifier = KDEy(lr, target='max_likelihood', val_split=10)`
			`elif method == 'DIR':`
			`param_grid = hyper_LR`
			`quantifier = DIRy(lr)`
			`elif method == 'EMQ':`
			`param_grid = hyper_LR`
			`quantifier = EMQ(lr)`
cleaning project 2023-12-06 16:55:06 +01:00			`elif method == 'EMQ-BCTS':`
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00			`method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}`
			`param_grid = {method_params, hyper_LR}`
			`quantifier = EMQ(lr)`
switching to devel 2023-10-30 09:41:52 +01:00			`elif method == 'HDy':`
			`param_grid = hyper_LR`
			`quantifier = HDy(lr)`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`elif method == 'HDy-OvA':`
			`param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}`
			`quantifier = OneVsAllAggregative(HDy(lr))`
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00			`elif method == 'DM-T':`
			`method_params = {`
cleaning project 2023-12-06 16:55:06 +01:00			`'nbins': nbins_range,`
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00			`'val_split': [10],`
			`'divergence': ['topsoe']`
			`}`
			`param_grid = {method_params, hyper_LR}`
			`quantifier = DistributionMatching(lr)`
			`elif method == 'DM-HD':`
			`method_params = {`
cleaning project 2023-12-06 16:55:06 +01:00			`'nbins': nbins_range,`
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00			`'val_split': [10],`
			`'divergence': ['HD']`
			`}`
			`param_grid = {method_params, hyper_LR}`
			`quantifier = DistributionMatching(lr)`
			`elif method == 'DM-CS':`
			`method_params = {`
cleaning project 2023-12-06 16:55:06 +01:00			`'nbins': nbins_range,`
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins 2023-10-23 11:32:35 +02:00			`'val_split': [10],`
			`'divergence': ['CS']`
			`}`
			`param_grid = {method_params, hyper_LR}`
			`quantifier = DistributionMatching(lr)`
cleaning and refactoring, also trying to repair the montecarlo approximation 2023-09-28 17:47:39 +02:00			`else:`
			`raise NotImplementedError('unknown method', method)`

			`return param_grid, quantifier`


			`def show_results(result_path):`
			`df = pd.read_csv(result_path+'.csv', sep='\t')`

			`pd.set_option('display.max_columns', None)`
			`pd.set_option('display.max_rows', None)`
			`pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])`
			`print(pv)`