update trailing char

This commit is contained in:
Lorenzo Volpi 2023-11-08 17:26:44 +01:00
parent dd581f7937
commit f346005515
47 changed files with 31354 additions and 31354 deletions

38
.gitignore vendored
View File

@ -1,20 +1,20 @@
*.code-workspace *.code-workspace
quavenv/* quavenv/*
*.pdf *.pdf
__pycache__/* __pycache__/*
baselines/__pycache__/* baselines/__pycache__/*
baselines/densratio/__pycache__/* baselines/densratio/__pycache__/*
quacc/__pycache__/* quacc/__pycache__/*
quacc/evaluation/__pycache__/* quacc/evaluation/__pycache__/*
quacc/method/__pycache__/* quacc/method/__pycache__/*
tests/__pycache__/* tests/__pycache__/*
*.coverage *.coverage
.coverage .coverage
scp_sync.py scp_sync.py
out/* out/*
output/* output/*
!output/main/ !output/main/

48
.vscode/launch.json vendored
View File

@ -1,25 +1,25 @@
{ {
// Use IntelliSense to learn about possible attributes. // Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes. // Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
"name": "main", "name": "main",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py", "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"justMyCode": true "justMyCode": true
}, },
{ {
"name": "main_test", "name": "main_test",
"type": "python", "type": "python",
"request": "launch", "request": "launch",
"program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main_test.py", "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main_test.py",
"console": "integratedTerminal", "console": "integratedTerminal",
"justMyCode": false "justMyCode": false
}, },
] ]
} }

View File

@ -1,54 +1,54 @@
{ {
"todo": [ "todo": [
{ {
"assignedTo": { "assignedTo": {
"name": "Lorenzo Volpi" "name": "Lorenzo Volpi"
}, },
"creation_time": "2023-10-28T14:33:36.069Z", "creation_time": "2023-10-28T14:33:36.069Z",
"id": "2", "id": "2",
"references": [], "references": [],
"title": "Creare plot avg con training prevalence sull'asse x e media rispetto a test prevalence" "title": "Creare plot avg con training prevalence sull'asse x e media rispetto a test prevalence"
}, },
{ {
"assignedTo": { "assignedTo": {
"name": "Lorenzo Volpi" "name": "Lorenzo Volpi"
}, },
"creation_time": "2023-10-28T14:32:37.610Z", "creation_time": "2023-10-28T14:32:37.610Z",
"id": "1", "id": "1",
"references": [], "references": [],
"title": "Testare su imdb" "title": "Testare su imdb"
} }
], ],
"in-progress": [ "in-progress": [
{ {
"assignedTo": { "assignedTo": {
"name": "Lorenzo Volpi" "name": "Lorenzo Volpi"
}, },
"creation_time": "2023-10-28T14:34:23.217Z", "creation_time": "2023-10-28T14:34:23.217Z",
"id": "3", "id": "3",
"references": [], "references": [],
"title": "Relaizzare grid search per task specifico partedno da GridSearchQ" "title": "Relaizzare grid search per task specifico partedno da GridSearchQ"
}, },
{ {
"assignedTo": { "assignedTo": {
"name": "Lorenzo Volpi" "name": "Lorenzo Volpi"
}, },
"creation_time": "2023-10-28T14:34:46.226Z", "creation_time": "2023-10-28T14:34:46.226Z",
"id": "4", "id": "4",
"references": [], "references": [],
"title": "Aggingere estimator basati su PACC (quantificatore)" "title": "Aggingere estimator basati su PACC (quantificatore)"
} }
], ],
"testing": [], "testing": [],
"done": [ "done": [
{ {
"assignedTo": { "assignedTo": {
"name": "Lorenzo Volpi" "name": "Lorenzo Volpi"
}, },
"creation_time": "2023-10-28T14:35:12.683Z", "creation_time": "2023-10-28T14:35:12.683Z",
"id": "5", "id": "5",
"references": [], "references": [],
"title": "Rework rappresentazione dati di report" "title": "Rework rappresentazione dati di report"
} }
] ]
} }

284
TODO.html
View File

@ -1,143 +1,143 @@
<!DOCTYPE html> <!DOCTYPE html>
<html> <html>
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<title></title> <title></title>
<style> <style>
/* From extension vscode.github */ /* From extension vscode.github */
/*--------------------------------------------------------------------------------------------- /*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved. * Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information. * Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/ *--------------------------------------------------------------------------------------------*/
.vscode-dark img[src$=\#gh-light-mode-only], .vscode-dark img[src$=\#gh-light-mode-only],
.vscode-light img[src$=\#gh-dark-mode-only] { .vscode-light img[src$=\#gh-dark-mode-only] {
display: none; display: none;
} }
</style> </style>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/markdown.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css"> <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/Microsoft/vscode/extensions/markdown-language-features/media/highlight.css">
<style> <style>
body { body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', system-ui, 'Ubuntu', 'Droid Sans', sans-serif; font-family: -apple-system, BlinkMacSystemFont, 'Segoe WPC', 'Segoe UI', system-ui, 'Ubuntu', 'Droid Sans', sans-serif;
font-size: 14px; font-size: 14px;
line-height: 1.6; line-height: 1.6;
} }
</style> </style>
<style> <style>
.task-list-item { .task-list-item {
list-style-type: none; list-style-type: none;
} }
.task-list-item-checkbox { .task-list-item-checkbox {
margin-left: -20px; margin-left: -20px;
vertical-align: middle; vertical-align: middle;
pointer-events: none; pointer-events: none;
} }
</style> </style>
</head> </head>
<body class="vscode-body vscode-light"> <body class="vscode-body vscode-light">
<ul class="contains-task-list"> <ul class="contains-task-list">
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere media tabelle</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere media tabelle</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot; 3 tipi (appunti + email + garg)</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot; 3 tipi (appunti + email + garg)</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare kfcv baseline</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare kfcv baseline</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere metodo con CC oltre SLD</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere metodo con CC oltre SLD</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> variare parametro recalibration in SLD</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> variare parametro recalibration in SLD</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> fix grafico diagonal</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> fix grafico diagonal</p>
<ul> <ul>
<li>seaborn example gallery</li> <li>seaborn example gallery</li>
</ul> </ul>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> varianti recalib: bcts, SLD (provare exact_train_prev=False)</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> varianti recalib: bcts, SLD (provare exact_train_prev=False)</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> vedere cosa usa garg di validation size</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> vedere cosa usa garg di validation size</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure &quot;balanced&quot;; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure &quot;balanced&quot;; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo</p>
<ul> <ul>
<li>qp.train_test_split per avere v_train e v_val</li> <li>qp.train_test_split per avere v_train e v_val</li>
<li>GridSearchQ( <li>GridSearchQ(
model: BaseQuantifier, model: BaseQuantifier,
param_grid: { param_grid: {
'classifier__C': np.logspace(-3,3,7), 'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'], 'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts'] 'recalib': [None, 'bcts']
}, },
protocol: UPP(V_val, repeats=1000), protocol: UPP(V_val, repeats=1000),
error = qp.error.mae, error = qp.error.mae,
refit=True, refit=True,
timeout=-1, timeout=-1,
n_jobs=-2, n_jobs=-2,
verbose=True).fit(V_tr)</li> verbose=True).fit(V_tr)</li>
</ul> </ul>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> salvare il best score ottenuto da ogni applicazione di GridSearchQ</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> salvare il best score ottenuto da ogni applicazione di GridSearchQ</p>
<ul> <ul>
<li>nel caso di bin fare media dei due best score</li> <li>nel caso di bin fare media dei due best score</li>
</ul> </ul>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> import baselines</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> import baselines</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> importare mandoline</p> <p><input class="task-list-item-checkbox"type="checkbox"> importare mandoline</p>
<ul> <ul>
<li>mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc</li> <li>mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc</li>
</ul> </ul>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> sistemare vecchie iw baselines</p> <p><input class="task-list-item-checkbox"type="checkbox"> sistemare vecchie iw baselines</p>
<ul> <ul>
<li>non possono essere fixate perché dipendono da numpy</li> <li>non possono essere fixate perché dipendono da numpy</li>
</ul> </ul>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot avg con train prevalence sull'asse x e media su test prevalecne</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> plot avg con train prevalence sull'asse x e media su test prevalecne</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> realizzare grid search per task specifico partendo da GridSearchQ</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> realizzare grid search per task specifico partendo da GridSearchQ</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> provare PACC come quantificatore</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> provare PACC come quantificatore</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere etichette in shift plot</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> aggiungere etichette in shift plot</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare exact_train quapy</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> sistemare exact_train quapy</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox" checked=""type="checkbox"> testare anche su imbd</p> <p><input class="task-list-item-checkbox" checked=""type="checkbox"> testare anche su imbd</p>
</li> </li>
<li class="task-list-item enabled"> <li class="task-list-item enabled">
<p><input class="task-list-item-checkbox"type="checkbox"> rivedere nuove baselines</p> <p><input class="task-list-item-checkbox"type="checkbox"> rivedere nuove baselines</p>
</li> </li>
</ul> </ul>
</body> </body>
</html> </html>

86
TODO.md
View File

@ -1,44 +1,44 @@
- [x] aggiungere media tabelle - [x] aggiungere media tabelle
- [x] plot; 3 tipi (appunti + email + garg) - [x] plot; 3 tipi (appunti + email + garg)
- [x] sistemare kfcv baseline - [x] sistemare kfcv baseline
- [x] aggiungere metodo con CC oltre SLD - [x] aggiungere metodo con CC oltre SLD
- [x] prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1) - [x] prendere classe più popolosa di rcv1, togliere negativi fino a raggiungere 50/50; poi fare subsampling con 9 training prvalences (da 0.1-0.9 a 0.9-0.1)
- [x] variare parametro recalibration in SLD - [x] variare parametro recalibration in SLD
- [x] fix grafico diagonal - [x] fix grafico diagonal
- seaborn example gallery - seaborn example gallery
- [x] varianti recalib: bcts, SLD (provare exact_train_prev=False) - [x] varianti recalib: bcts, SLD (provare exact_train_prev=False)
- [x] vedere cosa usa garg di validation size - [x] vedere cosa usa garg di validation size
- [x] per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure "balanced"; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo - [x] per model selection testare il parametro c del classificatore, si esplora in np.logscale(-3,3, 7) oppure np.logscale(-4, 4, 9), parametro class_weight si esplora in None oppure "balanced"; va usato qp.model_selection.GridSearchQ in funzione di mae come errore, UPP come protocollo
- qp.train_test_split per avere v_train e v_val - qp.train_test_split per avere v_train e v_val
- GridSearchQ( - GridSearchQ(
model: BaseQuantifier, model: BaseQuantifier,
param_grid: { param_grid: {
'classifier__C': np.logspace(-3,3,7), 'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced'], 'classifier__class_weight': [None, 'balanced'],
'recalib': [None, 'bcts'] 'recalib': [None, 'bcts']
}, },
protocol: UPP(V_val, repeats=1000), protocol: UPP(V_val, repeats=1000),
error = qp.error.mae, error = qp.error.mae,
refit=True, refit=True,
timeout=-1, timeout=-1,
n_jobs=-2, n_jobs=-2,
verbose=True).fit(V_tr) verbose=True).fit(V_tr)
- [x] plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati - [x] plot collettivo, con sulla x lo shift e prenda in considerazione tutti i training set, facendo la media sui 9 casi (ogni line è un metodo), risultati non ottimizzati e ottimizzati
- [x] salvare il best score ottenuto da ogni applicazione di GridSearchQ - [x] salvare il best score ottenuto da ogni applicazione di GridSearchQ
- nel caso di bin fare media dei due best score - nel caso di bin fare media dei due best score
- [x] import baselines - [x] import baselines
- [ ] importare mandoline - [ ] importare mandoline
- mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc - mandoline può essere importato, ma richiedere uno slicing delle features a priori che devere essere realizzato ad hoc
- [ ] sistemare vecchie iw baselines - [ ] sistemare vecchie iw baselines
- non possono essere fixate perché dipendono da numpy - non possono essere fixate perché dipendono da numpy
- [x] plot avg con train prevalence sull'asse x e media su test prevalecne - [x] plot avg con train prevalence sull'asse x e media su test prevalecne
- [x] realizzare grid search per task specifico partendo da GridSearchQ - [x] realizzare grid search per task specifico partendo da GridSearchQ
- [x] provare PACC come quantificatore - [x] provare PACC come quantificatore
- [x] aggiungere etichette in shift plot - [x] aggiungere etichette in shift plot
- [x] sistemare exact_train quapy - [x] sistemare exact_train quapy
- [x] testare anche su imbd - [x] testare anche su imbd
- [ ] rivedere nuove baselines - [ ] rivedere nuove baselines

View File

@ -1,44 +1,44 @@
import numpy as np import numpy as np
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
def get_entropy(probs): def get_entropy(probs):
return np.sum(np.multiply(probs, np.log(probs + 1e-20)), axis=1) return np.sum(np.multiply(probs, np.log(probs + 1e-20)), axis=1)
def get_max_conf(probs): def get_max_conf(probs):
return np.max(probs, axis=-1) return np.max(probs, axis=-1)
def find_ATC_threshold(scores, labels): def find_ATC_threshold(scores, labels):
sorted_idx = np.argsort(scores) sorted_idx = np.argsort(scores)
sorted_scores = scores[sorted_idx] sorted_scores = scores[sorted_idx]
sorted_labels = labels[sorted_idx] sorted_labels = labels[sorted_idx]
fp = np.sum(labels == 0) fp = np.sum(labels == 0)
fn = 0.0 fn = 0.0
min_fp_fn = np.abs(fp - fn) min_fp_fn = np.abs(fp - fn)
thres = 0.0 thres = 0.0
for i in range(len(labels)): for i in range(len(labels)):
if sorted_labels[i] == 0: if sorted_labels[i] == 0:
fp -= 1 fp -= 1
else: else:
fn += 1 fn += 1
if np.abs(fp - fn) < min_fp_fn: if np.abs(fp - fn) < min_fp_fn:
min_fp_fn = np.abs(fp - fn) min_fp_fn = np.abs(fp - fn)
thres = sorted_scores[i] thres = sorted_scores[i]
return min_fp_fn, thres return min_fp_fn, thres
def get_ATC_acc(thres, scores): def get_ATC_acc(thres, scores):
return np.mean(scores >= thres) return np.mean(scores >= thres)
def get_ATC_f1(thres, scores, probs): def get_ATC_f1(thres, scores, probs):
preds = np.argmax(probs, axis=-1) preds = np.argmax(probs, axis=-1)
estim_y = np.abs(1 - (scores >= thres) ^ preds) estim_y = np.abs(1 - (scores >= thres) ^ preds)
return f1_score(estim_y, preds) return f1_score(estim_y, preds)

View File

@ -1,277 +1,277 @@
""" """
Relative Unconstrained Least-Squares Fitting (RuLSIF): A Python Implementation Relative Unconstrained Least-Squares Fitting (RuLSIF): A Python Implementation
References: References:
'Change-point detection in time-series data by relative density-ratio estimation' 'Change-point detection in time-series data by relative density-ratio estimation'
Song Liu, Makoto Yamada, Nigel Collier and Masashi Sugiyama, Song Liu, Makoto Yamada, Nigel Collier and Masashi Sugiyama,
Neural Networks 43 (2013) 72-83. Neural Networks 43 (2013) 72-83.
'A Least-squares Approach to Direct Importance Estimation' 'A Least-squares Approach to Direct Importance Estimation'
Takafumi Kanamori, Shohei Hido, and Masashi Sugiyama, Takafumi Kanamori, Shohei Hido, and Masashi Sugiyama,
Journal of Machine Learning Research 10 (2009) 1391-1445. Journal of Machine Learning Research 10 (2009) 1391-1445.
""" """
from warnings import warn from warnings import warn
from numpy import ( from numpy import (
array, array,
asarray, asarray,
asmatrix, asmatrix,
diag, diag,
diagflat, diagflat,
empty, empty,
exp, exp,
inf, inf,
log, log,
matrix, matrix,
multiply, multiply,
ones, ones,
power, power,
sum, sum,
) )
from numpy.linalg import solve from numpy.linalg import solve
from numpy.random import randint from numpy.random import randint
from .density_ratio import DensityRatio, KernelInfo from .density_ratio import DensityRatio, KernelInfo
from .helpers import guvectorize_compute, np_float, to_ndarray from .helpers import guvectorize_compute, np_float, to_ndarray
def RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num=100, verbose=True): def RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num=100, verbose=True):
""" """
Estimation of the alpha-Relative Density Ratio p(x)/p_alpha(x) by RuLSIF Estimation of the alpha-Relative Density Ratio p(x)/p_alpha(x) by RuLSIF
(Relative Unconstrained Least-Square Importance Fitting) (Relative Unconstrained Least-Square Importance Fitting)
p_alpha(x) = alpha * p(x) + (1 - alpha) * q(x) p_alpha(x) = alpha * p(x) + (1 - alpha) * q(x)
Arguments: Arguments:
x (numpy.matrix): Sample from p(x). x (numpy.matrix): Sample from p(x).
y (numpy.matrix): Sample from q(x). y (numpy.matrix): Sample from q(x).
alpha (float): Mixture parameter. alpha (float): Mixture parameter.
sigma_range (list<float>): Search range of Gaussian kernel bandwidth. sigma_range (list<float>): Search range of Gaussian kernel bandwidth.
lambda_range (list<float>): Search range of regularization parameter. lambda_range (list<float>): Search range of regularization parameter.
kernel_num (int): Number of kernels. (Default 100) kernel_num (int): Number of kernels. (Default 100)
verbose (bool): Indicator to print messages (Default True) verbose (bool): Indicator to print messages (Default True)
Returns: Returns:
densratio.DensityRatio object which has `compute_density_ratio()`. densratio.DensityRatio object which has `compute_density_ratio()`.
""" """
# Number of samples. # Number of samples.
nx = x.shape[0] nx = x.shape[0]
ny = y.shape[0] ny = y.shape[0]
# Number of kernel functions. # Number of kernel functions.
kernel_num = min(kernel_num, nx) kernel_num = min(kernel_num, nx)
# Randomly take a subset of x, to identify centers for the kernels. # Randomly take a subset of x, to identify centers for the kernels.
centers = x[randint(nx, size=kernel_num)] centers = x[randint(nx, size=kernel_num)]
if verbose: if verbose:
print("RuLSIF starting...") print("RuLSIF starting...")
if len(sigma_range) == 1 and len(lambda_range) == 1: if len(sigma_range) == 1 and len(lambda_range) == 1:
sigma = sigma_range[0] sigma = sigma_range[0]
lambda_ = lambda_range[0] lambda_ = lambda_range[0]
else: else:
if verbose: if verbose:
print("Searching for the optimal sigma and lambda...") print("Searching for the optimal sigma and lambda...")
# Grid-search cross-validation for optimal kernel and regularization parameters. # Grid-search cross-validation for optimal kernel and regularization parameters.
opt_params = search_sigma_and_lambda( opt_params = search_sigma_and_lambda(
x, y, alpha, centers, sigma_range, lambda_range, verbose x, y, alpha, centers, sigma_range, lambda_range, verbose
) )
sigma = opt_params["sigma"] sigma = opt_params["sigma"]
lambda_ = opt_params["lambda"] lambda_ = opt_params["lambda"]
if verbose: if verbose:
print( print(
"Found optimal sigma = {:.3f}, lambda = {:.3f}.".format(sigma, lambda_) "Found optimal sigma = {:.3f}, lambda = {:.3f}.".format(sigma, lambda_)
) )
if verbose: if verbose:
print("Optimizing theta...") print("Optimizing theta...")
phi_x = compute_kernel_Gaussian(x, centers, sigma) phi_x = compute_kernel_Gaussian(x, centers, sigma)
phi_y = compute_kernel_Gaussian(y, centers, sigma) phi_y = compute_kernel_Gaussian(y, centers, sigma)
H = alpha * (phi_x.T.dot(phi_x) / nx) + (1 - alpha) * (phi_y.T.dot(phi_y) / ny) H = alpha * (phi_x.T.dot(phi_x) / nx) + (1 - alpha) * (phi_y.T.dot(phi_y) / ny)
h = phi_x.mean(axis=0).T h = phi_x.mean(axis=0).T
theta = asarray(solve(H + diag(array(lambda_).repeat(kernel_num)), h)).ravel() theta = asarray(solve(H + diag(array(lambda_).repeat(kernel_num)), h)).ravel()
# No negative coefficients. # No negative coefficients.
theta[theta < 0] = 0 theta[theta < 0] = 0
# Compute the alpha-relative density ratio, at the given coordinates. # Compute the alpha-relative density ratio, at the given coordinates.
def alpha_density_ratio(coordinates): def alpha_density_ratio(coordinates):
# Evaluate the kernel at these coordinates, and take the dot-product with the weights. # Evaluate the kernel at these coordinates, and take the dot-product with the weights.
coordinates = to_ndarray(coordinates) coordinates = to_ndarray(coordinates)
phi_x = compute_kernel_Gaussian(coordinates, centers, sigma) phi_x = compute_kernel_Gaussian(coordinates, centers, sigma)
alpha_density_ratio = phi_x @ theta alpha_density_ratio = phi_x @ theta
return alpha_density_ratio return alpha_density_ratio
# Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions. # Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions.
def alpha_PE_divergence(x, y): def alpha_PE_divergence(x, y):
# This is Y, in Reference 1. # This is Y, in Reference 1.
x = to_ndarray(x) x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points. # Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x) g_x = alpha_density_ratio(x)
# This is Y', in Reference 1. # This is Y', in Reference 1.
y = to_ndarray(y) y = to_ndarray(y)
# Obtain alpha-relative density ratio at these points. # Obtain alpha-relative density ratio at these points.
g_y = alpha_density_ratio(y) g_y = alpha_density_ratio(y)
# Compute the alpha-relative PE-divergence as given in Reference 1. # Compute the alpha-relative PE-divergence as given in Reference 1.
n = x.shape[0] n = x.shape[0]
divergence = ( divergence = (
-alpha * (g_x @ g_x) / 2 - (1 - alpha) * (g_y @ g_y) / 2 + g_x.sum(axis=0) -alpha * (g_x @ g_x) / 2 - (1 - alpha) * (g_y @ g_y) / 2 + g_x.sum(axis=0)
) / n - 1.0 / 2 ) / n - 1.0 / 2
return divergence return divergence
# Compute the approximate alpha-relative KL-divergence, given samples x and y from the respective distributions. # Compute the approximate alpha-relative KL-divergence, given samples x and y from the respective distributions.
def alpha_KL_divergence(x, y): def alpha_KL_divergence(x, y):
# This is Y, in Reference 1. # This is Y, in Reference 1.
x = to_ndarray(x) x = to_ndarray(x)
# Obtain alpha-relative density ratio at these points. # Obtain alpha-relative density ratio at these points.
g_x = alpha_density_ratio(x) g_x = alpha_density_ratio(x)
# Compute the alpha-relative KL-divergence. # Compute the alpha-relative KL-divergence.
n = x.shape[0] n = x.shape[0]
divergence = log(g_x).sum(axis=0) / n divergence = log(g_x).sum(axis=0) / n
return divergence return divergence
alpha_PE = alpha_PE_divergence(x, y) alpha_PE = alpha_PE_divergence(x, y)
alpha_KL = alpha_KL_divergence(x, y) alpha_KL = alpha_KL_divergence(x, y)
if verbose: if verbose:
print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE)) print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE))
print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL)) print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL))
kernel_info = KernelInfo( kernel_info = KernelInfo(
kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers
) )
result = DensityRatio( result = DensityRatio(
method="RuLSIF", method="RuLSIF",
alpha=alpha, alpha=alpha,
theta=theta, theta=theta,
lambda_=lambda_, lambda_=lambda_,
alpha_PE=alpha_PE, alpha_PE=alpha_PE,
alpha_KL=alpha_KL, alpha_KL=alpha_KL,
kernel_info=kernel_info, kernel_info=kernel_info,
compute_density_ratio=alpha_density_ratio, compute_density_ratio=alpha_density_ratio,
) )
if verbose: if verbose:
print("RuLSIF completed.") print("RuLSIF completed.")
return result return result
# Grid-search cross-validation for the optimal parameters sigma and lambda by leave-one-out cross-validation. See Reference 2. # Grid-search cross-validation for the optimal parameters sigma and lambda by leave-one-out cross-validation. See Reference 2.
def search_sigma_and_lambda(x, y, alpha, centers, sigma_range, lambda_range, verbose): def search_sigma_and_lambda(x, y, alpha, centers, sigma_range, lambda_range, verbose):
nx = x.shape[0] nx = x.shape[0]
ny = y.shape[0] ny = y.shape[0]
n_min = min(nx, ny) n_min = min(nx, ny)
kernel_num = centers.shape[0] kernel_num = centers.shape[0]
score_new = inf score_new = inf
sigma_new = 0 sigma_new = 0
lambda_new = 0 lambda_new = 0
for sigma in sigma_range: for sigma in sigma_range:
phi_x = compute_kernel_Gaussian(x, centers, sigma) # (nx, kernel_num) phi_x = compute_kernel_Gaussian(x, centers, sigma) # (nx, kernel_num)
phi_y = compute_kernel_Gaussian(y, centers, sigma) # (ny, kernel_num) phi_y = compute_kernel_Gaussian(y, centers, sigma) # (ny, kernel_num)
H = alpha * (phi_x.T @ phi_x / nx) + (1 - alpha) * ( H = alpha * (phi_x.T @ phi_x / nx) + (1 - alpha) * (
phi_y.T @ phi_y / ny phi_y.T @ phi_y / ny
) # (kernel_num, kernel_num) ) # (kernel_num, kernel_num)
h = phi_x.mean(axis=0).reshape(-1, 1) # (kernel_num, 1) h = phi_x.mean(axis=0).reshape(-1, 1) # (kernel_num, 1)
phi_x = phi_x[:n_min].T # (kernel_num, n_min) phi_x = phi_x[:n_min].T # (kernel_num, n_min)
phi_y = phi_y[:n_min].T # (kernel_num, n_min) phi_y = phi_y[:n_min].T # (kernel_num, n_min)
for lambda_ in lambda_range: for lambda_ in lambda_range:
B = H + diag( B = H + diag(
array(lambda_ * (ny - 1) / ny).repeat(kernel_num) array(lambda_ * (ny - 1) / ny).repeat(kernel_num)
) # (kernel_num, kernel_num) ) # (kernel_num, kernel_num)
B_inv_X = solve(B, phi_y) # (kernel_num, n_min) B_inv_X = solve(B, phi_y) # (kernel_num, n_min)
X_B_inv_X = multiply(phi_y, B_inv_X) # (kernel_num, n_min) X_B_inv_X = multiply(phi_y, B_inv_X) # (kernel_num, n_min)
denom = ny * ones(n_min) - ones(kernel_num) @ X_B_inv_X # (n_min, ) denom = ny * ones(n_min) - ones(kernel_num) @ X_B_inv_X # (n_min, )
B0 = solve(B, h @ ones((1, n_min))) + B_inv_X @ diagflat( B0 = solve(B, h @ ones((1, n_min))) + B_inv_X @ diagflat(
h.T @ B_inv_X / denom h.T @ B_inv_X / denom
) # (kernel_num, n_min) ) # (kernel_num, n_min)
B1 = solve(B, phi_x) + B_inv_X @ diagflat( B1 = solve(B, phi_x) + B_inv_X @ diagflat(
ones(kernel_num) @ multiply(phi_x, B_inv_X) ones(kernel_num) @ multiply(phi_x, B_inv_X)
) # (kernel_num, n_min) ) # (kernel_num, n_min)
B2 = (ny - 1) * (nx * B0 - B1) / (ny * (nx - 1)) # (kernel_num, n_min) B2 = (ny - 1) * (nx * B0 - B1) / (ny * (nx - 1)) # (kernel_num, n_min)
B2[B2 < 0] = 0 B2[B2 < 0] = 0
r_y = multiply(phi_y, B2).sum(axis=0).T # (n_min, ) r_y = multiply(phi_y, B2).sum(axis=0).T # (n_min, )
r_x = multiply(phi_x, B2).sum(axis=0).T # (n_min, ) r_x = multiply(phi_x, B2).sum(axis=0).T # (n_min, )
# Squared loss of RuLSIF, without regularization term. # Squared loss of RuLSIF, without regularization term.
# Directly related to the negative of the PE-divergence. # Directly related to the negative of the PE-divergence.
score = (r_y @ r_y / 2 - r_x.sum(axis=0)) / n_min score = (r_y @ r_y / 2 - r_x.sum(axis=0)) / n_min
if verbose: if verbose:
print( print(
"sigma = %.5f, lambda = %.5f, score = %.5f" "sigma = %.5f, lambda = %.5f, score = %.5f"
% (sigma, lambda_, score) % (sigma, lambda_, score)
) )
if score < score_new: if score < score_new:
score_new = score score_new = score
sigma_new = sigma sigma_new = sigma
lambda_new = lambda_ lambda_new = lambda_
return {"sigma": sigma_new, "lambda": lambda_new} return {"sigma": sigma_new, "lambda": lambda_new}
def _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res) -> None: def _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res) -> None:
sq_norm = sum(power(x_list - y_row, 2), 1) sq_norm = sum(power(x_list - y_row, 2), 1)
multiply(neg_gamma, sq_norm, res) multiply(neg_gamma, sq_norm, res)
exp(res, res) exp(res, res)
def _target_numpy_wrapper(x_list, y_list, neg_gamma): def _target_numpy_wrapper(x_list, y_list, neg_gamma):
res = empty((y_list.shape[0], x_list.shape[0]), np_float) res = empty((y_list.shape[0], x_list.shape[0]), np_float)
if isinstance(x_list, matrix) or isinstance(y_list, matrix): if isinstance(x_list, matrix) or isinstance(y_list, matrix):
res = asmatrix(res) res = asmatrix(res)
for j, y_row in enumerate(y_list): for j, y_row in enumerate(y_list):
# `.T` aligns shapes for matrices, does nothing for 1D ndarray. # `.T` aligns shapes for matrices, does nothing for 1D ndarray.
_compute_kernel_Gaussian(x_list, y_row, neg_gamma, res[j].T) _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res[j].T)
return res return res
_compute_functions = {"numpy": _target_numpy_wrapper} _compute_functions = {"numpy": _target_numpy_wrapper}
if guvectorize_compute: if guvectorize_compute:
_compute_functions.update( _compute_functions.update(
{ {
key: guvectorize_compute(key)(_compute_kernel_Gaussian) key: guvectorize_compute(key)(_compute_kernel_Gaussian)
for key in ("cpu", "parallel") for key in ("cpu", "parallel")
} }
) )
_compute_function = _compute_functions[ _compute_function = _compute_functions[
"cpu" if "cpu" in _compute_functions else "numpy" "cpu" if "cpu" in _compute_functions else "numpy"
] ]
# Returns a 2D numpy matrix of kernel evaluated at the gridpoints with coordinates from x_list and y_list. # Returns a 2D numpy matrix of kernel evaluated at the gridpoints with coordinates from x_list and y_list.
def compute_kernel_Gaussian(x_list, y_list, sigma): def compute_kernel_Gaussian(x_list, y_list, sigma):
return _compute_function(x_list, y_list, -0.5 * sigma**-2).T return _compute_function(x_list, y_list, -0.5 * sigma**-2).T
def set_compute_kernel_target(target: str) -> None: def set_compute_kernel_target(target: str) -> None:
global _compute_function global _compute_function
if target not in ("numpy", "cpu", "parallel"): if target not in ("numpy", "cpu", "parallel"):
raise ValueError( raise ValueError(
"'target' must be one of the following: 'numpy', 'cpu', or 'parallel'." "'target' must be one of the following: 'numpy', 'cpu', or 'parallel'."
) )
if target not in _compute_functions: if target not in _compute_functions:
warn("'numba' not available; defaulting to 'numpy'.", ImportWarning) warn("'numba' not available; defaulting to 'numpy'.", ImportWarning)
target = "numpy" target = "numpy"
_compute_function = _compute_functions[target] _compute_function = _compute_functions[target]

View File

@ -1,7 +1,7 @@
from warnings import filterwarnings from warnings import filterwarnings
from .core import densratio from .core import densratio
from .RuLSIF import set_compute_kernel_target from .RuLSIF import set_compute_kernel_target
filterwarnings("default", message="'numba'", category=ImportWarning, module="densratio") filterwarnings("default", message="'numba'", category=ImportWarning, module="densratio")
__all__ = ["densratio", "set_compute_kernel_target"] __all__ = ["densratio", "set_compute_kernel_target"]

View File

@ -1,70 +1,70 @@
""" """
densratio.core densratio.core
~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~
Estimate Density Ratio p(x)/q(y) Estimate Density Ratio p(x)/q(y)
""" """
from numpy import linspace from numpy import linspace
from .helpers import to_ndarray from .helpers import to_ndarray
from .RuLSIF import RuLSIF from .RuLSIF import RuLSIF
def densratio( def densratio(
x, y, alpha=0, sigma_range="auto", lambda_range="auto", kernel_num=100, verbose=True x, y, alpha=0, sigma_range="auto", lambda_range="auto", kernel_num=100, verbose=True
): ):
"""Estimate alpha-mixture Density Ratio p(x)/(alpha*p(x) + (1 - alpha)*q(x)) """Estimate alpha-mixture Density Ratio p(x)/(alpha*p(x) + (1 - alpha)*q(x))
Arguments: Arguments:
x: sample from p(x). x: sample from p(x).
y: sample from q(x). y: sample from q(x).
alpha: Default 0 - corresponds to ordinary density ratio. alpha: Default 0 - corresponds to ordinary density ratio.
sigma_range: search range of Gaussian kernel bandwidth. sigma_range: search range of Gaussian kernel bandwidth.
Default "auto" means 10^-3, 10^-2, ..., 10^9. Default "auto" means 10^-3, 10^-2, ..., 10^9.
lambda_range: search range of regularization parameter for uLSIF. lambda_range: search range of regularization parameter for uLSIF.
Default "auto" means 10^-3, 10^-2, ..., 10^9. Default "auto" means 10^-3, 10^-2, ..., 10^9.
kernel_num: number of kernels. Default 100. kernel_num: number of kernels. Default 100.
verbose: indicator to print messages. Default True. verbose: indicator to print messages. Default True.
Returns: Returns:
densratio.DensityRatio object which has `compute_density_ratio()`. densratio.DensityRatio object which has `compute_density_ratio()`.
Raises: Raises:
ValueError: if dimension of x != dimension of y ValueError: if dimension of x != dimension of y
Usage:: Usage::
>>> from scipy.stats import norm >>> from scipy.stats import norm
>>> from densratio import densratio >>> from densratio import densratio
>>> x = norm.rvs(size=200, loc=1, scale=1./8) >>> x = norm.rvs(size=200, loc=1, scale=1./8)
>>> y = norm.rvs(size=200, loc=1, scale=1./2) >>> y = norm.rvs(size=200, loc=1, scale=1./2)
>>> result = densratio(x, y, alpha=0.7) >>> result = densratio(x, y, alpha=0.7)
>>> print(result) >>> print(result)
>>> density_ratio = result.compute_density_ratio(y) >>> density_ratio = result.compute_density_ratio(y)
>>> print(density_ratio) >>> print(density_ratio)
""" """
x = to_ndarray(x) x = to_ndarray(x)
y = to_ndarray(y) y = to_ndarray(y)
if x.shape[1] != y.shape[1]: if x.shape[1] != y.shape[1]:
raise ValueError("x and y must be same dimensions.") raise ValueError("x and y must be same dimensions.")
if isinstance(sigma_range, str) and sigma_range != "auto": if isinstance(sigma_range, str) and sigma_range != "auto":
raise TypeError("Invalid value for sigma_range.") raise TypeError("Invalid value for sigma_range.")
if isinstance(lambda_range, str) and lambda_range != "auto": if isinstance(lambda_range, str) and lambda_range != "auto":
raise TypeError("Invalid value for lambda_range.") raise TypeError("Invalid value for lambda_range.")
if sigma_range is None or (isinstance(sigma_range, str) and sigma_range == "auto"): if sigma_range is None or (isinstance(sigma_range, str) and sigma_range == "auto"):
sigma_range = 10 ** linspace(-3, 9, 13) sigma_range = 10 ** linspace(-3, 9, 13)
if lambda_range is None or ( if lambda_range is None or (
isinstance(lambda_range, str) and lambda_range == "auto" isinstance(lambda_range, str) and lambda_range == "auto"
): ):
lambda_range = 10 ** linspace(-3, 9, 13) lambda_range = 10 ** linspace(-3, 9, 13)
result = RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num, verbose) result = RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num, verbose)
return result return result

View File

@ -1,88 +1,88 @@
from pprint import pformat from pprint import pformat
from re import sub from re import sub
class DensityRatio: class DensityRatio:
"""Density Ratio.""" """Density Ratio."""
def __init__( def __init__(
self, self,
method, method,
alpha, alpha,
theta, theta,
lambda_, lambda_,
alpha_PE, alpha_PE,
alpha_KL, alpha_KL,
kernel_info, kernel_info,
compute_density_ratio, compute_density_ratio,
): ):
self.method = method self.method = method
self.alpha = alpha self.alpha = alpha
self.theta = theta self.theta = theta
self.lambda_ = lambda_ self.lambda_ = lambda_
self.alpha_PE = alpha_PE self.alpha_PE = alpha_PE
self.alpha_KL = alpha_KL self.alpha_KL = alpha_KL
self.kernel_info = kernel_info self.kernel_info = kernel_info
self.compute_density_ratio = compute_density_ratio self.compute_density_ratio = compute_density_ratio
def __str__(self): def __str__(self):
return """ return """
Method: %(method)s Method: %(method)s
Alpha: %(alpha)s Alpha: %(alpha)s
Kernel Information: Kernel Information:
%(kernel_info)s %(kernel_info)s
Kernel Weights (theta): Kernel Weights (theta):
%(theta)s %(theta)s
Regularization Parameter (lambda): %(lambda_)s Regularization Parameter (lambda): %(lambda_)s
Alpha-Relative PE-Divergence: %(alpha_PE)s Alpha-Relative PE-Divergence: %(alpha_PE)s
Alpha-Relative KL-Divergence: %(alpha_KL)s Alpha-Relative KL-Divergence: %(alpha_KL)s
Function to Estimate Density Ratio: Function to Estimate Density Ratio:
compute_density_ratio(x) compute_density_ratio(x)
"""[ """[
1:-1 1:-1
] % dict( ] % dict(
method=self.method, method=self.method,
kernel_info=self.kernel_info, kernel_info=self.kernel_info,
alpha=self.alpha, alpha=self.alpha,
theta=my_format(self.theta), theta=my_format(self.theta),
lambda_=self.lambda_, lambda_=self.lambda_,
alpha_PE=self.alpha_PE, alpha_PE=self.alpha_PE,
alpha_KL=self.alpha_KL, alpha_KL=self.alpha_KL,
) )
class KernelInfo: class KernelInfo:
"""Kernel Information.""" """Kernel Information."""
def __init__(self, kernel_type, kernel_num, sigma, centers): def __init__(self, kernel_type, kernel_num, sigma, centers):
self.kernel_type = kernel_type self.kernel_type = kernel_type
self.kernel_num = kernel_num self.kernel_num = kernel_num
self.sigma = sigma self.sigma = sigma
self.centers = centers self.centers = centers
def __str__(self): def __str__(self):
return """ return """
Kernel type: %(kernel_type)s Kernel type: %(kernel_type)s
Number of kernels: %(kernel_num)s Number of kernels: %(kernel_num)s
Bandwidth(sigma): %(sigma)s Bandwidth(sigma): %(sigma)s
Centers: %(centers)s Centers: %(centers)s
"""[ """[
1:-1 1:-1
] % dict( ] % dict(
kernel_type=self.kernel_type, kernel_type=self.kernel_type,
kernel_num=self.kernel_num, kernel_num=self.kernel_num,
sigma=self.sigma, sigma=self.sigma,
centers=my_format(self.centers), centers=my_format(self.centers),
) )
def my_format(str): def my_format(str):
return sub(r"\s+", " ", (pformat(str).split("\n")[0] + "..")) return sub(r"\s+", " ", (pformat(str).split("\n")[0] + ".."))

View File

@ -1,36 +1,36 @@
from numpy import array, ndarray, result_type from numpy import array, ndarray, result_type
np_float = result_type(float) np_float = result_type(float)
try: try:
import numba as nb import numba as nb
except ModuleNotFoundError: except ModuleNotFoundError:
guvectorize_compute = None guvectorize_compute = None
else: else:
_nb_float = nb.from_dtype(np_float) _nb_float = nb.from_dtype(np_float)
def guvectorize_compute(target: str, *, cache: bool = True): def guvectorize_compute(target: str, *, cache: bool = True):
return nb.guvectorize( return nb.guvectorize(
[nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])], [nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])],
"(m, p),(p),()->(m)", "(m, p),(p),()->(m)",
nopython=True, nopython=True,
target=target, target=target,
cache=cache, cache=cache,
) )
def is_numeric(x): def is_numeric(x):
return isinstance(x, int) or isinstance(x, float) return isinstance(x, int) or isinstance(x, float)
def to_ndarray(x): def to_ndarray(x):
if isinstance(x, ndarray): if isinstance(x, ndarray):
if len(x.shape) == 1: if len(x.shape) == 1:
return x.reshape(-1, 1) return x.reshape(-1, 1)
else: else:
return x return x
elif str(type(x)) == "<class 'pandas.core.frame.DataFrame'>": elif str(type(x)) == "<class 'pandas.core.frame.DataFrame'>":
return x.values return x.values
elif not x: elif not x:
raise ValueError("Cannot transform to numpy.matrix.") raise ValueError("Cannot transform to numpy.matrix.")
else: else:
return to_ndarray(array(x)) return to_ndarray(array(x))

View File

@ -1,4 +1,4 @@
import numpy as np import numpy as np
def get_doc(probs1, probs2): def get_doc(probs1, probs2):
return np.mean(probs2) - np.mean(probs1) return np.mean(probs2) - np.mean(probs1)

View File

@ -1,66 +1,66 @@
import numpy as np import numpy as np
from scipy.sparse import issparse, vstack from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity from sklearn.neighbors import KernelDensity
from baselines import densratio from baselines import densratio
from baselines.pykliep import DensityRatioEstimator from baselines.pykliep import DensityRatioEstimator
def kliep(Xtr, ytr, Xte): def kliep(Xtr, ytr, Xte):
kliep = DensityRatioEstimator() kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte) kliep.fit(Xtr, Xte)
return kliep.predict(Xtr) return kliep.predict(Xtr)
def usilf(Xtr, ytr, Xte, alpha=0.0): def usilf(Xtr, ytr, Xte, alpha=0.0):
dense_ratio_obj = densratio(Xtr, Xte, alpha=alpha, verbose=False) dense_ratio_obj = densratio(Xtr, Xte, alpha=alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr) return dense_ratio_obj.compute_density_ratio(Xtr)
def logreg(Xtr, ytr, Xte): def logreg(Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for # check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28 # Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr): if issparse(Xtr):
X = vstack([Xtr, Xte]) X = vstack([Xtr, Xte])
else: else:
X = np.concatenate([Xtr, Xte]) X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0] y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV( logreg = GridSearchCV(
LogisticRegression(), LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]}, param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1, n_jobs=-1,
) )
logreg.fit(X, y) logreg.fit(X, y)
probs = logreg.predict_proba(Xtr) probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1] prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0] prior_train = Xtr.shape[0]
prior_test = Xte.shape[0] prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train) w = (prior_train / prior_test) * (prob_test / prob_train)
return w return w
kdex2_params = {"bandwidth": np.logspace(-1, 1, 20)} kdex2_params = {"bandwidth": np.logspace(-1, 1, 20)}
def kdex2_lltr(Xtr): def kdex2_lltr(Xtr):
if issparse(Xtr): if issparse(Xtr):
Xtr = Xtr.toarray() Xtr = Xtr.toarray()
return GridSearchCV(KernelDensity(), kdex2_params).fit(Xtr).score_samples(Xtr) return GridSearchCV(KernelDensity(), kdex2_params).fit(Xtr).score_samples(Xtr)
def kdex2_weights(Xtr, Xte, log_likelihood_tr): def kdex2_weights(Xtr, Xte, log_likelihood_tr):
log_likelihood_te = ( log_likelihood_te = (
GridSearchCV(KernelDensity(), kdex2_params).fit(Xte).score_samples(Xtr) GridSearchCV(KernelDensity(), kdex2_params).fit(Xte).score_samples(Xtr)
) )
likelihood_tr = np.exp(log_likelihood_tr) likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te) likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr return likelihood_te / likelihood_tr
def get_acc(tr_preds, ytr, w): def get_acc(tr_preds, ytr, w):
return np.sum((1.0 * (tr_preds == ytr)) * w) / np.sum(w) return np.sum((1.0 * (tr_preds == ytr)) * w) / np.sum(w)

View File

@ -1,140 +1,140 @@
# import itertools # import itertools
# from typing import Iterable # from typing import Iterable
# import quapy as qp # import quapy as qp
# import quapy.functional as F # import quapy.functional as F
# from densratio import densratio # from densratio import densratio
# from quapy.method.aggregative import * # from quapy.method.aggregative import *
# from quapy.protocol import ( # from quapy.protocol import (
# AbstractStochasticSeededProtocol, # AbstractStochasticSeededProtocol,
# OnLabelledCollectionProtocol, # OnLabelledCollectionProtocol,
# ) # )
# from scipy.sparse import issparse, vstack # from scipy.sparse import issparse, vstack
# from scipy.spatial.distance import cdist # from scipy.spatial.distance import cdist
# from scipy.stats import multivariate_normal # from scipy.stats import multivariate_normal
# from sklearn.linear_model import LogisticRegression # from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV # from sklearn.model_selection import GridSearchCV
# from sklearn.neighbors import KernelDensity # from sklearn.neighbors import KernelDensity
import time import time
import numpy as np import numpy as np
import sklearn.metrics as metrics import sklearn.metrics as metrics
from pykliep import DensityRatioEstimator from pykliep import DensityRatioEstimator
from quapy.protocol import APP from quapy.protocol import APP
from scipy.sparse import issparse, vstack from scipy.sparse import issparse, vstack
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity from sklearn.neighbors import KernelDensity
import baselines.impweight as iw import baselines.impweight as iw
from baselines.densratio import densratio from baselines.densratio import densratio
from quacc.dataset import Dataset from quacc.dataset import Dataset
# --------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------
# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg) # Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg)
# --------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------
class ImportanceWeight: class ImportanceWeight:
def weights(self, Xtr, ytr, Xte): def weights(self, Xtr, ytr, Xte):
... ...
class KLIEP(ImportanceWeight): class KLIEP(ImportanceWeight):
def __init__(self): def __init__(self):
pass pass
def weights(self, Xtr, ytr, Xte): def weights(self, Xtr, ytr, Xte):
kliep = DensityRatioEstimator() kliep = DensityRatioEstimator()
kliep.fit(Xtr, Xte) kliep.fit(Xtr, Xte)
return kliep.predict(Xtr) return kliep.predict(Xtr)
class USILF(ImportanceWeight): class USILF(ImportanceWeight):
def __init__(self, alpha=0.0): def __init__(self, alpha=0.0):
self.alpha = alpha self.alpha = alpha
def weights(self, Xtr, ytr, Xte): def weights(self, Xtr, ytr, Xte):
dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False) dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False)
return dense_ratio_obj.compute_density_ratio(Xtr) return dense_ratio_obj.compute_density_ratio(Xtr)
class LogReg(ImportanceWeight): class LogReg(ImportanceWeight):
def __init__(self): def __init__(self):
pass pass
def weights(self, Xtr, ytr, Xte): def weights(self, Xtr, ytr, Xte):
# check "Direct Density Ratio Estimation for # check "Direct Density Ratio Estimation for
# Large-scale Covariate Shift Adaptation", Eq.28 # Large-scale Covariate Shift Adaptation", Eq.28
if issparse(Xtr): if issparse(Xtr):
X = vstack([Xtr, Xte]) X = vstack([Xtr, Xte])
else: else:
X = np.concatenate([Xtr, Xte]) X = np.concatenate([Xtr, Xte])
y = [0] * Xtr.shape[0] + [1] * Xte.shape[0] y = [0] * Xtr.shape[0] + [1] * Xte.shape[0]
logreg = GridSearchCV( logreg = GridSearchCV(
LogisticRegression(), LogisticRegression(),
param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]}, param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]},
n_jobs=-1, n_jobs=-1,
) )
logreg.fit(X, y) logreg.fit(X, y)
probs = logreg.predict_proba(Xtr) probs = logreg.predict_proba(Xtr)
prob_train, prob_test = probs[:, 0], probs[:, 1] prob_train, prob_test = probs[:, 0], probs[:, 1]
prior_train = Xtr.shape[0] prior_train = Xtr.shape[0]
prior_test = Xte.shape[0] prior_test = Xte.shape[0]
w = (prior_train / prior_test) * (prob_test / prob_train) w = (prior_train / prior_test) * (prob_test / prob_train)
return w return w
class KDEx2(ImportanceWeight): class KDEx2(ImportanceWeight):
def __init__(self): def __init__(self):
pass pass
def weights(self, Xtr, ytr, Xte): def weights(self, Xtr, ytr, Xte):
params = {"bandwidth": np.logspace(-1, 1, 20)} params = {"bandwidth": np.logspace(-1, 1, 20)}
log_likelihood_tr = ( log_likelihood_tr = (
GridSearchCV(KernelDensity(), params).fit(Xtr).score_samples(Xtr) GridSearchCV(KernelDensity(), params).fit(Xtr).score_samples(Xtr)
) )
log_likelihood_te = ( log_likelihood_te = (
GridSearchCV(KernelDensity(), params).fit(Xte).score_samples(Xtr) GridSearchCV(KernelDensity(), params).fit(Xte).score_samples(Xtr)
) )
likelihood_tr = np.exp(log_likelihood_tr) likelihood_tr = np.exp(log_likelihood_tr)
likelihood_te = np.exp(log_likelihood_te) likelihood_te = np.exp(log_likelihood_te)
return likelihood_te / likelihood_tr return likelihood_te / likelihood_tr
if __name__ == "__main__": if __name__ == "__main__":
# d = Dataset("rcv1", target="CCAT").get_raw() # d = Dataset("rcv1", target="CCAT").get_raw()
d = Dataset("imdb", n_prevalences=1).get()[0] d = Dataset("imdb", n_prevalences=1).get()[0]
tstart = time.time() tstart = time.time()
lr = LogisticRegression() lr = LogisticRegression()
lr.fit(*d.train.Xy) lr.fit(*d.train.Xy)
val_preds = lr.predict(d.validation.X) val_preds = lr.predict(d.validation.X)
protocol = APP( protocol = APP(
d.test, d.test,
n_prevalences=21, n_prevalences=21,
repeats=1, repeats=1,
sample_size=100, sample_size=100,
return_type="labelled_collection", return_type="labelled_collection",
) )
results = [] results = []
for sample in protocol(): for sample in protocol():
wx = iw.kliep(d.validation.X, d.validation.y, sample.X) wx = iw.kliep(d.validation.X, d.validation.y, sample.X)
test_preds = lr.predict(sample.X) test_preds = lr.predict(sample.X)
estim_acc = np.sum((1.0 * (val_preds == d.validation.y)) * wx) / np.sum(wx) estim_acc = np.sum((1.0 * (val_preds == d.validation.y)) * wx) / np.sum(wx)
true_acc = metrics.accuracy_score(sample.y, test_preds) true_acc = metrics.accuracy_score(sample.y, test_preds)
results.append((sample.prevalence(), estim_acc, true_acc)) results.append((sample.prevalence(), estim_acc, true_acc))
tend = time.time() tend = time.time()
for r in results: for r in results:
print(*r) print(*r)
print(f"logreg finished [took {tend-tstart:.3f}s]") print(f"logreg finished [took {tend-tstart:.3f}s]")
import win11toast import win11toast
win11toast.notify("models.py", "Completed") win11toast.notify("models.py", "Completed")

View File

@ -1,221 +1,221 @@
import warnings import warnings
import numpy as np import numpy as np
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
class DensityRatioEstimator: class DensityRatioEstimator:
""" """
Class to accomplish direct density estimation implementing the original KLIEP Class to accomplish direct density estimation implementing the original KLIEP
algorithm from Direct Importance Estimation with Model Selection algorithm from Direct Importance Estimation with Model Selection
and Its Application to Covariate Shift Adaptation by Sugiyama et al. and Its Application to Covariate Shift Adaptation by Sugiyama et al.
The training set is distributed via The training set is distributed via
train ~ p(x) train ~ p(x)
and the test set is distributed via and the test set is distributed via
test ~ q(x). test ~ q(x).
The KLIEP algorithm and its variants approximate w(x) = q(x) / p(x) directly. The predict function returns the The KLIEP algorithm and its variants approximate w(x) = q(x) / p(x) directly. The predict function returns the
estimate of w(x). The function w(x) can serve as sample weights for the training set during estimate of w(x). The function w(x) can serve as sample weights for the training set during
training to modify the expectation function that the model's loss function is optimized via, training to modify the expectation function that the model's loss function is optimized via,
i.e. i.e.
E_{x ~ w(x)p(x)} loss(x) = E_{x ~ q(x)} loss(x). E_{x ~ w(x)p(x)} loss(x) = E_{x ~ q(x)} loss(x).
Usage : Usage :
The fit method is used to run the KLIEP algorithm using LCV and returns value of J The fit method is used to run the KLIEP algorithm using LCV and returns value of J
trained on the entire training/test set with the best sigma found. trained on the entire training/test set with the best sigma found.
Use the predict method on the training set to determine the sample weights from the KLIEP algorithm. Use the predict method on the training set to determine the sample weights from the KLIEP algorithm.
""" """
def __init__( def __init__(
self, self,
max_iter=5000, max_iter=5000,
num_params=[0.1, 0.2], num_params=[0.1, 0.2],
epsilon=1e-4, epsilon=1e-4,
cv=3, cv=3,
sigmas=[0.01, 0.1, 0.25, 0.5, 0.75, 1], sigmas=[0.01, 0.1, 0.25, 0.5, 0.75, 1],
random_state=None, random_state=None,
verbose=0, verbose=0,
): ):
""" """
Direct density estimation using an inner LCV loop to estimate the proper model. Can be used with sklearn Direct density estimation using an inner LCV loop to estimate the proper model. Can be used with sklearn
cross validation methods with or without storing the inner CV. To use a standard grid search. cross validation methods with or without storing the inner CV. To use a standard grid search.
max_iter : Number of iterations to perform max_iter : Number of iterations to perform
num_params : List of number of test set vectors used to construct the approximation for inner LCV. num_params : List of number of test set vectors used to construct the approximation for inner LCV.
Must be a float. Original paper used 10%, i.e. =.1 Must be a float. Original paper used 10%, i.e. =.1
sigmas : List of sigmas to be used in inner LCV loop. sigmas : List of sigmas to be used in inner LCV loop.
epsilon : Additive factor in the iterative algorithm for numerical stability. epsilon : Additive factor in the iterative algorithm for numerical stability.
""" """
self.max_iter = max_iter self.max_iter = max_iter
self.num_params = num_params self.num_params = num_params
self.epsilon = epsilon self.epsilon = epsilon
self.verbose = verbose self.verbose = verbose
self.sigmas = sigmas self.sigmas = sigmas
self.cv = cv self.cv = cv
self.random_state = 0 self.random_state = 0
def fit(self, X_train, X_test, alpha_0=None): def fit(self, X_train, X_test, alpha_0=None):
"""Uses cross validation to select sigma as in the original paper (LCV). """Uses cross validation to select sigma as in the original paper (LCV).
In a break from sklearn convention, y=X_test. In a break from sklearn convention, y=X_test.
The parameter cv corresponds to R in the original paper. The parameter cv corresponds to R in the original paper.
Once found, the best sigma is used to train on the full set.""" Once found, the best sigma is used to train on the full set."""
# LCV loop, shuffle a copy in place for performance. # LCV loop, shuffle a copy in place for performance.
cv = self.cv cv = self.cv
chunk = int(X_test.shape[0] / float(cv)) chunk = int(X_test.shape[0] / float(cv))
if self.random_state is not None: if self.random_state is not None:
np.random.seed(self.random_state) np.random.seed(self.random_state)
# if isinstance(X_test, csr_matrix): # if isinstance(X_test, csr_matrix):
# X_test_shuffled = X_test.toarray() # X_test_shuffled = X_test.toarray()
# else: # else:
# X_test_shuffled = X_test.copy() # X_test_shuffled = X_test.copy()
X_test_shuffled = X_test.copy() X_test_shuffled = X_test.copy()
X_test_index = np.arange(X_test_shuffled.shape[0]) X_test_index = np.arange(X_test_shuffled.shape[0])
np.random.shuffle(X_test_index) np.random.shuffle(X_test_index)
X_test_shuffled = X_test_shuffled[X_test_index, :] X_test_shuffled = X_test_shuffled[X_test_index, :]
j_scores = {} j_scores = {}
if type(self.sigmas) != list: if type(self.sigmas) != list:
self.sigmas = [self.sigmas] self.sigmas = [self.sigmas]
if type(self.num_params) != list: if type(self.num_params) != list:
self.num_params = [self.num_params] self.num_params = [self.num_params]
if len(self.sigmas) * len(self.num_params) > 1: if len(self.sigmas) * len(self.num_params) > 1:
# Inner LCV loop # Inner LCV loop
for num_param in self.num_params: for num_param in self.num_params:
for sigma in self.sigmas: for sigma in self.sigmas:
j_scores[(num_param, sigma)] = np.zeros(cv) j_scores[(num_param, sigma)] = np.zeros(cv)
for k in range(1, cv + 1): for k in range(1, cv + 1):
if self.verbose > 0: if self.verbose > 0:
print("Training: sigma: %s R: %s" % (sigma, k)) print("Training: sigma: %s R: %s" % (sigma, k))
X_test_fold = X_test_shuffled[(k - 1) * chunk : k * chunk, :] X_test_fold = X_test_shuffled[(k - 1) * chunk : k * chunk, :]
j_scores[(num_param, sigma)][k - 1] = self._fit( j_scores[(num_param, sigma)][k - 1] = self._fit(
X_train=X_train, X_train=X_train,
X_test=X_test_fold, X_test=X_test_fold,
num_parameters=num_param, num_parameters=num_param,
sigma=sigma, sigma=sigma,
) )
j_scores[(num_param, sigma)] = np.mean(j_scores[(num_param, sigma)]) j_scores[(num_param, sigma)] = np.mean(j_scores[(num_param, sigma)])
sorted_scores = sorted( sorted_scores = sorted(
[x for x in j_scores.items() if np.isfinite(x[1])], [x for x in j_scores.items() if np.isfinite(x[1])],
key=lambda x: x[1], key=lambda x: x[1],
reverse=True, reverse=True,
) )
if len(sorted_scores) == 0: if len(sorted_scores) == 0:
warnings.warn("LCV failed to converge for all values of sigma.") warnings.warn("LCV failed to converge for all values of sigma.")
return self return self
self._sigma = sorted_scores[0][0][1] self._sigma = sorted_scores[0][0][1]
self._num_parameters = sorted_scores[0][0][0] self._num_parameters = sorted_scores[0][0][0]
self._j_scores = sorted_scores self._j_scores = sorted_scores
else: else:
self._sigma = self.sigmas[0] self._sigma = self.sigmas[0]
self._num_parameters = self.num_params[0] self._num_parameters = self.num_params[0]
# best sigma # best sigma
self._j = self._fit( self._j = self._fit(
X_train=X_train, X_train=X_train,
X_test=X_test_shuffled, X_test=X_test_shuffled,
num_parameters=self._num_parameters, num_parameters=self._num_parameters,
sigma=self._sigma, sigma=self._sigma,
) )
return self # Compatibility with sklearn return self # Compatibility with sklearn
def _fit(self, X_train, X_test, num_parameters, sigma, alpha_0=None): def _fit(self, X_train, X_test, num_parameters, sigma, alpha_0=None):
"""Fits the estimator with the given parameters w-hat and returns J""" """Fits the estimator with the given parameters w-hat and returns J"""
num_parameters = num_parameters num_parameters = num_parameters
if type(num_parameters) == float: if type(num_parameters) == float:
num_parameters = int(X_test.shape[0] * num_parameters) num_parameters = int(X_test.shape[0] * num_parameters)
self._select_param_vectors( self._select_param_vectors(
X_test=X_test, sigma=sigma, num_parameters=num_parameters X_test=X_test, sigma=sigma, num_parameters=num_parameters
) )
# if isinstance(X_train, csr_matrix): # if isinstance(X_train, csr_matrix):
# X_train = X_train.toarray() # X_train = X_train.toarray()
X_train = self._reshape_X(X_train) X_train = self._reshape_X(X_train)
X_test = self._reshape_X(X_test) X_test = self._reshape_X(X_test)
if alpha_0 is None: if alpha_0 is None:
alpha_0 = np.ones(shape=(num_parameters, 1)) / float(num_parameters) alpha_0 = np.ones(shape=(num_parameters, 1)) / float(num_parameters)
self._find_alpha( self._find_alpha(
X_train=X_train, X_train=X_train,
X_test=X_test, X_test=X_test,
num_parameters=num_parameters, num_parameters=num_parameters,
epsilon=self.epsilon, epsilon=self.epsilon,
alpha_0=alpha_0, alpha_0=alpha_0,
sigma=sigma, sigma=sigma,
) )
return self._calculate_j(X_test, sigma=sigma) return self._calculate_j(X_test, sigma=sigma)
def _calculate_j(self, X_test, sigma): def _calculate_j(self, X_test, sigma):
pred = self.predict(X_test, sigma=sigma) + 0.0000001 pred = self.predict(X_test, sigma=sigma) + 0.0000001
log = np.log(pred).sum() log = np.log(pred).sum()
return log / (X_test.shape[0]) return log / (X_test.shape[0])
def score(self, X_test): def score(self, X_test):
"""Return the J score, similar to sklearn's API""" """Return the J score, similar to sklearn's API"""
return self._calculate_j(X_test=X_test, sigma=self._sigma) return self._calculate_j(X_test=X_test, sigma=self._sigma)
@staticmethod @staticmethod
def _reshape_X(X): def _reshape_X(X):
"""Reshape input from mxn to mx1xn to take advantage of numpy broadcasting.""" """Reshape input from mxn to mx1xn to take advantage of numpy broadcasting."""
if len(X.shape) != 3: if len(X.shape) != 3:
return X.reshape((X.shape[0], 1, X.shape[1])) return X.reshape((X.shape[0], 1, X.shape[1]))
return X return X
def _select_param_vectors(self, X_test, sigma, num_parameters): def _select_param_vectors(self, X_test, sigma, num_parameters):
"""X_test is the test set. b is the number of parameters.""" """X_test is the test set. b is the number of parameters."""
indices = np.random.choice(X_test.shape[0], size=num_parameters, replace=False) indices = np.random.choice(X_test.shape[0], size=num_parameters, replace=False)
self._test_vectors = X_test[indices, :].copy() self._test_vectors = X_test[indices, :].copy()
self._phi_fitted = True self._phi_fitted = True
def _phi(self, X, sigma=None): def _phi(self, X, sigma=None):
if sigma is None: if sigma is None:
sigma = self._sigma sigma = self._sigma
if self._phi_fitted: if self._phi_fitted:
return np.exp( return np.exp(
-np.sum((X - self._test_vectors) ** 2, axis=-1) / (2 * sigma**2) -np.sum((X - self._test_vectors) ** 2, axis=-1) / (2 * sigma**2)
) )
raise Exception("Phi not fitted.") raise Exception("Phi not fitted.")
def _find_alpha(self, alpha_0, X_train, X_test, num_parameters, sigma, epsilon): def _find_alpha(self, alpha_0, X_train, X_test, num_parameters, sigma, epsilon):
A = np.zeros(shape=(X_test.shape[0], num_parameters)) A = np.zeros(shape=(X_test.shape[0], num_parameters))
b = np.zeros(shape=(num_parameters, 1)) b = np.zeros(shape=(num_parameters, 1))
A = self._phi(X_test, sigma) A = self._phi(X_test, sigma)
b = self._phi(X_train, sigma).sum(axis=0) / X_train.shape[0] b = self._phi(X_train, sigma).sum(axis=0) / X_train.shape[0]
b = b.reshape((num_parameters, 1)) b = b.reshape((num_parameters, 1))
out = alpha_0.copy() out = alpha_0.copy()
for k in range(self.max_iter): for k in range(self.max_iter):
mat = np.dot(A, out) mat = np.dot(A, out)
mat += 0.000000001 mat += 0.000000001
out += epsilon * np.dot(np.transpose(A), 1.0 / mat) out += epsilon * np.dot(np.transpose(A), 1.0 / mat)
out += b * ( out += b * (
((1 - np.dot(np.transpose(b), out)) / np.dot(np.transpose(b), b)) ((1 - np.dot(np.transpose(b), out)) / np.dot(np.transpose(b), b))
) )
out = np.maximum(0, out) out = np.maximum(0, out)
out /= np.dot(np.transpose(b), out) out /= np.dot(np.transpose(b), out)
self._alpha = out self._alpha = out
self._fitted = True self._fitted = True
def predict(self, X, sigma=None): def predict(self, X, sigma=None):
"""Equivalent of w(X) from the original paper.""" """Equivalent of w(X) from the original paper."""
X = self._reshape_X(X) X = self._reshape_X(X)
if not self._fitted: if not self._fitted:
raise Exception("Not fitted!") raise Exception("Not fitted!")
return np.dot(self._phi(X, sigma=sigma), self._alpha).reshape((X.shape[0],)) return np.dot(self._phi(X, sigma=sigma), self._alpha).reshape((X.shape[0],))

View File

@ -1,14 +1,14 @@
import numpy as np import numpy as np
from sklearn import clone from sklearn import clone
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
def clone_fit(c_model: BaseEstimator, data, labels): def clone_fit(c_model: BaseEstimator, data, labels):
c_model2 = clone(c_model) c_model2 = clone(c_model)
c_model2.fit(data, labels) c_model2.fit(data, labels)
return c_model2 return c_model2
def get_score(pred1, pred2, labels): def get_score(pred1, pred2, labels):
return np.mean((pred1 == labels).astype(int) - (pred2 == labels).astype(int)) return np.mean((pred1 == labels).astype(int) - (pred2 == labels).astype(int))

464
conf.yaml
View File

@ -1,233 +1,233 @@
debug_conf: &debug_conf debug_conf: &debug_conf
global: global:
METRICS: METRICS:
- acc - acc
DATASET_N_PREVS: 5 DATASET_N_PREVS: 5
DATASET_PREVS: DATASET_PREVS:
# - 0.2 # - 0.2
- 0.5 - 0.5
# - 0.8 # - 0.8
confs: confs:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: CCAT DATASET_TARGET: CCAT
plot_confs: plot_confs:
debug: debug:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mulmc_sld - mulmc_sld
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
mc_conf: &mc_conf mc_conf: &mc_conf
global: global:
METRICS: METRICS:
- acc - acc
DATASET_N_PREVS: 9 DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true DATASET_DIR_UPDATE: true
confs: confs:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: CCAT DATASET_TARGET: CCAT
# - DATASET_NAME: imdb # - DATASET_NAME: imdb
plot_confs: plot_confs:
debug3: debug3:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- binmc_sld - binmc_sld
- mulmc_sld - mulmc_sld
- binne_sld - binne_sld
- mulne_sld - mulne_sld
- bin_sld_gs - bin_sld_gs
- mul_sld_gs - mul_sld_gs
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
test_conf: &test_conf test_conf: &test_conf
global: global:
METRICS: METRICS:
- acc - acc
- f1 - f1
DATASET_N_PREVS: 9 DATASET_N_PREVS: 9
confs: confs:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: CCAT DATASET_TARGET: CCAT
# - DATASET_NAME: imdb # - DATASET_NAME: imdb
plot_confs: plot_confs:
gs_vs_gsq: gs_vs_gsq:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- bin_sld_gs - bin_sld_gs
- bin_sld_gsq - bin_sld_gsq
- mul_sld - mul_sld
- mul_sld_gs - mul_sld_gs
- mul_sld_gsq - mul_sld_gsq
gs_vs_atc: gs_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- bin_sld_gs - bin_sld_gs
- mul_sld - mul_sld
- mul_sld_gs - mul_sld_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
sld_vs_pacc: sld_vs_pacc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- bin_sld_gs - bin_sld_gs
- mul_sld - mul_sld
- mul_sld_gs - mul_sld_gs
- bin_pacc - bin_pacc
- bin_pacc_gs - bin_pacc_gs
- mul_pacc - mul_pacc
- mul_pacc_gs - mul_pacc_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
pacc_vs_atc: pacc_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_pacc - bin_pacc
- bin_pacc_gs - bin_pacc_gs
- mul_pacc - mul_pacc
- mul_pacc_gs - mul_pacc_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
main_conf: &main_conf main_conf: &main_conf
global: global:
METRICS: METRICS:
- acc - acc
- f1 - f1
DATASET_N_PREVS: 9 DATASET_N_PREVS: 9
DATASET_DIR_UPDATE: true DATASET_DIR_UPDATE: true
confs: confs:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: CCAT DATASET_TARGET: CCAT
- DATASET_NAME: imdb - DATASET_NAME: imdb
confs_next: confs_next:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: GCAT DATASET_TARGET: GCAT
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: MCAT DATASET_TARGET: MCAT
plot_confs: plot_confs:
gs_vs_qgs: gs_vs_qgs:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_sld_gs - mul_sld_gs
- bin_sld_gs - bin_sld_gs
- mul_sld_gsq - mul_sld_gsq
- bin_sld_gsq - bin_sld_gsq
- atc_mc - atc_mc
- atc_ne - atc_ne
PLOT_STDEV: true PLOT_STDEV: true
plot_confs_completed: plot_confs_completed:
max_conf_vs_atc_pacc: max_conf_vs_atc_pacc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_pacc - bin_pacc
- binmc_pacc - binmc_pacc
- mul_pacc - mul_pacc
- mulmc_pacc - mulmc_pacc
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
max_conf_vs_entropy_pacc: max_conf_vs_entropy_pacc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- binmc_pacc - binmc_pacc
- binne_pacc - binne_pacc
- mulmc_pacc - mulmc_pacc
- mulne_pacc - mulne_pacc
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
gs_vs_atc: gs_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_sld_gs - mul_sld_gs
- bin_sld_gs - bin_sld_gs
- mul_pacc_gs - mul_pacc_gs
- bin_pacc_gs - bin_pacc_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
PLOT_STDEV: true PLOT_STDEV: true
gs_vs_all: gs_vs_all:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_sld_gs - mul_sld_gs
- bin_sld_gs - bin_sld_gs
- mul_pacc_gs - mul_pacc_gs
- bin_pacc_gs - bin_pacc_gs
- atc_mc - atc_mc
- doc_feat - doc_feat
- kfcv - kfcv
PLOT_STDEV: true PLOT_STDEV: true
gs_vs_qgs: gs_vs_qgs:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_sld_gs - mul_sld_gs
- bin_sld_gs - bin_sld_gs
- mul_sld_gsq - mul_sld_gsq
- bin_sld_gsq - bin_sld_gsq
- atc_mc - atc_mc
- atc_ne - atc_ne
PLOT_STDEV: true PLOT_STDEV: true
cc_vs_other: cc_vs_other:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_cc - mul_cc
- bin_cc - bin_cc
- mul_sld - mul_sld
- bin_sld - bin_sld
- mul_pacc - mul_pacc
- bin_pacc - bin_pacc
PLOT_STDEV: true PLOT_STDEV: true
max_conf_vs_atc: max_conf_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- binmc_sld - binmc_sld
- mul_sld - mul_sld
- mulmc_sld - mulmc_sld
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
max_conf_vs_entropy: max_conf_vs_entropy:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- binmc_sld - binmc_sld
- binne_sld - binne_sld
- mulmc_sld - mulmc_sld
- mulne_sld - mulne_sld
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
sld_vs_pacc: sld_vs_pacc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- mul_sld - mul_sld
- bin_pacc - bin_pacc
- mul_pacc - mul_pacc
- atc_mc - atc_mc
PLOT_STDEV: true PLOT_STDEV: true
plot_confs_other: plot_confs_other:
best_vs_atc: best_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- mul_sld_bcts - mul_sld_bcts
- mul_sld_gs - mul_sld_gs
- bin_sld_bcts - bin_sld_bcts
- bin_sld_gs - bin_sld_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
all_vs_atc: all_vs_atc:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld - bin_sld
- bin_sld_bcts - bin_sld_bcts
- bin_sld_gs - bin_sld_gs
- mul_sld - mul_sld
- mul_sld_bcts - mul_sld_bcts
- mul_sld_gs - mul_sld_gs
- atc_mc - atc_mc
- atc_ne - atc_ne
best_vs_all: best_vs_all:
PLOT_ESTIMATORS: PLOT_ESTIMATORS:
- bin_sld_bcts - bin_sld_bcts
- bin_sld_gs - bin_sld_gs
- mul_sld_bcts - mul_sld_bcts
- mul_sld_gs - mul_sld_gs
- kfcv - kfcv
- atc_mc - atc_mc
- atc_ne - atc_ne
- doc_feat - doc_feat
exec: *main_conf exec: *main_conf

View File

@ -1,445 +1,445 @@
<div>target: default</div> <div>target: default</div>
<div>train: [0.5 0.5]</div> <div>train: [0.5 0.5]</div>
<div>validation: [0.5 0.5]</div> <div>validation: [0.5 0.5]</div>
<div>evaluate_binary: 277.300s</div> <div>evaluate_binary: 277.300s</div>
<div>evaluate_multiclass: 139.986s</div> <div>evaluate_multiclass: 139.986s</div>
<div>kfcv: 98.625s</div> <div>kfcv: 98.625s</div>
<div>atc_mc: 93.304s</div> <div>atc_mc: 93.304s</div>
<div>atc_ne: 91.201s</div> <div>atc_ne: 91.201s</div>
<div>doc_feat: 29.930s</div> <div>doc_feat: 29.930s</div>
<div>rca_score: 1018.341s</div> <div>rca_score: 1018.341s</div>
<div>rca_star_score: 1013.733s</div> <div>rca_star_score: 1013.733s</div>
<div>tot: 1054.413s</div> <div>tot: 1054.413s</div>
<table border="1" class="dataframe"> <table border="1" class="dataframe">
<thead> <thead>
<tr style="text-align: right;"> <tr style="text-align: right;">
<th></th> <th></th>
<th>bin</th> <th>bin</th>
<th>mul</th> <th>mul</th>
<th>kfcv</th> <th>kfcv</th>
<th>atc_mc</th> <th>atc_mc</th>
<th>atc_ne</th> <th>atc_ne</th>
<th>doc_feat</th> <th>doc_feat</th>
<th>rca</th> <th>rca</th>
<th>rca_star</th> <th>rca_star</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
<th>(0.0, 1.0)</th> <th>(0.0, 1.0)</th>
<td>0.0154</td> <td>0.0154</td>
<td>0.0177</td> <td>0.0177</td>
<td>0.0249</td> <td>0.0249</td>
<td>0.0291</td> <td>0.0291</td>
<td>0.0291</td> <td>0.0291</td>
<td>0.0248</td> <td>0.0248</td>
<td>0.2705</td> <td>0.2705</td>
<td>0.2413</td> <td>0.2413</td>
</tr> </tr>
<tr> <tr>
<th>(0.05, 0.95)</th> <th>(0.05, 0.95)</th>
<td>0.0309</td> <td>0.0309</td>
<td>0.0284</td> <td>0.0284</td>
<td>0.0252</td> <td>0.0252</td>
<td>0.0300</td> <td>0.0300</td>
<td>0.0300</td> <td>0.0300</td>
<td>0.0247</td> <td>0.0247</td>
<td>0.2796</td> <td>0.2796</td>
<td>0.2504</td> <td>0.2504</td>
</tr> </tr>
<tr> <tr>
<th>(0.1, 0.9)</th> <th>(0.1, 0.9)</th>
<td>0.0309</td> <td>0.0309</td>
<td>0.0302</td> <td>0.0302</td>
<td>0.0251</td> <td>0.0251</td>
<td>0.0279</td> <td>0.0279</td>
<td>0.0279</td> <td>0.0279</td>
<td>0.0250</td> <td>0.0250</td>
<td>0.2722</td> <td>0.2722</td>
<td>0.2430</td> <td>0.2430</td>
</tr> </tr>
<tr> <tr>
<th>(0.15, 0.85)</th> <th>(0.15, 0.85)</th>
<td>0.0310</td> <td>0.0310</td>
<td>0.0339</td> <td>0.0339</td>
<td>0.0245</td> <td>0.0245</td>
<td>0.0269</td> <td>0.0269</td>
<td>0.0269</td> <td>0.0269</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.2684</td> <td>0.2684</td>
<td>0.2392</td> <td>0.2392</td>
</tr> </tr>
<tr> <tr>
<th>(0.2, 0.8)</th> <th>(0.2, 0.8)</th>
<td>0.0411</td> <td>0.0411</td>
<td>0.0407</td> <td>0.0407</td>
<td>0.0259</td> <td>0.0259</td>
<td>0.0292</td> <td>0.0292</td>
<td>0.0292</td> <td>0.0292</td>
<td>0.0257</td> <td>0.0257</td>
<td>0.2724</td> <td>0.2724</td>
<td>0.2432</td> <td>0.2432</td>
</tr> </tr>
<tr> <tr>
<th>(0.25, 0.75)</th> <th>(0.25, 0.75)</th>
<td>0.0381</td> <td>0.0381</td>
<td>0.0376</td> <td>0.0376</td>
<td>0.0262</td> <td>0.0262</td>
<td>0.0319</td> <td>0.0319</td>
<td>0.0319</td> <td>0.0319</td>
<td>0.0259</td> <td>0.0259</td>
<td>0.2701</td> <td>0.2701</td>
<td>0.2409</td> <td>0.2409</td>
</tr> </tr>
<tr> <tr>
<th>(0.3, 0.7)</th> <th>(0.3, 0.7)</th>
<td>0.0442</td> <td>0.0442</td>
<td>0.0452</td> <td>0.0452</td>
<td>0.0254</td> <td>0.0254</td>
<td>0.0273</td> <td>0.0273</td>
<td>0.0273</td> <td>0.0273</td>
<td>0.0256</td> <td>0.0256</td>
<td>0.2650</td> <td>0.2650</td>
<td>0.2358</td> <td>0.2358</td>
</tr> </tr>
<tr> <tr>
<th>(0.35, 0.65)</th> <th>(0.35, 0.65)</th>
<td>0.0480</td> <td>0.0480</td>
<td>0.0498</td> <td>0.0498</td>
<td>0.0236</td> <td>0.0236</td>
<td>0.0257</td> <td>0.0257</td>
<td>0.0257</td> <td>0.0257</td>
<td>0.0235</td> <td>0.0235</td>
<td>0.2640</td> <td>0.2640</td>
<td>0.2347</td> <td>0.2347</td>
</tr> </tr>
<tr> <tr>
<th>(0.4, 0.6)</th> <th>(0.4, 0.6)</th>
<td>0.0401</td> <td>0.0401</td>
<td>0.0431</td> <td>0.0431</td>
<td>0.0222</td> <td>0.0222</td>
<td>0.0296</td> <td>0.0296</td>
<td>0.0296</td> <td>0.0296</td>
<td>0.0220</td> <td>0.0220</td>
<td>0.2654</td> <td>0.2654</td>
<td>0.2361</td> <td>0.2361</td>
</tr> </tr>
<tr> <tr>
<th>(0.45, 0.55)</th> <th>(0.45, 0.55)</th>
<td>0.0551</td> <td>0.0551</td>
<td>0.0558</td> <td>0.0558</td>
<td>0.0243</td> <td>0.0243</td>
<td>0.0295</td> <td>0.0295</td>
<td>0.0295</td> <td>0.0295</td>
<td>0.0246</td> <td>0.0246</td>
<td>0.1838</td> <td>0.1838</td>
<td>0.1551</td> <td>0.1551</td>
</tr> </tr>
<tr> <tr>
<th>(0.5, 0.5)</th> <th>(0.5, 0.5)</th>
<td>0.0499</td> <td>0.0499</td>
<td>0.0513</td> <td>0.0513</td>
<td>0.0308</td> <td>0.0308</td>
<td>0.0319</td> <td>0.0319</td>
<td>0.0319</td> <td>0.0319</td>
<td>0.0309</td> <td>0.0309</td>
<td>0.1472</td> <td>0.1472</td>
<td>0.1202</td> <td>0.1202</td>
</tr> </tr>
<tr> <tr>
<th>(0.55, 0.45)</th> <th>(0.55, 0.45)</th>
<td>0.0538</td> <td>0.0538</td>
<td>0.0542</td> <td>0.0542</td>
<td>0.0278</td> <td>0.0278</td>
<td>0.0329</td> <td>0.0329</td>
<td>0.0329</td> <td>0.0329</td>
<td>0.0280</td> <td>0.0280</td>
<td>0.1717</td> <td>0.1717</td>
<td>0.1459</td> <td>0.1459</td>
</tr> </tr>
<tr> <tr>
<th>(0.6, 0.4)</th> <th>(0.6, 0.4)</th>
<td>0.0476</td> <td>0.0476</td>
<td>0.0484</td> <td>0.0484</td>
<td>0.0258</td> <td>0.0258</td>
<td>0.0298</td> <td>0.0298</td>
<td>0.0298</td> <td>0.0298</td>
<td>0.0259</td> <td>0.0259</td>
<td>0.2434</td> <td>0.2434</td>
<td>0.2147</td> <td>0.2147</td>
</tr> </tr>
<tr> <tr>
<th>(0.65, 0.35)</th> <th>(0.65, 0.35)</th>
<td>0.0447</td> <td>0.0447</td>
<td>0.0474</td> <td>0.0474</td>
<td>0.0287</td> <td>0.0287</td>
<td>0.0332</td> <td>0.0332</td>
<td>0.0332</td> <td>0.0332</td>
<td>0.0288</td> <td>0.0288</td>
<td>0.2632</td> <td>0.2632</td>
<td>0.2340</td> <td>0.2340</td>
</tr> </tr>
<tr> <tr>
<th>(0.7, 0.3)</th> <th>(0.7, 0.3)</th>
<td>0.0388</td> <td>0.0388</td>
<td>0.0397</td> <td>0.0397</td>
<td>0.0295</td> <td>0.0295</td>
<td>0.0328</td> <td>0.0328</td>
<td>0.0328</td> <td>0.0328</td>
<td>0.0296</td> <td>0.0296</td>
<td>0.2659</td> <td>0.2659</td>
<td>0.2367</td> <td>0.2367</td>
</tr> </tr>
<tr> <tr>
<th>(0.75, 0.25)</th> <th>(0.75, 0.25)</th>
<td>0.0336</td> <td>0.0336</td>
<td>0.0399</td> <td>0.0399</td>
<td>0.0241</td> <td>0.0241</td>
<td>0.0293</td> <td>0.0293</td>
<td>0.0293</td> <td>0.0293</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.2612</td> <td>0.2612</td>
<td>0.2320</td> <td>0.2320</td>
</tr> </tr>
<tr> <tr>
<th>(0.8, 0.2)</th> <th>(0.8, 0.2)</th>
<td>0.0407</td> <td>0.0407</td>
<td>0.0447</td> <td>0.0447</td>
<td>0.0266</td> <td>0.0266</td>
<td>0.0303</td> <td>0.0303</td>
<td>0.0303</td> <td>0.0303</td>
<td>0.0271</td> <td>0.0271</td>
<td>0.2601</td> <td>0.2601</td>
<td>0.2309</td> <td>0.2309</td>
</tr> </tr>
<tr> <tr>
<th>(0.85, 0.15)</th> <th>(0.85, 0.15)</th>
<td>0.0383</td> <td>0.0383</td>
<td>0.0423</td> <td>0.0423</td>
<td>0.0219</td> <td>0.0219</td>
<td>0.0278</td> <td>0.0278</td>
<td>0.0278</td> <td>0.0278</td>
<td>0.0220</td> <td>0.0220</td>
<td>0.2670</td> <td>0.2670</td>
<td>0.2378</td> <td>0.2378</td>
</tr> </tr>
<tr> <tr>
<th>(0.9, 0.1)</th> <th>(0.9, 0.1)</th>
<td>0.0351</td> <td>0.0351</td>
<td>0.0387</td> <td>0.0387</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.0275</td> <td>0.0275</td>
<td>0.0275</td> <td>0.0275</td>
<td>0.0245</td> <td>0.0245</td>
<td>0.2618</td> <td>0.2618</td>
<td>0.2326</td> <td>0.2326</td>
</tr> </tr>
<tr> <tr>
<th>(0.95, 0.05)</th> <th>(0.95, 0.05)</th>
<td>0.0238</td> <td>0.0238</td>
<td>0.0263</td> <td>0.0263</td>
<td>0.0269</td> <td>0.0269</td>
<td>0.0296</td> <td>0.0296</td>
<td>0.0296</td> <td>0.0296</td>
<td>0.0272</td> <td>0.0272</td>
<td>0.2602</td> <td>0.2602</td>
<td>0.2310</td> <td>0.2310</td>
</tr> </tr>
<tr> <tr>
<th>(1.0, 0.0)</th> <th>(1.0, 0.0)</th>
<td>0.0118</td> <td>0.0118</td>
<td>0.0202</td> <td>0.0202</td>
<td>0.0241</td> <td>0.0241</td>
<td>0.0279</td> <td>0.0279</td>
<td>0.0279</td> <td>0.0279</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.2571</td> <td>0.2571</td>
<td>0.2279</td> <td>0.2279</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
<table border="1" class="dataframe"> <table border="1" class="dataframe">
<thead> <thead>
<tr style="text-align: right;"> <tr style="text-align: right;">
<th></th> <th></th>
<th>bin</th> <th>bin</th>
<th>mul</th> <th>mul</th>
<th>kfcv</th> <th>kfcv</th>
<th>atc_mc</th> <th>atc_mc</th>
<th>atc_ne</th> <th>atc_ne</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
<th>(0.0, 1.0)</th> <th>(0.0, 1.0)</th>
<td>0.0088</td> <td>0.0088</td>
<td>0.0100</td> <td>0.0100</td>
<td>0.0580</td> <td>0.0580</td>
<td>0.0183</td> <td>0.0183</td>
<td>0.0183</td> <td>0.0183</td>
</tr> </tr>
<tr> <tr>
<th>(0.05, 0.95)</th> <th>(0.05, 0.95)</th>
<td>0.0175</td> <td>0.0175</td>
<td>0.0159</td> <td>0.0159</td>
<td>0.0605</td> <td>0.0605</td>
<td>0.0193</td> <td>0.0193</td>
<td>0.0193</td> <td>0.0193</td>
</tr> </tr>
<tr> <tr>
<th>(0.1, 0.9)</th> <th>(0.1, 0.9)</th>
<td>0.0184</td> <td>0.0184</td>
<td>0.0176</td> <td>0.0176</td>
<td>0.0532</td> <td>0.0532</td>
<td>0.0189</td> <td>0.0189</td>
<td>0.0189</td> <td>0.0189</td>
</tr> </tr>
<tr> <tr>
<th>(0.15, 0.85)</th> <th>(0.15, 0.85)</th>
<td>0.0188</td> <td>0.0188</td>
<td>0.0204</td> <td>0.0204</td>
<td>0.0475</td> <td>0.0475</td>
<td>0.0180</td> <td>0.0180</td>
<td>0.0180</td> <td>0.0180</td>
</tr> </tr>
<tr> <tr>
<th>(0.2, 0.8)</th> <th>(0.2, 0.8)</th>
<td>0.0269</td> <td>0.0269</td>
<td>0.0266</td> <td>0.0266</td>
<td>0.0455</td> <td>0.0455</td>
<td>0.0206</td> <td>0.0206</td>
<td>0.0206</td> <td>0.0206</td>
</tr> </tr>
<tr> <tr>
<th>(0.25, 0.75)</th> <th>(0.25, 0.75)</th>
<td>0.0265</td> <td>0.0265</td>
<td>0.0261</td> <td>0.0261</td>
<td>0.0401</td> <td>0.0401</td>
<td>0.0242</td> <td>0.0242</td>
<td>0.0242</td> <td>0.0242</td>
</tr> </tr>
<tr> <tr>
<th>(0.3, 0.7)</th> <th>(0.3, 0.7)</th>
<td>0.0328</td> <td>0.0328</td>
<td>0.0336</td> <td>0.0336</td>
<td>0.0331</td> <td>0.0331</td>
<td>0.0208</td> <td>0.0208</td>
<td>0.0208</td> <td>0.0208</td>
</tr> </tr>
<tr> <tr>
<th>(0.35, 0.65)</th> <th>(0.35, 0.65)</th>
<td>0.0386</td> <td>0.0386</td>
<td>0.0394</td> <td>0.0394</td>
<td>0.0307</td> <td>0.0307</td>
<td>0.0211</td> <td>0.0211</td>
<td>0.0211</td> <td>0.0211</td>
</tr> </tr>
<tr> <tr>
<th>(0.4, 0.6)</th> <th>(0.4, 0.6)</th>
<td>0.0343</td> <td>0.0343</td>
<td>0.0371</td> <td>0.0371</td>
<td>0.0273</td> <td>0.0273</td>
<td>0.0265</td> <td>0.0265</td>
<td>0.0265</td> <td>0.0265</td>
</tr> </tr>
<tr> <tr>
<th>(0.45, 0.55)</th> <th>(0.45, 0.55)</th>
<td>0.0511</td> <td>0.0511</td>
<td>0.0512</td> <td>0.0512</td>
<td>0.0231</td> <td>0.0231</td>
<td>0.0275</td> <td>0.0275</td>
<td>0.0275</td> <td>0.0275</td>
</tr> </tr>
<tr> <tr>
<th>(0.5, 0.5)</th> <th>(0.5, 0.5)</th>
<td>0.0517</td> <td>0.0517</td>
<td>0.0529</td> <td>0.0529</td>
<td>0.0306</td> <td>0.0306</td>
<td>0.0319</td> <td>0.0319</td>
<td>0.0319</td> <td>0.0319</td>
</tr> </tr>
<tr> <tr>
<th>(0.55, 0.45)</th> <th>(0.55, 0.45)</th>
<td>0.0584</td> <td>0.0584</td>
<td>0.0583</td> <td>0.0583</td>
<td>0.0308</td> <td>0.0308</td>
<td>0.0354</td> <td>0.0354</td>
<td>0.0354</td> <td>0.0354</td>
</tr> </tr>
<tr> <tr>
<th>(0.6, 0.4)</th> <th>(0.6, 0.4)</th>
<td>0.0590</td> <td>0.0590</td>
<td>0.0599</td> <td>0.0599</td>
<td>0.0363</td> <td>0.0363</td>
<td>0.0357</td> <td>0.0357</td>
<td>0.0357</td> <td>0.0357</td>
</tr> </tr>
<tr> <tr>
<th>(0.65, 0.35)</th> <th>(0.65, 0.35)</th>
<td>0.0635</td> <td>0.0635</td>
<td>0.0662</td> <td>0.0662</td>
<td>0.0506</td> <td>0.0506</td>
<td>0.0440</td> <td>0.0440</td>
<td>0.0440</td> <td>0.0440</td>
</tr> </tr>
<tr> <tr>
<th>(0.7, 0.3)</th> <th>(0.7, 0.3)</th>
<td>0.0596</td> <td>0.0596</td>
<td>0.0638</td> <td>0.0638</td>
<td>0.0654</td> <td>0.0654</td>
<td>0.0457</td> <td>0.0457</td>
<td>0.0457</td> <td>0.0457</td>
</tr> </tr>
<tr> <tr>
<th>(0.75, 0.25)</th> <th>(0.75, 0.25)</th>
<td>0.0627</td> <td>0.0627</td>
<td>0.0744</td> <td>0.0744</td>
<td>0.0964</td> <td>0.0964</td>
<td>0.0461</td> <td>0.0461</td>
<td>0.0461</td> <td>0.0461</td>
</tr> </tr>
<tr> <tr>
<th>(0.8, 0.2)</th> <th>(0.8, 0.2)</th>
<td>0.0909</td> <td>0.0909</td>
<td>0.0999</td> <td>0.0999</td>
<td>0.1400</td> <td>0.1400</td>
<td>0.0629</td> <td>0.0629</td>
<td>0.0629</td> <td>0.0629</td>
</tr> </tr>
<tr> <tr>
<th>(0.85, 0.15)</th> <th>(0.85, 0.15)</th>
<td>0.1052</td> <td>0.1052</td>
<td>0.1126</td> <td>0.1126</td>
<td>0.1829</td> <td>0.1829</td>
<td>0.0727</td> <td>0.0727</td>
<td>0.0727</td> <td>0.0727</td>
</tr> </tr>
<tr> <tr>
<th>(0.9, 0.1)</th> <th>(0.9, 0.1)</th>
<td>0.1377</td> <td>0.1377</td>
<td>0.1481</td> <td>0.1481</td>
<td>0.2839</td> <td>0.2839</td>
<td>0.1215</td> <td>0.1215</td>
<td>0.1215</td> <td>0.1215</td>
</tr> </tr>
<tr> <tr>
<th>(0.95, 0.05)</th> <th>(0.95, 0.05)</th>
<td>0.1305</td> <td>0.1305</td>
<td>0.1450</td> <td>0.1450</td>
<td>0.4592</td> <td>0.4592</td>
<td>0.2037</td> <td>0.2037</td>
<td>0.2037</td> <td>0.2037</td>
</tr> </tr>
<tr> <tr>
<th>(1.0, 0.0)</th> <th>(1.0, 0.0)</th>
<td>0.1092</td> <td>0.1092</td>
<td>0.1387</td> <td>0.1387</td>
<td>0.8818</td> <td>0.8818</td>
<td>0.5267</td> <td>0.5267</td>
<td>0.5267</td> <td>0.5267</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>

34710
out_rcv1.md

File diff suppressed because it is too large Load Diff

View File

@ -1,445 +1,445 @@
<div>target: default</div> <div>target: default</div>
<div>train: [0.60621118 0.39378882]</div> <div>train: [0.60621118 0.39378882]</div>
<div>validation: [0.60559006 0.39440994]</div> <div>validation: [0.60559006 0.39440994]</div>
<div>evaluate_binary: 31.883s</div> <div>evaluate_binary: 31.883s</div>
<div>evaluate_multiclass: 24.748s</div> <div>evaluate_multiclass: 24.748s</div>
<div>kfcv: 23.957s</div> <div>kfcv: 23.957s</div>
<div>atc_mc: 36.062s</div> <div>atc_mc: 36.062s</div>
<div>atc_ne: 37.123s</div> <div>atc_ne: 37.123s</div>
<div>doc_feat: 7.063s</div> <div>doc_feat: 7.063s</div>
<div>rca_score: 148.420s</div> <div>rca_score: 148.420s</div>
<div>rca_star_score: 145.690s</div> <div>rca_star_score: 145.690s</div>
<div>tot: 149.118s</div> <div>tot: 149.118s</div>
<table border="1" class="dataframe"> <table border="1" class="dataframe">
<thead> <thead>
<tr style="text-align: right;"> <tr style="text-align: right;">
<th></th> <th></th>
<th>bin</th> <th>bin</th>
<th>mul</th> <th>mul</th>
<th>kfcv</th> <th>kfcv</th>
<th>atc_mc</th> <th>atc_mc</th>
<th>atc_ne</th> <th>atc_ne</th>
<th>doc_feat</th> <th>doc_feat</th>
<th>rca</th> <th>rca</th>
<th>rca_star</th> <th>rca_star</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
<th>(0.0, 1.0)</th> <th>(0.0, 1.0)</th>
<td>0.0411</td> <td>0.0411</td>
<td>0.0907</td> <td>0.0907</td>
<td>0.0208</td> <td>0.0208</td>
<td>0.0267</td> <td>0.0267</td>
<td>0.0267</td> <td>0.0267</td>
<td>0.0204</td> <td>0.0204</td>
<td>0.1106</td> <td>0.1106</td>
<td>0.1059</td> <td>0.1059</td>
</tr> </tr>
<tr> <tr>
<th>(0.05, 0.95)</th> <th>(0.05, 0.95)</th>
<td>0.0392</td> <td>0.0392</td>
<td>0.0897</td> <td>0.0897</td>
<td>0.0216</td> <td>0.0216</td>
<td>0.0266</td> <td>0.0266</td>
<td>0.0266</td> <td>0.0266</td>
<td>0.0211</td> <td>0.0211</td>
<td>0.0523</td> <td>0.0523</td>
<td>0.0510</td> <td>0.0510</td>
</tr> </tr>
<tr> <tr>
<th>(0.1, 0.9)</th> <th>(0.1, 0.9)</th>
<td>0.0371</td> <td>0.0371</td>
<td>0.0891</td> <td>0.0891</td>
<td>0.0232</td> <td>0.0232</td>
<td>0.0267</td> <td>0.0267</td>
<td>0.0267</td> <td>0.0267</td>
<td>0.0227</td> <td>0.0227</td>
<td>0.0347</td> <td>0.0347</td>
<td>0.0354</td> <td>0.0354</td>
</tr> </tr>
<tr> <tr>
<th>(0.15, 0.85)</th> <th>(0.15, 0.85)</th>
<td>0.0464</td> <td>0.0464</td>
<td>0.0853</td> <td>0.0853</td>
<td>0.0226</td> <td>0.0226</td>
<td>0.0257</td> <td>0.0257</td>
<td>0.0257</td> <td>0.0257</td>
<td>0.0222</td> <td>0.0222</td>
<td>0.0315</td> <td>0.0315</td>
<td>0.0341</td> <td>0.0341</td>
</tr> </tr>
<tr> <tr>
<th>(0.2, 0.8)</th> <th>(0.2, 0.8)</th>
<td>0.0414</td> <td>0.0414</td>
<td>0.0757</td> <td>0.0757</td>
<td>0.0202</td> <td>0.0202</td>
<td>0.0249</td> <td>0.0249</td>
<td>0.0249</td> <td>0.0249</td>
<td>0.0200</td> <td>0.0200</td>
<td>0.0280</td> <td>0.0280</td>
<td>0.0302</td> <td>0.0302</td>
</tr> </tr>
<tr> <tr>
<th>(0.25, 0.75)</th> <th>(0.25, 0.75)</th>
<td>0.0468</td> <td>0.0468</td>
<td>0.0768</td> <td>0.0768</td>
<td>0.0204</td> <td>0.0204</td>
<td>0.0250</td> <td>0.0250</td>
<td>0.0250</td> <td>0.0250</td>
<td>0.0201</td> <td>0.0201</td>
<td>0.0335</td> <td>0.0335</td>
<td>0.0376</td> <td>0.0376</td>
</tr> </tr>
<tr> <tr>
<th>(0.3, 0.7)</th> <th>(0.3, 0.7)</th>
<td>0.0384</td> <td>0.0384</td>
<td>0.0739</td> <td>0.0739</td>
<td>0.0201</td> <td>0.0201</td>
<td>0.0252</td> <td>0.0252</td>
<td>0.0252</td> <td>0.0252</td>
<td>0.0200</td> <td>0.0200</td>
<td>0.0349</td> <td>0.0349</td>
<td>0.0410</td> <td>0.0410</td>
</tr> </tr>
<tr> <tr>
<th>(0.35, 0.65)</th> <th>(0.35, 0.65)</th>
<td>0.0386</td> <td>0.0386</td>
<td>0.0715</td> <td>0.0715</td>
<td>0.0198</td> <td>0.0198</td>
<td>0.0239</td> <td>0.0239</td>
<td>0.0239</td> <td>0.0239</td>
<td>0.0196</td> <td>0.0196</td>
<td>0.0376</td> <td>0.0376</td>
<td>0.0448</td> <td>0.0448</td>
</tr> </tr>
<tr> <tr>
<th>(0.4, 0.6)</th> <th>(0.4, 0.6)</th>
<td>0.0392</td> <td>0.0392</td>
<td>0.0657</td> <td>0.0657</td>
<td>0.0199</td> <td>0.0199</td>
<td>0.0249</td> <td>0.0249</td>
<td>0.0249</td> <td>0.0249</td>
<td>0.0197</td> <td>0.0197</td>
<td>0.0315</td> <td>0.0315</td>
<td>0.0391</td> <td>0.0391</td>
</tr> </tr>
<tr> <tr>
<th>(0.45, 0.55)</th> <th>(0.45, 0.55)</th>
<td>0.0380</td> <td>0.0380</td>
<td>0.0679</td> <td>0.0679</td>
<td>0.0213</td> <td>0.0213</td>
<td>0.0258</td> <td>0.0258</td>
<td>0.0258</td> <td>0.0258</td>
<td>0.0212</td> <td>0.0212</td>
<td>0.0358</td> <td>0.0358</td>
<td>0.0450</td> <td>0.0450</td>
</tr> </tr>
<tr> <tr>
<th>(0.5, 0.5)</th> <th>(0.5, 0.5)</th>
<td>0.0400</td> <td>0.0400</td>
<td>0.0670</td> <td>0.0670</td>
<td>0.0218</td> <td>0.0218</td>
<td>0.0228</td> <td>0.0228</td>
<td>0.0228</td> <td>0.0228</td>
<td>0.0217</td> <td>0.0217</td>
<td>0.0441</td> <td>0.0441</td>
<td>0.0550</td> <td>0.0550</td>
</tr> </tr>
<tr> <tr>
<th>(0.55, 0.45)</th> <th>(0.55, 0.45)</th>
<td>0.0403</td> <td>0.0403</td>
<td>0.0686</td> <td>0.0686</td>
<td>0.0203</td> <td>0.0203</td>
<td>0.0237</td> <td>0.0237</td>
<td>0.0237</td> <td>0.0237</td>
<td>0.0200</td> <td>0.0200</td>
<td>0.0398</td> <td>0.0398</td>
<td>0.0507</td> <td>0.0507</td>
</tr> </tr>
<tr> <tr>
<th>(0.6, 0.4)</th> <th>(0.6, 0.4)</th>
<td>0.0432</td> <td>0.0432</td>
<td>0.0625</td> <td>0.0625</td>
<td>0.0201</td> <td>0.0201</td>
<td>0.0245</td> <td>0.0245</td>
<td>0.0245</td> <td>0.0245</td>
<td>0.0200</td> <td>0.0200</td>
<td>0.0370</td> <td>0.0370</td>
<td>0.0487</td> <td>0.0487</td>
</tr> </tr>
<tr> <tr>
<th>(0.65, 0.35)</th> <th>(0.65, 0.35)</th>
<td>0.0384</td> <td>0.0384</td>
<td>0.0620</td> <td>0.0620</td>
<td>0.0195</td> <td>0.0195</td>
<td>0.0236</td> <td>0.0236</td>
<td>0.0236</td> <td>0.0236</td>
<td>0.0195</td> <td>0.0195</td>
<td>0.0356</td> <td>0.0356</td>
<td>0.0460</td> <td>0.0460</td>
</tr> </tr>
<tr> <tr>
<th>(0.7, 0.3)</th> <th>(0.7, 0.3)</th>
<td>0.0304</td> <td>0.0304</td>
<td>0.0570</td> <td>0.0570</td>
<td>0.0236</td> <td>0.0236</td>
<td>0.0227</td> <td>0.0227</td>
<td>0.0227</td> <td>0.0227</td>
<td>0.0236</td> <td>0.0236</td>
<td>0.0302</td> <td>0.0302</td>
<td>0.0396</td> <td>0.0396</td>
</tr> </tr>
<tr> <tr>
<th>(0.75, 0.25)</th> <th>(0.75, 0.25)</th>
<td>0.0321</td> <td>0.0321</td>
<td>0.0614</td> <td>0.0614</td>
<td>0.0187</td> <td>0.0187</td>
<td>0.0273</td> <td>0.0273</td>
<td>0.0273</td> <td>0.0273</td>
<td>0.0187</td> <td>0.0187</td>
<td>0.0332</td> <td>0.0332</td>
<td>0.0439</td> <td>0.0439</td>
</tr> </tr>
<tr> <tr>
<th>(0.8, 0.2)</th> <th>(0.8, 0.2)</th>
<td>0.0300</td> <td>0.0300</td>
<td>0.0555</td> <td>0.0555</td>
<td>0.0221</td> <td>0.0221</td>
<td>0.0230</td> <td>0.0230</td>
<td>0.0230</td> <td>0.0230</td>
<td>0.0222</td> <td>0.0222</td>
<td>0.0287</td> <td>0.0287</td>
<td>0.0340</td> <td>0.0340</td>
</tr> </tr>
<tr> <tr>
<th>(0.85, 0.15)</th> <th>(0.85, 0.15)</th>
<td>0.0325</td> <td>0.0325</td>
<td>0.0540</td> <td>0.0540</td>
<td>0.0224</td> <td>0.0224</td>
<td>0.0229</td> <td>0.0229</td>
<td>0.0229</td> <td>0.0229</td>
<td>0.0225</td> <td>0.0225</td>
<td>0.0342</td> <td>0.0342</td>
<td>0.0360</td> <td>0.0360</td>
</tr> </tr>
<tr> <tr>
<th>(0.9, 0.1)</th> <th>(0.9, 0.1)</th>
<td>0.0262</td> <td>0.0262</td>
<td>0.0518</td> <td>0.0518</td>
<td>0.0211</td> <td>0.0211</td>
<td>0.0238</td> <td>0.0238</td>
<td>0.0238</td> <td>0.0238</td>
<td>0.0211</td> <td>0.0211</td>
<td>0.0483</td> <td>0.0483</td>
<td>0.0469</td> <td>0.0469</td>
</tr> </tr>
<tr> <tr>
<th>(0.95, 0.05)</th> <th>(0.95, 0.05)</th>
<td>0.0243</td> <td>0.0243</td>
<td>0.0576</td> <td>0.0576</td>
<td>0.0197</td> <td>0.0197</td>
<td>0.0240</td> <td>0.0240</td>
<td>0.0240</td> <td>0.0240</td>
<td>0.0196</td> <td>0.0196</td>
<td>0.0806</td> <td>0.0806</td>
<td>0.0746</td> <td>0.0746</td>
</tr> </tr>
<tr> <tr>
<th>(1.0, 0.0)</th> <th>(1.0, 0.0)</th>
<td>0.0146</td> <td>0.0146</td>
<td>0.0597</td> <td>0.0597</td>
<td>0.0231</td> <td>0.0231</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.0244</td> <td>0.0244</td>
<td>0.0232</td> <td>0.0232</td>
<td>0.1600</td> <td>0.1600</td>
<td>0.1515</td> <td>0.1515</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
<table border="1" class="dataframe"> <table border="1" class="dataframe">
<thead> <thead>
<tr style="text-align: right;"> <tr style="text-align: right;">
<th></th> <th></th>
<th>bin</th> <th>bin</th>
<th>mul</th> <th>mul</th>
<th>kfcv</th> <th>kfcv</th>
<th>atc_mc</th> <th>atc_mc</th>
<th>atc_ne</th> <th>atc_ne</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
<th>(0.0, 1.0)</th> <th>(0.0, 1.0)</th>
<td>0.0239</td> <td>0.0239</td>
<td>0.0477</td> <td>0.0477</td>
<td>0.0345</td> <td>0.0345</td>
<td>0.0162</td> <td>0.0162</td>
<td>0.0162</td> <td>0.0162</td>
</tr> </tr>
<tr> <tr>
<th>(0.05, 0.95)</th> <th>(0.05, 0.95)</th>
<td>0.0235</td> <td>0.0235</td>
<td>0.0496</td> <td>0.0496</td>
<td>0.0320</td> <td>0.0320</td>
<td>0.0169</td> <td>0.0169</td>
<td>0.0169</td> <td>0.0169</td>
</tr> </tr>
<tr> <tr>
<th>(0.1, 0.9)</th> <th>(0.1, 0.9)</th>
<td>0.0230</td> <td>0.0230</td>
<td>0.0520</td> <td>0.0520</td>
<td>0.0289</td> <td>0.0289</td>
<td>0.0171</td> <td>0.0171</td>
<td>0.0171</td> <td>0.0171</td>
</tr> </tr>
<tr> <tr>
<th>(0.15, 0.85)</th> <th>(0.15, 0.85)</th>
<td>0.0308</td> <td>0.0308</td>
<td>0.0528</td> <td>0.0528</td>
<td>0.0274</td> <td>0.0274</td>
<td>0.0171</td> <td>0.0171</td>
<td>0.0171</td> <td>0.0171</td>
</tr> </tr>
<tr> <tr>
<th>(0.2, 0.8)</th> <th>(0.2, 0.8)</th>
<td>0.0286</td> <td>0.0286</td>
<td>0.0490</td> <td>0.0490</td>
<td>0.0291</td> <td>0.0291</td>
<td>0.0186</td> <td>0.0186</td>
<td>0.0186</td> <td>0.0186</td>
</tr> </tr>
<tr> <tr>
<th>(0.25, 0.75)</th> <th>(0.25, 0.75)</th>
<td>0.0346</td> <td>0.0346</td>
<td>0.0534</td> <td>0.0534</td>
<td>0.0255</td> <td>0.0255</td>
<td>0.0186</td> <td>0.0186</td>
<td>0.0186</td> <td>0.0186</td>
</tr> </tr>
<tr> <tr>
<th>(0.3, 0.7)</th> <th>(0.3, 0.7)</th>
<td>0.0299</td> <td>0.0299</td>
<td>0.0545</td> <td>0.0545</td>
<td>0.0232</td> <td>0.0232</td>
<td>0.0205</td> <td>0.0205</td>
<td>0.0205</td> <td>0.0205</td>
</tr> </tr>
<tr> <tr>
<th>(0.35, 0.65)</th> <th>(0.35, 0.65)</th>
<td>0.0335</td> <td>0.0335</td>
<td>0.0566</td> <td>0.0566</td>
<td>0.0217</td> <td>0.0217</td>
<td>0.0211</td> <td>0.0211</td>
<td>0.0211</td> <td>0.0211</td>
</tr> </tr>
<tr> <tr>
<th>(0.4, 0.6)</th> <th>(0.4, 0.6)</th>
<td>0.0360</td> <td>0.0360</td>
<td>0.0562</td> <td>0.0562</td>
<td>0.0217</td> <td>0.0217</td>
<td>0.0226</td> <td>0.0226</td>
<td>0.0226</td> <td>0.0226</td>
</tr> </tr>
<tr> <tr>
<th>(0.45, 0.55)</th> <th>(0.45, 0.55)</th>
<td>0.0372</td> <td>0.0372</td>
<td>0.0626</td> <td>0.0626</td>
<td>0.0213</td> <td>0.0213</td>
<td>0.0246</td> <td>0.0246</td>
<td>0.0246</td> <td>0.0246</td>
</tr> </tr>
<tr> <tr>
<th>(0.5, 0.5)</th> <th>(0.5, 0.5)</th>
<td>0.0437</td> <td>0.0437</td>
<td>0.0677</td> <td>0.0677</td>
<td>0.0223</td> <td>0.0223</td>
<td>0.0241</td> <td>0.0241</td>
<td>0.0241</td> <td>0.0241</td>
</tr> </tr>
<tr> <tr>
<th>(0.55, 0.45)</th> <th>(0.55, 0.45)</th>
<td>0.0486</td> <td>0.0486</td>
<td>0.0762</td> <td>0.0762</td>
<td>0.0241</td> <td>0.0241</td>
<td>0.0269</td> <td>0.0269</td>
<td>0.0269</td> <td>0.0269</td>
</tr> </tr>
<tr> <tr>
<th>(0.6, 0.4)</th> <th>(0.6, 0.4)</th>
<td>0.0572</td> <td>0.0572</td>
<td>0.0779</td> <td>0.0779</td>
<td>0.0290</td> <td>0.0290</td>
<td>0.0312</td> <td>0.0312</td>
<td>0.0312</td> <td>0.0312</td>
</tr> </tr>
<tr> <tr>
<th>(0.65, 0.35)</th> <th>(0.65, 0.35)</th>
<td>0.0580</td> <td>0.0580</td>
<td>0.0866</td> <td>0.0866</td>
<td>0.0340</td> <td>0.0340</td>
<td>0.0341</td> <td>0.0341</td>
<td>0.0341</td> <td>0.0341</td>
</tr> </tr>
<tr> <tr>
<th>(0.7, 0.3)</th> <th>(0.7, 0.3)</th>
<td>0.0546</td> <td>0.0546</td>
<td>0.0919</td> <td>0.0919</td>
<td>0.0420</td> <td>0.0420</td>
<td>0.0374</td> <td>0.0374</td>
<td>0.0374</td> <td>0.0374</td>
</tr> </tr>
<tr> <tr>
<th>(0.75, 0.25)</th> <th>(0.75, 0.25)</th>
<td>0.0636</td> <td>0.0636</td>
<td>0.1161</td> <td>0.1161</td>
<td>0.0689</td> <td>0.0689</td>
<td>0.0533</td> <td>0.0533</td>
<td>0.0533</td> <td>0.0533</td>
</tr> </tr>
<tr> <tr>
<th>(0.8, 0.2)</th> <th>(0.8, 0.2)</th>
<td>0.0750</td> <td>0.0750</td>
<td>0.1192</td> <td>0.1192</td>
<td>0.0768</td> <td>0.0768</td>
<td>0.0560</td> <td>0.0560</td>
<td>0.0560</td> <td>0.0560</td>
</tr> </tr>
<tr> <tr>
<th>(0.85, 0.15)</th> <th>(0.85, 0.15)</th>
<td>0.1031</td> <td>0.1031</td>
<td>0.1580</td> <td>0.1580</td>
<td>0.1244</td> <td>0.1244</td>
<td>0.0728</td> <td>0.0728</td>
<td>0.0728</td> <td>0.0728</td>
</tr> </tr>
<tr> <tr>
<th>(0.9, 0.1)</th> <th>(0.9, 0.1)</th>
<td>0.1175</td> <td>0.1175</td>
<td>0.2412</td> <td>0.2412</td>
<td>0.1885</td> <td>0.1885</td>
<td>0.1100</td> <td>0.1100</td>
<td>0.1100</td> <td>0.1100</td>
</tr> </tr>
<tr> <tr>
<th>(0.95, 0.05)</th> <th>(0.95, 0.05)</th>
<td>0.1877</td> <td>0.1877</td>
<td>0.3434</td> <td>0.3434</td>
<td>0.3579</td> <td>0.3579</td>
<td>0.2053</td> <td>0.2053</td>
<td>0.2053</td> <td>0.2053</td>
</tr> </tr>
<tr> <tr>
<th>(1.0, 0.0)</th> <th>(1.0, 0.0)</th>
<td>0.2717</td> <td>0.2717</td>
<td>0.3136</td> <td>0.3136</td>
<td>0.9178</td> <td>0.9178</td>
<td>0.6264</td> <td>0.6264</td>
<td>0.6264</td> <td>0.6264</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>

2978
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,40 +1,40 @@
[tool.poetry] [tool.poetry]
name = "quacc" name = "quacc"
version = "0.1.0" version = "0.1.0"
description = "" description = ""
authors = ["Lorenzo Volpi <lorenzo.volpi@outlook.com>"] authors = ["Lorenzo Volpi <lorenzo.volpi@outlook.com>"]
readme = "README.md" readme = "README.md"
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.11" python = "^3.11"
quapy = "^0.1.7" quapy = "^0.1.7"
pandas = "^2.0.3" pandas = "^2.0.3"
jinja2 = "^3.1.2" jinja2 = "^3.1.2"
pyyaml = "^6.0.1" pyyaml = "^6.0.1"
logging = "^0.4.9.6" logging = "^0.4.9.6"
[tool.poetry.scripts] [tool.poetry.scripts]
main = "quacc.main:main" main = "quacc.main:main"
comp = "quacc.main:estimate_comparison" comp = "quacc.main:estimate_comparison"
tohost = "scp_sync:scp_sync_to_host" tohost = "scp_sync:scp_sync_to_host"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
pytest = "^7.4.0" pytest = "^7.4.0"
pylance = "^0.5.9" pylance = "^0.5.9"
pytest-mock = "^3.11.1" pytest-mock = "^3.11.1"
pytest-cov = "^4.1.0" pytest-cov = "^4.1.0"
win11toast = "^0.32" win11toast = "^0.32"
tabulate = "^0.9.0" tabulate = "^0.9.0"
paramiko = "^3.3.1" paramiko = "^3.3.1"
[tool.pytest.ini_options] [tool.pytest.ini_options]
addopts = "--cov=quacc --capture=tee-sys" addopts = "--cov=quacc --capture=tee-sys"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[virtualenvs] [virtualenvs]
in-project = true in-project = true

9342
quacc.log

File diff suppressed because it is too large Load Diff

View File

@ -1,150 +1,150 @@
import math import math
from typing import List, Optional from typing import List, Optional
import numpy as np import numpy as np
import scipy.sparse as sp import scipy.sparse as sp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
# Extended classes # Extended classes
# #
# 0 ~ True 0 # 0 ~ True 0
# 1 ~ False 1 # 1 ~ False 1
# 2 ~ False 0 # 2 ~ False 0
# 3 ~ True 1 # 3 ~ True 1
# _____________________ # _____________________
# | | | # | | |
# | True 0 | False 1 | # | True 0 | False 1 |
# |__________|__________| # |__________|__________|
# | | | # | | |
# | False 0 | True 1 | # | False 0 | True 1 |
# |__________|__________| # |__________|__________|
# #
class ExClassManager: class ExClassManager:
@staticmethod @staticmethod
def get_ex(n_classes: int, true_class: int, pred_class: int) -> int: def get_ex(n_classes: int, true_class: int, pred_class: int) -> int:
return true_class * n_classes + pred_class return true_class * n_classes + pred_class
@staticmethod @staticmethod
def get_pred(n_classes: int, ex_class: int) -> int: def get_pred(n_classes: int, ex_class: int) -> int:
return ex_class % n_classes return ex_class % n_classes
@staticmethod @staticmethod
def get_true(n_classes: int, ex_class: int) -> int: def get_true(n_classes: int, ex_class: int) -> int:
return ex_class // n_classes return ex_class // n_classes
class ExtendedCollection(LabelledCollection): class ExtendedCollection(LabelledCollection):
def __init__( def __init__(
self, self,
instances: np.ndarray | sp.csr_matrix, instances: np.ndarray | sp.csr_matrix,
labels: np.ndarray, labels: np.ndarray,
classes: Optional[List] = None, classes: Optional[List] = None,
): ):
super().__init__(instances, labels, classes=classes) super().__init__(instances, labels, classes=classes)
def split_by_pred(self): def split_by_pred(self):
_ncl = int(math.sqrt(self.n_classes)) _ncl = int(math.sqrt(self.n_classes))
_indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances) _indexes = ExtendedCollection._split_index_by_pred(_ncl, self.instances)
if isinstance(self.instances, np.ndarray): if isinstance(self.instances, np.ndarray):
_instances = [ _instances = [
self.instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int) self.instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes for ind in _indexes
] ]
elif isinstance(self.instances, sp.csr_matrix): elif isinstance(self.instances, sp.csr_matrix):
_instances = [ _instances = [
self.instances[ind] self.instances[ind]
if ind.shape[0] > 0 if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int)) else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes for ind in _indexes
] ]
_labels = [ _labels = [
np.asarray( np.asarray(
[ [
ExClassManager.get_true(_ncl, lbl) ExClassManager.get_true(_ncl, lbl)
for lbl in (self.labels[ind] if len(ind) > 0 else []) for lbl in (self.labels[ind] if len(ind) > 0 else [])
], ],
dtype=int, dtype=int,
) )
for ind in _indexes for ind in _indexes
] ]
return [ return [
ExtendedCollection(inst, lbl, classes=range(0, _ncl)) ExtendedCollection(inst, lbl, classes=range(0, _ncl))
for (inst, lbl) in zip(_instances, _labels) for (inst, lbl) in zip(_instances, _labels)
] ]
@classmethod @classmethod
def split_inst_by_pred( def split_inst_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> (List[np.ndarray | sp.csr_matrix], List[float]): ) -> (List[np.ndarray | sp.csr_matrix], List[float]):
_indexes = cls._split_index_by_pred(n_classes, instances) _indexes = cls._split_index_by_pred(n_classes, instances)
if isinstance(instances, np.ndarray): if isinstance(instances, np.ndarray):
_instances = [ _instances = [
instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int) instances[ind] if ind.shape[0] > 0 else np.asarray([], dtype=int)
for ind in _indexes for ind in _indexes
] ]
elif isinstance(instances, sp.csr_matrix): elif isinstance(instances, sp.csr_matrix):
_instances = [ _instances = [
instances[ind] instances[ind]
if ind.shape[0] > 0 if ind.shape[0] > 0
else sp.csr_matrix(np.empty((0, 0), dtype=int)) else sp.csr_matrix(np.empty((0, 0), dtype=int))
for ind in _indexes for ind in _indexes
] ]
norms = [inst.shape[0] / instances.shape[0] for inst in _instances] norms = [inst.shape[0] / instances.shape[0] for inst in _instances]
return _instances, norms return _instances, norms
@classmethod @classmethod
def _split_index_by_pred( def _split_index_by_pred(
cls, n_classes: int, instances: np.ndarray | sp.csr_matrix cls, n_classes: int, instances: np.ndarray | sp.csr_matrix
) -> List[np.ndarray]: ) -> List[np.ndarray]:
if isinstance(instances, np.ndarray): if isinstance(instances, np.ndarray):
_pred_label = [np.argmax(inst[-n_classes:], axis=0) for inst in instances] _pred_label = [np.argmax(inst[-n_classes:], axis=0) for inst in instances]
elif isinstance(instances, sp.csr_matrix): elif isinstance(instances, sp.csr_matrix):
_pred_label = [ _pred_label = [
np.argmax(inst[:, -n_classes:].toarray().flatten(), axis=0) np.argmax(inst[:, -n_classes:].toarray().flatten(), axis=0)
for inst in instances for inst in instances
] ]
else: else:
raise ValueError("Unsupported matrix format") raise ValueError("Unsupported matrix format")
return [ return [
np.asarray([j for (j, x) in enumerate(_pred_label) if x == i], dtype=int) np.asarray([j for (j, x) in enumerate(_pred_label) if x == i], dtype=int)
for i in range(0, n_classes) for i in range(0, n_classes)
] ]
@classmethod @classmethod
def extend_instances( def extend_instances(
cls, instances: np.ndarray | sp.csr_matrix, pred_proba: np.ndarray cls, instances: np.ndarray | sp.csr_matrix, pred_proba: np.ndarray
) -> np.ndarray | sp.csr_matrix: ) -> np.ndarray | sp.csr_matrix:
if isinstance(instances, sp.csr_matrix): if isinstance(instances, sp.csr_matrix):
_pred_proba = sp.csr_matrix(pred_proba) _pred_proba = sp.csr_matrix(pred_proba)
n_x = sp.hstack([instances, _pred_proba]) n_x = sp.hstack([instances, _pred_proba])
elif isinstance(instances, np.ndarray): elif isinstance(instances, np.ndarray):
n_x = np.concatenate((instances, pred_proba), axis=1) n_x = np.concatenate((instances, pred_proba), axis=1)
else: else:
raise ValueError("Unsupported matrix format") raise ValueError("Unsupported matrix format")
return n_x return n_x
@classmethod @classmethod
def extend_collection( def extend_collection(
cls, cls,
base: LabelledCollection, base: LabelledCollection,
pred_proba: np.ndarray, pred_proba: np.ndarray,
): ):
n_classes = base.n_classes n_classes = base.n_classes
# n_X = [ X | predicted probs. ] # n_X = [ X | predicted probs. ]
n_x = cls.extend_instances(base.X, pred_proba) n_x = cls.extend_instances(base.X, pred_proba)
# n_y = (exptected y, predicted y) # n_y = (exptected y, predicted y)
pred_proba = pred_proba[:, -n_classes:] pred_proba = pred_proba[:, -n_classes:]
preds = np.argmax(pred_proba, axis=-1) preds = np.argmax(pred_proba, axis=-1)
n_y = np.asarray( n_y = np.asarray(
[ [
ExClassManager.get_ex(n_classes, true_class, pred_class) ExClassManager.get_ex(n_classes, true_class, pred_class)
for (true_class, pred_class) in zip(base.y, preds) for (true_class, pred_class) in zip(base.y, preds)
] ]
) )
return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)]) return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)])

View File

@ -1,171 +1,171 @@
import math import math
from typing import List from typing import List
import numpy as np import numpy as np
import quapy as qp import quapy as qp
from quapy.data.base import LabelledCollection from quapy.data.base import LabelledCollection
from sklearn.conftest import fetch_rcv1 from sklearn.conftest import fetch_rcv1
TRAIN_VAL_PROP = 0.5 TRAIN_VAL_PROP = 0.5
class DatasetSample: class DatasetSample:
def __init__( def __init__(
self, self,
train: LabelledCollection, train: LabelledCollection,
validation: LabelledCollection, validation: LabelledCollection,
test: LabelledCollection, test: LabelledCollection,
): ):
self.train = train self.train = train
self.validation = validation self.validation = validation
self.test = test self.test = test
@property @property
def train_prev(self): def train_prev(self):
return self.train.prevalence() return self.train.prevalence()
@property @property
def validation_prev(self): def validation_prev(self):
return self.validation.prevalence() return self.validation.prevalence()
@property @property
def prevs(self): def prevs(self):
return {"train": self.train_prev, "validation": self.validation_prev} return {"train": self.train_prev, "validation": self.validation_prev}
class Dataset: class Dataset:
def __init__(self, name, n_prevalences=9, prevs=None, target=None): def __init__(self, name, n_prevalences=9, prevs=None, target=None):
self._name = name self._name = name
self._target = target self._target = target
self.prevs = None self.prevs = None
self.n_prevs = n_prevalences self.n_prevs = n_prevalences
if prevs is not None: if prevs is not None:
prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0]) prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0])
if prevs.shape[0] > 0: if prevs.shape[0] > 0:
self.prevs = np.sort(prevs) self.prevs = np.sort(prevs)
self.n_prevs = self.prevs.shape[0] self.n_prevs = self.prevs.shape[0]
def __spambase(self): def __spambase(self):
return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test return qp.datasets.fetch_UCIDataset("spambase", verbose=False).train_test
# provare min_df=5 # provare min_df=5
def __imdb(self): def __imdb(self):
return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test return qp.datasets.fetch_reviews("imdb", tfidf=True, min_df=3).train_test
def __rcv1(self): def __rcv1(self):
n_train = 23149 n_train = 23149
available_targets = ["CCAT", "GCAT", "MCAT"] available_targets = ["CCAT", "GCAT", "MCAT"]
if self._target is None or self._target not in available_targets: if self._target is None or self._target not in available_targets:
raise ValueError(f"Invalid target {self._target}") raise ValueError(f"Invalid target {self._target}")
dataset = fetch_rcv1() dataset = fetch_rcv1()
target_index = np.where(dataset.target_names == self._target)[0] target_index = np.where(dataset.target_names == self._target)[0]
all_train_d = dataset.data[:n_train, :] all_train_d = dataset.data[:n_train, :]
test_d = dataset.data[n_train:, :] test_d = dataset.data[n_train:, :]
labels = dataset.target[:, target_index].toarray().flatten() labels = dataset.target[:, target_index].toarray().flatten()
all_train_l, test_l = labels[:n_train], labels[n_train:] all_train_l, test_l = labels[:n_train], labels[n_train:]
all_train = LabelledCollection(all_train_d, all_train_l, classes=[0, 1]) all_train = LabelledCollection(all_train_d, all_train_l, classes=[0, 1])
test = LabelledCollection(test_d, test_l, classes=[0, 1]) test = LabelledCollection(test_d, test_l, classes=[0, 1])
return all_train, test return all_train, test
def get_raw(self) -> DatasetSample: def get_raw(self) -> DatasetSample:
all_train, test = { all_train, test = {
"spambase": self.__spambase, "spambase": self.__spambase,
"imdb": self.__imdb, "imdb": self.__imdb,
"rcv1": self.__rcv1, "rcv1": self.__rcv1,
}[self._name]() }[self._name]()
train, val = all_train.split_stratified( train, val = all_train.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0 train_prop=TRAIN_VAL_PROP, random_state=0
) )
return DatasetSample(train, val, test) return DatasetSample(train, val, test)
def get(self) -> List[DatasetSample]: def get(self) -> List[DatasetSample]:
(all_train, test) = { (all_train, test) = {
"spambase": self.__spambase, "spambase": self.__spambase,
"imdb": self.__imdb, "imdb": self.__imdb,
"rcv1": self.__rcv1, "rcv1": self.__rcv1,
}[self._name]() }[self._name]()
# resample all_train set to have (0.5, 0.5) prevalence # resample all_train set to have (0.5, 0.5) prevalence
at_positives = np.sum(all_train.y) at_positives = np.sum(all_train.y)
all_train = all_train.sampling( all_train = all_train.sampling(
min(at_positives, len(all_train) - at_positives) * 2, 0.5, random_state=0 min(at_positives, len(all_train) - at_positives) * 2, 0.5, random_state=0
) )
# sample prevalences # sample prevalences
if self.prevs is not None: if self.prevs is not None:
prevs = self.prevs prevs = self.prevs
else: else:
prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:] prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs) at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs)
datasets = [] datasets = []
for p in 1.0 - prevs: for p in 1.0 - prevs:
all_train_sampled = all_train.sampling(at_size, p, random_state=0) all_train_sampled = all_train.sampling(at_size, p, random_state=0)
train, validation = all_train_sampled.split_stratified( train, validation = all_train_sampled.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0 train_prop=TRAIN_VAL_PROP, random_state=0
) )
datasets.append(DatasetSample(train, validation, test)) datasets.append(DatasetSample(train, validation, test))
return datasets return datasets
def __call__(self): def __call__(self):
return self.get() return self.get()
@property @property
def name(self): def name(self):
return ( return (
f"{self._name}_{self._target}_{self.n_prevs}prevs" f"{self._name}_{self._target}_{self.n_prevs}prevs"
if self._name == "rcv1" if self._name == "rcv1"
else f"{self._name}_{self.n_prevs}prevs" else f"{self._name}_{self.n_prevs}prevs"
) )
# >>> fetch_rcv1().target_names # >>> fetch_rcv1().target_names
# array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16', # array(['C11', 'C12', 'C13', 'C14', 'C15', 'C151', 'C1511', 'C152', 'C16',
# 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182', # 'C17', 'C171', 'C172', 'C173', 'C174', 'C18', 'C181', 'C182',
# 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313', # 'C183', 'C21', 'C22', 'C23', 'C24', 'C31', 'C311', 'C312', 'C313',
# 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11', # 'C32', 'C33', 'C331', 'C34', 'C41', 'C411', 'C42', 'CCAT', 'E11',
# 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142', # 'E12', 'E121', 'E13', 'E131', 'E132', 'E14', 'E141', 'E142',
# 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313', # 'E143', 'E21', 'E211', 'E212', 'E31', 'E311', 'E312', 'E313',
# 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT', # 'E41', 'E411', 'E51', 'E511', 'E512', 'E513', 'E61', 'E71', 'ECAT',
# 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157', # 'G15', 'G151', 'G152', 'G153', 'G154', 'G155', 'G156', 'G157',
# 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT', # 'G158', 'G159', 'GCAT', 'GCRIM', 'GDEF', 'GDIP', 'GDIS', 'GENT',
# 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL', # 'GENV', 'GFAS', 'GHEA', 'GJOB', 'GMIL', 'GOBIT', 'GODD', 'GPOL',
# 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA', # 'GPRO', 'GREL', 'GSCI', 'GSPO', 'GTOUR', 'GVIO', 'GVOTE', 'GWEA',
# 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141', # 'GWELF', 'M11', 'M12', 'M13', 'M131', 'M132', 'M14', 'M141',
# 'M142', 'M143', 'MCAT'], dtype=object) # 'M142', 'M143', 'MCAT'], dtype=object)
def rcv1_info(): def rcv1_info():
dataset = fetch_rcv1() dataset = fetch_rcv1()
n_train = 23149 n_train = 23149
targets = [] targets = []
for target in range(103): for target in range(103):
train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten()) train_t_prev = np.average(dataset.target[:n_train, target].toarray().flatten())
test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten()) test_t_prev = np.average(dataset.target[n_train:, target].toarray().flatten())
targets.append( targets.append(
( (
dataset.target_names[target], dataset.target_names[target],
{ {
"train": (1.0 - train_t_prev, train_t_prev), "train": (1.0 - train_t_prev, train_t_prev),
"test": (1.0 - test_t_prev, test_t_prev), "test": (1.0 - test_t_prev, test_t_prev),
}, },
) )
) )
targets.sort(key=lambda t: t[1]["train"][1]) targets.sort(key=lambda t: t[1]["train"][1])
for n, d in targets: for n, d in targets:
print(f"{n}:") print(f"{n}:")
for k, (fp, tp) in d.items(): for k, (fp, tp) in d.items():
print(f"\t{k}: {fp:.4f}, {tp:.4f}") print(f"\t{k}: {fp:.4f}, {tp:.4f}")
if __name__ == "__main__": if __name__ == "__main__":
rcv1_info() rcv1_info()

View File

@ -1,118 +1,118 @@
import collections as C import collections as C
import copy import copy
from typing import Any from typing import Any
import yaml import yaml
class environ: class environ:
_instance = None _instance = None
_default_env = { _default_env = {
"DATASET_NAME": None, "DATASET_NAME": None,
"DATASET_TARGET": None, "DATASET_TARGET": None,
"METRICS": [], "METRICS": [],
"COMP_ESTIMATORS": [], "COMP_ESTIMATORS": [],
"DATASET_N_PREVS": 9, "DATASET_N_PREVS": 9,
"DATASET_PREVS": None, "DATASET_PREVS": None,
"OUT_DIR_NAME": "output", "OUT_DIR_NAME": "output",
"OUT_DIR": None, "OUT_DIR": None,
"PLOT_DIR_NAME": "plot", "PLOT_DIR_NAME": "plot",
"PLOT_OUT_DIR": None, "PLOT_OUT_DIR": None,
"DATASET_DIR_UPDATE": False, "DATASET_DIR_UPDATE": False,
"PROTOCOL_N_PREVS": 21, "PROTOCOL_N_PREVS": 21,
"PROTOCOL_REPEATS": 100, "PROTOCOL_REPEATS": 100,
"SAMPLE_SIZE": 1000, "SAMPLE_SIZE": 1000,
"PLOT_ESTIMATORS": [], "PLOT_ESTIMATORS": [],
"PLOT_STDEV": False, "PLOT_STDEV": False,
} }
_keys = list(_default_env.keys()) _keys = list(_default_env.keys())
def __init__(self): def __init__(self):
self.exec = [] self.exec = []
self.confs = [] self.confs = []
self.load_conf() self.load_conf()
self._stack = C.deque([self.__getdict()]) self._stack = C.deque([self.__getdict()])
def __setdict(self, d): def __setdict(self, d):
for k, v in d.items(): for k, v in d.items():
super().__setattr__(k, v) super().__setattr__(k, v)
def __getdict(self): def __getdict(self):
return {k: self.__getattribute__(k) for k in environ._keys} return {k: self.__getattribute__(k) for k in environ._keys}
def __setattr__(self, __name: str, __value: Any) -> None: def __setattr__(self, __name: str, __value: Any) -> None:
if __name in environ._keys: if __name in environ._keys:
self._stack[-1][__name] = __value self._stack[-1][__name] = __value
super().__setattr__(__name, __value) super().__setattr__(__name, __value)
def load_conf(self): def load_conf(self):
self.__setdict(environ._default_env) self.__setdict(environ._default_env)
with open("conf.yaml", "r") as f: with open("conf.yaml", "r") as f:
confs = yaml.safe_load(f)["exec"] confs = yaml.safe_load(f)["exec"]
_global = confs["global"] _global = confs["global"]
_estimators = set() _estimators = set()
for pc in confs["plot_confs"].values(): for pc in confs["plot_confs"].values():
_estimators = _estimators.union(set(pc["PLOT_ESTIMATORS"])) _estimators = _estimators.union(set(pc["PLOT_ESTIMATORS"]))
_global["COMP_ESTIMATORS"] = list(_estimators) _global["COMP_ESTIMATORS"] = list(_estimators)
self.__setdict(_global) self.__setdict(_global)
self.confs = confs["confs"] self.confs = confs["confs"]
self.plot_confs = confs["plot_confs"] self.plot_confs = confs["plot_confs"]
def get_confs(self): def get_confs(self):
self._stack.append(None) self._stack.append(None)
for _conf in self.confs: for _conf in self.confs:
self._stack.pop() self._stack.pop()
self.__setdict(self._stack[-1]) self.__setdict(self._stack[-1])
self.__setdict(_conf) self.__setdict(_conf)
self._stack.append(self.__getdict()) self._stack.append(self.__getdict())
yield copy.deepcopy(self._stack[-1]) yield copy.deepcopy(self._stack[-1])
self._stack.pop() self._stack.pop()
def get_plot_confs(self): def get_plot_confs(self):
self._stack.append(None) self._stack.append(None)
for k, pc in self.plot_confs.items(): for k, pc in self.plot_confs.items():
self._stack.pop() self._stack.pop()
self.__setdict(self._stack[-1]) self.__setdict(self._stack[-1])
self.__setdict(pc) self.__setdict(pc)
self._stack.append(self.__getdict()) self._stack.append(self.__getdict())
name = self.DATASET_NAME name = self.DATASET_NAME
if self.DATASET_TARGET is not None: if self.DATASET_TARGET is not None:
name += f"_{self.DATASET_TARGET}" name += f"_{self.DATASET_TARGET}"
name += f"_{k}" name += f"_{k}"
yield name yield name
self._stack.pop() self._stack.pop()
@property @property
def current(self): def current(self):
return copy.deepcopy(self.__getdict()) return copy.deepcopy(self.__getdict())
env = environ() env = environ()
if __name__ == "__main__": if __name__ == "__main__":
stack = C.deque() stack = C.deque()
stack.append(-1) stack.append(-1)
def __gen(stack: C.deque): def __gen(stack: C.deque):
stack.append(None) stack.append(None)
for i in range(5): for i in range(5):
stack.pop() stack.pop()
stack.append(i) stack.append(i)
yield stack[-1] yield stack[-1]
stack.pop() stack.pop()
print(stack) print(stack)
for i in __gen(stack): for i in __gen(stack):
print(stack, i) print(stack, i)
print(stack) print(stack)

View File

@ -1,55 +1,55 @@
import numpy as np import numpy as np
def from_name(err_name): def from_name(err_name):
assert err_name in ERROR_NAMES, f"unknown error {err_name}" assert err_name in ERROR_NAMES, f"unknown error {err_name}"
callable_error = globals()[err_name] callable_error = globals()[err_name]
return callable_error return callable_error
# def f1(prev): # def f1(prev):
# # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure # # https://github.com/dice-group/gerbil/wiki/Precision,-Recall-and-F1-measure
# if prev[0] == 0 and prev[1] == 0 and prev[2] == 0: # if prev[0] == 0 and prev[1] == 0 and prev[2] == 0:
# return 1.0 # return 1.0
# elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0: # elif prev[0] == 0 and prev[1] > 0 and prev[2] == 0:
# return 0.0 # return 0.0
# elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0: # elif prev[0] == 0 and prev[1] == 0 and prev[2] > 0:
# return float('NaN') # return float('NaN')
# else: # else:
# recall = prev[0] / (prev[0] + prev[1]) # recall = prev[0] / (prev[0] + prev[1])
# precision = prev[0] / (prev[0] + prev[2]) # precision = prev[0] / (prev[0] + prev[2])
# return 2 * (precision * recall) / (precision + recall) # return 2 * (precision * recall) / (precision + recall)
def f1(prev): def f1(prev):
den = (2 * prev[3]) + prev[1] + prev[2] den = (2 * prev[3]) + prev[1] + prev[2]
if den == 0: if den == 0:
return 0.0 return 0.0
else: else:
return (2 * prev[3]) / den return (2 * prev[3]) / den
def f1e(prev): def f1e(prev):
return 1 - f1(prev) return 1 - f1(prev)
def acc(prev: np.ndarray) -> float: def acc(prev: np.ndarray) -> float:
return (prev[0] + prev[3]) / np.sum(prev) return (prev[0] + prev[3]) / np.sum(prev)
def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray: def accd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> np.ndarray:
vacc = np.vectorize(acc, signature="(m)->()") vacc = np.vectorize(acc, signature="(m)->()")
a_tp = vacc(true_prevs) a_tp = vacc(true_prevs)
a_ep = vacc(estim_prevs) a_ep = vacc(estim_prevs)
return np.abs(a_tp - a_ep) return np.abs(a_tp - a_ep)
def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float: def maccd(true_prevs: np.ndarray, estim_prevs: np.ndarray) -> float:
return accd(true_prevs, estim_prevs).mean() return accd(true_prevs, estim_prevs).mean()
ACCURACY_ERROR = {maccd} ACCURACY_ERROR = {maccd}
ACCURACY_ERROR_SINGLE = {accd} ACCURACY_ERROR_SINGLE = {accd}
ACCURACY_ERROR_NAMES = {func.__name__ for func in ACCURACY_ERROR} ACCURACY_ERROR_NAMES = {func.__name__ for func in ACCURACY_ERROR}
ACCURACY_ERROR_SINGLE_NAMES = {func.__name__ for func in ACCURACY_ERROR_SINGLE} ACCURACY_ERROR_SINGLE_NAMES = {func.__name__ for func in ACCURACY_ERROR_SINGLE}
ERROR_NAMES = ACCURACY_ERROR_NAMES | ACCURACY_ERROR_SINGLE_NAMES ERROR_NAMES = ACCURACY_ERROR_NAMES | ACCURACY_ERROR_SINGLE_NAMES

View File

@ -1,34 +1,34 @@
from typing import Callable, Union from typing import Callable, Union
import numpy as np import numpy as np
from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol from quapy.protocol import AbstractProtocol, OnLabelledCollectionProtocol
import quacc as qc import quacc as qc
from ..method.base import BaseAccuracyEstimator from ..method.base import BaseAccuracyEstimator
def evaluate( def evaluate(
estimator: BaseAccuracyEstimator, estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol, protocol: AbstractProtocol,
error_metric: Union[Callable | str], error_metric: Union[Callable | str],
) -> float: ) -> float:
if isinstance(error_metric, str): if isinstance(error_metric, str):
error_metric = qc.error.from_name(error_metric) error_metric = qc.error.from_name(error_metric)
collator_bck_ = protocol.collator collator_bck_ = protocol.collator
protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection") protocol.collator = OnLabelledCollectionProtocol.get_collator("labelled_collection")
estim_prevs, true_prevs = [], [] estim_prevs, true_prevs = [], []
for sample in protocol(): for sample in protocol():
e_sample = estimator.extend(sample) e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True) estim_prev = estimator.estimate(e_sample.X, ext=True)
estim_prevs.append(estim_prev) estim_prevs.append(estim_prev)
true_prevs.append(e_sample.prevalence()) true_prevs.append(e_sample.prevalence())
protocol.collator = collator_bck_ protocol.collator = collator_bck_
true_prevs = np.array(true_prevs) true_prevs = np.array(true_prevs)
estim_prevs = np.array(estim_prevs) estim_prevs = np.array(estim_prevs)
return error_metric(true_prevs, estim_prevs) return error_metric(true_prevs, estim_prevs)

View File

@ -1,299 +1,299 @@
from functools import wraps from functools import wraps
from statistics import mean from statistics import mean
import numpy as np import numpy as np
import sklearn.metrics as metrics import sklearn.metrics as metrics
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.protocol import AbstractStochasticSeededProtocol from quapy.protocol import AbstractStochasticSeededProtocol
from scipy.sparse import issparse from scipy.sparse import issparse
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_validate from sklearn.model_selection import cross_validate
import baselines.atc as atc import baselines.atc as atc
import baselines.doc as doc import baselines.doc as doc
import baselines.impweight as iw import baselines.impweight as iw
import baselines.rca as rcalib import baselines.rca as rcalib
from .report import EvaluationReport from .report import EvaluationReport
_baselines = {} _baselines = {}
def baseline(func): def baseline(func):
@wraps(func) @wraps(func)
def wrapper(c_model, validation, protocol): def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol) return func(c_model, validation, protocol)
_baselines[func.__name__] = wrapper _baselines[func.__name__] = wrapper
return wrapper return wrapper
@baseline @baseline
def kfcv( def kfcv(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict", predict_method="predict",
): ):
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
scoring = ["accuracy", "f1_macro"] scoring = ["accuracy", "f1_macro"]
scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring) scores = cross_validate(c_model, validation.X, validation.y, scoring=scoring)
acc_score = mean(scores["test_accuracy"]) acc_score = mean(scores["test_accuracy"])
f1_score = mean(scores["test_f1_macro"]) f1_score = mean(scores["test_f1_macro"])
report = EvaluationReport(name="kfcv") report = EvaluationReport(name="kfcv")
for test in protocol(): for test in protocol():
test_preds = c_model_predict(test.X) test_preds = c_model_predict(test.X)
meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds)) meta_acc = abs(acc_score - metrics.accuracy_score(test.y, test_preds))
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds)) meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row( report.append_row(
test.prevalence(), test.prevalence(),
acc_score=acc_score, acc_score=acc_score,
f1_score=f1_score, f1_score=f1_score,
acc=meta_acc, acc=meta_acc,
f1=meta_f1, f1=meta_f1,
) )
return report return report
@baseline @baseline
def ref( def ref(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
): ):
c_model_predict = getattr(c_model, "predict") c_model_predict = getattr(c_model, "predict")
report = EvaluationReport(name="ref") report = EvaluationReport(name="ref")
for test in protocol(): for test in protocol():
test_preds = c_model_predict(test.X) test_preds = c_model_predict(test.X)
report.append_row( report.append_row(
test.prevalence(), test.prevalence(),
acc_score=metrics.accuracy_score(test.y, test_preds), acc_score=metrics.accuracy_score(test.y, test_preds),
f1_score=metrics.f1_score(test.y, test_preds), f1_score=metrics.f1_score(test.y, test_preds),
) )
return report return report
@baseline @baseline
def atc_mc( def atc_mc(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba", predict_method="predict_proba",
): ):
"""garg""" """garg"""
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels ## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence ## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_max_conf(val_probs) val_scores = atc.get_max_conf(val_probs)
val_preds = np.argmax(val_probs, axis=-1) val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds) _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_mc") report = EvaluationReport(name="atc_mc")
for test in protocol(): for test in protocol():
## Load OOD test data probs ## Load OOD test data probs
test_probs = c_model_predict(test.X) test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1) test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_max_conf(test_probs) test_scores = atc.get_max_conf(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores) atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds)) meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs) f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds)) meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row( report.append_row(
test.prevalence(), test.prevalence(),
acc=meta_acc, acc=meta_acc,
acc_score=atc_accuracy, acc_score=atc_accuracy,
f1_score=f1_score, f1_score=f1_score,
f1=meta_f1, f1=meta_f1,
) )
return report return report
@baseline @baseline
def atc_ne( def atc_ne(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba", predict_method="predict_proba",
): ):
"""garg""" """garg"""
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
## Load ID validation data probs and labels ## Load ID validation data probs and labels
val_probs, val_labels = c_model_predict(validation.X), validation.y val_probs, val_labels = c_model_predict(validation.X), validation.y
## score function, e.g., negative entropy or argmax confidence ## score function, e.g., negative entropy or argmax confidence
val_scores = atc.get_entropy(val_probs) val_scores = atc.get_entropy(val_probs)
val_preds = np.argmax(val_probs, axis=-1) val_preds = np.argmax(val_probs, axis=-1)
_, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds) _, atc_thres = atc.find_ATC_threshold(val_scores, val_labels == val_preds)
report = EvaluationReport(name="atc_ne") report = EvaluationReport(name="atc_ne")
for test in protocol(): for test in protocol():
## Load OOD test data probs ## Load OOD test data probs
test_probs = c_model_predict(test.X) test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1) test_preds = np.argmax(test_probs, axis=-1)
test_scores = atc.get_entropy(test_probs) test_scores = atc.get_entropy(test_probs)
atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores) atc_accuracy = atc.get_ATC_acc(atc_thres, test_scores)
meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds)) meta_acc = abs(atc_accuracy - metrics.accuracy_score(test.y, test_preds))
f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs) f1_score = atc.get_ATC_f1(atc_thres, test_scores, test_probs)
meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds)) meta_f1 = abs(f1_score - metrics.f1_score(test.y, test_preds))
report.append_row( report.append_row(
test.prevalence(), test.prevalence(),
acc=meta_acc, acc=meta_acc,
acc_score=atc_accuracy, acc_score=atc_accuracy,
f1_score=f1_score, f1_score=f1_score,
f1=meta_f1, f1=meta_f1,
) )
return report return report
@baseline @baseline
def doc_feat( def doc_feat(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict_proba", predict_method="predict_proba",
): ):
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
val_probs, val_labels = c_model_predict(validation.X), validation.y val_probs, val_labels = c_model_predict(validation.X), validation.y
val_scores = np.max(val_probs, axis=-1) val_scores = np.max(val_probs, axis=-1)
val_preds = np.argmax(val_probs, axis=-1) val_preds = np.argmax(val_probs, axis=-1)
v1acc = np.mean(val_preds == val_labels) * 100 v1acc = np.mean(val_preds == val_labels) * 100
report = EvaluationReport(name="doc_feat") report = EvaluationReport(name="doc_feat")
for test in protocol(): for test in protocol():
test_probs = c_model_predict(test.X) test_probs = c_model_predict(test.X)
test_preds = np.argmax(test_probs, axis=-1) test_preds = np.argmax(test_probs, axis=-1)
test_scores = np.max(test_probs, axis=-1) test_scores = np.max(test_probs, axis=-1)
score = (v1acc + doc.get_doc(val_scores, test_scores)) / 100.0 score = (v1acc + doc.get_doc(val_scores, test_scores)) / 100.0
meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds)) meta_acc = abs(score - metrics.accuracy_score(test.y, test_preds))
report.append_row(test.prevalence(), acc=meta_acc, acc_score=score) report.append_row(test.prevalence(), acc=meta_acc, acc_score=score)
return report return report
@baseline @baseline
def rca( def rca(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict", predict_method="predict",
): ):
"""elsahar19""" """elsahar19"""
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
val_pred1 = c_model_predict(validation.X) val_pred1 = c_model_predict(validation.X)
report = EvaluationReport(name="rca") report = EvaluationReport(name="rca")
for test in protocol(): for test in protocol():
try: try:
test_pred = c_model_predict(test.X) test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred) c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method) c_model2_predict = getattr(c_model2, predict_method)
val_pred2 = c_model2_predict(validation.X) val_pred2 = c_model2_predict(validation.X)
rca_score = 1.0 - rcalib.get_score(val_pred1, val_pred2, validation.y) rca_score = 1.0 - rcalib.get_score(val_pred1, val_pred2, validation.y)
meta_score = abs(rca_score - metrics.accuracy_score(test.y, test_pred)) meta_score = abs(rca_score - metrics.accuracy_score(test.y, test_pred))
report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score) report.append_row(test.prevalence(), acc=meta_score, acc_score=rca_score)
except ValueError: except ValueError:
report.append_row( report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan") test.prevalence(), acc=float("nan"), acc_score=float("nan")
) )
return report return report
@baseline @baseline
def rca_star( def rca_star(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict", predict_method="predict",
): ):
"""elsahar19""" """elsahar19"""
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
validation1, validation2 = validation.split_stratified( validation1, validation2 = validation.split_stratified(
train_prop=0.5, random_state=0 train_prop=0.5, random_state=0
) )
val1_pred = c_model_predict(validation1.X) val1_pred = c_model_predict(validation1.X)
c_model1 = rcalib.clone_fit(c_model, validation1.X, val1_pred) c_model1 = rcalib.clone_fit(c_model, validation1.X, val1_pred)
c_model1_predict = getattr(c_model1, predict_method) c_model1_predict = getattr(c_model1, predict_method)
val2_pred1 = c_model1_predict(validation2.X) val2_pred1 = c_model1_predict(validation2.X)
report = EvaluationReport(name="rca_star") report = EvaluationReport(name="rca_star")
for test in protocol(): for test in protocol():
try: try:
test_pred = c_model_predict(test.X) test_pred = c_model_predict(test.X)
c_model2 = rcalib.clone_fit(c_model, test.X, test_pred) c_model2 = rcalib.clone_fit(c_model, test.X, test_pred)
c_model2_predict = getattr(c_model2, predict_method) c_model2_predict = getattr(c_model2, predict_method)
val2_pred2 = c_model2_predict(validation2.X) val2_pred2 = c_model2_predict(validation2.X)
rca_star_score = 1.0 - rcalib.get_score( rca_star_score = 1.0 - rcalib.get_score(
val2_pred1, val2_pred2, validation2.y val2_pred1, val2_pred2, validation2.y
) )
meta_score = abs(rca_star_score - metrics.accuracy_score(test.y, test_pred)) meta_score = abs(rca_star_score - metrics.accuracy_score(test.y, test_pred))
report.append_row( report.append_row(
test.prevalence(), acc=meta_score, acc_score=rca_star_score test.prevalence(), acc=meta_score, acc_score=rca_star_score
) )
except ValueError: except ValueError:
report.append_row( report.append_row(
test.prevalence(), acc=float("nan"), acc_score=float("nan") test.prevalence(), acc=float("nan"), acc_score=float("nan")
) )
return report return report
@baseline @baseline
def logreg( def logreg(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict", predict_method="predict",
): ):
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X) val_preds = c_model_predict(validation.X)
report = EvaluationReport(name="logreg") report = EvaluationReport(name="logreg")
for test in protocol(): for test in protocol():
wx = iw.logreg(validation.X, validation.y, test.X) wx = iw.logreg(validation.X, validation.y, test.X)
test_preds = c_model_predict(test.X) test_preds = c_model_predict(test.X)
estim_acc = iw.get_acc(val_preds, validation.y, wx) estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds) true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc) meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc) report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report return report
@baseline @baseline
def kdex2( def kdex2(
c_model: BaseEstimator, c_model: BaseEstimator,
validation: LabelledCollection, validation: LabelledCollection,
protocol: AbstractStochasticSeededProtocol, protocol: AbstractStochasticSeededProtocol,
predict_method="predict", predict_method="predict",
): ):
c_model_predict = getattr(c_model, predict_method) c_model_predict = getattr(c_model, predict_method)
val_preds = c_model_predict(validation.X) val_preds = c_model_predict(validation.X)
log_likelihood_val = iw.kdex2_lltr(validation.X) log_likelihood_val = iw.kdex2_lltr(validation.X)
Xval = validation.X.toarray() if issparse(validation.X) else validation.X Xval = validation.X.toarray() if issparse(validation.X) else validation.X
report = EvaluationReport(name="kdex2") report = EvaluationReport(name="kdex2")
for test in protocol(): for test in protocol():
Xte = test.X.toarray() if issparse(test.X) else test.X Xte = test.X.toarray() if issparse(test.X) else test.X
wx = iw.kdex2_weights(Xval, Xte, log_likelihood_val) wx = iw.kdex2_weights(Xval, Xte, log_likelihood_val)
test_preds = c_model_predict(Xte) test_preds = c_model_predict(Xte)
estim_acc = iw.get_acc(val_preds, validation.y, wx) estim_acc = iw.get_acc(val_preds, validation.y, wx)
true_acc = metrics.accuracy_score(test.y, test_preds) true_acc = metrics.accuracy_score(test.y, test_preds)
meta_score = abs(estim_acc - true_acc) meta_score = abs(estim_acc - true_acc)
report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc) report.append_row(test.prevalence(), acc=meta_score, acc_score=estim_acc)
return report return report

View File

@ -1,128 +1,128 @@
import multiprocessing import multiprocessing
import time import time
from traceback import print_exception as traceback from traceback import print_exception as traceback
from typing import List from typing import List
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import quapy as qp import quapy as qp
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.environment import env from quacc.environment import env
from quacc.evaluation import baseline, method from quacc.evaluation import baseline, method
from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport
from quacc.evaluation.worker import estimate_worker from quacc.evaluation.worker import estimate_worker
from quacc.logger import Logger from quacc.logger import Logger
pd.set_option("display.float_format", "{:.4f}".format) pd.set_option("display.float_format", "{:.4f}".format)
qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE
class CompEstimatorName_: class CompEstimatorName_:
def __init__(self, ce): def __init__(self, ce):
self.ce = ce self.ce = ce
def __getitem__(self, e: str | List[str]): def __getitem__(self, e: str | List[str]):
if isinstance(e, str): if isinstance(e, str):
return self.ce._CompEstimator__get(e)[0] return self.ce._CompEstimator__get(e)[0]
elif isinstance(e, list): elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).keys()) return list(self.ce._CompEstimator__get(e).keys())
class CompEstimatorFunc_: class CompEstimatorFunc_:
def __init__(self, ce): def __init__(self, ce):
self.ce = ce self.ce = ce
def __getitem__(self, e: str | List[str]): def __getitem__(self, e: str | List[str]):
if isinstance(e, str): if isinstance(e, str):
return self.ce._CompEstimator__get(e)[1] return self.ce._CompEstimator__get(e)[1]
elif isinstance(e, list): elif isinstance(e, list):
return list(self.ce._CompEstimator__get(e).values()) return list(self.ce._CompEstimator__get(e).values())
class CompEstimator: class CompEstimator:
__dict = method._methods | baseline._baselines __dict = method._methods | baseline._baselines
def __get(cls, e: str | List[str]): def __get(cls, e: str | List[str]):
if isinstance(e, str): if isinstance(e, str):
try: try:
return (e, cls.__dict[e]) return (e, cls.__dict[e])
except KeyError: except KeyError:
raise KeyError(f"Invalid estimator: estimator {e} does not exist") raise KeyError(f"Invalid estimator: estimator {e} does not exist")
elif isinstance(e, list): elif isinstance(e, list):
_subtr = np.setdiff1d(e, list(cls.__dict.keys())) _subtr = np.setdiff1d(e, list(cls.__dict.keys()))
if len(_subtr) > 0: if len(_subtr) > 0:
raise KeyError( raise KeyError(
f"Invalid estimator: estimator {_subtr[0]} does not exist" f"Invalid estimator: estimator {_subtr[0]} does not exist"
) )
e_fun = {k: fun for k, fun in cls.__dict.items() if k in e} e_fun = {k: fun for k, fun in cls.__dict.items() if k in e}
if "ref" not in e: if "ref" not in e:
e_fun["ref"] = cls.__dict["ref"] e_fun["ref"] = cls.__dict["ref"]
return e_fun return e_fun
@property @property
def name(self): def name(self):
return CompEstimatorName_(self) return CompEstimatorName_(self)
@property @property
def func(self): def func(self):
return CompEstimatorFunc_(self) return CompEstimatorFunc_(self)
CE = CompEstimator() CE = CompEstimator()
def evaluate_comparison(dataset: Dataset, estimators=None) -> EvaluationReport: def evaluate_comparison(dataset: Dataset, estimators=None) -> EvaluationReport:
log = Logger.logger() log = Logger.logger()
# with multiprocessing.Pool(1) as pool: # with multiprocessing.Pool(1) as pool:
with multiprocessing.Pool(len(estimators)) as pool: with multiprocessing.Pool(len(estimators)) as pool:
dr = DatasetReport(dataset.name) dr = DatasetReport(dataset.name)
log.info(f"dataset {dataset.name}") log.info(f"dataset {dataset.name}")
for d in dataset(): for d in dataset():
log.info( log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started"
) )
tstart = time.time() tstart = time.time()
tasks = [ tasks = [
(estim, d.train, d.validation, d.test) for estim in CE.func[estimators] (estim, d.train, d.validation, d.test) for estim in CE.func[estimators]
] ]
results = [ results = [
pool.apply_async(estimate_worker, t, {"_env": env, "q": Logger.queue()}) pool.apply_async(estimate_worker, t, {"_env": env, "q": Logger.queue()})
for t in tasks for t in tasks
] ]
results_got = [] results_got = []
for _r in results: for _r in results:
try: try:
r = _r.get() r = _r.get()
if r["result"] is not None: if r["result"] is not None:
results_got.append(r) results_got.append(r)
except Exception as e: except Exception as e:
log.warning( log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
) )
tend = time.time() tend = time.time()
times = {r["name"]: r["time"] for r in results_got} times = {r["name"]: r["time"] for r in results_got}
times["tot"] = tend - tstart times["tot"] = tend - tstart
log.info( log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s]" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s]"
) )
try: try:
cr = CompReport( cr = CompReport(
[r["result"] for r in results_got], [r["result"] for r in results_got],
name=dataset.name, name=dataset.name,
train_prev=d.train_prev, train_prev=d.train_prev,
valid_prev=d.validation_prev, valid_prev=d.validation_prev,
times=times, times=times,
) )
except Exception as e: except Exception as e:
log.warning( log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
) )
traceback(e) traceback(e)
cr = None cr = None
dr += cr dr += cr
return dr return dr

View File

@ -1,305 +1,305 @@
import inspect import inspect
from functools import wraps from functools import wraps
import numpy as np import numpy as np
from quapy.method.aggregative import PACC, SLD, CC from quapy.method.aggregative import PACC, SLD, CC
from quapy.protocol import UPP, AbstractProtocol from quapy.protocol import UPP, AbstractProtocol
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quacc as qc import quacc as qc
from quacc.evaluation.report import EvaluationReport from quacc.evaluation.report import EvaluationReport
from quacc.method.model_selection import BQAEgsq, GridSearchAE, MCAEgsq from quacc.method.model_selection import BQAEgsq, GridSearchAE, MCAEgsq
from ..method.base import BQAE, MCAE, BaseAccuracyEstimator from ..method.base import BQAE, MCAE, BaseAccuracyEstimator
_methods = {} _methods = {}
_sld_param_grid = { _sld_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7), "q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"], "q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts"], "q__recalib": [None, "bcts"],
"q__exact_train_prev": [True], "q__exact_train_prev": [True],
"confidence": [None, "max_conf", "entropy"], "confidence": [None, "max_conf", "entropy"],
} }
_pacc_param_grid = { _pacc_param_grid = {
"q__classifier__C": np.logspace(-3, 3, 7), "q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"], "q__classifier__class_weight": [None, "balanced"],
"confidence": [None, "max_conf", "entropy"], "confidence": [None, "max_conf", "entropy"],
} }
def method(func): def method(func):
@wraps(func) @wraps(func)
def wrapper(c_model, validation, protocol): def wrapper(c_model, validation, protocol):
return func(c_model, validation, protocol) return func(c_model, validation, protocol)
_methods[func.__name__] = wrapper _methods[func.__name__] = wrapper
return wrapper return wrapper
def evaluation_report( def evaluation_report(
estimator: BaseAccuracyEstimator, estimator: BaseAccuracyEstimator,
protocol: AbstractProtocol, protocol: AbstractProtocol,
) -> EvaluationReport: ) -> EvaluationReport:
method_name = inspect.stack()[1].function method_name = inspect.stack()[1].function
report = EvaluationReport(name=method_name) report = EvaluationReport(name=method_name)
for sample in protocol(): for sample in protocol():
e_sample = estimator.extend(sample) e_sample = estimator.extend(sample)
estim_prev = estimator.estimate(e_sample.X, ext=True) estim_prev = estimator.estimate(e_sample.X, ext=True)
acc_score = qc.error.acc(estim_prev) acc_score = qc.error.acc(estim_prev)
f1_score = qc.error.f1(estim_prev) f1_score = qc.error.f1(estim_prev)
report.append_row( report.append_row(
sample.prevalence(), sample.prevalence(),
acc_score=acc_score, acc_score=acc_score,
acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score), acc=abs(qc.error.acc(e_sample.prevalence()) - acc_score),
f1_score=f1_score, f1_score=f1_score,
f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score), f1=abs(qc.error.f1(e_sample.prevalence()) - f1_score),
) )
return report return report
@method @method
def bin_sld(c_model, validation, protocol) -> EvaluationReport: def bin_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, SLD(LogisticRegression())).fit(validation) est = BQAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_sld(c_model, validation, protocol) -> EvaluationReport: def mul_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, SLD(LogisticRegression())).fit(validation) est = MCAE(c_model, SLD(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def binmc_sld(c_model, validation, protocol) -> EvaluationReport: def binmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE( est = BQAE(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
confidence="max_conf", confidence="max_conf",
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mulmc_sld(c_model, validation, protocol) -> EvaluationReport: def mulmc_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE( est = MCAE(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
confidence="max_conf", confidence="max_conf",
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def binne_sld(c_model, validation, protocol) -> EvaluationReport: def binne_sld(c_model, validation, protocol) -> EvaluationReport:
est = BQAE( est = BQAE(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
confidence="entropy", confidence="entropy",
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mulne_sld(c_model, validation, protocol) -> EvaluationReport: def mulne_sld(c_model, validation, protocol) -> EvaluationReport:
est = MCAE( est = MCAE(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
confidence="entropy", confidence="entropy",
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def bin_sld_gs(c_model, validation, protocol) -> EvaluationReport: def bin_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0) v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, SLD(LogisticRegression())) model = BQAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE( est = GridSearchAE(
model=model, model=model,
param_grid=_sld_param_grid, param_grid=_sld_param_grid,
refit=False, refit=False,
protocol=UPP(v_val, repeats=100), protocol=UPP(v_val, repeats=100),
verbose=True, verbose=True,
).fit(v_train) ).fit(v_train)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_sld_gs(c_model, validation, protocol) -> EvaluationReport: def mul_sld_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0) v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, SLD(LogisticRegression())) model = MCAE(c_model, SLD(LogisticRegression()))
est = GridSearchAE( est = GridSearchAE(
model=model, model=model,
param_grid=_sld_param_grid, param_grid=_sld_param_grid,
refit=False, refit=False,
protocol=UPP(v_val, repeats=100), protocol=UPP(v_val, repeats=100),
verbose=True, verbose=True,
).fit(v_train) ).fit(v_train)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def bin_sld_gsq(c_model, validation, protocol) -> EvaluationReport: def bin_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = BQAEgsq( est = BQAEgsq(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
param_grid={ param_grid={
"classifier__C": np.logspace(-3, 3, 7), "classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"], "classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"], "recalib": [None, "bcts", "vs"],
}, },
refit=False, refit=False,
verbose=False, verbose=False,
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_sld_gsq(c_model, validation, protocol) -> EvaluationReport: def mul_sld_gsq(c_model, validation, protocol) -> EvaluationReport:
est = MCAEgsq( est = MCAEgsq(
c_model, c_model,
SLD(LogisticRegression()), SLD(LogisticRegression()),
param_grid={ param_grid={
"classifier__C": np.logspace(-3, 3, 7), "classifier__C": np.logspace(-3, 3, 7),
"classifier__class_weight": [None, "balanced"], "classifier__class_weight": [None, "balanced"],
"recalib": [None, "bcts", "vs"], "recalib": [None, "bcts", "vs"],
}, },
refit=False, refit=False,
verbose=False, verbose=False,
).fit(validation) ).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def bin_pacc(c_model, validation, protocol) -> EvaluationReport: def bin_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression())).fit(validation) est = BQAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_pacc(c_model, validation, protocol) -> EvaluationReport: def mul_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression())).fit(validation) est = MCAE(c_model, PACC(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def binmc_pacc(c_model, validation, protocol) -> EvaluationReport: def binmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation) est = BQAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mulmc_pacc(c_model, validation, protocol) -> EvaluationReport: def mulmc_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation) est = MCAE(c_model, PACC(LogisticRegression()), confidence="max_conf").fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def binne_pacc(c_model, validation, protocol) -> EvaluationReport: def binne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation) est = BQAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mulne_pacc(c_model, validation, protocol) -> EvaluationReport: def mulne_pacc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation) est = MCAE(c_model, PACC(LogisticRegression()), confidence="entropy").fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def bin_pacc_gs(c_model, validation, protocol) -> EvaluationReport: def bin_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0) v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = BQAE(c_model, PACC(LogisticRegression())) model = BQAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE( est = GridSearchAE(
model=model, model=model,
param_grid=_pacc_param_grid, param_grid=_pacc_param_grid,
refit=False, refit=False,
protocol=UPP(v_val, repeats=100), protocol=UPP(v_val, repeats=100),
verbose=False, verbose=False,
).fit(v_train) ).fit(v_train)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_pacc_gs(c_model, validation, protocol) -> EvaluationReport: def mul_pacc_gs(c_model, validation, protocol) -> EvaluationReport:
v_train, v_val = validation.split_stratified(0.6, random_state=0) v_train, v_val = validation.split_stratified(0.6, random_state=0)
model = MCAE(c_model, PACC(LogisticRegression())) model = MCAE(c_model, PACC(LogisticRegression()))
est = GridSearchAE( est = GridSearchAE(
model=model, model=model,
param_grid=_pacc_param_grid, param_grid=_pacc_param_grid,
refit=False, refit=False,
protocol=UPP(v_val, repeats=100), protocol=UPP(v_val, repeats=100),
verbose=False, verbose=False,
).fit(v_train) ).fit(v_train)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def bin_cc(c_model, validation, protocol) -> EvaluationReport: def bin_cc(c_model, validation, protocol) -> EvaluationReport:
est = BQAE(c_model, CC(LogisticRegression())).fit(validation) est = BQAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )
@method @method
def mul_cc(c_model, validation, protocol) -> EvaluationReport: def mul_cc(c_model, validation, protocol) -> EvaluationReport:
est = MCAE(c_model, CC(LogisticRegression())).fit(validation) est = MCAE(c_model, CC(LogisticRegression())).fit(validation)
return evaluation_report( return evaluation_report(
estimator=est, estimator=est,
protocol=protocol, protocol=protocol,
) )

File diff suppressed because it is too large Load Diff

View File

@ -1,44 +1,44 @@
import time import time
from traceback import print_exception as traceback from traceback import print_exception as traceback
import quapy as qp import quapy as qp
from quapy.protocol import APP from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from quacc.logger import SubLogger from quacc.logger import SubLogger
def estimate_worker(_estimate, train, validation, test, _env=None, q=None): def estimate_worker(_estimate, train, validation, test, _env=None, q=None):
qp.environ["SAMPLE_SIZE"] = _env.SAMPLE_SIZE qp.environ["SAMPLE_SIZE"] = _env.SAMPLE_SIZE
SubLogger.setup(q) SubLogger.setup(q)
log = SubLogger.logger() log = SubLogger.logger()
model = LogisticRegression() model = LogisticRegression()
model.fit(*train.Xy) model.fit(*train.Xy)
protocol = APP( protocol = APP(
test, test,
n_prevalences=_env.PROTOCOL_N_PREVS, n_prevalences=_env.PROTOCOL_N_PREVS,
repeats=_env.PROTOCOL_REPEATS, repeats=_env.PROTOCOL_REPEATS,
return_type="labelled_collection", return_type="labelled_collection",
) )
start = time.time() start = time.time()
try: try:
result = _estimate(model, validation, protocol) result = _estimate(model, validation, protocol)
except Exception as e: except Exception as e:
log.warning(f"Method {_estimate.__name__} failed. Exception: {e}") log.warning(f"Method {_estimate.__name__} failed. Exception: {e}")
traceback(e) traceback(e)
return { return {
"name": _estimate.__name__, "name": _estimate.__name__,
"result": None, "result": None,
"time": 0, "time": 0,
} }
end = time.time() end = time.time()
log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]") log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]")
return { return {
"name": _estimate.__name__, "name": _estimate.__name__,
"result": result, "result": result,
"time": end - start, "time": end - start,
} }

View File

@ -1,136 +1,136 @@
import logging import logging
import logging.handlers import logging.handlers
import multiprocessing import multiprocessing
import threading import threading
from pathlib import Path from pathlib import Path
class Logger: class Logger:
__logger_file = "quacc.log" __logger_file = "quacc.log"
__logger_name = "queue_logger" __logger_name = "queue_logger"
__manager = None __manager = None
__queue = None __queue = None
__thread = None __thread = None
__setup = False __setup = False
__handlers = [] __handlers = []
@classmethod @classmethod
def __logger_listener(cls, q): def __logger_listener(cls, q):
while True: while True:
record = q.get() record = q.get()
if record is None: if record is None:
break break
root = logging.getLogger("listener") root = logging.getLogger("listener")
root.handle(record) root.handle(record)
@classmethod @classmethod
def setup(cls): def setup(cls):
if cls.__setup: if cls.__setup:
return return
# setup root # setup root
root = logging.getLogger("listener") root = logging.getLogger("listener")
root.setLevel(logging.DEBUG) root.setLevel(logging.DEBUG)
rh = logging.FileHandler(cls.__logger_file, mode="a") rh = logging.FileHandler(cls.__logger_file, mode="a")
rh.setLevel(logging.DEBUG) rh.setLevel(logging.DEBUG)
root.addHandler(rh) root.addHandler(rh)
# setup logger # setup logger
if cls.__manager is None: if cls.__manager is None:
cls.__manager = multiprocessing.Manager() cls.__manager = multiprocessing.Manager()
if cls.__queue is None: if cls.__queue is None:
cls.__queue = cls.__manager.Queue() cls.__queue = cls.__manager.Queue()
logger = logging.getLogger(cls.__logger_name) logger = logging.getLogger(cls.__logger_name)
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
qh = logging.handlers.QueueHandler(cls.__queue) qh = logging.handlers.QueueHandler(cls.__queue)
qh.setLevel(logging.DEBUG) qh.setLevel(logging.DEBUG)
qh.setFormatter( qh.setFormatter(
logging.Formatter( logging.Formatter(
fmt="%(asctime)s| %(levelname)-8s %(message)s", fmt="%(asctime)s| %(levelname)-8s %(message)s",
datefmt="%d/%m/%y %H:%M:%S", datefmt="%d/%m/%y %H:%M:%S",
) )
) )
logger.addHandler(qh) logger.addHandler(qh)
# start listener # start listener
cls.__thread = threading.Thread( cls.__thread = threading.Thread(
target=cls.__logger_listener, target=cls.__logger_listener,
args=(cls.__queue,), args=(cls.__queue,),
) )
cls.__thread.start() cls.__thread.start()
cls.__setup = True cls.__setup = True
@classmethod @classmethod
def add_handler(cls, path: Path): def add_handler(cls, path: Path):
root = logging.getLogger("listener") root = logging.getLogger("listener")
rh = logging.FileHandler(path, mode="a") rh = logging.FileHandler(path, mode="a")
rh.setLevel(logging.DEBUG) rh.setLevel(logging.DEBUG)
cls.__handlers.append(rh) cls.__handlers.append(rh)
root.addHandler(rh) root.addHandler(rh)
@classmethod @classmethod
def clear_handlers(cls): def clear_handlers(cls):
root = logging.getLogger("listener") root = logging.getLogger("listener")
for h in cls.__handlers: for h in cls.__handlers:
root.removeHandler(h) root.removeHandler(h)
cls.__handlers.clear() cls.__handlers.clear()
@classmethod @classmethod
def queue(cls): def queue(cls):
if not cls.__setup: if not cls.__setup:
cls.setup() cls.setup()
return cls.__queue return cls.__queue
@classmethod @classmethod
def logger(cls): def logger(cls):
if not cls.__setup: if not cls.__setup:
cls.setup() cls.setup()
return logging.getLogger(cls.__logger_name) return logging.getLogger(cls.__logger_name)
@classmethod @classmethod
def close(cls): def close(cls):
if cls.__setup and cls.__thread is not None: if cls.__setup and cls.__thread is not None:
root = logging.getLogger("listener") root = logging.getLogger("listener")
root.info("-" * 100) root.info("-" * 100)
cls.__queue.put(None) cls.__queue.put(None)
cls.__thread.join() cls.__thread.join()
# cls.__manager.close() # cls.__manager.close()
class SubLogger: class SubLogger:
__queue = None __queue = None
__setup = False __setup = False
@classmethod @classmethod
def setup(cls, q): def setup(cls, q):
if cls.__setup: if cls.__setup:
return return
cls.__queue = q cls.__queue = q
# setup root # setup root
root = logging.getLogger() root = logging.getLogger()
root.setLevel(logging.DEBUG) root.setLevel(logging.DEBUG)
rh = logging.handlers.QueueHandler(q) rh = logging.handlers.QueueHandler(q)
rh.setLevel(logging.DEBUG) rh.setLevel(logging.DEBUG)
rh.setFormatter( rh.setFormatter(
logging.Formatter( logging.Formatter(
fmt="%(asctime)s| %(levelname)-12s%(message)s", fmt="%(asctime)s| %(levelname)-12s%(message)s",
datefmt="%d/%m/%y %H:%M:%S", datefmt="%d/%m/%y %H:%M:%S",
) )
) )
root.addHandler(rh) root.addHandler(rh)
cls.__setup = True cls.__setup = True
@classmethod @classmethod
def logger(cls): def logger(cls):
if not cls.__setup: if not cls.__setup:
return None return None
return logging.getLogger() return logging.getLogger()

View File

@ -1,75 +1,75 @@
from sys import platform from sys import platform
from traceback import print_exception as traceback from traceback import print_exception as traceback
import quacc.evaluation.comp as comp import quacc.evaluation.comp as comp
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.environment import env from quacc.environment import env
from quacc.logger import Logger from quacc.logger import Logger
from quacc.utils import create_dataser_dir from quacc.utils import create_dataser_dir
CE = comp.CompEstimator() CE = comp.CompEstimator()
def toast(): def toast():
if platform == "win32": if platform == "win32":
import win11toast import win11toast
win11toast.notify("Comp", "Completed Execution") win11toast.notify("Comp", "Completed Execution")
def estimate_comparison(): def estimate_comparison():
log = Logger.logger() log = Logger.logger()
for conf in env.get_confs(): for conf in env.get_confs():
dataset = Dataset( dataset = Dataset(
env.DATASET_NAME, env.DATASET_NAME,
target=env.DATASET_TARGET, target=env.DATASET_TARGET,
n_prevalences=env.DATASET_N_PREVS, n_prevalences=env.DATASET_N_PREVS,
prevs=env.DATASET_PREVS, prevs=env.DATASET_PREVS,
) )
create_dataser_dir(dataset.name, update=env.DATASET_DIR_UPDATE) create_dataser_dir(dataset.name, update=env.DATASET_DIR_UPDATE)
Logger.add_handler(env.OUT_DIR / f"{dataset.name}.log") Logger.add_handler(env.OUT_DIR / f"{dataset.name}.log")
try: try:
dr = comp.evaluate_comparison( dr = comp.evaluate_comparison(
dataset, dataset,
estimators=CE.name[env.COMP_ESTIMATORS], estimators=CE.name[env.COMP_ESTIMATORS],
) )
except Exception as e: except Exception as e:
log.error(f"Evaluation over {dataset.name} failed. Exception: {e}") log.error(f"Evaluation over {dataset.name} failed. Exception: {e}")
traceback(e) traceback(e)
for plot_conf in env.get_plot_confs(): for plot_conf in env.get_plot_confs():
for m in env.METRICS: for m in env.METRICS:
output_path = env.OUT_DIR / f"{plot_conf}_{m}.md" output_path = env.OUT_DIR / f"{plot_conf}_{m}.md"
try: try:
_repr = dr.to_md( _repr = dr.to_md(
conf=plot_conf, conf=plot_conf,
metric=m, metric=m,
estimators=CE.name[env.PLOT_ESTIMATORS], estimators=CE.name[env.PLOT_ESTIMATORS],
stdev=env.PLOT_STDEV, stdev=env.PLOT_STDEV,
) )
with open(output_path, "w") as f: with open(output_path, "w") as f:
f.write(_repr) f.write(_repr)
except Exception as e: except Exception as e:
log.error( log.error(
f"Failed while saving configuration {plot_conf} of {dataset.name}. Exception: {e}" f"Failed while saving configuration {plot_conf} of {dataset.name}. Exception: {e}"
) )
traceback(e) traceback(e)
Logger.clear_handlers() Logger.clear_handlers()
# print(df.to_latex(float_format="{:.4f}".format)) # print(df.to_latex(float_format="{:.4f}".format))
# print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format)) # print(utils.avg_group_report(df).to_latex(float_format="{:.4f}".format))
def main(): def main():
log = Logger.logger() log = Logger.logger()
try: try:
estimate_comparison() estimate_comparison()
except Exception as e: except Exception as e:
log.error(f"estimate comparison failed. Exceprion: {e}") log.error(f"estimate comparison failed. Exceprion: {e}")
traceback(e) traceback(e)
toast() toast()
Logger.close() Logger.close()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -1,120 +1,120 @@
from copy import deepcopy from copy import deepcopy
from time import time from time import time
import numpy as np import numpy as np
import win11toast import win11toast
from quapy.method.aggregative import SLD from quapy.method.aggregative import SLD
from quapy.protocol import APP, UPP from quapy.protocol import APP, UPP
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import quacc as qc import quacc as qc
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.error import acc from quacc.error import acc
from quacc.evaluation.baseline import ref from quacc.evaluation.baseline import ref
from quacc.evaluation.method import mulmc_sld from quacc.evaluation.method import mulmc_sld
from quacc.evaluation.report import CompReport, EvaluationReport from quacc.evaluation.report import CompReport, EvaluationReport
from quacc.method.base import MCAE, BinaryQuantifierAccuracyEstimator from quacc.method.base import MCAE, BinaryQuantifierAccuracyEstimator
from quacc.method.model_selection import GridSearchAE from quacc.method.model_selection import GridSearchAE
def test_gs(): def test_gs():
d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw() d = Dataset(name="rcv1", target="CCAT", n_prevalences=1).get_raw()
classifier = LogisticRegression() classifier = LogisticRegression()
classifier.fit(*d.train.Xy) classifier.fit(*d.train.Xy)
quantifier = SLD(LogisticRegression()) quantifier = SLD(LogisticRegression())
# estimator = MultiClassAccuracyEstimator(classifier, quantifier) # estimator = MultiClassAccuracyEstimator(classifier, quantifier)
estimator = BinaryQuantifierAccuracyEstimator(classifier, quantifier) estimator = BinaryQuantifierAccuracyEstimator(classifier, quantifier)
v_train, v_val = d.validation.split_stratified(0.6, random_state=0) v_train, v_val = d.validation.split_stratified(0.6, random_state=0)
gs_protocol = UPP(v_val, sample_size=1000, repeats=100) gs_protocol = UPP(v_val, sample_size=1000, repeats=100)
gs_estimator = GridSearchAE( gs_estimator = GridSearchAE(
model=deepcopy(estimator), model=deepcopy(estimator),
param_grid={ param_grid={
"q__classifier__C": np.logspace(-3, 3, 7), "q__classifier__C": np.logspace(-3, 3, 7),
"q__classifier__class_weight": [None, "balanced"], "q__classifier__class_weight": [None, "balanced"],
"q__recalib": [None, "bcts", "ts"], "q__recalib": [None, "bcts", "ts"],
}, },
refit=False, refit=False,
protocol=gs_protocol, protocol=gs_protocol,
verbose=True, verbose=True,
).fit(v_train) ).fit(v_train)
estimator.fit(d.validation) estimator.fit(d.validation)
tstart = time() tstart = time()
erb, ergs = EvaluationReport("base"), EvaluationReport("gs") erb, ergs = EvaluationReport("base"), EvaluationReport("gs")
protocol = APP( protocol = APP(
d.test, d.test,
sample_size=1000, sample_size=1000,
n_prevalences=21, n_prevalences=21,
repeats=100, repeats=100,
return_type="labelled_collection", return_type="labelled_collection",
) )
for sample in protocol(): for sample in protocol():
e_sample = gs_estimator.extend(sample) e_sample = gs_estimator.extend(sample)
estim_prev_b = estimator.estimate(e_sample.X, ext=True) estim_prev_b = estimator.estimate(e_sample.X, ext=True)
estim_prev_gs = gs_estimator.estimate(e_sample.X, ext=True) estim_prev_gs = gs_estimator.estimate(e_sample.X, ext=True)
erb.append_row( erb.append_row(
sample.prevalence(), sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_b)), acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_b)),
) )
ergs.append_row( ergs.append_row(
sample.prevalence(), sample.prevalence(),
acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_gs)), acc=abs(acc(e_sample.prevalence()) - acc(estim_prev_gs)),
) )
cr = CompReport( cr = CompReport(
[erb, ergs], [erb, ergs],
"test", "test",
train_prev=d.train_prev, train_prev=d.train_prev,
valid_prev=d.validation_prev, valid_prev=d.validation_prev,
) )
print(cr.table()) print(cr.table())
print(f"[took {time() - tstart:.3f}s]") print(f"[took {time() - tstart:.3f}s]")
win11toast.notify("Test", "completed") win11toast.notify("Test", "completed")
def test_mc(): def test_mc():
d = Dataset(name="rcv1", target="CCAT", prevs=[0.9]).get()[0] d = Dataset(name="rcv1", target="CCAT", prevs=[0.9]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy) classifier = LogisticRegression().fit(*d.train.Xy)
protocol = APP( protocol = APP(
d.test, d.test,
sample_size=1000, sample_size=1000,
repeats=100, repeats=100,
n_prevalences=21, n_prevalences=21,
return_type="labelled_collection", return_type="labelled_collection",
) )
ref_er = ref(classifier, d.validation, protocol) ref_er = ref(classifier, d.validation, protocol)
mulmc_er = mulmc_sld(classifier, d.validation, protocol) mulmc_er = mulmc_sld(classifier, d.validation, protocol)
cr = CompReport( cr = CompReport(
[mulmc_er, ref_er], [mulmc_er, ref_er],
name="test_mc", name="test_mc",
train_prev=d.train_prev, train_prev=d.train_prev,
valid_prev=d.validation_prev, valid_prev=d.validation_prev,
) )
with open("test_mc.md", "w") as f: with open("test_mc.md", "w") as f:
f.write(cr.data().to_markdown()) f.write(cr.data().to_markdown())
def test_et(): def test_et():
d = Dataset(name="imdb", prevs=[0.5]).get()[0] d = Dataset(name="imdb", prevs=[0.5]).get()[0]
classifier = LogisticRegression().fit(*d.train.Xy) classifier = LogisticRegression().fit(*d.train.Xy)
estimator = MCAE( estimator = MCAE(
classifier, classifier,
SLD(LogisticRegression(), exact_train_prev=False), SLD(LogisticRegression(), exact_train_prev=False),
confidence="max_conf", confidence="max_conf",
).fit(d.validation) ).fit(d.validation)
e_test = estimator.extend(d.test) e_test = estimator.extend(d.test)
ep = estimator.estimate(e_test.X, ext=True) ep = estimator.estimate(e_test.X, ext=True)
print(f"{qc.error.acc(ep) = }") print(f"{qc.error.acc(ep) = }")
print(f"{qc.error.acc(e_test.prevalence()) = }") print(f"{qc.error.acc(e_test.prevalence()) = }")
if __name__ == "__main__": if __name__ == "__main__":
test_et() test_et()

View File

@ -1,177 +1,177 @@
import math import math
from abc import abstractmethod from abc import abstractmethod
from copy import deepcopy from copy import deepcopy
from typing import List from typing import List
import numpy as np import numpy as np
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.method.aggregative import BaseQuantifier from quapy.method.aggregative import BaseQuantifier
from scipy.sparse import csr_matrix from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from quacc.data import ExtendedCollection from quacc.data import ExtendedCollection
class BaseAccuracyEstimator(BaseQuantifier): class BaseAccuracyEstimator(BaseQuantifier):
def __init__( def __init__(
self, self,
classifier: BaseEstimator, classifier: BaseEstimator,
quantifier: BaseQuantifier, quantifier: BaseQuantifier,
confidence=None, confidence=None,
): ):
self.__check_classifier(classifier) self.__check_classifier(classifier)
self.quantifier = quantifier self.quantifier = quantifier
self.confidence = confidence self.confidence = confidence
def __check_classifier(self, classifier): def __check_classifier(self, classifier):
if not hasattr(classifier, "predict_proba"): if not hasattr(classifier, "predict_proba"):
raise ValueError( raise ValueError(
f"Passed classifier {classifier.__class__.__name__} cannot predict probabilities." f"Passed classifier {classifier.__class__.__name__} cannot predict probabilities."
) )
self.classifier = classifier self.classifier = classifier
def __get_confidence(self): def __get_confidence(self):
def max_conf(probas): def max_conf(probas):
_mc = np.max(probas, axis=-1) _mc = np.max(probas, axis=-1)
_min = 1.0 / probas.shape[1] _min = 1.0 / probas.shape[1]
_norm_mc = (_mc - _min) / (1.0 - _min) _norm_mc = (_mc - _min) / (1.0 - _min)
return _norm_mc return _norm_mc
def entropy(probas): def entropy(probas):
_ent = np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1) _ent = np.sum(np.multiply(probas, np.log(probas + 1e-20)), axis=1)
return _ent return _ent
if self.confidence is None: if self.confidence is None:
return None return None
__confs = { __confs = {
"max_conf": max_conf, "max_conf": max_conf,
"entropy": entropy, "entropy": entropy,
} }
return __confs.get(self.confidence, None) return __confs.get(self.confidence, None)
def __get_ext(self, pred_proba): def __get_ext(self, pred_proba):
_ext = pred_proba _ext = pred_proba
_f_conf = self.__get_confidence() _f_conf = self.__get_confidence()
if _f_conf is not None: if _f_conf is not None:
_confs = _f_conf(pred_proba).reshape((len(pred_proba), 1)) _confs = _f_conf(pred_proba).reshape((len(pred_proba), 1))
_ext = np.concatenate((_confs, pred_proba), axis=1) _ext = np.concatenate((_confs, pred_proba), axis=1)
return _ext return _ext
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection: def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
if pred_proba is None: if pred_proba is None:
pred_proba = self.classifier.predict_proba(coll.X) pred_proba = self.classifier.predict_proba(coll.X)
_ext = self.__get_ext(pred_proba) _ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_collection(coll, pred_proba=_ext) return ExtendedCollection.extend_collection(coll, pred_proba=_ext)
def _extend_instances(self, instances: np.ndarray | csr_matrix, pred_proba=None): def _extend_instances(self, instances: np.ndarray | csr_matrix, pred_proba=None):
if pred_proba is None: if pred_proba is None:
pred_proba = self.classifier.predict_proba(instances) pred_proba = self.classifier.predict_proba(instances)
_ext = self.__get_ext(pred_proba) _ext = self.__get_ext(pred_proba)
return ExtendedCollection.extend_instances(instances, _ext) return ExtendedCollection.extend_instances(instances, _ext)
@abstractmethod @abstractmethod
def fit(self, train: LabelledCollection | ExtendedCollection): def fit(self, train: LabelledCollection | ExtendedCollection):
... ...
@abstractmethod @abstractmethod
def estimate(self, instances, ext=False) -> np.ndarray: def estimate(self, instances, ext=False) -> np.ndarray:
... ...
class MultiClassAccuracyEstimator(BaseAccuracyEstimator): class MultiClassAccuracyEstimator(BaseAccuracyEstimator):
def __init__( def __init__(
self, self,
classifier: BaseEstimator, classifier: BaseEstimator,
quantifier: BaseQuantifier, quantifier: BaseQuantifier,
confidence: str = None, confidence: str = None,
): ):
super().__init__( super().__init__(
classifier=classifier, classifier=classifier,
quantifier=quantifier, quantifier=quantifier,
confidence=confidence, confidence=confidence,
) )
self.e_train = None self.e_train = None
def fit(self, train: LabelledCollection): def fit(self, train: LabelledCollection):
self.e_train = self.extend(train) self.e_train = self.extend(train)
self.quantifier.fit(self.e_train) self.quantifier.fit(self.e_train)
return self return self
def estimate(self, instances, ext=False) -> np.ndarray: def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances) e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst) estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.classes_) return self._check_prevalence_classes(estim_prev, self.quantifier.classes_)
def _check_prevalence_classes(self, estim_prev, estim_classes) -> np.ndarray: def _check_prevalence_classes(self, estim_prev, estim_classes) -> np.ndarray:
true_classes = self.e_train.classes_ true_classes = self.e_train.classes_
for _cls in true_classes: for _cls in true_classes:
if _cls not in estim_classes: if _cls not in estim_classes:
estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0) estim_prev = np.insert(estim_prev, _cls, [0.0], axis=0)
return estim_prev return estim_prev
class BinaryQuantifierAccuracyEstimator(BaseAccuracyEstimator): class BinaryQuantifierAccuracyEstimator(BaseAccuracyEstimator):
def __init__( def __init__(
self, self,
classifier: BaseEstimator, classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator, quantifier: BaseAccuracyEstimator,
confidence: str = None, confidence: str = None,
): ):
super().__init__( super().__init__(
classifier=classifier, classifier=classifier,
quantifier=quantifier, quantifier=quantifier,
confidence=confidence, confidence=confidence,
) )
self.quantifiers = [] self.quantifiers = []
self.e_trains = [] self.e_trains = []
def fit(self, train: LabelledCollection | ExtendedCollection): def fit(self, train: LabelledCollection | ExtendedCollection):
self.e_train = self.extend(train) self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred() self.e_trains = self.e_train.split_by_pred()
self.quantifiers = [] self.quantifiers = []
for train in self.e_trains: for train in self.e_trains:
quant = deepcopy(self.quantifier) quant = deepcopy(self.quantifier)
quant.fit(train) quant.fit(train)
self.quantifiers.append(quant) self.quantifiers.append(quant)
return self return self
def estimate(self, instances, ext=False): def estimate(self, instances, ext=False):
# TODO: test # TODO: test
e_inst = instances if ext else self._extend_instances(instances) e_inst = instances if ext else self._extend_instances(instances)
_ncl = int(math.sqrt(self.n_classes)) _ncl = int(math.sqrt(self.n_classes))
s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst) s_inst, norms = ExtendedCollection.split_inst_by_pred(_ncl, e_inst)
estim_prevs = self._quantify_helper(s_inst, norms) estim_prevs = self._quantify_helper(s_inst, norms)
estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten() estim_prev = np.array([prev_row for prev_row in zip(*estim_prevs)]).flatten()
return estim_prev return estim_prev
def _quantify_helper( def _quantify_helper(
self, self,
s_inst: List[np.ndarray | csr_matrix], s_inst: List[np.ndarray | csr_matrix],
norms: List[float], norms: List[float],
): ):
estim_prevs = [] estim_prevs = []
for quant, inst, norm in zip(self.quantifiers, s_inst, norms): for quant, inst, norm in zip(self.quantifiers, s_inst, norms):
if inst.shape[0] > 0: if inst.shape[0] > 0:
estim_prevs.append(quant.quantify(inst) * norm) estim_prevs.append(quant.quantify(inst) * norm)
else: else:
estim_prevs.append(np.asarray([0.0, 0.0])) estim_prevs.append(np.asarray([0.0, 0.0]))
return estim_prevs return estim_prevs
BAE = BaseAccuracyEstimator BAE = BaseAccuracyEstimator
MCAE = MultiClassAccuracyEstimator MCAE = MultiClassAccuracyEstimator
BQAE = BinaryQuantifierAccuracyEstimator BQAE = BinaryQuantifierAccuracyEstimator

View File

@ -1,307 +1,307 @@
import itertools import itertools
from copy import deepcopy from copy import deepcopy
from time import time from time import time
from typing import Callable, Union from typing import Callable, Union
import numpy as np import numpy as np
import quapy as qp import quapy as qp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.model_selection import GridSearchQ from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol from quapy.protocol import UPP, AbstractProtocol, OnLabelledCollectionProtocol
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import quacc as qc import quacc as qc
import quacc.error import quacc.error
from quacc.data import ExtendedCollection from quacc.data import ExtendedCollection
from quacc.evaluation import evaluate from quacc.evaluation import evaluate
from quacc.logger import SubLogger from quacc.logger import SubLogger
from quacc.method.base import ( from quacc.method.base import (
BaseAccuracyEstimator, BaseAccuracyEstimator,
BinaryQuantifierAccuracyEstimator, BinaryQuantifierAccuracyEstimator,
MultiClassAccuracyEstimator, MultiClassAccuracyEstimator,
) )
class GridSearchAE(BaseAccuracyEstimator): class GridSearchAE(BaseAccuracyEstimator):
def __init__( def __init__(
self, self,
model: BaseAccuracyEstimator, model: BaseAccuracyEstimator,
param_grid: dict, param_grid: dict,
protocol: AbstractProtocol, protocol: AbstractProtocol,
error: Union[Callable, str] = qc.error.maccd, error: Union[Callable, str] = qc.error.maccd,
refit=True, refit=True,
# timeout=-1, # timeout=-1,
# n_jobs=None, # n_jobs=None,
verbose=False, verbose=False,
): ):
self.model = model self.model = model
self.param_grid = self.__normalize_params(param_grid) self.param_grid = self.__normalize_params(param_grid)
self.protocol = protocol self.protocol = protocol
self.refit = refit self.refit = refit
# self.timeout = timeout # self.timeout = timeout
# self.n_jobs = qp._get_njobs(n_jobs) # self.n_jobs = qp._get_njobs(n_jobs)
self.verbose = verbose self.verbose = verbose
self.__check_error(error) self.__check_error(error)
assert isinstance(protocol, AbstractProtocol), "unknown protocol" assert isinstance(protocol, AbstractProtocol), "unknown protocol"
def _sout(self, msg): def _sout(self, msg):
if self.verbose: if self.verbose:
print(f"[{self.__class__.__name__}]: {msg}") print(f"[{self.__class__.__name__}]: {msg}")
def __normalize_params(self, params): def __normalize_params(self, params):
__remap = {} __remap = {}
for key in params.keys(): for key in params.keys():
k, delim, sub_key = key.partition("__") k, delim, sub_key = key.partition("__")
if delim and k == "q": if delim and k == "q":
__remap[key] = f"quantifier__{sub_key}" __remap[key] = f"quantifier__{sub_key}"
return {(__remap[k] if k in __remap else k): v for k, v in params.items()} return {(__remap[k] if k in __remap else k): v for k, v in params.items()}
def __check_error(self, error): def __check_error(self, error):
if error in qc.error.ACCURACY_ERROR: if error in qc.error.ACCURACY_ERROR:
self.error = error self.error = error
elif isinstance(error, str): elif isinstance(error, str):
self.error = qc.error.from_name(error) self.error = qc.error.from_name(error)
elif hasattr(error, "__call__"): elif hasattr(error, "__call__"):
self.error = error self.error = error
else: else:
raise ValueError( raise ValueError(
f"unexpected error type; must either be a callable function or a str representing\n" f"unexpected error type; must either be a callable function or a str representing\n"
f"the name of an error function in {qc.error.ACCURACY_ERROR_NAMES}" f"the name of an error function in {qc.error.ACCURACY_ERROR_NAMES}"
) )
def fit(self, training: LabelledCollection): def fit(self, training: LabelledCollection):
"""Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing """Learning routine. Fits methods with all combinations of hyperparameters and selects the one minimizing
the error metric. the error metric.
:param training: the training set on which to optimize the hyperparameters :param training: the training set on which to optimize the hyperparameters
:return: self :return: self
""" """
params_keys = list(self.param_grid.keys()) params_keys = list(self.param_grid.keys())
params_values = list(self.param_grid.values()) params_values = list(self.param_grid.values())
protocol = self.protocol protocol = self.protocol
self.param_scores_ = {} self.param_scores_ = {}
self.best_score_ = None self.best_score_ = None
tinit = time() tinit = time()
hyper = [ hyper = [
dict(zip(params_keys, val)) for val in itertools.product(*params_values) dict(zip(params_keys, val)) for val in itertools.product(*params_values)
] ]
# self._sout(f"starting model selection with {self.n_jobs =}") # self._sout(f"starting model selection with {self.n_jobs =}")
self._sout("starting model selection") self._sout("starting model selection")
scores = [self.__params_eval(params, training) for params in hyper] scores = [self.__params_eval(params, training) for params in hyper]
for params, score, model in scores: for params, score, model in scores:
if score is not None: if score is not None:
if self.best_score_ is None or score < self.best_score_: if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score self.best_score_ = score
self.best_params_ = params self.best_params_ = params
self.best_model_ = model self.best_model_ = model
self.param_scores_[str(params)] = score self.param_scores_[str(params)] = score
else: else:
self.param_scores_[str(params)] = "timeout" self.param_scores_[str(params)] = "timeout"
tend = time() - tinit tend = time() - tinit
if self.best_score_ is None: if self.best_score_ is None:
raise TimeoutError("no combination of hyperparameters seem to work") raise TimeoutError("no combination of hyperparameters seem to work")
self._sout( self._sout(
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) " f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]" f"[took {tend:.4f}s]"
) )
log = SubLogger.logger() log = SubLogger.logger()
log.debug( log.debug(
f"[{self.model.__class__.__name__}] " f"[{self.model.__class__.__name__}] "
f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) " f"optimization finished: best params {self.best_params_} (score={self.best_score_:.5f}) "
f"[took {tend:.4f}s]" f"[took {tend:.4f}s]"
) )
if self.refit: if self.refit:
if isinstance(protocol, OnLabelledCollectionProtocol): if isinstance(protocol, OnLabelledCollectionProtocol):
self._sout("refitting on the whole development set") self._sout("refitting on the whole development set")
self.best_model_.fit(training + protocol.get_labelled_collection()) self.best_model_.fit(training + protocol.get_labelled_collection())
else: else:
raise RuntimeWarning( raise RuntimeWarning(
f'"refit" was requested, but the protocol does not ' f'"refit" was requested, but the protocol does not '
f"implement the {OnLabelledCollectionProtocol.__name__} interface" f"implement the {OnLabelledCollectionProtocol.__name__} interface"
) )
return self return self
def __params_eval(self, params, training): def __params_eval(self, params, training):
protocol = self.protocol protocol = self.protocol
error = self.error error = self.error
# if self.timeout > 0: # if self.timeout > 0:
# def handler(signum, frame): # def handler(signum, frame):
# raise TimeoutError() # raise TimeoutError()
# signal.signal(signal.SIGALRM, handler) # signal.signal(signal.SIGALRM, handler)
tinit = time() tinit = time()
# if self.timeout > 0: # if self.timeout > 0:
# signal.alarm(self.timeout) # signal.alarm(self.timeout)
try: try:
model = deepcopy(self.model) model = deepcopy(self.model)
# overrides default parameters with the parameters being explored at this iteration # overrides default parameters with the parameters being explored at this iteration
model.set_params(**params) model.set_params(**params)
# print({k: v for k, v in model.get_params().items() if k in params}) # print({k: v for k, v in model.get_params().items() if k in params})
model.fit(training) model.fit(training)
score = evaluate(model, protocol=protocol, error_metric=error) score = evaluate(model, protocol=protocol, error_metric=error)
ttime = time() - tinit ttime = time() - tinit
self._sout( self._sout(
f"hyperparams={params}\t got score {score:.5f} [took {ttime:.4f}s]" f"hyperparams={params}\t got score {score:.5f} [took {ttime:.4f}s]"
) )
# if self.timeout > 0: # if self.timeout > 0:
# signal.alarm(0) # signal.alarm(0)
# except TimeoutError: # except TimeoutError:
# self._sout(f"timeout ({self.timeout}s) reached for config {params}") # self._sout(f"timeout ({self.timeout}s) reached for config {params}")
# score = None # score = None
except ValueError as e: except ValueError as e:
self._sout(f"the combination of hyperparameters {params} is invalid") self._sout(f"the combination of hyperparameters {params} is invalid")
raise e raise e
except Exception as e: except Exception as e:
self._sout(f"something went wrong for config {params}; skipping:") self._sout(f"something went wrong for config {params}; skipping:")
self._sout(f"\tException: {e}") self._sout(f"\tException: {e}")
score = None score = None
return params, score, model return params, score, model
def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection: def extend(self, coll: LabelledCollection, pred_proba=None) -> ExtendedCollection:
assert hasattr(self, "best_model_"), "quantify called before fit" assert hasattr(self, "best_model_"), "quantify called before fit"
return self.best_model().extend(coll, pred_proba=pred_proba) return self.best_model().extend(coll, pred_proba=pred_proba)
def estimate(self, instances, ext=False): def estimate(self, instances, ext=False):
"""Estimate class prevalence values using the best model found after calling the :meth:`fit` method. """Estimate class prevalence values using the best model found after calling the :meth:`fit` method.
:param instances: sample contanining the instances :param instances: sample contanining the instances
:return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found :return: a ndarray of shape `(n_classes)` with class prevalence estimates as according to the best model found
by the model selection process. by the model selection process.
""" """
assert hasattr(self, "best_model_"), "estimate called before fit" assert hasattr(self, "best_model_"), "estimate called before fit"
return self.best_model().estimate(instances, ext=ext) return self.best_model().estimate(instances, ext=ext)
def set_params(self, **parameters): def set_params(self, **parameters):
"""Sets the hyper-parameters to explore. """Sets the hyper-parameters to explore.
:param parameters: a dictionary with keys the parameter names and values the list of values to explore :param parameters: a dictionary with keys the parameter names and values the list of values to explore
""" """
self.param_grid = parameters self.param_grid = parameters
def get_params(self, deep=True): def get_params(self, deep=True):
"""Returns the dictionary of hyper-parameters to explore (`param_grid`) """Returns the dictionary of hyper-parameters to explore (`param_grid`)
:param deep: Unused :param deep: Unused
:return: the dictionary `param_grid` :return: the dictionary `param_grid`
""" """
return self.param_grid return self.param_grid
def best_model(self): def best_model(self):
""" """
Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination Returns the best model found after calling the :meth:`fit` method, i.e., the one trained on the combination
of hyper-parameters that minimized the error function. of hyper-parameters that minimized the error function.
:return: a trained quantifier :return: a trained quantifier
""" """
if hasattr(self, "best_model_"): if hasattr(self, "best_model_"):
return self.best_model_ return self.best_model_
raise ValueError("best_model called before fit") raise ValueError("best_model called before fit")
class MCAEgsq(MultiClassAccuracyEstimator): class MCAEgsq(MultiClassAccuracyEstimator):
def __init__( def __init__(
self, self,
classifier: BaseEstimator, classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator, quantifier: BaseAccuracyEstimator,
param_grid: dict, param_grid: dict,
error: Union[Callable, str] = qp.error.mae, error: Union[Callable, str] = qp.error.mae,
refit=True, refit=True,
timeout=-1, timeout=-1,
n_jobs=None, n_jobs=None,
verbose=False, verbose=False,
): ):
self.param_grid = param_grid self.param_grid = param_grid
self.refit = refit self.refit = refit
self.timeout = timeout self.timeout = timeout
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.verbose = verbose self.verbose = verbose
self.error = error self.error = error
super().__init__(classifier, quantifier) super().__init__(classifier, quantifier)
def fit(self, train: LabelledCollection): def fit(self, train: LabelledCollection):
self.e_train = self.extend(train) self.e_train = self.extend(train)
t_train, t_val = self.e_train.split_stratified(0.6, random_state=0) t_train, t_val = self.e_train.split_stratified(0.6, random_state=0)
self.quantifier = GridSearchQ( self.quantifier = GridSearchQ(
deepcopy(self.quantifier), deepcopy(self.quantifier),
param_grid=self.param_grid, param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100), protocol=UPP(t_val, repeats=100),
error=self.error, error=self.error,
refit=self.refit, refit=self.refit,
timeout=self.timeout, timeout=self.timeout,
n_jobs=self.n_jobs, n_jobs=self.n_jobs,
verbose=self.verbose, verbose=self.verbose,
).fit(self.e_train) ).fit(self.e_train)
return self return self
def estimate(self, instances, ext=False) -> np.ndarray: def estimate(self, instances, ext=False) -> np.ndarray:
e_inst = instances if ext else self._extend_instances(instances) e_inst = instances if ext else self._extend_instances(instances)
estim_prev = self.quantifier.quantify(e_inst) estim_prev = self.quantifier.quantify(e_inst)
return self._check_prevalence_classes(estim_prev, self.quantifier.best_model().classes_) return self._check_prevalence_classes(estim_prev, self.quantifier.best_model().classes_)
class BQAEgsq(BinaryQuantifierAccuracyEstimator): class BQAEgsq(BinaryQuantifierAccuracyEstimator):
def __init__( def __init__(
self, self,
classifier: BaseEstimator, classifier: BaseEstimator,
quantifier: BaseAccuracyEstimator, quantifier: BaseAccuracyEstimator,
param_grid: dict, param_grid: dict,
error: Union[Callable, str] = qp.error.mae, error: Union[Callable, str] = qp.error.mae,
refit=True, refit=True,
timeout=-1, timeout=-1,
n_jobs=None, n_jobs=None,
verbose=False, verbose=False,
): ):
self.param_grid = param_grid self.param_grid = param_grid
self.refit = refit self.refit = refit
self.timeout = timeout self.timeout = timeout
self.n_jobs = n_jobs self.n_jobs = n_jobs
self.verbose = verbose self.verbose = verbose
self.error = error self.error = error
super().__init__(classifier=classifier, quantifier=quantifier) super().__init__(classifier=classifier, quantifier=quantifier)
def fit(self, train: LabelledCollection): def fit(self, train: LabelledCollection):
self.e_train = self.extend(train) self.e_train = self.extend(train)
self.n_classes = self.e_train.n_classes self.n_classes = self.e_train.n_classes
self.e_trains = self.e_train.split_by_pred() self.e_trains = self.e_train.split_by_pred()
self.quantifiers = [] self.quantifiers = []
for e_train in self.e_trains: for e_train in self.e_trains:
t_train, t_val = e_train.split_stratified(0.6, random_state=0) t_train, t_val = e_train.split_stratified(0.6, random_state=0)
quantifier = GridSearchQ( quantifier = GridSearchQ(
model=deepcopy(self.quantifier), model=deepcopy(self.quantifier),
param_grid=self.param_grid, param_grid=self.param_grid,
protocol=UPP(t_val, repeats=100), protocol=UPP(t_val, repeats=100),
error=self.error, error=self.error,
refit=self.refit, refit=self.refit,
timeout=self.timeout, timeout=self.timeout,
n_jobs=self.n_jobs, n_jobs=self.n_jobs,
verbose=self.verbose, verbose=self.verbose,
).fit(t_train) ).fit(t_train)
self.quantifiers.append(quantifier) self.quantifiers.append(quantifier)
return self return self

View File

@ -1,239 +1,239 @@
from pathlib import Path from pathlib import Path
import matplotlib import matplotlib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
from cycler import cycler from cycler import cycler
from quacc.environment import env from quacc.environment import env
matplotlib.use("agg") matplotlib.use("agg")
def _get_markers(n: int): def _get_markers(n: int):
ls = "ovx+sDph*^1234X><.Pd" ls = "ovx+sDph*^1234X><.Pd"
if n > len(ls): if n > len(ls):
ls = ls * (n / len(ls) + 1) ls = ls * (n / len(ls) + 1)
return list(ls)[:n] return list(ls)[:n]
def plot_delta( def plot_delta(
base_prevs, base_prevs,
columns, columns,
data, data,
*, *,
stdevs=None, stdevs=None,
pos_class=1, pos_class=1,
metric="acc", metric="acc",
name="default", name="default",
train_prev=None, train_prev=None,
legend=True, legend=True,
avg=None, avg=None,
) -> Path: ) -> Path:
_base_title = "delta_stdev" if stdevs is not None else "delta" _base_title = "delta_stdev" if stdevs is not None else "delta"
if train_prev is not None: if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100)) t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"{_base_title}_{name}_{t_prev_pos}_{metric}" title = f"{_base_title}_{name}_{t_prev_pos}_{metric}"
else: else:
title = f"{_base_title}_{name}_avg_{avg}_{metric}" title = f"{_base_title}_{name}_avg_{avg}_{metric}"
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.set_aspect("auto") ax.set_aspect("auto")
ax.grid() ax.grid()
NUM_COLORS = len(data) NUM_COLORS = len(data)
cm = plt.get_cmap("tab10") cm = plt.get_cmap("tab10")
if NUM_COLORS > 10: if NUM_COLORS > 10:
cm = plt.get_cmap("tab20") cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)]) cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
base_prevs = base_prevs[:, pos_class] base_prevs = base_prevs[:, pos_class]
for method, deltas, _cy in zip(columns, data, cy): for method, deltas, _cy in zip(columns, data, cy):
ax.plot( ax.plot(
base_prevs, base_prevs,
deltas, deltas,
label=method, label=method,
color=_cy["color"], color=_cy["color"],
linestyle="-", linestyle="-",
marker="o", marker="o",
markersize=3, markersize=3,
zorder=2, zorder=2,
) )
if stdevs is not None: if stdevs is not None:
_col_idx = np.where(columns == method)[0] _col_idx = np.where(columns == method)[0]
stdev = stdevs[_col_idx].flatten() stdev = stdevs[_col_idx].flatten()
nn_idx = np.intersect1d( nn_idx = np.intersect1d(
np.where(deltas != np.nan)[0], np.where(deltas != np.nan)[0],
np.where(stdev != np.nan)[0], np.where(stdev != np.nan)[0],
) )
_bps, _ds, _st = base_prevs[nn_idx], deltas[nn_idx], stdev[nn_idx] _bps, _ds, _st = base_prevs[nn_idx], deltas[nn_idx], stdev[nn_idx]
ax.fill_between( ax.fill_between(
_bps, _bps,
_ds - _st, _ds - _st,
_ds + _st, _ds + _st,
color=_cy["color"], color=_cy["color"],
alpha=0.25, alpha=0.25,
) )
x_label = "test" if avg is None or avg == "train" else "train" x_label = "test" if avg is None or avg == "train" else "train"
ax.set( ax.set(
xlabel=f"{x_label} prevalence", xlabel=f"{x_label} prevalence",
ylabel=metric, ylabel=metric,
title=title, title=title,
) )
if legend: if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png" output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight") fig.savefig(output_path, bbox_inches="tight")
return output_path return output_path
def plot_diagonal( def plot_diagonal(
reference, reference,
columns, columns,
data, data,
*, *,
pos_class=1, pos_class=1,
metric="acc", metric="acc",
name="default", name="default",
train_prev=None, train_prev=None,
legend=True, legend=True,
): ):
if train_prev is not None: if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100)) t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"diagonal_{name}_{t_prev_pos}_{metric}" title = f"diagonal_{name}_{t_prev_pos}_{metric}"
else: else:
title = f"diagonal_{name}_{metric}" title = f"diagonal_{name}_{metric}"
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.set_aspect("auto") ax.set_aspect("auto")
ax.grid() ax.grid()
ax.set_aspect("equal") ax.set_aspect("equal")
NUM_COLORS = len(data) NUM_COLORS = len(data)
cm = plt.get_cmap("tab10") cm = plt.get_cmap("tab10")
if NUM_COLORS > 10: if NUM_COLORS > 10:
cm = plt.get_cmap("tab20") cm = plt.get_cmap("tab20")
cy = cycler( cy = cycler(
color=[cm(i) for i in range(NUM_COLORS)], color=[cm(i) for i in range(NUM_COLORS)],
marker=_get_markers(NUM_COLORS), marker=_get_markers(NUM_COLORS),
) )
reference = np.array(reference) reference = np.array(reference)
x_ticks = np.unique(reference) x_ticks = np.unique(reference)
x_ticks.sort() x_ticks.sort()
for deltas, _cy in zip(data, cy): for deltas, _cy in zip(data, cy):
ax.plot( ax.plot(
reference, reference,
deltas, deltas,
color=_cy["color"], color=_cy["color"],
linestyle="None", linestyle="None",
marker=_cy["marker"], marker=_cy["marker"],
markersize=3, markersize=3,
zorder=2, zorder=2,
alpha=0.25, alpha=0.25,
) )
# ensure limits are equal for both axes # ensure limits are equal for both axes
_alims = np.stack(((ax.get_xlim(), ax.get_ylim())), axis=-1) _alims = np.stack(((ax.get_xlim(), ax.get_ylim())), axis=-1)
_lims = np.array([f(ls) for f, ls in zip([np.min, np.max], _alims)]) _lims = np.array([f(ls) for f, ls in zip([np.min, np.max], _alims)])
ax.set(xlim=tuple(_lims), ylim=tuple(_lims)) ax.set(xlim=tuple(_lims), ylim=tuple(_lims))
for method, deltas, _cy in zip(columns, data, cy): for method, deltas, _cy in zip(columns, data, cy):
slope, interc = np.polyfit(reference, deltas, 1) slope, interc = np.polyfit(reference, deltas, 1)
y_lr = np.array([slope * x + interc for x in _lims]) y_lr = np.array([slope * x + interc for x in _lims])
ax.plot( ax.plot(
_lims, _lims,
y_lr, y_lr,
label=method, label=method,
color=_cy["color"], color=_cy["color"],
linestyle="-", linestyle="-",
markersize="0", markersize="0",
zorder=1, zorder=1,
) )
# plot reference line # plot reference line
ax.plot( ax.plot(
_lims, _lims,
_lims, _lims,
color="black", color="black",
linestyle="--", linestyle="--",
markersize=0, markersize=0,
zorder=1, zorder=1,
) )
ax.set(xlabel=f"true {metric}", ylabel=f"estim. {metric}", title=title) ax.set(xlabel=f"true {metric}", ylabel=f"estim. {metric}", title=title)
if legend: if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png" output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight") fig.savefig(output_path, bbox_inches="tight")
return output_path return output_path
def plot_shift( def plot_shift(
shift_prevs, shift_prevs,
columns, columns,
data, data,
*, *,
counts=None, counts=None,
pos_class=1, pos_class=1,
metric="acc", metric="acc",
name="default", name="default",
train_prev=None, train_prev=None,
legend=True, legend=True,
) -> Path: ) -> Path:
if train_prev is not None: if train_prev is not None:
t_prev_pos = int(round(train_prev[pos_class] * 100)) t_prev_pos = int(round(train_prev[pos_class] * 100))
title = f"shift_{name}_{t_prev_pos}_{metric}" title = f"shift_{name}_{t_prev_pos}_{metric}"
else: else:
title = f"shift_{name}_avg_{metric}" title = f"shift_{name}_avg_{metric}"
fig, ax = plt.subplots() fig, ax = plt.subplots()
ax.set_aspect("auto") ax.set_aspect("auto")
ax.grid() ax.grid()
NUM_COLORS = len(data) NUM_COLORS = len(data)
cm = plt.get_cmap("tab10") cm = plt.get_cmap("tab10")
if NUM_COLORS > 10: if NUM_COLORS > 10:
cm = plt.get_cmap("tab20") cm = plt.get_cmap("tab20")
cy = cycler(color=[cm(i) for i in range(NUM_COLORS)]) cy = cycler(color=[cm(i) for i in range(NUM_COLORS)])
shift_prevs = shift_prevs[:, pos_class] shift_prevs = shift_prevs[:, pos_class]
for method, shifts, _cy in zip(columns, data, cy): for method, shifts, _cy in zip(columns, data, cy):
ax.plot( ax.plot(
shift_prevs, shift_prevs,
shifts, shifts,
label=method, label=method,
color=_cy["color"], color=_cy["color"],
linestyle="-", linestyle="-",
marker="o", marker="o",
markersize=3, markersize=3,
zorder=2, zorder=2,
) )
if counts is not None: if counts is not None:
_col_idx = np.where(columns == method)[0] _col_idx = np.where(columns == method)[0]
count = counts[_col_idx].flatten() count = counts[_col_idx].flatten()
for prev, shift, cnt in zip(shift_prevs, shifts, count): for prev, shift, cnt in zip(shift_prevs, shifts, count):
label = f"{cnt}" label = f"{cnt}"
plt.annotate( plt.annotate(
label, label,
(prev, shift), (prev, shift),
textcoords="offset points", textcoords="offset points",
xytext=(0, 10), xytext=(0, 10),
ha="center", ha="center",
color=_cy["color"], color=_cy["color"],
fontsize=12.0, fontsize=12.0,
) )
ax.set(xlabel="dataset shift", ylabel=metric, title=title) ax.set(xlabel="dataset shift", ylabel=metric, title=title)
if legend: if legend:
ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
output_path = env.PLOT_OUT_DIR / f"{title}.png" output_path = env.PLOT_OUT_DIR / f"{title}.png"
fig.savefig(output_path, bbox_inches="tight") fig.savefig(output_path, bbox_inches="tight")
return output_path return output_path

View File

@ -1,59 +1,59 @@
import functools import functools
import os import os
import shutil import shutil
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
from quacc.environment import env from quacc.environment import env
def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame: def combine_dataframes(dfs, df_index=[]) -> pd.DataFrame:
if len(dfs) < 1: if len(dfs) < 1:
raise ValueError raise ValueError
if len(dfs) == 1: if len(dfs) == 1:
return dfs[0] return dfs[0]
df = dfs[0] df = dfs[0]
for ndf in dfs[1:]: for ndf in dfs[1:]:
df = df.join(ndf.set_index(df_index), on=df_index) df = df.join(ndf.set_index(df_index), on=df_index)
return df return df
def avg_group_report(df: pd.DataFrame) -> pd.DataFrame: def avg_group_report(df: pd.DataFrame) -> pd.DataFrame:
def _reduce_func(s1, s2): def _reduce_func(s1, s2):
return {(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()} return {(n1, n2): v + s2[(n1, n2)] for ((n1, n2), v) in s1.items()}
lst = df.to_dict(orient="records")[1:-1] lst = df.to_dict(orient="records")[1:-1]
summed_series = functools.reduce(_reduce_func, lst) summed_series = functools.reduce(_reduce_func, lst)
idx = df.columns.drop([("base", "T"), ("base", "F")]) idx = df.columns.drop([("base", "T"), ("base", "F")])
avg_report = { avg_report = {
(n1, n2): (v / len(lst)) (n1, n2): (v / len(lst))
for ((n1, n2), v) in summed_series.items() for ((n1, n2), v) in summed_series.items()
if n1 != "base" if n1 != "base"
} }
return pd.DataFrame([avg_report], columns=idx) return pd.DataFrame([avg_report], columns=idx)
def fmt_line_md(s): def fmt_line_md(s):
return f"> {s} \n" return f"> {s} \n"
def create_dataser_dir(dir_name, update=False): def create_dataser_dir(dir_name, update=False):
base_out_dir = Path(env.OUT_DIR_NAME) base_out_dir = Path(env.OUT_DIR_NAME)
if not base_out_dir.exists(): if not base_out_dir.exists():
os.mkdir(base_out_dir) os.mkdir(base_out_dir)
dataset_dir = base_out_dir / dir_name dataset_dir = base_out_dir / dir_name
env.OUT_DIR = dataset_dir env.OUT_DIR = dataset_dir
if update: if update:
if not dataset_dir.exists(): if not dataset_dir.exists():
os.mkdir(dataset_dir) os.mkdir(dataset_dir)
else: else:
shutil.rmtree(dataset_dir, ignore_errors=True) shutil.rmtree(dataset_dir, ignore_errors=True)
os.mkdir(dataset_dir) os.mkdir(dataset_dir)
plot_dir_path = dataset_dir / "plot" plot_dir_path = dataset_dir / "plot"
env.PLOT_OUT_DIR = plot_dir_path env.PLOT_OUT_DIR = plot_dir_path
if not plot_dir_path.exists(): if not plot_dir_path.exists():
os.mkdir(plot_dir_path) os.mkdir(plot_dir_path)

View File

@ -1,40 +1,40 @@
## Roadmap ## Roadmap
#### quantificator domain #### quantificator domain
- single multilabel quantificator - single multilabel quantificator
- vector of binary quantificators - vector of binary quantificators
| quantificator | | | | quantificator | | |
|:-------------------:|:--------------:|:--------------:| |:-------------------:|:--------------:|:--------------:|
| true quantificator | true positive | false positive | | true quantificator | true positive | false positive |
| false quantificator | false negative | true negative | | false quantificator | false negative | true negative |
#### dataset split #### dataset split
- train | test - train | test
- classificator C is fit on train - classificator C is fit on train
- quantificator Q is fit on cross validation of C over train - quantificator Q is fit on cross validation of C over train
- train | validation | test - train | validation | test
- classificator C is fit on train - classificator C is fit on train
- quantificator Q is fit on validation - quantificator Q is fit on validation
#### classificator origin #### classificator origin
- black box - black box
- crystal box - crystal box
#### test metrics #### test metrics
- f1_score - f1_score
- K - K
#### models #### models
- classificator - classificator
- quantificator - quantificator

4202
test_mc.md

File diff suppressed because it is too large Load Diff

View File

@ -1,225 +1,225 @@
import pytest import pytest
from quacc.data import ExClassManager as ECM, ExtendedCollection from quacc.data import ExClassManager as ECM, ExtendedCollection
import numpy as np import numpy as np
import scipy.sparse as sp import scipy.sparse as sp
class TestExClassManager: class TestExClassManager:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"true_class,pred_class,result", "true_class,pred_class,result",
[ [
(0, 0, 0), (0, 0, 0),
(0, 1, 1), (0, 1, 1),
(1, 0, 2), (1, 0, 2),
(1, 1, 3), (1, 1, 3),
], ],
) )
def test_get_ex(self, true_class, pred_class, result): def test_get_ex(self, true_class, pred_class, result):
ncl = 2 ncl = 2
assert ECM.get_ex(ncl, true_class, pred_class) == result assert ECM.get_ex(ncl, true_class, pred_class) == result
@pytest.mark.parametrize( @pytest.mark.parametrize(
"ex_class,result", "ex_class,result",
[ [
(0, 0), (0, 0),
(1, 1), (1, 1),
(2, 0), (2, 0),
(3, 1), (3, 1),
], ],
) )
def test_get_pred(self, ex_class, result): def test_get_pred(self, ex_class, result):
ncl = 2 ncl = 2
assert ECM.get_pred(ncl, ex_class) == result assert ECM.get_pred(ncl, ex_class) == result
@pytest.mark.parametrize( @pytest.mark.parametrize(
"ex_class,result", "ex_class,result",
[ [
(0, 0), (0, 0),
(1, 0), (1, 0),
(2, 1), (2, 1),
(3, 1), (3, 1),
], ],
) )
def test_get_true(self, ex_class, result): def test_get_true(self, ex_class, result):
ncl = 2 ncl = 2
assert ECM.get_true(ncl, ex_class) == result assert ECM.get_true(ncl, ex_class) == result
class TestExtendedCollection: class TestExtendedCollection:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instances,result", "instances,result",
[ [
( (
np.asarray( np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
[np.asarray([1, 3]), np.asarray([0, 2])], [np.asarray([1, 3]), np.asarray([0, 2])],
), ),
( (
sp.csr_matrix( sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
[np.asarray([1, 3]), np.asarray([0, 2])], [np.asarray([1, 3]), np.asarray([0, 2])],
), ),
( (
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])], [np.asarray([], dtype=int), np.asarray([0, 1])],
), ),
( (
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[np.asarray([], dtype=int), np.asarray([0, 1])], [np.asarray([], dtype=int), np.asarray([0, 1])],
), ),
( (
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)], [np.asarray([0, 1]), np.asarray([], dtype=int)],
), ),
( (
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[np.asarray([0, 1]), np.asarray([], dtype=int)], [np.asarray([0, 1]), np.asarray([], dtype=int)],
), ),
], ],
) )
def test__split_index_by_pred(self, instances, result): def test__split_index_by_pred(self, instances, result):
ncl = 2 ncl = 2
assert all( assert all(
np.array_equal(a, b) np.array_equal(a, b)
for (a, b) in zip( for (a, b) in zip(
ExtendedCollection._split_index_by_pred(ncl, instances), ExtendedCollection._split_index_by_pred(ncl, instances),
result, result,
) )
) )
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instances,s_inst,norms", "instances,s_inst,norms",
[ [
( (
np.asarray( np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
[ [
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
], ],
[0.5, 0.5], [0.5, 0.5],
), ),
( (
sp.csr_matrix( sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
[ [
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
], ],
[0.5, 0.5], [0.5, 0.5],
), ),
( (
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[ [
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([], dtype=int), np.asarray([], dtype=int),
], ],
[1.0, 0.0], [1.0, 0.0],
), ),
( (
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
[ [
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
sp.csr_matrix([], dtype=int), sp.csr_matrix([], dtype=int),
], ],
[1.0, 0.0], [1.0, 0.0],
), ),
( (
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[ [
np.asarray([], dtype=int), np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
], ],
[0.0, 1.0], [0.0, 1.0],
), ),
( (
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
[ [
sp.csr_matrix([], dtype=int), sp.csr_matrix([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
], ],
[0.0, 1.0], [0.0, 1.0],
), ),
], ],
) )
def test_split_inst_by_pred(self, instances, s_inst, norms): def test_split_inst_by_pred(self, instances, s_inst, norms):
ncl = 2 ncl = 2
_s_inst, _norms = ExtendedCollection.split_inst_by_pred(ncl, instances) _s_inst, _norms = ExtendedCollection.split_inst_by_pred(ncl, instances)
if isinstance(s_inst, np.ndarray): if isinstance(s_inst, np.ndarray):
assert all(np.array_equal(a, b) for (a, b) in zip(_s_inst, s_inst)) assert all(np.array_equal(a, b) for (a, b) in zip(_s_inst, s_inst))
if isinstance(s_inst, sp.csr_matrix): if isinstance(s_inst, sp.csr_matrix):
assert all((a != b).nnz == 0 for (a, b) in zip(_s_inst, s_inst)) assert all((a != b).nnz == 0 for (a, b) in zip(_s_inst, s_inst))
assert all(a == b for (a, b) in zip(_norms, norms)) assert all(a == b for (a, b) in zip(_norms, norms))
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instances,labels,inst0,lbl0,inst1,lbl1", "instances,labels,inst0,lbl0,inst1,lbl1",
[ [
( (
np.asarray( np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
np.asarray([3, 0, 1, 2]), np.asarray([3, 0, 1, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]), np.asarray([0, 1]),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]), np.asarray([1, 0]),
), ),
( (
sp.csr_matrix( sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
np.asarray([3, 0, 1, 2]), np.asarray([3, 0, 1, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]), np.asarray([0, 1]),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]), np.asarray([1, 0]),
), ),
( (
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]), np.asarray([3, 1]),
np.asarray([], dtype=int), np.asarray([], dtype=int),
np.asarray([], dtype=int), np.asarray([], dtype=int),
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]), np.asarray([1, 0]),
), ),
( (
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([3, 1]), np.asarray([3, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)), sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int), np.asarray([], dtype=int),
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([1, 0]), np.asarray([1, 0]),
), ),
( (
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]), np.asarray([0, 2]),
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]), np.asarray([0, 1]),
np.asarray([], dtype=int), np.asarray([], dtype=int),
np.asarray([], dtype=int), np.asarray([], dtype=int),
), ),
( (
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 2]), np.asarray([0, 2]),
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0, 1]), np.asarray([0, 1]),
sp.csr_matrix(np.empty((0, 0), dtype=int)), sp.csr_matrix(np.empty((0, 0), dtype=int)),
np.asarray([], dtype=int), np.asarray([], dtype=int),
), ),
], ],
) )
def test_split_by_pred(self, instances, labels, inst0, lbl0, inst1, lbl1): def test_split_by_pred(self, instances, labels, inst0, lbl0, inst1, lbl1):
ec = ExtendedCollection(instances, labels, classes=range(0, 4)) ec = ExtendedCollection(instances, labels, classes=range(0, 4))
[ec0, ec1] = ec.split_by_pred() [ec0, ec1] = ec.split_by_pred()
if isinstance(instances, np.ndarray): if isinstance(instances, np.ndarray):
assert np.array_equal(ec0.X, inst0) assert np.array_equal(ec0.X, inst0)
assert np.array_equal(ec1.X, inst1) assert np.array_equal(ec1.X, inst1)
if isinstance(instances, sp.csr_matrix): if isinstance(instances, sp.csr_matrix):
assert (ec0.X != inst0).nnz == 0 assert (ec0.X != inst0).nnz == 0
assert (ec1.X != inst1).nnz == 0 assert (ec1.X != inst1).nnz == 0
assert np.array_equal(ec0.y, lbl0) assert np.array_equal(ec0.y, lbl0)
assert np.array_equal(ec1.y, lbl1) assert np.array_equal(ec1.y, lbl1)

View File

@ -1,3 +1,3 @@
class TestDataset: class TestDataset:
pass pass

View File

@ -1,12 +1,12 @@
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.evaluation.baseline import kfcv from quacc.evaluation.baseline import kfcv
class TestBaseline: class TestBaseline:
def test_kfcv(self): def test_kfcv(self):
spambase = Dataset("spambase", n_prevalences=1).get_raw() spambase = Dataset("spambase", n_prevalences=1).get_raw()
c_model = LogisticRegression() c_model = LogisticRegression()
c_model.fit(spambase.train.X, spambase.train.y) c_model.fit(spambase.train.X, spambase.train.y)
assert "f1_score" in kfcv(c_model, spambase.validation) assert "f1_score" in kfcv(c_model, spambase.validation)

View File

@ -1,66 +1,66 @@
import numpy as np import numpy as np
import pytest import pytest
import scipy.sparse as sp import scipy.sparse as sp
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from quacc.method.base import BinaryQuantifierAccuracyEstimator from quacc.method.base import BinaryQuantifierAccuracyEstimator
class TestBQAE: class TestBQAE:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"instances,preds0,preds1,result", "instances,preds0,preds1,result",
[ [
( (
np.asarray( np.asarray(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]), np.asarray([0.15, 0.2, 0.35, 0.3]),
), ),
( (
sp.csr_matrix( sp.csr_matrix(
[[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]] [[0, 0.3, 0.7], [1, 0.54, 0.46], [2, 0.28, 0.72], [3, 0.6, 0.4]]
), ),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.15, 0.2, 0.35, 0.3]), np.asarray([0.15, 0.2, 0.35, 0.3]),
), ),
( (
np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]), np.asarray([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]), np.asarray([0.0, 0.4, 0.0, 0.6]),
), ),
( (
sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]), sp.csr_matrix([[0, 0.3, 0.7], [2, 0.28, 0.72]]),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.0, 0.4, 0.0, 0.6]), np.asarray([0.0, 0.4, 0.0, 0.6]),
), ),
( (
np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]), np.asarray([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]), np.asarray([0.3, 0.0, 0.7, 0.0]),
), ),
( (
sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]), sp.csr_matrix([[1, 0.54, 0.46], [3, 0.6, 0.4]]),
np.asarray([0.3, 0.7]), np.asarray([0.3, 0.7]),
np.asarray([0.4, 0.6]), np.asarray([0.4, 0.6]),
np.asarray([0.3, 0.0, 0.7, 0.0]), np.asarray([0.3, 0.0, 0.7, 0.0]),
), ),
], ],
) )
def test_estimate_ndarray(self, mocker, instances, preds0, preds1, result): def test_estimate_ndarray(self, mocker, instances, preds0, preds1, result):
estimator = BinaryQuantifierAccuracyEstimator(LogisticRegression()) estimator = BinaryQuantifierAccuracyEstimator(LogisticRegression())
estimator.n_classes = 4 estimator.n_classes = 4
with mocker.patch.object(estimator.q_model_0, "quantify"), mocker.patch.object( with mocker.patch.object(estimator.q_model_0, "quantify"), mocker.patch.object(
estimator.q_model_1, "quantify" estimator.q_model_1, "quantify"
): ):
estimator.q_model_0.quantify.return_value = preds0 estimator.q_model_0.quantify.return_value = preds0
estimator.q_model_1.quantify.return_value = preds1 estimator.q_model_1.quantify.return_value = preds1
assert np.array_equal( assert np.array_equal(
estimator.estimate(instances, ext=True), estimator.estimate(instances, ext=True),
result, result,
) )

View File

@ -1,2 +1,2 @@
class TestMCAE: class TestMCAE:
pass pass