forked from moreo/QuaPy
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD
This commit is contained in:
parent
9a08125e7e
commit
a7e87e41f8
|
@ -1,9 +1,13 @@
|
||||||
1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar)
|
|
||||||
2. tablas?
|
2. tablas?
|
||||||
3. fetch dataset (download, unzip, etc.)
|
3. fetch dataset (download, unzip, etc.)
|
||||||
4. model selection
|
4. model selection
|
||||||
5. plots
|
5. plots
|
||||||
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
|
|
||||||
que de todos modos genera un output con el mismo nombre del file
|
|
||||||
7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
|
|
||||||
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
|
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
|
||||||
|
9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo)
|
||||||
|
10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar
|
||||||
|
cosas en el código. Está escrito varias veces un glob(*.txt)
|
||||||
|
11. Quitar las categorias como columnas de los ficheros de prevalences
|
||||||
|
12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
|
||||||
|
13. repair doc of GridSearchQ
|
||||||
|
14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros)
|
||||||
|
15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm
|
|
@ -0,0 +1,84 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import *
|
||||||
|
import quapy.functional as F
|
||||||
|
from data import *
|
||||||
|
import os
|
||||||
|
import constants
|
||||||
|
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
|
||||||
|
|
||||||
|
# LeQua official baselines for task T1A (Binary/Vector)
|
||||||
|
# =====================================================
|
||||||
|
|
||||||
|
predictions_path = os.path.join('predictions', 'T1A')
|
||||||
|
os.makedirs(predictions_path, exist_ok=True)
|
||||||
|
|
||||||
|
models_path = os.path.join('models', 'T1A')
|
||||||
|
os.makedirs(models_path, exist_ok=True)
|
||||||
|
|
||||||
|
pathT1A = './data/T1A/public'
|
||||||
|
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||||
|
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||||
|
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
|
||||||
|
|
||||||
|
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
|
||||||
|
nF = train.instances.shape[1]
|
||||||
|
svd = TruncatedSVD(n_components=300)
|
||||||
|
train.instances = svd.fit_transform(train.instances)
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||||
|
|
||||||
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
print(f'number of training documents: {len(train)}')
|
||||||
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
|
true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||||
|
|
||||||
|
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
|
|
||||||
|
# classifier = CalibratedClassifierCV(LogisticRegression())
|
||||||
|
classifier = LogisticRegression()
|
||||||
|
model = quantifier(classifier).fit(train)
|
||||||
|
quantifier_name = model.__class__.__name__
|
||||||
|
|
||||||
|
predictions = ResultSubmission(categories=['negative', 'positive'])
|
||||||
|
for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
|
||||||
|
desc=quantifier_name, total=len(true_prevalence)):
|
||||||
|
sample = svd.transform(sample)
|
||||||
|
predictions.add(samplename, model.quantify(sample))
|
||||||
|
|
||||||
|
predictions.dump(os.path.join(predictions_path, quantifier_name + '.svd.csv'))
|
||||||
|
pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.svd.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||||
|
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||||
|
|
||||||
|
"""
|
||||||
|
test:
|
||||||
|
CC 0.1859 1.5406
|
||||||
|
ACC 0.0453 0.2840
|
||||||
|
PCC 0.1793 1.7187
|
||||||
|
PACC 0.0287 0.1494
|
||||||
|
EMQ 0.0225 0.1020
|
||||||
|
HDy 0.0631 0.2307
|
||||||
|
|
||||||
|
validation
|
||||||
|
CC 0.1862 1.9587
|
||||||
|
ACC 0.0394 0.2669
|
||||||
|
PCC 0.1789 2.1383
|
||||||
|
PACC 0.0354 0.1587
|
||||||
|
EMQ 0.0224 0.0960
|
||||||
|
HDy 0.0467 0.2121
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,16 @@ from data import *
|
||||||
import os
|
import os
|
||||||
import constants
|
import constants
|
||||||
|
|
||||||
predictions_path = os.path.join('predictions', 'T1A') # binary - vector
|
|
||||||
|
# LeQua official baselines for task T1A (Binary/Vector)
|
||||||
|
# =====================================================
|
||||||
|
|
||||||
|
predictions_path = os.path.join('predictions', 'T1A')
|
||||||
os.makedirs(predictions_path, exist_ok=True)
|
os.makedirs(predictions_path, exist_ok=True)
|
||||||
|
|
||||||
|
models_path = os.path.join('models', 'T1A')
|
||||||
|
os.makedirs(models_path, exist_ok=True)
|
||||||
|
|
||||||
pathT1A = './data/T1A/public'
|
pathT1A = './data/T1A/public'
|
||||||
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||||
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||||
|
@ -35,16 +42,19 @@ true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||||
|
|
||||||
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
|
|
||||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
# classifier = CalibratedClassifierCV(LogisticRegression(C=1))
|
||||||
|
classifier = LogisticRegression(C=1)
|
||||||
model = quantifier(classifier).fit(train)
|
model = quantifier(classifier).fit(train)
|
||||||
quantifier_name = model.__class__.__name__
|
quantifier_name = model.__class__.__name__
|
||||||
|
|
||||||
predictions = ResultSubmission(categories=['negative', 'positive'])
|
predictions = ResultSubmission(categories=['negative', 'positive'])
|
||||||
for samplename, sample in tqdm(gen_load_samples_T1A(T1A_devvectors_path, nF),
|
for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
|
||||||
desc=quantifier_name, total=len(true_prevalence)):
|
desc=quantifier_name, total=len(true_prevalence)):
|
||||||
predictions.add(samplename, model.quantify(sample))
|
predictions.add(samplename, model.quantify(sample))
|
||||||
|
|
||||||
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
|
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
|
||||||
|
pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||||
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||||
|
|
|
@ -0,0 +1,91 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import *
|
||||||
|
import quapy.functional as F
|
||||||
|
from data import *
|
||||||
|
import os
|
||||||
|
import constants
|
||||||
|
|
||||||
|
|
||||||
|
# LeQua official baselines for task T1A (Binary/Vector)
|
||||||
|
# =====================================================
|
||||||
|
|
||||||
|
predictions_path = os.path.join('predictions', 'T1A')
|
||||||
|
os.makedirs(predictions_path, exist_ok=True)
|
||||||
|
|
||||||
|
models_path = os.path.join('models', 'T1A')
|
||||||
|
os.makedirs(models_path, exist_ok=True)
|
||||||
|
|
||||||
|
pathT1A = './data/T1A/public'
|
||||||
|
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||||
|
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||||
|
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
|
||||||
|
|
||||||
|
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
|
||||||
|
nF = train.instances.shape[1]
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||||
|
|
||||||
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
print(f'number of training documents: {len(train)}')
|
||||||
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
|
true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||||
|
|
||||||
|
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples():
|
||||||
|
return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_filename=False)
|
||||||
|
|
||||||
|
|
||||||
|
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||||
|
#classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1)
|
||||||
|
classifier = LogisticRegression()
|
||||||
|
model = quantifier(classifier)
|
||||||
|
print(f'{model.__class__.__name__}: Model selection')
|
||||||
|
model = qp.model_selection.GridSearchQ(
|
||||||
|
model,
|
||||||
|
param_grid,
|
||||||
|
sample_size=None,
|
||||||
|
protocol='gen',
|
||||||
|
error=qp.error.mae,
|
||||||
|
refit=False,
|
||||||
|
verbose=True
|
||||||
|
).fit(train, gen_samples)
|
||||||
|
|
||||||
|
quantifier_name = model.best_model().__class__.__name__
|
||||||
|
print(f'{quantifier_name} mae={model.best_score_:.3f} (params: {model.best_params_})')
|
||||||
|
|
||||||
|
pickle.dump(model.best_model(),
|
||||||
|
open(os.path.join(models_path, quantifier_name+'.modsel.pkl'), 'wb'),
|
||||||
|
protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
test:
|
||||||
|
CC 0.1859 1.5406
|
||||||
|
ACC 0.0453 0.2840
|
||||||
|
PCC 0.1793 1.7187
|
||||||
|
PACC 0.0287 0.1494
|
||||||
|
EMQ 0.0225 0.1020
|
||||||
|
HDy 0.0631 0.2307
|
||||||
|
|
||||||
|
validation
|
||||||
|
CC 0.1862 1.9587
|
||||||
|
ACC 0.0394 0.2669
|
||||||
|
PCC 0.1789 2.1383
|
||||||
|
PACC 0.0354 0.1587
|
||||||
|
EMQ 0.0224 0.0960
|
||||||
|
HDy 0.0467 0.2121
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.method.aggregative import *
|
||||||
|
import quapy.functional as F
|
||||||
|
from data import *
|
||||||
|
import os
|
||||||
|
import constants
|
||||||
|
|
||||||
|
predictions_path = os.path.join('predictions', 'T1B') # multiclass - vector
|
||||||
|
os.makedirs(predictions_path, exist_ok=True)
|
||||||
|
|
||||||
|
pathT1B = './data/T1B/public'
|
||||||
|
T1B_devvectors_path = os.path.join(pathT1B, 'dev_vectors')
|
||||||
|
T1B_devprevalence_path = os.path.join(pathT1B, 'dev_prevalences.csv')
|
||||||
|
T1B_trainpath = os.path.join(pathT1B, 'training_vectors.txt')
|
||||||
|
T1B_catmap = os.path.join(pathT1B, 'training_vectors_label_map.txt')
|
||||||
|
|
||||||
|
train = LabelledCollection.load(T1B_trainpath, load_binary_vectors)
|
||||||
|
nF = train.instances.shape[1]
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = constants.T1B_SAMPLE_SIZE
|
||||||
|
|
||||||
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
print(f'number of training documents: {len(train)}')
|
||||||
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
|
true_prevalence = ResultSubmission.load(T1B_devprevalence_path)
|
||||||
|
|
||||||
|
cat2code, categories = load_category_map(T1B_catmap)
|
||||||
|
|
||||||
|
for quantifier in [PACC]: # [CC, ACC, PCC, PACC, EMQ]:
|
||||||
|
|
||||||
|
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||||
|
model = quantifier(classifier).fit(train)
|
||||||
|
quantifier_name = model.__class__.__name__
|
||||||
|
|
||||||
|
predictions = ResultSubmission(categories=categories)
|
||||||
|
for samplename, sample in tqdm(gen_load_samples_T1(T1B_devvectors_path, nF),
|
||||||
|
desc=quantifier_name, total=len(true_prevalence)):
|
||||||
|
predictions.add(samplename, model.quantify(sample))
|
||||||
|
|
||||||
|
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
|
||||||
|
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||||
|
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,5 +2,6 @@ DEV_SAMPLES = 1000
|
||||||
TEST_SAMPLES = 5000
|
TEST_SAMPLES = 5000
|
||||||
|
|
||||||
T1A_SAMPLE_SIZE = 250
|
T1A_SAMPLE_SIZE = 250
|
||||||
|
T1B_SAMPLE_SIZE = 1000
|
||||||
|
|
||||||
ERROR_TOL=1E-3
|
ERROR_TOL = 1E-3
|
||||||
|
|
|
@ -26,40 +26,45 @@ import constants
|
||||||
def load_category_map(path):
|
def load_category_map(path):
|
||||||
cat2code = {}
|
cat2code = {}
|
||||||
with open(path, 'rt') as fin:
|
with open(path, 'rt') as fin:
|
||||||
category, code = fin.readline().split()
|
for line in fin:
|
||||||
cat2code[category] = int(code)
|
category, code = line.split()
|
||||||
return cat2code
|
cat2code[category] = int(code)
|
||||||
|
code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]
|
||||||
|
return cat2code, code2cat
|
||||||
|
|
||||||
|
|
||||||
def load_binary_vectors(path, nF=None):
|
def load_binary_vectors(path, nF=None):
|
||||||
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_with_groudtruth(path_dir:str, ground_truth_path:str, load_fn, **load_kwargs):
|
def __gen_load_samples_with_groudtruth(path_dir:str, return_filename:bool, ground_truth_path:str, load_fn, **load_kwargs):
|
||||||
true_prevs = ResultSubmission.load(ground_truth_path)
|
true_prevs = ResultSubmission.load(ground_truth_path)
|
||||||
for filename, prevalence in true_prevs.iterrows():
|
for filename, prevalence in true_prevs.iterrows():
|
||||||
sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
|
sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
|
||||||
yield filename, sample, prevalence
|
if return_filename:
|
||||||
|
yield filename, sample, prevalence
|
||||||
|
else:
|
||||||
|
yield sample, prevalence
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_without_groudtruth(path_dir:str, load_fn, **load_kwargs):
|
def __gen_load_samples_without_groudtruth(path_dir:str, return_filename:bool, load_fn, **load_kwargs):
|
||||||
for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
|
for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
|
||||||
sample, _ = load_fn(filepath, **load_kwargs)
|
sample, _ = load_fn(filepath, **load_kwargs)
|
||||||
yield os.path.basename(filepath), sample
|
if return_filename:
|
||||||
|
yield os.path.basename(filepath), sample
|
||||||
|
else:
|
||||||
|
yield sample
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples_T1A(path_dir:str, nF:int, ground_truth_path:str = None):
|
def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_filename=True):
|
||||||
if ground_truth_path is None:
|
if ground_truth_path is None:
|
||||||
for filename, sample in __gen_load_samples_without_groudtruth(path_dir, load_binary_vectors, nF=nF):
|
# the generator function returns tuples (filename:str, sample:csr_matrix)
|
||||||
yield filename, sample
|
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_filename, load_binary_vectors, nF=nF)
|
||||||
else:
|
else:
|
||||||
for filename, sample, prevalence in __gen_load_samples_with_groudtruth(path_dir, ground_truth_path, load_binary_vectors, nF=nF):
|
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
|
||||||
yield filename, sample, prevalence
|
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_filename, ground_truth_path, load_binary_vectors, nF=nF)
|
||||||
|
for r in gen_fn:
|
||||||
|
yield r
|
||||||
def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
|
|
||||||
# for ... : yield
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
|
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
|
||||||
|
|
|
@ -9,6 +9,7 @@ from quapy.method.base import BaseQuantifier
|
||||||
from quapy.util import temp_seed
|
from quapy.util import temp_seed
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
def artificial_prevalence_prediction(
|
def artificial_prevalence_prediction(
|
||||||
|
@ -78,6 +79,27 @@ def natural_prevalence_prediction(
|
||||||
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
|
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
|
||||||
|
|
||||||
|
|
||||||
|
def gen_prevalence_prediction(model: BaseQuantifier, gen_fn: Callable, eval_budget=None):
|
||||||
|
if not inspect.isgenerator(gen_fn()):
|
||||||
|
raise ValueError('param "gen_fun" is not a generator')
|
||||||
|
|
||||||
|
if not isinstance(eval_budget, int):
|
||||||
|
eval_budget = -1
|
||||||
|
|
||||||
|
true_prevalences, estim_prevalences = [], []
|
||||||
|
for sample_instances, true_prev in gen_fn():
|
||||||
|
true_prevalences.append(true_prev)
|
||||||
|
estim_prevalences.append(model.quantify(sample_instances))
|
||||||
|
eval_budget -= 1
|
||||||
|
if eval_budget == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
true_prevalences = np.asarray(true_prevalences)
|
||||||
|
estim_prevalences = np.asarray(estim_prevalences)
|
||||||
|
|
||||||
|
return true_prevalences, estim_prevalences
|
||||||
|
|
||||||
|
|
||||||
def _predict_from_indexes(
|
def _predict_from_indexes(
|
||||||
indexes,
|
indexes,
|
||||||
model: BaseQuantifier,
|
model: BaseQuantifier,
|
||||||
|
|
|
@ -5,8 +5,9 @@ from typing import Union, Callable
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.data.base import LabelledCollection
|
from quapy.data.base import LabelledCollection
|
||||||
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction
|
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
|
||||||
from quapy.method.aggregative import BaseQuantifier
|
from quapy.method.aggregative import BaseQuantifier
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
|
||||||
class GridSearchQ(BaseQuantifier):
|
class GridSearchQ(BaseQuantifier):
|
||||||
|
@ -74,8 +75,10 @@ class GridSearchQ(BaseQuantifier):
|
||||||
self.timeout = timeout
|
self.timeout = timeout
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.__check_error(error)
|
self.__check_error(error)
|
||||||
assert self.protocol in {'app', 'npp'}, \
|
assert self.protocol in {'app', 'npp', 'gen'}, \
|
||||||
'unknown protocol; valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence protocols'
|
'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
|
||||||
|
'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
|
||||||
|
'sample (instances) and their prevalence (ndarray) at each iteration.'
|
||||||
if self.protocol == 'npp':
|
if self.protocol == 'npp':
|
||||||
if self.n_repetitions is None or self.n_repetitions == 1:
|
if self.n_repetitions is None or self.n_repetitions == 1:
|
||||||
if self.eval_budget is not None:
|
if self.eval_budget is not None:
|
||||||
|
@ -99,9 +102,14 @@ class GridSearchQ(BaseQuantifier):
|
||||||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||||
return training, validation
|
return training, validation
|
||||||
|
elif self.protocol=='gen' and inspect.isgenerator(validation()):
|
||||||
|
return training, validation
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||||
f'proportion of training documents to extract (type found: {type(validation)})')
|
f'proportion of training documents to extract (type found: {type(validation)}). '
|
||||||
|
f'Optionally, "validation" can be a callable function returning a generator that yields '
|
||||||
|
f'the sample instances along with their true prevalence at each iteration by '
|
||||||
|
f'setting protocol="gen".')
|
||||||
|
|
||||||
def __check_error(self, error):
|
def __check_error(self, error):
|
||||||
if error in qp.error.QUANTIFICATION_ERROR:
|
if error in qp.error.QUANTIFICATION_ERROR:
|
||||||
|
@ -132,6 +140,8 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return natural_prevalence_prediction(
|
return natural_prevalence_prediction(
|
||||||
model, val_split, self.sample_size,
|
model, val_split, self.sample_size,
|
||||||
**commons)
|
**commons)
|
||||||
|
elif self.protocol == 'gen':
|
||||||
|
return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
|
||||||
else:
|
else:
|
||||||
raise ValueError('unknown protocol')
|
raise ValueError('unknown protocol')
|
||||||
|
|
||||||
|
@ -144,7 +154,8 @@ class GridSearchQ(BaseQuantifier):
|
||||||
if val_split is None:
|
if val_split is None:
|
||||||
val_split = self.val_split
|
val_split = self.val_split
|
||||||
training, val_split = self.__check_training_validation(training, val_split)
|
training, val_split = self.__check_training_validation(training, val_split)
|
||||||
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
if self.protocol != 'gen':
|
||||||
|
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
||||||
|
|
||||||
params_keys = list(self.param_grid.keys())
|
params_keys = list(self.param_grid.keys())
|
||||||
params_values = list(self.param_grid.values())
|
params_values = list(self.param_grid.values())
|
||||||
|
@ -192,8 +203,6 @@ class GridSearchQ(BaseQuantifier):
|
||||||
raise TimeoutError('all jobs took more than the timeout time to end')
|
raise TimeoutError('all jobs took more than the timeout time to end')
|
||||||
|
|
||||||
self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
||||||
# model.set_params(**self.best_params_)
|
|
||||||
# self.best_model_ = deepcopy(model)
|
|
||||||
|
|
||||||
if self.refit:
|
if self.refit:
|
||||||
self.sout(f'refitting on the whole development set')
|
self.sout(f'refitting on the whole development set')
|
||||||
|
@ -203,11 +212,11 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
def quantify(self, instances):
|
def quantify(self, instances):
|
||||||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||||
return self.best_model_.quantify(instances)
|
return self.best_model().quantify(instances)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def classes_(self):
|
def classes_(self):
|
||||||
return self.best_model_.classes_
|
return self.best_model().classes_
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
self.param_grid = parameters
|
self.param_grid = parameters
|
||||||
|
|
Loading…
Reference in New Issue