merged
This commit is contained in:
commit
4cd47cdf9f
|
@ -1,8 +1,13 @@
|
|||
1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar)
|
||||
2. tablas?
|
||||
3. fetch dataset (download, unzip, etc.)
|
||||
4. model selection
|
||||
5. plots
|
||||
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
|
||||
que de todos modos genera un output con el mismo nombre del file
|
||||
7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
|
||||
8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
|
||||
9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo)
|
||||
10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar
|
||||
cosas en el código. Está escrito varias veces un glob(*.txt)
|
||||
11. Quitar las categorias como columnas de los ficheros de prevalences
|
||||
12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
|
||||
13. repair doc of GridSearchQ
|
||||
14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros)
|
||||
15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm
|
|
@ -0,0 +1,84 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
import quapy.functional as F
|
||||
from data import *
|
||||
import os
|
||||
import constants
|
||||
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
|
||||
|
||||
# LeQua official baselines for task T1A (Binary/Vector)
|
||||
# =====================================================
|
||||
|
||||
predictions_path = os.path.join('predictions', 'T1A')
|
||||
os.makedirs(predictions_path, exist_ok=True)
|
||||
|
||||
models_path = os.path.join('models', 'T1A')
|
||||
os.makedirs(models_path, exist_ok=True)
|
||||
|
||||
pathT1A = './data/T1A/public'
|
||||
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
|
||||
|
||||
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
|
||||
nF = train.instances.shape[1]
|
||||
svd = TruncatedSVD(n_components=300)
|
||||
train.instances = svd.fit_transform(train.instances)
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||
|
||||
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||
|
||||
# classifier = CalibratedClassifierCV(LogisticRegression())
|
||||
classifier = LogisticRegression()
|
||||
model = quantifier(classifier).fit(train)
|
||||
quantifier_name = model.__class__.__name__
|
||||
|
||||
predictions = ResultSubmission(categories=['negative', 'positive'])
|
||||
for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
|
||||
desc=quantifier_name, total=len(true_prevalence)):
|
||||
sample = svd.transform(sample)
|
||||
predictions.add(samplename, model.quantify(sample))
|
||||
|
||||
predictions.dump(os.path.join(predictions_path, quantifier_name + '.svd.csv'))
|
||||
pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.svd.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||
|
||||
"""
|
||||
test:
|
||||
CC 0.1859 1.5406
|
||||
ACC 0.0453 0.2840
|
||||
PCC 0.1793 1.7187
|
||||
PACC 0.0287 0.1494
|
||||
EMQ 0.0225 0.1020
|
||||
HDy 0.0631 0.2307
|
||||
|
||||
validation
|
||||
CC 0.1862 1.9587
|
||||
ACC 0.0394 0.2669
|
||||
PCC 0.1789 2.1383
|
||||
PACC 0.0354 0.1587
|
||||
EMQ 0.0224 0.0960
|
||||
HDy 0.0467 0.2121
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
import quapy.functional as F
|
||||
from data import *
|
||||
import os
|
||||
import constants
|
||||
|
||||
|
||||
# LeQua official baselines for task T1A (Binary/Vector)
|
||||
# =====================================================
|
||||
|
||||
predictions_path = os.path.join('predictions', 'T1A')
|
||||
os.makedirs(predictions_path, exist_ok=True)
|
||||
|
||||
models_path = os.path.join('models', 'T1A')
|
||||
os.makedirs(models_path, exist_ok=True)
|
||||
|
||||
pathT1A = './data/T1A/public'
|
||||
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
|
||||
|
||||
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
|
||||
nF = train.instances.shape[1]
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||
|
||||
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||
|
||||
# classifier = CalibratedClassifierCV(LogisticRegression(C=1))
|
||||
classifier = LogisticRegression(C=1)
|
||||
model = quantifier(classifier).fit(train)
|
||||
quantifier_name = model.__class__.__name__
|
||||
|
||||
predictions = ResultSubmission(categories=['negative', 'positive'])
|
||||
for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
|
||||
desc=quantifier_name, total=len(true_prevalence)):
|
||||
predictions.add(samplename, model.quantify(sample))
|
||||
|
||||
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
|
||||
pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||
|
||||
"""
|
||||
test:
|
||||
CC 0.1859 1.5406
|
||||
ACC 0.0453 0.2840
|
||||
PCC 0.1793 1.7187
|
||||
PACC 0.0287 0.1494
|
||||
EMQ 0.0225 0.1020
|
||||
HDy 0.0631 0.2307
|
||||
|
||||
validation
|
||||
CC 0.1862 1.9587
|
||||
ACC 0.0394 0.2669
|
||||
PCC 0.1789 2.1383
|
||||
PACC 0.0354 0.1587
|
||||
EMQ 0.0224 0.0960
|
||||
HDy 0.0467 0.2121
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
import quapy.functional as F
|
||||
from data import *
|
||||
import os
|
||||
import constants
|
||||
|
||||
|
||||
# LeQua official baselines for task T1A (Binary/Vector)
|
||||
# =====================================================
|
||||
|
||||
predictions_path = os.path.join('predictions', 'T1A')
|
||||
os.makedirs(predictions_path, exist_ok=True)
|
||||
|
||||
models_path = os.path.join('models', 'T1A')
|
||||
os.makedirs(models_path, exist_ok=True)
|
||||
|
||||
pathT1A = './data/T1A/public'
|
||||
T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
|
||||
T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
|
||||
T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
|
||||
|
||||
train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
|
||||
nF = train.instances.shape[1]
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
|
||||
|
||||
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||
|
||||
|
||||
def gen_samples():
|
||||
return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_filename=False)
|
||||
|
||||
|
||||
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||
#classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1)
|
||||
classifier = LogisticRegression()
|
||||
model = quantifier(classifier)
|
||||
print(f'{model.__class__.__name__}: Model selection')
|
||||
model = qp.model_selection.GridSearchQ(
|
||||
model,
|
||||
param_grid,
|
||||
sample_size=None,
|
||||
protocol='gen',
|
||||
error=qp.error.mae,
|
||||
refit=False,
|
||||
verbose=True
|
||||
).fit(train, gen_samples)
|
||||
|
||||
quantifier_name = model.best_model().__class__.__name__
|
||||
print(f'{quantifier_name} mae={model.best_score_:.3f} (params: {model.best_params_})')
|
||||
|
||||
pickle.dump(model.best_model(),
|
||||
open(os.path.join(models_path, quantifier_name+'.modsel.pkl'), 'wb'),
|
||||
protocol=pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
"""
|
||||
test:
|
||||
CC 0.1859 1.5406
|
||||
ACC 0.0453 0.2840
|
||||
PCC 0.1793 1.7187
|
||||
PACC 0.0287 0.1494
|
||||
EMQ 0.0225 0.1020
|
||||
HDy 0.0631 0.2307
|
||||
|
||||
validation
|
||||
CC 0.1862 1.9587
|
||||
ACC 0.0394 0.2669
|
||||
PCC 0.1789 2.1383
|
||||
PACC 0.0354 0.1587
|
||||
EMQ 0.0224 0.0960
|
||||
HDy 0.0467 0.2121
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
import quapy.functional as F
|
||||
from data import *
|
||||
import os
|
||||
import constants
|
||||
|
||||
predictions_path = os.path.join('predictions', 'T1B') # multiclass - vector
|
||||
os.makedirs(predictions_path, exist_ok=True)
|
||||
|
||||
pathT1B = './data/T1B/public'
|
||||
T1B_devvectors_path = os.path.join(pathT1B, 'dev_vectors')
|
||||
T1B_devprevalence_path = os.path.join(pathT1B, 'dev_prevalences.csv')
|
||||
T1B_trainpath = os.path.join(pathT1B, 'training_vectors.txt')
|
||||
T1B_catmap = os.path.join(pathT1B, 'training_vectors_label_map.txt')
|
||||
|
||||
train = LabelledCollection.load(T1B_trainpath, load_binary_vectors)
|
||||
nF = train.instances.shape[1]
|
||||
|
||||
qp.environ['SAMPLE_SIZE'] = constants.T1B_SAMPLE_SIZE
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
true_prevalence = ResultSubmission.load(T1B_devprevalence_path)
|
||||
|
||||
cat2code, categories = load_category_map(T1B_catmap)
|
||||
|
||||
for quantifier in [PACC]: # [CC, ACC, PCC, PACC, EMQ]:
|
||||
|
||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||
model = quantifier(classifier).fit(train)
|
||||
quantifier_name = model.__class__.__name__
|
||||
|
||||
predictions = ResultSubmission(categories=categories)
|
||||
for samplename, sample in tqdm(gen_load_samples_T1(T1B_devvectors_path, nF),
|
||||
desc=quantifier_name, total=len(true_prevalence)):
|
||||
predictions.add(samplename, model.quantify(sample))
|
||||
|
||||
predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
|
||||
mae, mrae = evaluate_submission(true_prevalence, predictions)
|
||||
print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
DEV_SAMPLES = 1000
|
||||
TEST_SAMPLES = 5000
|
||||
|
||||
T1A_SAMPLE_SIZE = 250
|
||||
T1B_SAMPLE_SIZE = 1000
|
||||
|
||||
ERROR_TOL = 1E-3
|
|
@ -7,6 +7,9 @@ import quapy as qp
|
|||
import numpy as np
|
||||
import sklearn
|
||||
import re
|
||||
from glob import glob
|
||||
|
||||
import constants
|
||||
|
||||
|
||||
# def load_binary_raw_document(path):
|
||||
|
@ -20,19 +23,48 @@ import re
|
|||
# def load_multiclass_raw_document(path):
|
||||
# return qp.data.from_text(path, verbose=0, class2int=False)
|
||||
|
||||
def load_category_map(path):
|
||||
cat2code = {}
|
||||
with open(path, 'rt') as fin:
|
||||
for line in fin:
|
||||
category, code = line.split()
|
||||
cat2code[category] = int(code)
|
||||
code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]
|
||||
return cat2code, code2cat
|
||||
|
||||
|
||||
def load_binary_vectors(path, nF=None):
|
||||
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
||||
|
||||
|
||||
def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None):
|
||||
# for ... : yield
|
||||
pass
|
||||
def __gen_load_samples_with_groudtruth(path_dir:str, return_filename:bool, ground_truth_path:str, load_fn, **load_kwargs):
|
||||
true_prevs = ResultSubmission.load(ground_truth_path)
|
||||
for filename, prevalence in true_prevs.iterrows():
|
||||
sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
|
||||
if return_filename:
|
||||
yield filename, sample, prevalence
|
||||
else:
|
||||
yield sample, prevalence
|
||||
|
||||
|
||||
def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
|
||||
# for ... : yield
|
||||
pass
|
||||
def __gen_load_samples_without_groudtruth(path_dir:str, return_filename:bool, load_fn, **load_kwargs):
|
||||
for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
|
||||
sample, _ = load_fn(filepath, **load_kwargs)
|
||||
if return_filename:
|
||||
yield os.path.basename(filepath), sample
|
||||
else:
|
||||
yield sample
|
||||
|
||||
|
||||
def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_filename=True):
|
||||
if ground_truth_path is None:
|
||||
# the generator function returns tuples (filename:str, sample:csr_matrix)
|
||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_filename, load_binary_vectors, nF=nF)
|
||||
else:
|
||||
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
|
||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_filename, ground_truth_path, load_binary_vectors, nF=nF)
|
||||
for r in gen_fn:
|
||||
yield r
|
||||
|
||||
|
||||
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
|
||||
|
@ -46,9 +78,6 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
|
|||
|
||||
|
||||
class ResultSubmission:
|
||||
DEV_LEN = 1000
|
||||
TEST_LEN = 5000
|
||||
ERROR_TOL = 1E-3
|
||||
|
||||
def __init__(self, categories: List[str]):
|
||||
if not isinstance(categories, list) or len(categories) < 2:
|
||||
|
@ -80,9 +109,9 @@ class ResultSubmission:
|
|||
raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
|
||||
if (prevalence_values<0).any() or (prevalence_values>1).any():
|
||||
raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"')
|
||||
if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL:
|
||||
if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:
|
||||
raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"'
|
||||
f'(error tolerance {ResultSubmission.ERROR_TOL})')
|
||||
f'(error tolerance {constants.ERROR_TOL})')
|
||||
|
||||
new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)])
|
||||
self.df = self.df.append(new_entry, ignore_index=True)
|
||||
|
@ -93,7 +122,7 @@ class ResultSubmission:
|
|||
@classmethod
|
||||
def load(cls, path: str) -> 'ResultSubmission':
|
||||
df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True)
|
||||
r = ResultSubmission(categories=df.columns.values.tolist())
|
||||
r = ResultSubmission(categories=df.columns.values[1:].tolist())
|
||||
r.inferred_type = inferred_type
|
||||
r.df = df
|
||||
return r
|
||||
|
@ -102,13 +131,19 @@ class ResultSubmission:
|
|||
ResultSubmission.check_dataframe_format(self.df)
|
||||
self.df.to_csv(path)
|
||||
|
||||
def get(self, sample_name:str):
|
||||
def prevalence(self, sample_name:str):
|
||||
sel = self.df.loc[self.df['filename'] == sample_name]
|
||||
if sel.empty:
|
||||
return None
|
||||
else:
|
||||
return sel.loc[:,self.df.columns[1]:].values.flatten()
|
||||
|
||||
def iterrows(self):
|
||||
for index, row in self.df.iterrows():
|
||||
filename = row.filename
|
||||
prevalence = row[self.df.columns[1]:].values.flatten()
|
||||
yield filename, prevalence
|
||||
|
||||
@classmethod
|
||||
def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
|
||||
df = pd.read_csv(path, index_col=0)
|
||||
|
@ -116,7 +151,7 @@ class ResultSubmission:
|
|||
|
||||
@classmethod
|
||||
def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
|
||||
hint_path = '' # if given, show the data path in the error messages
|
||||
hint_path = '' # if given, show the data path in the error message
|
||||
if path is not None:
|
||||
hint_path = f' in {path}'
|
||||
|
||||
|
@ -125,33 +160,33 @@ class ResultSubmission:
|
|||
|
||||
if df.empty:
|
||||
raise ValueError(f'error{hint_path}: results file is empty')
|
||||
elif len(df) == ResultSubmission.DEV_LEN:
|
||||
elif len(df) == constants.DEV_SAMPLES:
|
||||
inferred_type = 'dev'
|
||||
expected_len = ResultSubmission.DEV_LEN
|
||||
elif len(df) == ResultSubmission.TEST_LEN:
|
||||
expected_len = constants.DEV_SAMPLES
|
||||
elif len(df) == constants.TEST_SAMPLES:
|
||||
inferred_type = 'test'
|
||||
expected_len = ResultSubmission.TEST_LEN
|
||||
expected_len = constants.TEST_SAMPLES
|
||||
else:
|
||||
raise ValueError(f'wrong number of prevalence values found{hint_path}; '
|
||||
f'expected {ResultSubmission.DEV_LEN} for development sets and '
|
||||
f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}')
|
||||
f'expected {constants.DEV_SAMPLES} for development sets and '
|
||||
f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')
|
||||
|
||||
set_names = frozenset(df.filename)
|
||||
for i in range(expected_len):
|
||||
if f'{inferred_type}_sample_{i}.txt' not in set_names:
|
||||
raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type '
|
||||
raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type '
|
||||
f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing '
|
||||
f'(among perhaps many others)')
|
||||
|
||||
for category_name in df.columns[1:]:
|
||||
if (df[category_name] < 0).any() or (df[category_name] > 1).any():
|
||||
raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]')
|
||||
raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]')
|
||||
|
||||
prevs = df.loc[:, df.columns[1]:].values
|
||||
round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL
|
||||
round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL
|
||||
if round_errors.any():
|
||||
raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
|
||||
f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), '
|
||||
f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '
|
||||
f'probably due to some rounding errors.')
|
||||
|
||||
if return_inferred_type:
|
||||
|
@ -163,20 +198,31 @@ class ResultSubmission:
|
|||
self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1)
|
||||
self.categories = sorted(self.categories)
|
||||
|
||||
def filenames(self):
|
||||
return self.df.filename.values
|
||||
|
||||
|
||||
def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True):
|
||||
def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):
|
||||
if sample_size is None:
|
||||
if qp.environ['SAMPLE_SIZE'] is None:
|
||||
raise ValueError('Relative Absolute Error cannot be computed: '
|
||||
'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
|
||||
else:
|
||||
sample_size = qp.environ['SAMPLE_SIZE']
|
||||
|
||||
if len(true_prevs) != len(predicted_prevs):
|
||||
raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries '
|
||||
f'while predictions contain {len(predicted_prevs)} entries')
|
||||
raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '
|
||||
f'while the file of predictions contain {len(predicted_prevs)} entries')
|
||||
true_prevs.sort_categories()
|
||||
predicted_prevs.sort_categories()
|
||||
if true_prevs.categories != predicted_prevs.categories:
|
||||
raise ValueError(f'these result files are not comparable since the categories are different')
|
||||
raise ValueError(f'these result files are not comparable since the categories are different: '
|
||||
f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}')
|
||||
ae, rae = [], []
|
||||
for sample_name in true_prevs.df.filename.values:
|
||||
ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name)))
|
||||
rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size))
|
||||
for sample_name, true_prevalence in true_prevs.iterrows():
|
||||
pred_prevalence = predicted_prevs.prevalence(sample_name)
|
||||
ae.append(qp.error.ae(true_prevalence, pred_prevalence))
|
||||
rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
|
||||
ae = np.asarray(ae)
|
||||
rae = np.asarray(rae)
|
||||
if average:
|
||||
|
@ -187,21 +233,6 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
|
|||
|
||||
|
||||
|
||||
# r = ResultSubmission(['negative', 'positive'])
|
||||
# from tqdm import tqdm
|
||||
# for i in tqdm(range(1000), total=1000):
|
||||
# r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5]))
|
||||
# r.dump('./path.csv')
|
||||
|
||||
# r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
|
||||
# t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv')
|
||||
# print(r.df)
|
||||
# print(r.get('dev_sample_10.txt'))
|
||||
# print(evaluate_submission(r, t))
|
||||
|
||||
# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
|
||||
#
|
||||
# print(s)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
import argparse
|
||||
import quapy as qp
|
||||
from data import ResultSubmission, evaluate_submission
|
||||
import constants
|
||||
import os
|
||||
|
||||
"""
|
||||
LeQua2022 Official evaluation script
|
||||
"""
|
||||
|
||||
def main(args):
|
||||
if args.task in {'T1A'}:
|
||||
qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
|
||||
true_prev = ResultSubmission.load(args.true_prevalences)
|
||||
pred_prev = ResultSubmission.load(args.pred_prevalences)
|
||||
mae, mrae = evaluate_submission(true_prev, pred_prev)
|
||||
print(f'MAE: {mae:.4f}')
|
||||
print(f'MRAE: {mrae:.4f}')
|
||||
|
||||
if args.output is not None:
|
||||
outdir = os.path.dirname(args.output)
|
||||
if outdir:
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(args.output, 'wt') as foo:
|
||||
foo.write(f'MAE: {mae:.4f}\n')
|
||||
foo.write(f'MRAE: {mrae:.4f}\n')
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script')
|
||||
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
|
||||
help='Task name (T1A, T1B, T2A, T2B)')
|
||||
parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str,
|
||||
help='Path of ground truth prevalence values file (.csv)')
|
||||
parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str,
|
||||
help='Path of predicted prevalence values file (.csv)')
|
||||
parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None,
|
||||
help='Path where to store the evaluation scores')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
|
@ -0,0 +1,27 @@
|
|||
import argparse
|
||||
import quapy as qp
|
||||
from data import ResultSubmission, evaluate_submission
|
||||
import constants
|
||||
import os
|
||||
|
||||
"""
|
||||
LeQua2022 Official format-checker script
|
||||
"""
|
||||
|
||||
def main(args):
|
||||
try:
|
||||
ResultSubmission.check_file_format(args.prevalence_file)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print('Format check: not passed')
|
||||
else:
|
||||
print('Format check: passed')
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script')
|
||||
parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str,
|
||||
help='Path of the file containing prevalence values to check')
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
|
@ -1,89 +0,0 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
import quapy.functional as F
|
||||
from data import load_binary_vectors
|
||||
import os
|
||||
|
||||
path_binary_vector = './data/T1A'
|
||||
result_path = os.path.join('results', 'T1A') # binary - vector
|
||||
os.makedirs(result_path, exist_ok=True)
|
||||
|
||||
train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
|
||||
|
||||
train = LabelledCollection.load(train_file, load_binary_vectors)
|
||||
|
||||
nF = train.instances.shape[1]
|
||||
|
||||
print(f'number of classes: {len(train.classes_)}')
|
||||
print(f'number of training documents: {len(train)}')
|
||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||
print(f'training matrix shape: {train.instances.shape}')
|
||||
|
||||
dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0)
|
||||
print(dev_prev)
|
||||
|
||||
|
||||
scores = {}
|
||||
for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
|
||||
|
||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||
model = quantifier(classifier).fit(train)
|
||||
quantifier_name = model.__class__.__name__
|
||||
|
||||
scores[quantifier_name]={}
|
||||
for sample_set, sample_size in [('dev', 1000)]:
|
||||
ae_errors, rae_errors = [], []
|
||||
for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'):
|
||||
filename = row['filename']
|
||||
prev_true = row[1:].values
|
||||
sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
|
||||
sample, _ = load_binary_vectors(sample_path, nF)
|
||||
qp.environ['SAMPLE_SIZE'] = sample.shape[0]
|
||||
prev_estim = model.quantify(sample)
|
||||
# prev_true = sample.prevalence()
|
||||
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
||||
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
||||
|
||||
ae_errors = np.asarray(ae_errors)
|
||||
rae_errors = np.asarray(rae_errors)
|
||||
|
||||
mae = ae_errors.mean()
|
||||
mrae = rae_errors.mean()
|
||||
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
||||
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
||||
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
||||
|
||||
for model in scores:
|
||||
for sample_set in ['validation']:#, 'test']:
|
||||
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
||||
|
||||
|
||||
"""
|
||||
test:
|
||||
CC 0.1859 1.5406
|
||||
ACC 0.0453 0.2840
|
||||
PCC 0.1793 1.7187
|
||||
PACC 0.0287 0.1494
|
||||
EMQ 0.0225 0.1020
|
||||
HDy 0.0631 0.2307
|
||||
|
||||
validation
|
||||
CC 0.1862 1.9587
|
||||
ACC 0.0394 0.2669
|
||||
PCC 0.1789 2.1383
|
||||
PACC 0.0354 0.1587
|
||||
EMQ 0.0224 0.0960
|
||||
HDy 0.0467 0.2121
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
import argparse
|
||||
import quapy as qp
|
||||
from data import ResultSubmission, evaluate_submission
|
||||
import constants
|
||||
import os
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
from data import gen_load_samples_T1, load_category_map
|
||||
from glob import glob
|
||||
import constants
|
||||
|
||||
"""
|
||||
LeQua2022 prediction script
|
||||
"""
|
||||
|
||||
def main(args):
|
||||
|
||||
# check the number of samples
|
||||
nsamples = len(glob(os.path.join(args.samples, '*.txt')))
|
||||
if nsamples not in {constants.DEV_SAMPLES, constants.TEST_SAMPLES}:
|
||||
print(f'Warning: The number of samples does neither coincide with the expected number of '
|
||||
f'dev samples ({constants.DEV_SAMPLES}) nor with the expected number of '
|
||||
f'test samples ({constants.TEST_SAMPLES}).')
|
||||
|
||||
_, categories = load_category_map(args.catmap)
|
||||
|
||||
# load pickled model
|
||||
model = pickle.load(open(args.model, 'rb'))
|
||||
|
||||
# predictions
|
||||
predictions = ResultSubmission(categories=categories)
|
||||
for samplename, sample in tqdm(gen_load_samples_T1(args.samples, args.nf),
|
||||
desc='predicting', total=nsamples):
|
||||
predictions.add(samplename, model.quantify(sample))
|
||||
|
||||
# saving
|
||||
basedir = os.path.basename(args.output)
|
||||
if basedir:
|
||||
os.makedirs(basedir, exist_ok=True)
|
||||
predictions.dump(args.output)
|
||||
|
||||
|
||||
if __name__=='__main__':
|
||||
parser = argparse.ArgumentParser(description='LeQua2022 prediction script')
|
||||
parser.add_argument('model', metavar='MODEL-PATH', type=str,
|
||||
help='Path of saved model')
|
||||
parser.add_argument('samples', metavar='SAMPLES-PATH', type=str,
|
||||
help='Path to the directory containing the samples')
|
||||
parser.add_argument('output', metavar='PREDICTIONS-PATH', type=str,
|
||||
help='Path where to store the predictions file')
|
||||
parser.add_argument('catmap', metavar='CATEGORY-MAP-PATH', type=str,
|
||||
help='Path to the category map file')
|
||||
parser.add_argument('nf', metavar='NUM-FEATURES', type=int,
|
||||
help='Number of features seen during training')
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.samples):
|
||||
raise FileNotFoundError(f'path {args.samples} does not exist')
|
||||
if not os.path.isdir(args.samples):
|
||||
raise ValueError(f'path {args.samples} is not a valid directory')
|
||||
|
||||
main(args)
|
|
@ -149,7 +149,7 @@ class IndexTransformer:
|
|||
|
||||
def index(self, documents):
|
||||
vocab = self.vocabulary_.copy()
|
||||
return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
||||
return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
||||
|
||||
def fit_transform(self, X, n_jobs=-1):
|
||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||
|
|
|
@ -9,6 +9,7 @@ from quapy.method.base import BaseQuantifier
|
|||
from quapy.util import temp_seed
|
||||
import quapy.functional as F
|
||||
import pandas as pd
|
||||
import inspect
|
||||
|
||||
|
||||
def artificial_prevalence_prediction(
|
||||
|
@ -78,6 +79,27 @@ def natural_prevalence_prediction(
|
|||
return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
|
||||
|
||||
|
||||
def gen_prevalence_prediction(model: BaseQuantifier, gen_fn: Callable, eval_budget=None):
|
||||
if not inspect.isgenerator(gen_fn()):
|
||||
raise ValueError('param "gen_fun" is not a generator')
|
||||
|
||||
if not isinstance(eval_budget, int):
|
||||
eval_budget = -1
|
||||
|
||||
true_prevalences, estim_prevalences = [], []
|
||||
for sample_instances, true_prev in gen_fn():
|
||||
true_prevalences.append(true_prev)
|
||||
estim_prevalences.append(model.quantify(sample_instances))
|
||||
eval_budget -= 1
|
||||
if eval_budget == 0:
|
||||
break
|
||||
|
||||
true_prevalences = np.asarray(true_prevalences)
|
||||
estim_prevalences = np.asarray(estim_prevalences)
|
||||
|
||||
return true_prevalences, estim_prevalences
|
||||
|
||||
|
||||
def _predict_from_indexes(
|
||||
indexes,
|
||||
model: BaseQuantifier,
|
||||
|
|
|
@ -5,8 +5,9 @@ from typing import Union, Callable
|
|||
|
||||
import quapy as qp
|
||||
from quapy.data.base import LabelledCollection
|
||||
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction
|
||||
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
|
||||
from quapy.method.aggregative import BaseQuantifier
|
||||
import inspect
|
||||
|
||||
|
||||
class GridSearchQ(BaseQuantifier):
|
||||
|
@ -74,8 +75,10 @@ class GridSearchQ(BaseQuantifier):
|
|||
self.timeout = timeout
|
||||
self.verbose = verbose
|
||||
self.__check_error(error)
|
||||
assert self.protocol in {'app', 'npp'}, \
|
||||
'unknown protocol; valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence protocols'
|
||||
assert self.protocol in {'app', 'npp', 'gen'}, \
|
||||
'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
|
||||
'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
|
||||
'sample (instances) and their prevalence (ndarray) at each iteration.'
|
||||
if self.protocol == 'npp':
|
||||
if self.n_repetitions is None or self.n_repetitions == 1:
|
||||
if self.eval_budget is not None:
|
||||
|
@ -99,9 +102,14 @@ class GridSearchQ(BaseQuantifier):
|
|||
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
|
||||
training, validation = training.split_stratified(train_prop=1 - validation)
|
||||
return training, validation
|
||||
elif self.protocol=='gen' and inspect.isgenerator(validation()):
|
||||
return training, validation
|
||||
else:
|
||||
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
|
||||
f'proportion of training documents to extract (type found: {type(validation)})')
|
||||
f'proportion of training documents to extract (type found: {type(validation)}). '
|
||||
f'Optionally, "validation" can be a callable function returning a generator that yields '
|
||||
f'the sample instances along with their true prevalence at each iteration by '
|
||||
f'setting protocol="gen".')
|
||||
|
||||
def __check_error(self, error):
|
||||
if error in qp.error.QUANTIFICATION_ERROR:
|
||||
|
@ -132,6 +140,8 @@ class GridSearchQ(BaseQuantifier):
|
|||
return natural_prevalence_prediction(
|
||||
model, val_split, self.sample_size,
|
||||
**commons)
|
||||
elif self.protocol == 'gen':
|
||||
return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
|
||||
else:
|
||||
raise ValueError('unknown protocol')
|
||||
|
||||
|
@ -144,7 +154,8 @@ class GridSearchQ(BaseQuantifier):
|
|||
if val_split is None:
|
||||
val_split = self.val_split
|
||||
training, val_split = self.__check_training_validation(training, val_split)
|
||||
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
||||
if self.protocol != 'gen':
|
||||
assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
|
||||
|
||||
params_keys = list(self.param_grid.keys())
|
||||
params_values = list(self.param_grid.values())
|
||||
|
@ -192,8 +203,6 @@ class GridSearchQ(BaseQuantifier):
|
|||
raise TimeoutError('all jobs took more than the timeout time to end')
|
||||
|
||||
self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
|
||||
# model.set_params(**self.best_params_)
|
||||
# self.best_model_ = deepcopy(model)
|
||||
|
||||
if self.refit:
|
||||
self.sout(f'refitting on the whole development set')
|
||||
|
@ -203,11 +212,11 @@ class GridSearchQ(BaseQuantifier):
|
|||
|
||||
def quantify(self, instances):
|
||||
assert hasattr(self, 'best_model_'), 'quantify called before fit'
|
||||
return self.best_model_.quantify(instances)
|
||||
return self.best_model().quantify(instances)
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return self.best_model_.classes_
|
||||
return self.best_model().classes_
|
||||
|
||||
def set_params(self, **parameters):
|
||||
self.param_grid = parameters
|
||||
|
|
Loading…
Reference in New Issue