2021-11-12 14:30:02 +01:00
|
|
|
import argparse
|
|
|
|
import pickle
|
|
|
|
from sklearn.linear_model import LogisticRegression as LR
|
|
|
|
from quapy.method.aggregative import *
|
2021-11-24 11:20:42 +01:00
|
|
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
2021-11-12 14:30:02 +01:00
|
|
|
import quapy.functional as F
|
|
|
|
from data import *
|
|
|
|
import os
|
|
|
|
import constants
|
|
|
|
|
|
|
|
|
2021-11-24 11:20:42 +01:00
|
|
|
# LeQua official baselines for task T1A (Binary/Vector) and T1B (Multiclass/Vector)
|
2021-11-12 14:30:02 +01:00
|
|
|
# =========================================================
|
|
|
|
|
|
|
|
def baselines():
|
|
|
|
yield CC(LR(n_jobs=-1)), "CC"
|
|
|
|
yield ACC(LR(n_jobs=-1)), "ACC"
|
|
|
|
yield PCC(LR(n_jobs=-1)), "PCC"
|
|
|
|
yield PACC(LR(n_jobs=-1)), "PACC"
|
|
|
|
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
2021-11-24 11:20:42 +01:00
|
|
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
|
|
|
# yield MLPE(), "MLPE"
|
2021-11-12 14:30:02 +01:00
|
|
|
|
|
|
|
|
|
|
|
def main(args):
|
|
|
|
|
|
|
|
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
|
|
|
|
|
|
|
|
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
|
|
|
|
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
|
|
|
|
path_train = os.path.join(args.datadir, 'training_vectors.txt')
|
|
|
|
|
|
|
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
|
|
|
|
2021-11-24 11:20:42 +01:00
|
|
|
train = LabelledCollection.load(path_train, load_vector_documents)
|
2021-11-12 14:30:02 +01:00
|
|
|
nF = train.instances.shape[1]
|
|
|
|
|
|
|
|
print(f'number of classes: {len(train.classes_)}')
|
|
|
|
print(f'number of training documents: {len(train)}')
|
|
|
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
|
|
|
print(f'training matrix shape: {train.instances.shape}')
|
|
|
|
|
2021-11-24 11:20:42 +01:00
|
|
|
# param_grid = {
|
|
|
|
# 'C': np.logspace(-3, 3, 7),
|
|
|
|
# 'class_weight': ['balanced', None]
|
|
|
|
# }
|
|
|
|
|
2021-11-12 14:30:02 +01:00
|
|
|
param_grid = {
|
2021-11-24 11:20:42 +01:00
|
|
|
'C': [1],
|
|
|
|
'class_weight': ['balanced']
|
2021-11-12 14:30:02 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
def gen_samples():
|
2021-11-24 11:20:42 +01:00
|
|
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
|
|
|
load_fn=load_vector_documents, nF=nF)
|
2021-11-12 14:30:02 +01:00
|
|
|
|
|
|
|
for quantifier, q_name in baselines():
|
|
|
|
print(f'{q_name}: Model selection')
|
|
|
|
quantifier = qp.model_selection.GridSearchQ(
|
|
|
|
quantifier,
|
|
|
|
param_grid,
|
|
|
|
sample_size=None,
|
|
|
|
protocol='gen',
|
|
|
|
error=qp.error.mae,
|
|
|
|
refit=False,
|
|
|
|
verbose=True
|
|
|
|
).fit(train, gen_samples)
|
|
|
|
|
|
|
|
print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})')
|
|
|
|
|
|
|
|
model_path = os.path.join(models_path, q_name+'.pkl')
|
|
|
|
print(f'saving model in {model_path}')
|
|
|
|
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
parser = argparse.ArgumentParser(description='LeQua2022 Task T1A/T1B baselines')
|
|
|
|
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B'],
|
|
|
|
help='Task name (T1A, T1B)')
|
|
|
|
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
|
|
|
help='Path of the directory containing "dev_prevalences.csv", "training_vectors.txt", and '
|
|
|
|
'the directory "dev_vectors"')
|
|
|
|
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
|
|
|
help='Path where to save the models. '
|
|
|
|
'A subdirectory named <task> will be automatically created.')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
if not os.path.exists(args.datadir):
|
|
|
|
raise FileNotFoundError(f'path {args.datadir} does not exist')
|
|
|
|
if not os.path.isdir(args.datadir):
|
|
|
|
raise ValueError(f'path {args.datadir} is not a valid directory')
|
|
|
|
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
|
|
|
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
|
|
|
|
if not os.path.exists(os.path.join(args.datadir, "training_vectors.txt")):
|
|
|
|
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.txt" file')
|
|
|
|
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
|
|
|
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
|
|
|
|
|
|
|
|
main(args)
|