adapting new format

This commit is contained in:
Alejandro Moreo Fernandez 2021-11-26 10:57:49 +01:00
parent 8e15678c36
commit 8368c467dc
5 changed files with 125 additions and 29 deletions

View File

@ -14,10 +14,10 @@ import constants
def baselines(): def baselines():
yield CC(LR(n_jobs=-1)), "CC" yield CC(LR(n_jobs=-1)), "CC"
yield ACC(LR(n_jobs=-1)), "ACC" # yield ACC(LR(n_jobs=-1)), "ACC"
yield PCC(LR(n_jobs=-1)), "PCC" # yield PCC(LR(n_jobs=-1)), "PCC"
yield PACC(LR(n_jobs=-1)), "PACC" # yield PACC(LR(n_jobs=-1)), "PACC"
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
# yield MLPE(), "MLPE" # yield MLPE(), "MLPE"
@ -28,7 +28,7 @@ def main(args):
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors') path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
path_train = os.path.join(args.datadir, 'training_vectors.txt') path_train = os.path.join(args.datadir, 'training_vectors.csv')
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
@ -46,13 +46,15 @@ def main(args):
# } # }
param_grid = { param_grid = {
'C': [1], 'C': [0.01],
'class_weight': ['balanced'] 'class_weight': ['balanced']
} }
target_metric = qp.error.mrae
def gen_samples(): def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
load_fn=load_vector_documents, nF=nF) load_fn=load_vector_documents, ext='csv')
for quantifier, q_name in baselines(): for quantifier, q_name in baselines():
print(f'{q_name}: Model selection') print(f'{q_name}: Model selection')
@ -61,12 +63,12 @@ def main(args):
param_grid, param_grid,
sample_size=None, sample_size=None,
protocol='gen', protocol='gen',
error=qp.error.mae, error=target_metric, #qp.error.mae,
refit=False, refit=False,
verbose=True verbose=True
).fit(train, gen_samples) ).fit(train, gen_samples)
print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})') print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
model_path = os.path.join(models_path, q_name+'.pkl') model_path = os.path.join(models_path, q_name+'.pkl')
print(f'saving model in {model_path}') print(f'saving model in {model_path}')
@ -91,8 +93,8 @@ if __name__ == '__main__':
raise ValueError(f'path {args.datadir} is not a valid directory') raise ValueError(f'path {args.datadir} is not a valid directory')
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
if not os.path.exists(os.path.join(args.datadir, "training_vectors.txt")): if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")):
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.txt" file') raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file')
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")): if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')

View File

@ -1,8 +1,14 @@
import argparse import argparse
import pickle import pickle
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import LogisticRegression as LR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from LeQua2022.pretrained_embeddings import TfidfWordEmbeddingTransformer, WordEmbeddingAverageTransformer
from LeQua2022.word_class_embeddings import WordClassEmbeddingsTransformer, ConcatenateEmbeddingsTransformer
from quapy.method.aggregative import * from quapy.method.aggregative import *
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
import quapy.functional as F import quapy.functional as F
@ -20,7 +26,7 @@ def baselines():
yield PCC(LR(n_jobs=-1)), "PCC" yield PCC(LR(n_jobs=-1)), "PCC"
yield PACC(LR(n_jobs=-1)), "PACC" yield PACC(LR(n_jobs=-1)), "PACC"
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
# yield MLPE(), "MLPE" # yield MLPE(), "MLPE"
@ -35,9 +41,69 @@ def main(args):
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
train = LabelledCollection.load(path_train, load_raw_documents) train = LabelledCollection.load(path_train, load_raw_documents)
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', min_df=4) # TfidfVectorizer(min_df=5)
train.instances = tfidf.fit_transform(train.instances) if args.mode == 'tfidf1':
nF = train.instances.shape[1] tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True)
if args.mode == 'tfidf2':
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
if args.mode == 'tfidf3':
tfidf = Pipeline([
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=300))
])
if args.mode == 'tfidf4':
tfidf = Pipeline([
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))),
('svd', TruncatedSVD(n_components=300))
])
if args.mode == 'glove1':
tfidf = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
if args.mode == 'glove2':
tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')
if args.mode == 'glove3':
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
tfidf = Pipeline([
('tfidf', vect),
('embedding', TfidfWordEmbeddingTransformer(
wordset_name='glove',
features_call=vect.get_feature_names_out,
path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
if args.mode == 'glove4':
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
tfidf = Pipeline([
('tfidf', vect),
('embedding', TfidfWordEmbeddingTransformer(
wordset_name='glove',
features_call=vect.get_feature_names_out,
path='/mnt/1T/Datasets/GloVe'))
])
if args.mode == 'wce1':
tfidf = WordClassEmbeddingsTransformer()
if args.mode == 'wce2':
glove = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
wce = WordClassEmbeddingsTransformer()
tfidf = ConcatenateEmbeddingsTransformer([glove, wce])
if args.mode == 'wce3':
glove = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
wce = WordClassEmbeddingsTransformer()
tfidf = Pipeline([
('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])),
('svd', TruncatedSVD(n_components=300))
])
target_metric = qp.error.mrae
train.instances = tfidf.fit_transform(*train.Xy)
print(f'number of classes: {len(train.classes_)}') print(f'number of classes: {len(train.classes_)}')
print(f'number of training documents: {len(train)}') print(f'number of training documents: {len(train)}')
@ -58,6 +124,7 @@ def main(args):
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
outs = []
for quantifier, q_name in baselines(): for quantifier, q_name in baselines():
print(f'{q_name}: Model selection') print(f'{q_name}: Model selection')
quantifier = qp.model_selection.GridSearchQ( quantifier = qp.model_selection.GridSearchQ(
@ -65,17 +132,25 @@ def main(args):
param_grid, param_grid,
sample_size=None, sample_size=None,
protocol='gen', protocol='gen',
error=qp.error.mae, error=target_metric, #qp.error.mae,
refit=False, refit=False,
verbose=True verbose=True
).fit(train, gen_samples) ).fit(train, gen_samples)
print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})') print(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
outs.append(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl') model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl')
print(f'saving model in {model_path}') print(f'saving model in {model_path}')
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print(tfidf)
print(args.mode)
print(outs)
with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
for line in outs:
foo.write(f'{line}\n')
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines') parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines')
@ -87,6 +162,8 @@ if __name__ == '__main__':
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
help='Path where to save the models. ' help='Path where to save the models. '
'A subdirectory named <task> will be automatically created.') 'A subdirectory named <task> will be automatically created.')
parser.add_argument('mode', metavar='PREPROCESSMODE', type=str,
help='modality of preprocessing')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.datadir): if not os.path.exists(args.datadir):

View File

@ -34,33 +34,42 @@ def load_raw_unlabelled_documents(path, vectorizer=None):
return documents, None return documents, None
def load_vector_documents(path, nF=None): # def load_vector_documents(path, nF=None):
X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF) # X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True)
y = y.astype(int) # y = y.astype(int)
# return X, y
def load_vector_documents(path):
D = pd.read_csv(path).to_numpy(dtype=np.float)
labelled = D.shape[1] == 301
if labelled:
X, y = D[:,:300], D[:,-1].astype(np.int).flatten()
else:
X, y = D, None
return X, y return X, y
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs): def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs):
true_prevs = ResultSubmission.load(ground_truth_path) true_prevs = ResultSubmission.load(ground_truth_path)
for id, prevalence in true_prevs.iterrows(): for id, prevalence in true_prevs.iterrows():
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
yield (id, sample, prevalence) if return_id else (sample, prevalence) yield (id, sample, prevalence) if return_id else (sample, prevalence)
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs): def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs):
nsamples = len(glob(os.path.join(path_dir, '*.txt'))) nsamples = len(glob(os.path.join(path_dir, f'*.{ext}')))
for id in range(nsamples): for id in range(nsamples):
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
yield (id, sample) if return_id else sample yield (id, sample) if return_id else sample
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, load_fn=load_vector_documents, **load_kwargs): def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs):
if ground_truth_path is None: if ground_truth_path is None:
# the generator function returns tuples (docid:str, sample:csr_matrix or str) # the generator function returns tuples (docid:str, sample:csr_matrix or str)
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs) gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs)
else: else:
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray) # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs) gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs)
for r in gen_fn: for r in gen_fn:
yield r yield r
@ -139,7 +148,11 @@ class ResultSubmission:
@classmethod @classmethod
def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
try:
df = pd.read_csv(path, index_col=0) df = pd.read_csv(path, index_col=0)
except Exception as e:
print(f'the file {path} does not seem to be a valid csv file. ')
print(e)
return ResultSubmission.check_dataframe_format(df, path=path) return ResultSubmission.check_dataframe_format(df, path=path)
@classmethod @classmethod

View File

@ -24,6 +24,7 @@ def artificial_prevalence_prediction(
verbose=False): verbose=False):
""" """
Performs the predictions for all samples generated according to the artificial sampling protocol. Performs the predictions for all samples generated according to the artificial sampling protocol.
:param model: the model in charge of generating the class prevalence estimations :param model: the model in charge of generating the class prevalence estimations
:param test: the test set on which to perform arificial sampling :param test: the test set on which to perform arificial sampling
:param sample_size: the size of the samples :param sample_size: the size of the samples

View File

@ -3,6 +3,8 @@ import signal
from copy import deepcopy from copy import deepcopy
from typing import Union, Callable from typing import Union, Callable
import numpy as np
import quapy as qp import quapy as qp
from quapy.data.base import LabelledCollection from quapy.data.base import LabelledCollection
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
@ -190,6 +192,7 @@ class GridSearchQ(BaseQuantifier):
model.fit(training) model.fit(training)
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split) true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
score = self.error(true_prevalences, estim_prevalences) score = self.error(true_prevalences, estim_prevalences)
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}') self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
if self.best_score_ is None or score < self.best_score_: if self.best_score_ is None or score < self.best_score_:
self.best_score_ = score self.best_score_ = score