forked from moreo/QuaPy
adapting new format
This commit is contained in:
parent
8e15678c36
commit
8368c467dc
|
@ -14,10 +14,10 @@ import constants
|
||||||
|
|
||||||
def baselines():
|
def baselines():
|
||||||
yield CC(LR(n_jobs=-1)), "CC"
|
yield CC(LR(n_jobs=-1)), "CC"
|
||||||
yield ACC(LR(n_jobs=-1)), "ACC"
|
# yield ACC(LR(n_jobs=-1)), "ACC"
|
||||||
yield PCC(LR(n_jobs=-1)), "PCC"
|
# yield PCC(LR(n_jobs=-1)), "PCC"
|
||||||
yield PACC(LR(n_jobs=-1)), "PACC"
|
# yield PACC(LR(n_jobs=-1)), "PACC"
|
||||||
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
# yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||||
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||||
# yield MLPE(), "MLPE"
|
# yield MLPE(), "MLPE"
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ def main(args):
|
||||||
|
|
||||||
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
|
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
|
||||||
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
|
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
|
||||||
path_train = os.path.join(args.datadir, 'training_vectors.txt')
|
path_train = os.path.join(args.datadir, 'training_vectors.csv')
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
|
@ -46,13 +46,15 @@ def main(args):
|
||||||
# }
|
# }
|
||||||
|
|
||||||
param_grid = {
|
param_grid = {
|
||||||
'C': [1],
|
'C': [0.01],
|
||||||
'class_weight': ['balanced']
|
'class_weight': ['balanced']
|
||||||
}
|
}
|
||||||
|
target_metric = qp.error.mrae
|
||||||
|
|
||||||
def gen_samples():
|
def gen_samples():
|
||||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
||||||
load_fn=load_vector_documents, nF=nF)
|
load_fn=load_vector_documents, ext='csv')
|
||||||
|
|
||||||
|
|
||||||
for quantifier, q_name in baselines():
|
for quantifier, q_name in baselines():
|
||||||
print(f'{q_name}: Model selection')
|
print(f'{q_name}: Model selection')
|
||||||
|
@ -61,12 +63,12 @@ def main(args):
|
||||||
param_grid,
|
param_grid,
|
||||||
sample_size=None,
|
sample_size=None,
|
||||||
protocol='gen',
|
protocol='gen',
|
||||||
error=qp.error.mae,
|
error=target_metric, #qp.error.mae,
|
||||||
refit=False,
|
refit=False,
|
||||||
verbose=True
|
verbose=True
|
||||||
).fit(train, gen_samples)
|
).fit(train, gen_samples)
|
||||||
|
|
||||||
print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})')
|
print(f'{q_name} got MRAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
|
||||||
|
|
||||||
model_path = os.path.join(models_path, q_name+'.pkl')
|
model_path = os.path.join(models_path, q_name+'.pkl')
|
||||||
print(f'saving model in {model_path}')
|
print(f'saving model in {model_path}')
|
||||||
|
@ -91,8 +93,8 @@ if __name__ == '__main__':
|
||||||
raise ValueError(f'path {args.datadir} is not a valid directory')
|
raise ValueError(f'path {args.datadir} is not a valid directory')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "training_vectors.txt")):
|
if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.txt" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,14 @@
|
||||||
import argparse
|
import argparse
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression as LR
|
from sklearn.linear_model import LogisticRegression as LR
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
|
||||||
|
from LeQua2022.pretrained_embeddings import TfidfWordEmbeddingTransformer, WordEmbeddingAverageTransformer
|
||||||
|
from LeQua2022.word_class_embeddings import WordClassEmbeddingsTransformer, ConcatenateEmbeddingsTransformer
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
@ -20,7 +26,7 @@ def baselines():
|
||||||
yield PCC(LR(n_jobs=-1)), "PCC"
|
yield PCC(LR(n_jobs=-1)), "PCC"
|
||||||
yield PACC(LR(n_jobs=-1)), "PACC"
|
yield PACC(LR(n_jobs=-1)), "PACC"
|
||||||
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||||
yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||||
# yield MLPE(), "MLPE"
|
# yield MLPE(), "MLPE"
|
||||||
|
|
||||||
|
|
||||||
|
@ -35,9 +41,69 @@ def main(args):
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
train = LabelledCollection.load(path_train, load_raw_documents)
|
train = LabelledCollection.load(path_train, load_raw_documents)
|
||||||
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', min_df=4) # TfidfVectorizer(min_df=5)
|
|
||||||
train.instances = tfidf.fit_transform(train.instances)
|
if args.mode == 'tfidf1':
|
||||||
nF = train.instances.shape[1]
|
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||||
|
if args.mode == 'tfidf2':
|
||||||
|
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
|
||||||
|
if args.mode == 'tfidf3':
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)),
|
||||||
|
('svd', TruncatedSVD(n_components=300))
|
||||||
|
])
|
||||||
|
if args.mode == 'tfidf4':
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))),
|
||||||
|
('svd', TruncatedSVD(n_components=300))
|
||||||
|
])
|
||||||
|
if args.mode == 'glove1':
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
||||||
|
('zscore', StandardScaler())
|
||||||
|
])
|
||||||
|
if args.mode == 'glove2':
|
||||||
|
tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')
|
||||||
|
if args.mode == 'glove3':
|
||||||
|
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('tfidf', vect),
|
||||||
|
('embedding', TfidfWordEmbeddingTransformer(
|
||||||
|
wordset_name='glove',
|
||||||
|
features_call=vect.get_feature_names_out,
|
||||||
|
path='/mnt/1T/Datasets/GloVe')),
|
||||||
|
('zscore', StandardScaler())
|
||||||
|
])
|
||||||
|
if args.mode == 'glove4':
|
||||||
|
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('tfidf', vect),
|
||||||
|
('embedding', TfidfWordEmbeddingTransformer(
|
||||||
|
wordset_name='glove',
|
||||||
|
features_call=vect.get_feature_names_out,
|
||||||
|
path='/mnt/1T/Datasets/GloVe'))
|
||||||
|
])
|
||||||
|
if args.mode == 'wce1':
|
||||||
|
tfidf = WordClassEmbeddingsTransformer()
|
||||||
|
if args.mode == 'wce2':
|
||||||
|
glove = Pipeline([
|
||||||
|
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
||||||
|
('zscore', StandardScaler())
|
||||||
|
])
|
||||||
|
wce = WordClassEmbeddingsTransformer()
|
||||||
|
tfidf = ConcatenateEmbeddingsTransformer([glove, wce])
|
||||||
|
if args.mode == 'wce3':
|
||||||
|
glove = Pipeline([
|
||||||
|
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
||||||
|
('zscore', StandardScaler())
|
||||||
|
])
|
||||||
|
wce = WordClassEmbeddingsTransformer()
|
||||||
|
tfidf = Pipeline([
|
||||||
|
('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])),
|
||||||
|
('svd', TruncatedSVD(n_components=300))
|
||||||
|
])
|
||||||
|
target_metric = qp.error.mrae
|
||||||
|
|
||||||
|
train.instances = tfidf.fit_transform(*train.Xy)
|
||||||
|
|
||||||
print(f'number of classes: {len(train.classes_)}')
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
print(f'number of training documents: {len(train)}')
|
print(f'number of training documents: {len(train)}')
|
||||||
|
@ -58,6 +124,7 @@ def main(args):
|
||||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
||||||
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
||||||
|
|
||||||
|
outs = []
|
||||||
for quantifier, q_name in baselines():
|
for quantifier, q_name in baselines():
|
||||||
print(f'{q_name}: Model selection')
|
print(f'{q_name}: Model selection')
|
||||||
quantifier = qp.model_selection.GridSearchQ(
|
quantifier = qp.model_selection.GridSearchQ(
|
||||||
|
@ -65,17 +132,25 @@ def main(args):
|
||||||
param_grid,
|
param_grid,
|
||||||
sample_size=None,
|
sample_size=None,
|
||||||
protocol='gen',
|
protocol='gen',
|
||||||
error=qp.error.mae,
|
error=target_metric, #qp.error.mae,
|
||||||
refit=False,
|
refit=False,
|
||||||
verbose=True
|
verbose=True
|
||||||
).fit(train, gen_samples)
|
).fit(train, gen_samples)
|
||||||
|
|
||||||
print(f'{q_name} got MAE={quantifier.best_score_:.3f} (hyper-params: {quantifier.best_params_})')
|
print(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
|
||||||
|
outs.append(f'{q_name} got MAE={quantifier.best_score_:.5f} (hyper-params: {quantifier.best_params_})')
|
||||||
|
|
||||||
model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl')
|
model_path = os.path.join(models_path, q_name+'.'+args.task+'.pkl')
|
||||||
print(f'saving model in {model_path}')
|
print(f'saving model in {model_path}')
|
||||||
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
print(tfidf)
|
||||||
|
print(args.mode)
|
||||||
|
print(outs)
|
||||||
|
with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
|
||||||
|
for line in outs:
|
||||||
|
foo.write(f'{line}\n')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines')
|
parser = argparse.ArgumentParser(description='LeQua2022 Task T2A/T2B baselines')
|
||||||
|
@ -87,6 +162,8 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
||||||
help='Path where to save the models. '
|
help='Path where to save the models. '
|
||||||
'A subdirectory named <task> will be automatically created.')
|
'A subdirectory named <task> will be automatically created.')
|
||||||
|
parser.add_argument('mode', metavar='PREPROCESSMODE', type=str,
|
||||||
|
help='modality of preprocessing')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not os.path.exists(args.datadir):
|
if not os.path.exists(args.datadir):
|
||||||
|
|
|
@ -34,33 +34,42 @@ def load_raw_unlabelled_documents(path, vectorizer=None):
|
||||||
return documents, None
|
return documents, None
|
||||||
|
|
||||||
|
|
||||||
def load_vector_documents(path, nF=None):
|
# def load_vector_documents(path, nF=None):
|
||||||
X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF)
|
# X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True)
|
||||||
y = y.astype(int)
|
# y = y.astype(int)
|
||||||
|
# return X, y
|
||||||
|
|
||||||
|
def load_vector_documents(path):
|
||||||
|
D = pd.read_csv(path).to_numpy(dtype=np.float)
|
||||||
|
labelled = D.shape[1] == 301
|
||||||
|
if labelled:
|
||||||
|
X, y = D[:,:300], D[:,-1].astype(np.int).flatten()
|
||||||
|
else:
|
||||||
|
X, y = D, None
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
|
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs):
|
||||||
true_prevs = ResultSubmission.load(ground_truth_path)
|
true_prevs = ResultSubmission.load(ground_truth_path)
|
||||||
for id, prevalence in true_prevs.iterrows():
|
for id, prevalence in true_prevs.iterrows():
|
||||||
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
|
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
|
||||||
yield (id, sample, prevalence) if return_id else (sample, prevalence)
|
yield (id, sample, prevalence) if return_id else (sample, prevalence)
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
|
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs):
|
||||||
nsamples = len(glob(os.path.join(path_dir, '*.txt')))
|
nsamples = len(glob(os.path.join(path_dir, f'*.{ext}')))
|
||||||
for id in range(nsamples):
|
for id in range(nsamples):
|
||||||
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
|
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
|
||||||
yield (id, sample) if return_id else sample
|
yield (id, sample) if return_id else sample
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, load_fn=load_vector_documents, **load_kwargs):
|
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs):
|
||||||
if ground_truth_path is None:
|
if ground_truth_path is None:
|
||||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
||||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
|
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs)
|
||||||
else:
|
else:
|
||||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
||||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
|
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs)
|
||||||
for r in gen_fn:
|
for r in gen_fn:
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
|
@ -139,7 +148,11 @@ class ResultSubmission:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
|
def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
|
||||||
df = pd.read_csv(path, index_col=0)
|
try:
|
||||||
|
df = pd.read_csv(path, index_col=0)
|
||||||
|
except Exception as e:
|
||||||
|
print(f'the file {path} does not seem to be a valid csv file. ')
|
||||||
|
print(e)
|
||||||
return ResultSubmission.check_dataframe_format(df, path=path)
|
return ResultSubmission.check_dataframe_format(df, path=path)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
|
@ -24,6 +24,7 @@ def artificial_prevalence_prediction(
|
||||||
verbose=False):
|
verbose=False):
|
||||||
"""
|
"""
|
||||||
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||||
|
|
||||||
:param model: the model in charge of generating the class prevalence estimations
|
:param model: the model in charge of generating the class prevalence estimations
|
||||||
:param test: the test set on which to perform arificial sampling
|
:param test: the test set on which to perform arificial sampling
|
||||||
:param sample_size: the size of the samples
|
:param sample_size: the size of the samples
|
||||||
|
|
|
@ -3,6 +3,8 @@ import signal
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Union, Callable
|
from typing import Union, Callable
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from quapy.data.base import LabelledCollection
|
from quapy.data.base import LabelledCollection
|
||||||
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
|
from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
|
||||||
|
@ -190,6 +192,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
model.fit(training)
|
model.fit(training)
|
||||||
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
|
true_prevalences, estim_prevalences = self.__generate_predictions(model, val_split)
|
||||||
score = self.error(true_prevalences, estim_prevalences)
|
score = self.error(true_prevalences, estim_prevalences)
|
||||||
|
|
||||||
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
|
self._sout(f'checking hyperparams={params} got {self.error.__name__} score {score:.5f}')
|
||||||
if self.best_score_ is None or score < self.best_score_:
|
if self.best_score_ is None or score < self.best_score_:
|
||||||
self.best_score_ = score
|
self.best_score_ = score
|
||||||
|
|
Loading…
Reference in New Issue