adapting everything to the new file format

This commit is contained in:
Alejandro Moreo Fernandez 2021-11-30 11:36:23 +01:00
parent 8368c467dc
commit 4da1233b46
8 changed files with 100 additions and 283 deletions

View File

@ -22,10 +22,10 @@ import constants
def baselines(): def baselines():
yield CC(LR(n_jobs=-1)), "CC" yield CC(LR(n_jobs=-1)), "CC"
yield ACC(LR(n_jobs=-1)), "ACC" # yield ACC(LR(n_jobs=-1)), "ACC"
yield PCC(LR(n_jobs=-1)), "PCC" # yield PCC(LR(n_jobs=-1)), "PCC"
yield PACC(LR(n_jobs=-1)), "PACC" # yield PACC(LR(n_jobs=-1)), "PACC"
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
# yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
# yield MLPE(), "MLPE" # yield MLPE(), "MLPE"
@ -34,75 +34,15 @@ def main(args):
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
path_dev_vectors = os.path.join(args.datadir, 'dev_documents') path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
path_train = os.path.join(args.datadir, 'training_documents.txt') path_train = os.path.join(args.datadir, 'training_data.txt')
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
train = LabelledCollection.load(path_train, load_raw_documents) train = LabelledCollection.load(path_train, load_raw_documents)
if args.mode == 'tfidf1':
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True)
if args.mode == 'tfidf2':
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2)) tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
if args.mode == 'tfidf3':
tfidf = Pipeline([
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)),
('svd', TruncatedSVD(n_components=300))
])
if args.mode == 'tfidf4':
tfidf = Pipeline([
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))),
('svd', TruncatedSVD(n_components=300))
])
if args.mode == 'glove1':
tfidf = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
if args.mode == 'glove2':
tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')
if args.mode == 'glove3':
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
tfidf = Pipeline([
('tfidf', vect),
('embedding', TfidfWordEmbeddingTransformer(
wordset_name='glove',
features_call=vect.get_feature_names_out,
path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
if args.mode == 'glove4':
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
tfidf = Pipeline([
('tfidf', vect),
('embedding', TfidfWordEmbeddingTransformer(
wordset_name='glove',
features_call=vect.get_feature_names_out,
path='/mnt/1T/Datasets/GloVe'))
])
if args.mode == 'wce1':
tfidf = WordClassEmbeddingsTransformer()
if args.mode == 'wce2':
glove = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
wce = WordClassEmbeddingsTransformer()
tfidf = ConcatenateEmbeddingsTransformer([glove, wce])
if args.mode == 'wce3':
glove = Pipeline([
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
('zscore', StandardScaler())
])
wce = WordClassEmbeddingsTransformer()
tfidf = Pipeline([
('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])),
('svd', TruncatedSVD(n_components=300))
])
target_metric = qp.error.mrae
train.instances = tfidf.fit_transform(*train.Xy) train.instances = tfidf.fit_transform(*train.Xy)
print(f'number of classes: {len(train.classes_)}') print(f'number of classes: {len(train.classes_)}')
@ -110,18 +50,18 @@ def main(args):
print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training prevalence: {F.strprev(train.prevalence())}')
print(f'training matrix shape: {train.instances.shape}') print(f'training matrix shape: {train.instances.shape}')
param_grid = {
'C': np.logspace(-3, 3, 7),
'class_weight': ['balanced', None]
}
# param_grid = { # param_grid = {
# 'C': [1], # 'C': np.logspace(-3, 3, 7),
# 'class_weight': ['balanced'] # 'class_weight': ['balanced', None]
# } # }
param_grid = {
'C': [1],
'class_weight': ['balanced']
}
def gen_samples(): def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
outs = [] outs = []
@ -132,7 +72,7 @@ def main(args):
param_grid, param_grid,
sample_size=None, sample_size=None,
protocol='gen', protocol='gen',
error=target_metric, #qp.error.mae, error=qp.error.mrae,
refit=False, refit=False,
verbose=True verbose=True
).fit(train, gen_samples) ).fit(train, gen_samples)
@ -144,8 +84,6 @@ def main(args):
print(f'saving model in {model_path}') print(f'saving model in {model_path}')
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
print(tfidf)
print(args.mode)
print(outs) print(outs)
with open(f'{args.mode}.{args.task}.txt', 'wt') as foo: with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
for line in outs: for line in outs:
@ -157,26 +95,23 @@ if __name__ == '__main__':
parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'], parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'],
help='Task name (T2A, T2B)') help='Task name (T2A, T2B)')
parser.add_argument('datadir', metavar='DATA-PATH', type=str, parser.add_argument('datadir', metavar='DATA-PATH', type=str,
help='Path of the directory containing "dev_prevalences.csv", "training_documents.txt", and ' help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
'the directory "dev_documents"') 'the directory "dev_documents"')
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
help='Path where to save the models. ' help='Path where to save the models. '
'A subdirectory named <task> will be automatically created.') 'A subdirectory named <task> will be automatically created.')
parser.add_argument('mode', metavar='PREPROCESSMODE', type=str,
help='modality of preprocessing')
args = parser.parse_args() args = parser.parse_args()
if not os.path.exists(args.datadir): if not os.path.exists(args.datadir):
raise FileNotFoundError(f'path {args.datadir} does not exist') raise FileNotFoundError(f'path {args.datadir} does not exist')
if not os.path.isdir(args.datadir): if not os.path.isdir(args.datadir):
raise ValueError(f'path {args.datadir} is not a valid directory') raise ValueError(f'path {args.datadir} is not a valid directory')
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
if not os.path.exists(os.path.join(args.datadir, "training_documents.txt")): if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
raise FileNotFoundError(f'path {args.datadir} does not contain "training_documents.txt" file') raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
if not os.path.exists(os.path.join(args.datadir, "dev_documents")): if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
main(args) main(args)
# print('WITHOUT MODEL SELECTION')

View File

@ -1,5 +1,7 @@
import argparse import argparse
import pickle import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import LogisticRegression as LR
from quapy.method.aggregative import * from quapy.method.aggregative import *
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
@ -16,8 +18,8 @@ def baselines():
yield CC(LR(n_jobs=-1)), "CC" yield CC(LR(n_jobs=-1)), "CC"
# yield ACC(LR(n_jobs=-1)), "ACC" # yield ACC(LR(n_jobs=-1)), "ACC"
# yield PCC(LR(n_jobs=-1)), "PCC" # yield PCC(LR(n_jobs=-1)), "PCC"
# yield PACC(LR(n_jobs=-1)), "PACC" yield PACC(LR(n_jobs=-1)), "PACC"
# yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
# yield MLPE(), "MLPE" # yield MLPE(), "MLPE"
@ -26,35 +28,40 @@ def main(args):
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors') path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
path_train = os.path.join(args.datadir, 'training_vectors.csv') path_train = os.path.join(args.datadir, 'training_data.txt')
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
if args.task in {'T1A', 'T1B'}:
train = LabelledCollection.load(path_train, load_vector_documents) train = LabelledCollection.load(path_train, load_vector_documents)
nF = train.instances.shape[1]
def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents)
else:
train = LabelledCollection.load(path_train, load_raw_documents)
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2))
train.instances = tfidf.fit_transform(*train.Xy)
def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
print(f'number of classes: {len(train.classes_)}') print(f'number of classes: {len(train.classes_)}')
print(f'number of training documents: {len(train)}') print(f'number of training documents: {len(train)}')
print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training prevalence: {F.strprev(train.prevalence())}')
print(f'training matrix shape: {train.instances.shape}') print(f'training matrix shape: {train.instances.shape}')
# param_grid = {
# 'C': np.logspace(-3, 3, 7),
# 'class_weight': ['balanced', None]
# }
param_grid = { param_grid = {
'C': [0.01], 'C': np.logspace(-3, 3, 7),
'class_weight': ['balanced'] 'class_weight': ['balanced', None]
} }
target_metric = qp.error.mrae
def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
load_fn=load_vector_documents, ext='csv')
# param_grid = {
# 'C': [0.01, 0.1, 1],
# 'class_weight': ['balanced']
# }
for quantifier, q_name in baselines(): for quantifier, q_name in baselines():
print(f'{q_name}: Model selection') print(f'{q_name}: Model selection')
@ -63,7 +70,7 @@ def main(args):
param_grid, param_grid,
sample_size=None, sample_size=None,
protocol='gen', protocol='gen',
error=target_metric, #qp.error.mae, error=qp.error.mrae,
refit=False, refit=False,
verbose=True verbose=True
).fit(train, gen_samples) ).fit(train, gen_samples)
@ -76,12 +83,12 @@ def main(args):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='LeQua2022 Task T1A/T1B baselines') parser = argparse.ArgumentParser(description='LeQua2022 baselines')
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B'], parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
help='Task name (T1A, T1B)') help='Task name (T1A, T1B, T2A, T2B)')
parser.add_argument('datadir', metavar='DATA-PATH', type=str, parser.add_argument('datadir', metavar='DATA-PATH', type=str,
help='Path of the directory containing "dev_prevalences.csv", "training_vectors.txt", and ' help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
'the directory "dev_vectors"') 'the directory "dev_samples"')
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
help='Path where to save the models. ' help='Path where to save the models. '
'A subdirectory named <task> will be automatically created.') 'A subdirectory named <task> will be automatically created.')
@ -91,11 +98,11 @@ if __name__ == '__main__':
raise FileNotFoundError(f'path {args.datadir} does not exist') raise FileNotFoundError(f'path {args.datadir} does not exist')
if not os.path.isdir(args.datadir): if not os.path.isdir(args.datadir):
raise ValueError(f'path {args.datadir} is not a valid directory') raise ValueError(f'path {args.datadir} is not a valid directory')
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")): if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file') raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")): if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
main(args) main(args)

View File

@ -2,18 +2,15 @@ DEV_SAMPLES = 1000
TEST_SAMPLES = 5000 TEST_SAMPLES = 5000
TXA_SAMPLE_SIZE = 250 TXA_SAMPLE_SIZE = 250
TXB_SAMPLE_SIZE = 250 TXB_SAMPLE_SIZE = 1000
T1A_SAMPLE_SIZE = 250
T1B_SAMPLE_SIZE = 1000
T2A_SAMPLE_SIZE = 250
T2B_SAMPLE_SIZE = 1000
SAMPLE_SIZE={ SAMPLE_SIZE={
'T1A': T1A_SAMPLE_SIZE, 'TXA': TXA_SAMPLE_SIZE,
'T1B': T1B_SAMPLE_SIZE, 'TXB': TXB_SAMPLE_SIZE,
'T2A': T2A_SAMPLE_SIZE, 'T1A': TXA_SAMPLE_SIZE,
'T2B': T2B_SAMPLE_SIZE 'T1B': TXB_SAMPLE_SIZE,
'T2A': TXA_SAMPLE_SIZE,
'T2B': TXB_SAMPLE_SIZE
} }
ERROR_TOL = 1E-3 ERROR_TOL = 1E-3

View File

@ -34,53 +34,37 @@ def load_raw_unlabelled_documents(path, vectorizer=None):
return documents, None return documents, None
# def load_vector_documents(path, nF=None):
# X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True)
# y = y.astype(int)
# return X, y
def load_vector_documents(path): def load_vector_documents(path):
D = pd.read_csv(path).to_numpy(dtype=np.float) D = pd.read_csv(path).to_numpy(dtype=np.float)
labelled = D.shape[1] == 301 labelled = D.shape[1] == 301
if labelled: if labelled:
X, y = D[:,:300], D[:,-1].astype(np.int).flatten() X, y = D[:,1:], D[:,0].astype(np.int).flatten()
else: else:
X, y = D, None X, y = D, None
return X, y return X, y
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs): def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
true_prevs = ResultSubmission.load(ground_truth_path) true_prevs = ResultSubmission.load(ground_truth_path)
for id, prevalence in true_prevs.iterrows(): for id, prevalence in true_prevs.iterrows():
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
yield (id, sample, prevalence) if return_id else (sample, prevalence) yield (id, sample, prevalence) if return_id else (sample, prevalence)
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs): def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
nsamples = len(glob(os.path.join(path_dir, f'*.{ext}'))) nsamples = len(glob(os.path.join(path_dir, f'*.txt')))
for id in range(nsamples): for id in range(nsamples):
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
yield (id, sample) if return_id else sample yield (id, sample) if return_id else sample
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs): def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs):
if ground_truth_path is None: if ground_truth_path is None:
# the generator function returns tuples (docid:str, sample:csr_matrix or str) # the generator function returns tuples (docid:str, sample:csr_matrix or str)
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs) gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
else: else:
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray) # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs) gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
for r in gen_fn:
yield r
def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
if ground_truth_path is None:
# the generator function returns tuples (filename:str, sample:csr_matrix)
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, nF=nF)
else:
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, nF=nF)
for r in gen_fn: for r in gen_fn:
yield r yield r
@ -214,19 +198,19 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
raise ValueError(f'these result files are not comparable since the categories are different: ' raise ValueError(f'these result files are not comparable since the categories are different: '
f'true={true_prevs.n_categories} categories vs. ' f'true={true_prevs.n_categories} categories vs. '
f'predictions={predicted_prevs.n_categories} categories') f'predictions={predicted_prevs.n_categories} categories')
ae, rae = [], [] rae, ae = [], []
for sample_id, true_prevalence in true_prevs.iterrows(): for sample_id, true_prevalence in true_prevs.iterrows():
pred_prevalence = predicted_prevs.prevalence(sample_id) pred_prevalence = predicted_prevs.prevalence(sample_id)
ae.append(qp.error.ae(true_prevalence, pred_prevalence))
rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size))) rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
ae.append(qp.error.ae(true_prevalence, pred_prevalence))
ae = np.asarray(ae)
rae = np.asarray(rae) rae = np.asarray(rae)
ae = np.asarray(ae)
if average: if average:
return ae.mean(), rae.mean() return rae.mean(), ae.mean()
else: else:
return ae, rae return rae, ae

View File

@ -8,21 +8,20 @@ LeQua2022 Official evaluation script
""" """
def main(args): def main(args):
if args.task in {'T1A', 'T2A'}:
qp.environ['SAMPLE_SIZE'] = constants.TXA_SAMPLE_SIZE sample_size = constants.SAMPLE_SIZE[args.task]
if args.task in {'T1B', 'T2B'}:
qp.environ['SAMPLE_SIZE'] = constants.TXB_SAMPLE_SIZE
true_prev = ResultSubmission.load(args.true_prevalences) true_prev = ResultSubmission.load(args.true_prevalences)
pred_prev = ResultSubmission.load(args.pred_prevalences) pred_prev = ResultSubmission.load(args.pred_prevalences)
mae, mrae = evaluate_submission(true_prev, pred_prev)
print(f'MAE: {mae:.4f}') mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size)
print(f'MRAE: {mrae:.4f}') print(f'MRAE: {mrae:.4f}')
print(f'MAE: {mae:.4f}')
if args.output is not None: if args.output is not None:
qp.util.create_parent_dir(args.output)
with open(args.output, 'wt') as foo: with open(args.output, 'wt') as foo:
foo.write(f'MAE: {mae:.4f}\n')
foo.write(f'MRAE: {mrae:.4f}\n') foo.write(f'MRAE: {mrae:.4f}\n')
foo.write(f'MAE: {mae:.4f}\n')
if __name__=='__main__': if __name__=='__main__':
@ -37,4 +36,7 @@ if __name__=='__main__':
help='Path where to store the evaluation scores') help='Path where to store the evaluation scores')
args = parser.parse_args() args = parser.parse_args()
if args.output is not None:
qp.util.create_parent_dir(args.output)
main(args) main(args)

View File

@ -1,8 +1,6 @@
import argparse import argparse
import quapy as qp from data import ResultSubmission
from data import ResultSubmission, evaluate_submission
import constants
import os
""" """
LeQua2022 Official format-checker script LeQua2022 Official format-checker script
@ -13,9 +11,9 @@ def main(args):
ResultSubmission.check_file_format(args.prevalence_file) ResultSubmission.check_file_format(args.prevalence_file)
except Exception as e: except Exception as e:
print(e) print(e)
print('Format check: not passed') print('Format check: [not passed]')
else: else:
print('Format check: passed') print('Format check: [passed]')
if __name__=='__main__': if __name__=='__main__':

View File

@ -1,7 +1,6 @@
import argparse import argparse
import quapy as qp import quapy as qp
from data import ResultSubmission from data import ResultSubmission
import constants
import os import os
import pickle import pickle
from tqdm import tqdm from tqdm import tqdm
@ -27,7 +26,7 @@ def main(args):
# predictions # predictions
predictions = ResultSubmission() predictions = ResultSubmission()
for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples): for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples):
predictions.add(sampleid, model.quantify(sample)) predictions.add(sampleid, model.quantify(sample))
# saving # saving

View File

@ -9,111 +9,6 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from quapy.functional import artificial_prevalence_sampling, strprev from quapy.functional import artificial_prevalence_sampling, strprev
# class Sampling:
#
# @abstractmethod
# def load(cls, path: str, loader_func: callable, classes=None): ...
#
# @abstractmethod
# @property
# def __len__(self): ...
#
# @abstractmethod
# @property
# def prevalence(self): ...
#
# @abstractmethod
# @property
# def n_classes(self):
#
# @property
# def binary(self):
# return self.n_classes == 2
#
# def uniform_sampling_index(self, size):
# return np.random.choice(len(self), size, replace=False)
#
# def uniform_sampling(self, size):
# unif_index = self.uniform_sampling_index(size)
# return self.sampling_from_index(unif_index)
#
# def sampling(self, size, *prevs, shuffle=True):
# prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
# return self.sampling_from_index(prev_index)
#
# def sampling_from_index(self, index):
# documents = self.instances[index]
# labels = self.labels[index]
# return LabelledCollection(documents, labels, classes_=self.classes_)
#
# def split_stratified(self, train_prop=0.6, random_state=None):
# # with temp_seed(42):
# tr_docs, te_docs, tr_labels, te_labels = \
# train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
# random_state=random_state)
# return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
#
# def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
# dimensions = self.n_classes
# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
# yield self.sampling(sample_size, *prevs)
#
# def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
# dimensions = self.n_classes
# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
# yield self.sampling_index(sample_size, *prevs)
#
# def natural_sampling_generator(self, sample_size, repeats=100):
# for _ in range(repeats):
# yield self.uniform_sampling(sample_size)
#
# def natural_sampling_index_generator(self, sample_size, repeats=100):
# for _ in range(repeats):
# yield self.uniform_sampling_index(sample_size)
#
# def __add__(self, other):
# if other is None:
# return self
# elif issparse(self.instances) and issparse(other.instances):
# join_instances = vstack([self.instances, other.instances])
# elif isinstance(self.instances, list) and isinstance(other.instances, list):
# join_instances = self.instances + other.instances
# elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
# join_instances = np.concatenate([self.instances, other.instances])
# else:
# raise NotImplementedError('unsupported operation for collection types')
# labels = np.concatenate([self.labels, other.labels])
# return LabelledCollection(join_instances, labels)
#
# @property
# def Xy(self):
# return self.instances, self.labels
#
# def stats(self, show=True):
# ninstances = len(self)
# instance_type = type(self.instances[0])
# if instance_type == list:
# nfeats = len(self.instances[0])
# elif instance_type == np.ndarray or issparse(self.instances):
# nfeats = self.instances.shape[1]
# else:
# nfeats = '?'
# stats_ = {'instances': ninstances,
# 'type': instance_type,
# 'features': nfeats,
# 'classes': self.classes_,
# 'prevs': strprev(self.prevalence())}
# if show:
# print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
# f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
# return stats_
#
# def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
# kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
# for train_index, test_index in kf.split(*self.Xy):
# train = self.sampling_from_index(train_index)
# test = self.sampling_from_index(test_index)
# yield train, test
class LabelledCollection: class LabelledCollection:
''' '''
@ -146,8 +41,8 @@ class LabelledCollection:
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod @classmethod
def load(cls, path: str, loader_func: callable, classes=None): def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
return LabelledCollection(*loader_func(path), classes) return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
def __len__(self): def __len__(self):
return self.instances.shape[0] return self.instances.shape[0]