adapting everything to the new file format
This commit is contained in:
parent
8368c467dc
commit
4da1233b46
|
@ -22,10 +22,10 @@ import constants
|
||||||
|
|
||||||
def baselines():
|
def baselines():
|
||||||
yield CC(LR(n_jobs=-1)), "CC"
|
yield CC(LR(n_jobs=-1)), "CC"
|
||||||
yield ACC(LR(n_jobs=-1)), "ACC"
|
# yield ACC(LR(n_jobs=-1)), "ACC"
|
||||||
yield PCC(LR(n_jobs=-1)), "PCC"
|
# yield PCC(LR(n_jobs=-1)), "PCC"
|
||||||
yield PACC(LR(n_jobs=-1)), "PACC"
|
# yield PACC(LR(n_jobs=-1)), "PACC"
|
||||||
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
# yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||||
# yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||||
# yield MLPE(), "MLPE"
|
# yield MLPE(), "MLPE"
|
||||||
|
|
||||||
|
@ -34,75 +34,15 @@ def main(args):
|
||||||
|
|
||||||
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
|
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
|
||||||
|
|
||||||
path_dev_vectors = os.path.join(args.datadir, 'dev_documents')
|
path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
|
||||||
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
|
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
|
||||||
path_train = os.path.join(args.datadir, 'training_documents.txt')
|
path_train = os.path.join(args.datadir, 'training_data.txt')
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
train = LabelledCollection.load(path_train, load_raw_documents)
|
train = LabelledCollection.load(path_train, load_raw_documents)
|
||||||
|
|
||||||
if args.mode == 'tfidf1':
|
|
||||||
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
|
||||||
if args.mode == 'tfidf2':
|
|
||||||
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
|
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))
|
||||||
if args.mode == 'tfidf3':
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)),
|
|
||||||
('svd', TruncatedSVD(n_components=300))
|
|
||||||
])
|
|
||||||
if args.mode == 'tfidf4':
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))),
|
|
||||||
('svd', TruncatedSVD(n_components=300))
|
|
||||||
])
|
|
||||||
if args.mode == 'glove1':
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
|
||||||
('zscore', StandardScaler())
|
|
||||||
])
|
|
||||||
if args.mode == 'glove2':
|
|
||||||
tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')
|
|
||||||
if args.mode == 'glove3':
|
|
||||||
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('tfidf', vect),
|
|
||||||
('embedding', TfidfWordEmbeddingTransformer(
|
|
||||||
wordset_name='glove',
|
|
||||||
features_call=vect.get_feature_names_out,
|
|
||||||
path='/mnt/1T/Datasets/GloVe')),
|
|
||||||
('zscore', StandardScaler())
|
|
||||||
])
|
|
||||||
if args.mode == 'glove4':
|
|
||||||
vect = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('tfidf', vect),
|
|
||||||
('embedding', TfidfWordEmbeddingTransformer(
|
|
||||||
wordset_name='glove',
|
|
||||||
features_call=vect.get_feature_names_out,
|
|
||||||
path='/mnt/1T/Datasets/GloVe'))
|
|
||||||
])
|
|
||||||
if args.mode == 'wce1':
|
|
||||||
tfidf = WordClassEmbeddingsTransformer()
|
|
||||||
if args.mode == 'wce2':
|
|
||||||
glove = Pipeline([
|
|
||||||
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
|
||||||
('zscore', StandardScaler())
|
|
||||||
])
|
|
||||||
wce = WordClassEmbeddingsTransformer()
|
|
||||||
tfidf = ConcatenateEmbeddingsTransformer([glove, wce])
|
|
||||||
if args.mode == 'wce3':
|
|
||||||
glove = Pipeline([
|
|
||||||
('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')),
|
|
||||||
('zscore', StandardScaler())
|
|
||||||
])
|
|
||||||
wce = WordClassEmbeddingsTransformer()
|
|
||||||
tfidf = Pipeline([
|
|
||||||
('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])),
|
|
||||||
('svd', TruncatedSVD(n_components=300))
|
|
||||||
])
|
|
||||||
target_metric = qp.error.mrae
|
|
||||||
|
|
||||||
train.instances = tfidf.fit_transform(*train.Xy)
|
train.instances = tfidf.fit_transform(*train.Xy)
|
||||||
|
|
||||||
print(f'number of classes: {len(train.classes_)}')
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
|
@ -110,18 +50,18 @@ def main(args):
|
||||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
print(f'training matrix shape: {train.instances.shape}')
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
param_grid = {
|
|
||||||
'C': np.logspace(-3, 3, 7),
|
|
||||||
'class_weight': ['balanced', None]
|
|
||||||
}
|
|
||||||
|
|
||||||
# param_grid = {
|
# param_grid = {
|
||||||
# 'C': [1],
|
# 'C': np.logspace(-3, 3, 7),
|
||||||
# 'class_weight': ['balanced']
|
# 'class_weight': ['balanced', None]
|
||||||
# }
|
# }
|
||||||
|
|
||||||
|
param_grid = {
|
||||||
|
'C': [1],
|
||||||
|
'class_weight': ['balanced']
|
||||||
|
}
|
||||||
|
|
||||||
def gen_samples():
|
def gen_samples():
|
||||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
|
||||||
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
||||||
|
|
||||||
outs = []
|
outs = []
|
||||||
|
@ -132,7 +72,7 @@ def main(args):
|
||||||
param_grid,
|
param_grid,
|
||||||
sample_size=None,
|
sample_size=None,
|
||||||
protocol='gen',
|
protocol='gen',
|
||||||
error=target_metric, #qp.error.mae,
|
error=qp.error.mrae,
|
||||||
refit=False,
|
refit=False,
|
||||||
verbose=True
|
verbose=True
|
||||||
).fit(train, gen_samples)
|
).fit(train, gen_samples)
|
||||||
|
@ -144,8 +84,6 @@ def main(args):
|
||||||
print(f'saving model in {model_path}')
|
print(f'saving model in {model_path}')
|
||||||
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
print(tfidf)
|
|
||||||
print(args.mode)
|
|
||||||
print(outs)
|
print(outs)
|
||||||
with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
|
with open(f'{args.mode}.{args.task}.txt', 'wt') as foo:
|
||||||
for line in outs:
|
for line in outs:
|
||||||
|
@ -157,26 +95,23 @@ if __name__ == '__main__':
|
||||||
parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'],
|
parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'],
|
||||||
help='Task name (T2A, T2B)')
|
help='Task name (T2A, T2B)')
|
||||||
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
||||||
help='Path of the directory containing "dev_prevalences.csv", "training_documents.txt", and '
|
help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
|
||||||
'the directory "dev_documents"')
|
'the directory "dev_documents"')
|
||||||
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
||||||
help='Path where to save the models. '
|
help='Path where to save the models. '
|
||||||
'A subdirectory named <task> will be automatically created.')
|
'A subdirectory named <task> will be automatically created.')
|
||||||
parser.add_argument('mode', metavar='PREPROCESSMODE', type=str,
|
|
||||||
help='modality of preprocessing')
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not os.path.exists(args.datadir):
|
if not os.path.exists(args.datadir):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not exist')
|
raise FileNotFoundError(f'path {args.datadir} does not exist')
|
||||||
if not os.path.isdir(args.datadir):
|
if not os.path.isdir(args.datadir):
|
||||||
raise ValueError(f'path {args.datadir} is not a valid directory')
|
raise ValueError(f'path {args.datadir} is not a valid directory')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "training_documents.txt")):
|
if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "training_documents.txt" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_documents")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
||||||
# print('WITHOUT MODEL SELECTION')
|
|
|
@ -1,5 +1,7 @@
|
||||||
import argparse
|
import argparse
|
||||||
import pickle
|
import pickle
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression as LR
|
from sklearn.linear_model import LogisticRegression as LR
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE
|
||||||
|
@ -16,8 +18,8 @@ def baselines():
|
||||||
yield CC(LR(n_jobs=-1)), "CC"
|
yield CC(LR(n_jobs=-1)), "CC"
|
||||||
# yield ACC(LR(n_jobs=-1)), "ACC"
|
# yield ACC(LR(n_jobs=-1)), "ACC"
|
||||||
# yield PCC(LR(n_jobs=-1)), "PCC"
|
# yield PCC(LR(n_jobs=-1)), "PCC"
|
||||||
# yield PACC(LR(n_jobs=-1)), "PACC"
|
yield PACC(LR(n_jobs=-1)), "PACC"
|
||||||
# yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD"
|
||||||
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
# yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy"
|
||||||
# yield MLPE(), "MLPE"
|
# yield MLPE(), "MLPE"
|
||||||
|
|
||||||
|
@ -26,35 +28,40 @@ def main(args):
|
||||||
|
|
||||||
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
|
models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task))
|
||||||
|
|
||||||
path_dev_vectors = os.path.join(args.datadir, 'dev_vectors')
|
path_dev_vectors = os.path.join(args.datadir, 'dev_samples')
|
||||||
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv')
|
path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt')
|
||||||
path_train = os.path.join(args.datadir, 'training_vectors.csv')
|
path_train = os.path.join(args.datadir, 'training_data.txt')
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task]
|
||||||
|
|
||||||
|
if args.task in {'T1A', 'T1B'}:
|
||||||
train = LabelledCollection.load(path_train, load_vector_documents)
|
train = LabelledCollection.load(path_train, load_vector_documents)
|
||||||
nF = train.instances.shape[1]
|
|
||||||
|
def gen_samples():
|
||||||
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents)
|
||||||
|
else:
|
||||||
|
train = LabelledCollection.load(path_train, load_raw_documents)
|
||||||
|
tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2))
|
||||||
|
train.instances = tfidf.fit_transform(*train.Xy)
|
||||||
|
|
||||||
|
def gen_samples():
|
||||||
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
|
||||||
|
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
||||||
|
|
||||||
print(f'number of classes: {len(train.classes_)}')
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
print(f'number of training documents: {len(train)}')
|
print(f'number of training documents: {len(train)}')
|
||||||
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
print(f'training prevalence: {F.strprev(train.prevalence())}')
|
||||||
print(f'training matrix shape: {train.instances.shape}')
|
print(f'training matrix shape: {train.instances.shape}')
|
||||||
|
|
||||||
# param_grid = {
|
|
||||||
# 'C': np.logspace(-3, 3, 7),
|
|
||||||
# 'class_weight': ['balanced', None]
|
|
||||||
# }
|
|
||||||
|
|
||||||
param_grid = {
|
param_grid = {
|
||||||
'C': [0.01],
|
'C': np.logspace(-3, 3, 7),
|
||||||
'class_weight': ['balanced']
|
'class_weight': ['balanced', None]
|
||||||
}
|
}
|
||||||
target_metric = qp.error.mrae
|
|
||||||
|
|
||||||
def gen_samples():
|
|
||||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False,
|
|
||||||
load_fn=load_vector_documents, ext='csv')
|
|
||||||
|
|
||||||
|
# param_grid = {
|
||||||
|
# 'C': [0.01, 0.1, 1],
|
||||||
|
# 'class_weight': ['balanced']
|
||||||
|
# }
|
||||||
|
|
||||||
for quantifier, q_name in baselines():
|
for quantifier, q_name in baselines():
|
||||||
print(f'{q_name}: Model selection')
|
print(f'{q_name}: Model selection')
|
||||||
|
@ -63,7 +70,7 @@ def main(args):
|
||||||
param_grid,
|
param_grid,
|
||||||
sample_size=None,
|
sample_size=None,
|
||||||
protocol='gen',
|
protocol='gen',
|
||||||
error=target_metric, #qp.error.mae,
|
error=qp.error.mrae,
|
||||||
refit=False,
|
refit=False,
|
||||||
verbose=True
|
verbose=True
|
||||||
).fit(train, gen_samples)
|
).fit(train, gen_samples)
|
||||||
|
@ -76,12 +83,12 @@ def main(args):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser(description='LeQua2022 Task T1A/T1B baselines')
|
parser = argparse.ArgumentParser(description='LeQua2022 baselines')
|
||||||
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B'],
|
parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
|
||||||
help='Task name (T1A, T1B)')
|
help='Task name (T1A, T1B, T2A, T2B)')
|
||||||
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
parser.add_argument('datadir', metavar='DATA-PATH', type=str,
|
||||||
help='Path of the directory containing "dev_prevalences.csv", "training_vectors.txt", and '
|
help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and '
|
||||||
'the directory "dev_vectors"')
|
'the directory "dev_samples"')
|
||||||
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
parser.add_argument('modeldir', metavar='MODEL-PATH', type=str,
|
||||||
help='Path where to save the models. '
|
help='Path where to save the models. '
|
||||||
'A subdirectory named <task> will be automatically created.')
|
'A subdirectory named <task> will be automatically created.')
|
||||||
|
@ -91,11 +98,11 @@ if __name__ == '__main__':
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not exist')
|
raise FileNotFoundError(f'path {args.datadir} does not exist')
|
||||||
if not os.path.isdir(args.datadir):
|
if not os.path.isdir(args.datadir):
|
||||||
raise ValueError(f'path {args.datadir} is not a valid directory')
|
raise ValueError(f'path {args.datadir} is not a valid directory')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")):
|
if not os.path.exists(os.path.join(args.datadir, "training_data.txt")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file')
|
||||||
if not os.path.exists(os.path.join(args.datadir, "dev_vectors")):
|
if not os.path.exists(os.path.join(args.datadir, "dev_samples")):
|
||||||
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder')
|
raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder')
|
||||||
|
|
||||||
main(args)
|
main(args)
|
|
@ -2,18 +2,15 @@ DEV_SAMPLES = 1000
|
||||||
TEST_SAMPLES = 5000
|
TEST_SAMPLES = 5000
|
||||||
|
|
||||||
TXA_SAMPLE_SIZE = 250
|
TXA_SAMPLE_SIZE = 250
|
||||||
TXB_SAMPLE_SIZE = 250
|
TXB_SAMPLE_SIZE = 1000
|
||||||
|
|
||||||
T1A_SAMPLE_SIZE = 250
|
|
||||||
T1B_SAMPLE_SIZE = 1000
|
|
||||||
T2A_SAMPLE_SIZE = 250
|
|
||||||
T2B_SAMPLE_SIZE = 1000
|
|
||||||
|
|
||||||
SAMPLE_SIZE={
|
SAMPLE_SIZE={
|
||||||
'T1A': T1A_SAMPLE_SIZE,
|
'TXA': TXA_SAMPLE_SIZE,
|
||||||
'T1B': T1B_SAMPLE_SIZE,
|
'TXB': TXB_SAMPLE_SIZE,
|
||||||
'T2A': T2A_SAMPLE_SIZE,
|
'T1A': TXA_SAMPLE_SIZE,
|
||||||
'T2B': T2B_SAMPLE_SIZE
|
'T1B': TXB_SAMPLE_SIZE,
|
||||||
|
'T2A': TXA_SAMPLE_SIZE,
|
||||||
|
'T2B': TXB_SAMPLE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
ERROR_TOL = 1E-3
|
ERROR_TOL = 1E-3
|
||||||
|
|
|
@ -34,53 +34,37 @@ def load_raw_unlabelled_documents(path, vectorizer=None):
|
||||||
return documents, None
|
return documents, None
|
||||||
|
|
||||||
|
|
||||||
# def load_vector_documents(path, nF=None):
|
|
||||||
# X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True)
|
|
||||||
# y = y.astype(int)
|
|
||||||
# return X, y
|
|
||||||
|
|
||||||
def load_vector_documents(path):
|
def load_vector_documents(path):
|
||||||
D = pd.read_csv(path).to_numpy(dtype=np.float)
|
D = pd.read_csv(path).to_numpy(dtype=np.float)
|
||||||
labelled = D.shape[1] == 301
|
labelled = D.shape[1] == 301
|
||||||
if labelled:
|
if labelled:
|
||||||
X, y = D[:,:300], D[:,-1].astype(np.int).flatten()
|
X, y = D[:,1:], D[:,0].astype(np.int).flatten()
|
||||||
else:
|
else:
|
||||||
X, y = D, None
|
X, y = D, None
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs):
|
def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
|
||||||
true_prevs = ResultSubmission.load(ground_truth_path)
|
true_prevs = ResultSubmission.load(ground_truth_path)
|
||||||
for id, prevalence in true_prevs.iterrows():
|
for id, prevalence in true_prevs.iterrows():
|
||||||
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
|
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
|
||||||
yield (id, sample, prevalence) if return_id else (sample, prevalence)
|
yield (id, sample, prevalence) if return_id else (sample, prevalence)
|
||||||
|
|
||||||
|
|
||||||
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs):
|
def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
|
||||||
nsamples = len(glob(os.path.join(path_dir, f'*.{ext}')))
|
nsamples = len(glob(os.path.join(path_dir, f'*.txt')))
|
||||||
for id in range(nsamples):
|
for id in range(nsamples):
|
||||||
sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs)
|
sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
|
||||||
yield (id, sample) if return_id else sample
|
yield (id, sample) if return_id else sample
|
||||||
|
|
||||||
|
|
||||||
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs):
|
def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs):
|
||||||
if ground_truth_path is None:
|
if ground_truth_path is None:
|
||||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str)
|
||||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs)
|
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs)
|
||||||
else:
|
else:
|
||||||
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
# the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray)
|
||||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs)
|
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs)
|
||||||
for r in gen_fn:
|
|
||||||
yield r
|
|
||||||
|
|
||||||
|
|
||||||
def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
|
|
||||||
if ground_truth_path is None:
|
|
||||||
# the generator function returns tuples (filename:str, sample:csr_matrix)
|
|
||||||
gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, nF=nF)
|
|
||||||
else:
|
|
||||||
# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
|
|
||||||
gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, nF=nF)
|
|
||||||
for r in gen_fn:
|
for r in gen_fn:
|
||||||
yield r
|
yield r
|
||||||
|
|
||||||
|
@ -214,19 +198,19 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
|
||||||
raise ValueError(f'these result files are not comparable since the categories are different: '
|
raise ValueError(f'these result files are not comparable since the categories are different: '
|
||||||
f'true={true_prevs.n_categories} categories vs. '
|
f'true={true_prevs.n_categories} categories vs. '
|
||||||
f'predictions={predicted_prevs.n_categories} categories')
|
f'predictions={predicted_prevs.n_categories} categories')
|
||||||
ae, rae = [], []
|
rae, ae = [], []
|
||||||
for sample_id, true_prevalence in true_prevs.iterrows():
|
for sample_id, true_prevalence in true_prevs.iterrows():
|
||||||
pred_prevalence = predicted_prevs.prevalence(sample_id)
|
pred_prevalence = predicted_prevs.prevalence(sample_id)
|
||||||
ae.append(qp.error.ae(true_prevalence, pred_prevalence))
|
|
||||||
rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
|
rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
|
||||||
|
ae.append(qp.error.ae(true_prevalence, pred_prevalence))
|
||||||
|
|
||||||
ae = np.asarray(ae)
|
|
||||||
rae = np.asarray(rae)
|
rae = np.asarray(rae)
|
||||||
|
ae = np.asarray(ae)
|
||||||
|
|
||||||
if average:
|
if average:
|
||||||
return ae.mean(), rae.mean()
|
return rae.mean(), ae.mean()
|
||||||
else:
|
else:
|
||||||
return ae, rae
|
return rae, ae
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,21 +8,20 @@ LeQua2022 Official evaluation script
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
if args.task in {'T1A', 'T2A'}:
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.TXA_SAMPLE_SIZE
|
sample_size = constants.SAMPLE_SIZE[args.task]
|
||||||
if args.task in {'T1B', 'T2B'}:
|
|
||||||
qp.environ['SAMPLE_SIZE'] = constants.TXB_SAMPLE_SIZE
|
|
||||||
true_prev = ResultSubmission.load(args.true_prevalences)
|
true_prev = ResultSubmission.load(args.true_prevalences)
|
||||||
pred_prev = ResultSubmission.load(args.pred_prevalences)
|
pred_prev = ResultSubmission.load(args.pred_prevalences)
|
||||||
mae, mrae = evaluate_submission(true_prev, pred_prev)
|
|
||||||
print(f'MAE: {mae:.4f}')
|
mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size)
|
||||||
print(f'MRAE: {mrae:.4f}')
|
print(f'MRAE: {mrae:.4f}')
|
||||||
|
print(f'MAE: {mae:.4f}')
|
||||||
|
|
||||||
if args.output is not None:
|
if args.output is not None:
|
||||||
qp.util.create_parent_dir(args.output)
|
|
||||||
with open(args.output, 'wt') as foo:
|
with open(args.output, 'wt') as foo:
|
||||||
foo.write(f'MAE: {mae:.4f}\n')
|
|
||||||
foo.write(f'MRAE: {mrae:.4f}\n')
|
foo.write(f'MRAE: {mrae:.4f}\n')
|
||||||
|
foo.write(f'MAE: {mae:.4f}\n')
|
||||||
|
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
|
@ -37,4 +36,7 @@ if __name__=='__main__':
|
||||||
help='Path where to store the evaluation scores')
|
help='Path where to store the evaluation scores')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.output is not None:
|
||||||
|
qp.util.create_parent_dir(args.output)
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import quapy as qp
|
from data import ResultSubmission
|
||||||
from data import ResultSubmission, evaluate_submission
|
|
||||||
import constants
|
|
||||||
import os
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
LeQua2022 Official format-checker script
|
LeQua2022 Official format-checker script
|
||||||
|
@ -13,9 +11,9 @@ def main(args):
|
||||||
ResultSubmission.check_file_format(args.prevalence_file)
|
ResultSubmission.check_file_format(args.prevalence_file)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
print(e)
|
||||||
print('Format check: not passed')
|
print('Format check: [not passed]')
|
||||||
else:
|
else:
|
||||||
print('Format check: passed')
|
print('Format check: [passed]')
|
||||||
|
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from data import ResultSubmission
|
from data import ResultSubmission
|
||||||
import constants
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -27,7 +26,7 @@ def main(args):
|
||||||
|
|
||||||
# predictions
|
# predictions
|
||||||
predictions = ResultSubmission()
|
predictions = ResultSubmission()
|
||||||
for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples):
|
for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples):
|
||||||
predictions.add(sampleid, model.quantify(sample))
|
predictions.add(sampleid, model.quantify(sample))
|
||||||
|
|
||||||
# saving
|
# saving
|
||||||
|
|
|
@ -9,111 +9,6 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
|
||||||
from quapy.functional import artificial_prevalence_sampling, strprev
|
from quapy.functional import artificial_prevalence_sampling, strprev
|
||||||
|
|
||||||
|
|
||||||
# class Sampling:
|
|
||||||
#
|
|
||||||
# @abstractmethod
|
|
||||||
# def load(cls, path: str, loader_func: callable, classes=None): ...
|
|
||||||
#
|
|
||||||
# @abstractmethod
|
|
||||||
# @property
|
|
||||||
# def __len__(self): ...
|
|
||||||
#
|
|
||||||
# @abstractmethod
|
|
||||||
# @property
|
|
||||||
# def prevalence(self): ...
|
|
||||||
#
|
|
||||||
# @abstractmethod
|
|
||||||
# @property
|
|
||||||
# def n_classes(self):
|
|
||||||
#
|
|
||||||
# @property
|
|
||||||
# def binary(self):
|
|
||||||
# return self.n_classes == 2
|
|
||||||
#
|
|
||||||
# def uniform_sampling_index(self, size):
|
|
||||||
# return np.random.choice(len(self), size, replace=False)
|
|
||||||
#
|
|
||||||
# def uniform_sampling(self, size):
|
|
||||||
# unif_index = self.uniform_sampling_index(size)
|
|
||||||
# return self.sampling_from_index(unif_index)
|
|
||||||
#
|
|
||||||
# def sampling(self, size, *prevs, shuffle=True):
|
|
||||||
# prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
|
||||||
# return self.sampling_from_index(prev_index)
|
|
||||||
#
|
|
||||||
# def sampling_from_index(self, index):
|
|
||||||
# documents = self.instances[index]
|
|
||||||
# labels = self.labels[index]
|
|
||||||
# return LabelledCollection(documents, labels, classes_=self.classes_)
|
|
||||||
#
|
|
||||||
# def split_stratified(self, train_prop=0.6, random_state=None):
|
|
||||||
# # with temp_seed(42):
|
|
||||||
# tr_docs, te_docs, tr_labels, te_labels = \
|
|
||||||
# train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels,
|
|
||||||
# random_state=random_state)
|
|
||||||
# return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
|
||||||
#
|
|
||||||
# def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
|
||||||
# dimensions = self.n_classes
|
|
||||||
# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
|
||||||
# yield self.sampling(sample_size, *prevs)
|
|
||||||
#
|
|
||||||
# def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1):
|
|
||||||
# dimensions = self.n_classes
|
|
||||||
# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
|
||||||
# yield self.sampling_index(sample_size, *prevs)
|
|
||||||
#
|
|
||||||
# def natural_sampling_generator(self, sample_size, repeats=100):
|
|
||||||
# for _ in range(repeats):
|
|
||||||
# yield self.uniform_sampling(sample_size)
|
|
||||||
#
|
|
||||||
# def natural_sampling_index_generator(self, sample_size, repeats=100):
|
|
||||||
# for _ in range(repeats):
|
|
||||||
# yield self.uniform_sampling_index(sample_size)
|
|
||||||
#
|
|
||||||
# def __add__(self, other):
|
|
||||||
# if other is None:
|
|
||||||
# return self
|
|
||||||
# elif issparse(self.instances) and issparse(other.instances):
|
|
||||||
# join_instances = vstack([self.instances, other.instances])
|
|
||||||
# elif isinstance(self.instances, list) and isinstance(other.instances, list):
|
|
||||||
# join_instances = self.instances + other.instances
|
|
||||||
# elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
|
|
||||||
# join_instances = np.concatenate([self.instances, other.instances])
|
|
||||||
# else:
|
|
||||||
# raise NotImplementedError('unsupported operation for collection types')
|
|
||||||
# labels = np.concatenate([self.labels, other.labels])
|
|
||||||
# return LabelledCollection(join_instances, labels)
|
|
||||||
#
|
|
||||||
# @property
|
|
||||||
# def Xy(self):
|
|
||||||
# return self.instances, self.labels
|
|
||||||
#
|
|
||||||
# def stats(self, show=True):
|
|
||||||
# ninstances = len(self)
|
|
||||||
# instance_type = type(self.instances[0])
|
|
||||||
# if instance_type == list:
|
|
||||||
# nfeats = len(self.instances[0])
|
|
||||||
# elif instance_type == np.ndarray or issparse(self.instances):
|
|
||||||
# nfeats = self.instances.shape[1]
|
|
||||||
# else:
|
|
||||||
# nfeats = '?'
|
|
||||||
# stats_ = {'instances': ninstances,
|
|
||||||
# 'type': instance_type,
|
|
||||||
# 'features': nfeats,
|
|
||||||
# 'classes': self.classes_,
|
|
||||||
# 'prevs': strprev(self.prevalence())}
|
|
||||||
# if show:
|
|
||||||
# print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, '
|
|
||||||
# f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}')
|
|
||||||
# return stats_
|
|
||||||
#
|
|
||||||
# def kFCV(self, nfolds=5, nrepeats=1, random_state=0):
|
|
||||||
# kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state)
|
|
||||||
# for train_index, test_index in kf.split(*self.Xy):
|
|
||||||
# train = self.sampling_from_index(train_index)
|
|
||||||
# test = self.sampling_from_index(test_index)
|
|
||||||
# yield train, test
|
|
||||||
|
|
||||||
class LabelledCollection:
|
class LabelledCollection:
|
||||||
'''
|
'''
|
||||||
|
@ -146,8 +41,8 @@ class LabelledCollection:
|
||||||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path: str, loader_func: callable, classes=None):
|
def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):
|
||||||
return LabelledCollection(*loader_func(path), classes)
|
return LabelledCollection(*loader_func(path, **loader_kwargs), classes)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.instances.shape[0]
|
return self.instances.shape[0]
|
||||||
|
|
Loading…
Reference in New Issue