From 4284f1daa3eae8c55eec78858f6cb87506b165c5 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Wed, 13 Oct 2021 20:36:53 +0200 Subject: [PATCH] branch for LeQua2022 - first commit --- LeQua2022/data.py | 16 +++++++ LeQua2022/main_binary.py | 82 ++++++++++++++++++++++++++++++++++++ LeQua2022/main_multiclass.py | 77 +++++++++++++++++++++++++++++++++ TODO.txt | 2 + quapy/data/base.py | 4 +- quapy/data/reader.py | 13 ++++-- 6 files changed, 188 insertions(+), 6 deletions(-) create mode 100644 LeQua2022/data.py create mode 100644 LeQua2022/main_binary.py create mode 100644 LeQua2022/main_multiclass.py diff --git a/LeQua2022/data.py b/LeQua2022/data.py new file mode 100644 index 0000000..97d1a7d --- /dev/null +++ b/LeQua2022/data.py @@ -0,0 +1,16 @@ +import quapy as qp +import numpy as np + + +def load_binary_raw_document(path): + documents, labels = qp.data.from_text(path, verbose=0, class2int=True) + labels = np.asarray(labels) + labels[np.logical_or(labels == 1, labels == 2)] = 0 + labels[np.logical_or(labels == 4, labels == 5)] = 1 + return documents, labels + + +def load_multiclass_raw_document(path): + return qp.data.from_text(path, verbose=0, class2int=False) + + diff --git a/LeQua2022/main_binary.py b/LeQua2022/main_binary.py new file mode 100644 index 0000000..653de00 --- /dev/null +++ b/LeQua2022/main_binary.py @@ -0,0 +1,82 @@ +import pickle + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +from data import load_binary_raw_document +import os + +path_binary_raw = 'binary_raw' +result_path = os.path.join('results', 'binary_raw') +os.makedirs(result_path, exist_ok=True) + +train_file = os.path.join(path_binary_raw, 'documents', 'training.txt') + +train = LabelledCollection.load(train_file, load_binary_raw_document) + +print(train.classes_) +print(len(train)) +print(train.prevalence()) + +tfidf = TfidfVectorizer(min_df=5) +train.instances = tfidf.fit_transform(train.instances) + +scores = {} +for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: + classifier = CalibratedClassifierCV(LogisticRegression()) + model = quantifier(classifier).fit(train) + + quantifier_name = model.__class__.__name__ + scores[quantifier_name]={} + for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]: + ae_errors, rae_errors = [], [] + for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'): + test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt') + test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_) + test.instances = tfidf.transform(test.instances) + qp.environ['SAMPLE_SIZE'] = len(test) + prev_estim = model.quantify(test.instances) + prev_true = test.prevalence() + ae_errors.append(qp.error.mae(prev_true, prev_estim)) + rae_errors.append(qp.error.mrae(prev_true, prev_estim)) + + ae_errors = np.asarray(ae_errors) + rae_errors = np.asarray(rae_errors) + + mae = ae_errors.mean() + mrae = rae_errors.mean() + scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} + pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') + print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') + +for model in scores: + for sample_set in ['validation']:#, 'test']: + print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') + + +""" +test: +CC 0.1859 1.5406 +ACC 0.0453 0.2840 +PCC 0.1793 1.7187 +PACC 0.0287 0.1494 +EMQ 0.0225 0.1020 +HDy 0.0631 0.2307 + +validation +CC 0.1862 1.9587 +ACC 0.0394 0.2669 +PCC 0.1789 2.1383 +PACC 0.0354 0.1587 +EMQ 0.0224 0.0960 +HDy 0.0467 0.2121 +""" + + diff --git a/LeQua2022/main_multiclass.py b/LeQua2022/main_multiclass.py new file mode 100644 index 0000000..a999a4e --- /dev/null +++ b/LeQua2022/main_multiclass.py @@ -0,0 +1,77 @@ +import pickle + +import numpy as np +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +from tqdm import tqdm + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.aggregative import * +from data import load_multiclass_raw_document +import os + +path_multiclass_raw = 'multiclass_raw' +result_path = os.path.join('results', 'multiclass_raw') +os.makedirs(result_path, exist_ok=True) + +train_file = os.path.join(path_multiclass_raw, 'documents', 'training.txt') + +train = LabelledCollection.load(train_file, load_multiclass_raw_document) + +print('classes', train.classes_) +print('#classes', len(train.classes_)) +print('#docs', len(train)) +print('prevalence', train.prevalence()) +print('counts', train.counts()) + +tfidf = TfidfVectorizer(min_df=5) +train.instances = tfidf.fit_transform(train.instances) +print(train.instances.shape[1]) + +scores = {} +for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]: + classifier = CalibratedClassifierCV(LogisticRegression()) + # classifier = LogisticRegression() + model = quantifier(classifier).fit(train) + print('model trained') + + quantifier_name = model.__class__.__name__ + scores[quantifier_name]={} + for sample_set, sample_size in [('validation', 1000), ('test', 5000)]: + ae_errors, rae_errors = [], [] + for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'): + test_file = os.path.join(path_multiclass_raw, 'documents', f'{sample_set}_{i}.txt') + test = LabelledCollection.load(test_file, load_multiclass_raw_document, classes=train.classes_) + test.instances = tfidf.transform(test.instances) + qp.environ['SAMPLE_SIZE'] = len(test) + prev_estim = model.quantify(test.instances) + prev_true = test.prevalence() + ae_errors.append(qp.error.mae(prev_true, prev_estim)) + rae_errors.append(qp.error.mrae(prev_true, prev_estim)) + + ae_errors = np.asarray(ae_errors) + rae_errors = np.asarray(rae_errors) + + mae = ae_errors.mean() + mrae = rae_errors.mean() + scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae} + pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL) + print(f'{quantifier_name} {sample_set} MAE={mae:.4f}') + print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}') + +for model in scores: + for sample_set in ['validation', 'test']: + print(f'{model}\t{sample_set}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}') + + +""" +test: + + +validation + +""" + + diff --git a/TODO.txt b/TODO.txt index fd46d02..0540821 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,3 +1,5 @@ +Looks like there are some "multilingual" stuff in the master branch? See, e.g., MultilingualLabelledCollection in data/base.py + Packaging: ========================================== Documentation with sphinx diff --git a/quapy/data/base.py b/quapy/data/base.py index b482548..7799c18 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -39,8 +39,8 @@ class LabelledCollection: self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} @classmethod - def load(cls, path: str, loader_func: callable): - return LabelledCollection(*loader_func(path)) + def load(cls, path: str, loader_func: callable, classes=None): + return LabelledCollection(*loader_func(path), classes) def __len__(self): return self.instances.shape[0] diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 5b4d115..59370bc 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -3,20 +3,25 @@ from scipy.sparse import dok_matrix from tqdm import tqdm -def from_text(path, encoding='utf-8'): +def from_text(path, encoding='utf-8', verbose=1, class2int=True): """ - Reas a labelled colletion of documents. + Reads a labelled colletion of documents. File fomart <0 or 1>\t\n :param path: path to the labelled collection :return: a list of sentences, and a list of labels """ all_sentences, all_labels = [], [] - for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'): + if verbose>0: + file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}') + else: + file = open(path, 'rt', encoding=encoding).readlines() + for line in file: line = line.strip() if line: label, sentence = line.split('\t') sentence = sentence.strip() - label = int(label) + if class2int: + label = int(label) if sentence: all_sentences.append(sentence) all_labels.append(label)