From 4284f1daa3eae8c55eec78858f6cb87506b165c5 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 13 Oct 2021 20:36:53 +0200
Subject: [PATCH] branch for LeQua2022 - first commit

---
 LeQua2022/data.py            | 16 +++++++
 LeQua2022/main_binary.py     | 82 ++++++++++++++++++++++++++++++++++++
 LeQua2022/main_multiclass.py | 77 +++++++++++++++++++++++++++++++++
 TODO.txt                     |  2 +
 quapy/data/base.py           |  4 +-
 quapy/data/reader.py         | 13 ++++--
 6 files changed, 188 insertions(+), 6 deletions(-)
 create mode 100644 LeQua2022/data.py
 create mode 100644 LeQua2022/main_binary.py
 create mode 100644 LeQua2022/main_multiclass.py

diff --git a/LeQua2022/data.py b/LeQua2022/data.py
new file mode 100644
index 0000000..97d1a7d
--- /dev/null
+++ b/LeQua2022/data.py
@@ -0,0 +1,16 @@
+import quapy as qp
+import numpy as np
+
+
+def load_binary_raw_document(path):
+    documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
+    labels = np.asarray(labels)
+    labels[np.logical_or(labels == 1, labels == 2)] = 0
+    labels[np.logical_or(labels == 4, labels == 5)] = 1
+    return documents, labels
+
+
+def load_multiclass_raw_document(path):
+    return qp.data.from_text(path, verbose=0, class2int=False)
+
+
diff --git a/LeQua2022/main_binary.py b/LeQua2022/main_binary.py
new file mode 100644
index 0000000..653de00
--- /dev/null
+++ b/LeQua2022/main_binary.py
@@ -0,0 +1,82 @@
+import pickle
+
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import *
+from data import load_binary_raw_document
+import os
+
+path_binary_raw = 'binary_raw'
+result_path = os.path.join('results', 'binary_raw')
+os.makedirs(result_path, exist_ok=True)
+
+train_file = os.path.join(path_binary_raw, 'documents', 'training.txt')
+
+train = LabelledCollection.load(train_file, load_binary_raw_document)
+
+print(train.classes_)
+print(len(train))
+print(train.prevalence())
+
+tfidf = TfidfVectorizer(min_df=5)
+train.instances = tfidf.fit_transform(train.instances)
+
+scores = {}
+for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
+    classifier = CalibratedClassifierCV(LogisticRegression())
+    model = quantifier(classifier).fit(train)
+
+    quantifier_name = model.__class__.__name__
+    scores[quantifier_name]={}
+    for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
+        ae_errors, rae_errors = [], []
+        for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
+            test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt')
+            test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
+            test.instances = tfidf.transform(test.instances)
+            qp.environ['SAMPLE_SIZE'] = len(test)
+            prev_estim = model.quantify(test.instances)
+            prev_true  = test.prevalence()
+            ae_errors.append(qp.error.mae(prev_true, prev_estim))
+            rae_errors.append(qp.error.mrae(prev_true, prev_estim))
+
+        ae_errors = np.asarray(ae_errors)
+        rae_errors = np.asarray(rae_errors)
+
+        mae = ae_errors.mean()
+        mrae = rae_errors.mean()
+        scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
+        pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
+        pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
+        print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
+        print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
+
+for model in scores:
+    for sample_set in ['validation']:#, 'test']:
+        print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
+
+
+"""
+test:
+CC	0.1859	1.5406
+ACC	0.0453	0.2840
+PCC	0.1793	1.7187
+PACC	0.0287	0.1494
+EMQ	0.0225	0.1020
+HDy	0.0631	0.2307
+
+validation
+CC	0.1862	1.9587
+ACC	0.0394	0.2669
+PCC	0.1789	2.1383
+PACC	0.0354	0.1587
+EMQ	0.0224	0.0960
+HDy	0.0467	0.2121
+"""
+
+
diff --git a/LeQua2022/main_multiclass.py b/LeQua2022/main_multiclass.py
new file mode 100644
index 0000000..a999a4e
--- /dev/null
+++ b/LeQua2022/main_multiclass.py
@@ -0,0 +1,77 @@
+import pickle
+
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import *
+from data import load_multiclass_raw_document
+import os
+
+path_multiclass_raw = 'multiclass_raw'
+result_path = os.path.join('results', 'multiclass_raw')
+os.makedirs(result_path, exist_ok=True)
+
+train_file = os.path.join(path_multiclass_raw, 'documents', 'training.txt')
+
+train = LabelledCollection.load(train_file, load_multiclass_raw_document)
+
+print('classes', train.classes_)
+print('#classes', len(train.classes_))
+print('#docs', len(train))
+print('prevalence', train.prevalence())
+print('counts', train.counts())
+
+tfidf = TfidfVectorizer(min_df=5)
+train.instances = tfidf.fit_transform(train.instances)
+print(train.instances.shape[1])
+
+scores = {}
+for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]:
+    classifier = CalibratedClassifierCV(LogisticRegression())
+    # classifier = LogisticRegression()
+    model = quantifier(classifier).fit(train)
+    print('model trained')
+
+    quantifier_name = model.__class__.__name__
+    scores[quantifier_name]={}
+    for sample_set, sample_size in [('validation', 1000), ('test', 5000)]:
+        ae_errors, rae_errors = [], []
+        for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
+            test_file = os.path.join(path_multiclass_raw, 'documents', f'{sample_set}_{i}.txt')
+            test = LabelledCollection.load(test_file, load_multiclass_raw_document, classes=train.classes_)
+            test.instances = tfidf.transform(test.instances)
+            qp.environ['SAMPLE_SIZE'] = len(test)
+            prev_estim = model.quantify(test.instances)
+            prev_true  = test.prevalence()
+            ae_errors.append(qp.error.mae(prev_true, prev_estim))
+            rae_errors.append(qp.error.mrae(prev_true, prev_estim))
+
+        ae_errors = np.asarray(ae_errors)
+        rae_errors = np.asarray(rae_errors)
+
+        mae = ae_errors.mean()
+        mrae = rae_errors.mean()
+        scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
+        pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
+        pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
+        print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
+        print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
+
+for model in scores:
+    for sample_set in ['validation', 'test']:
+        print(f'{model}\t{sample_set}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
+
+
+"""
+test:
+
+
+validation
+
+"""
+
+
diff --git a/TODO.txt b/TODO.txt
index fd46d02..0540821 100644
--- a/TODO.txt
+++ b/TODO.txt
@@ -1,3 +1,5 @@
+Looks like there are some "multilingual" stuff in the master branch? See, e.g., MultilingualLabelledCollection in data/base.py
+
 Packaging:
 ==========================================
 Documentation with sphinx
diff --git a/quapy/data/base.py b/quapy/data/base.py
index b482548..7799c18 100644
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@@ -39,8 +39,8 @@ class LabelledCollection:
         self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
 
     @classmethod
-    def load(cls, path: str, loader_func: callable):
-        return LabelledCollection(*loader_func(path))
+    def load(cls, path: str, loader_func: callable, classes=None):
+        return LabelledCollection(*loader_func(path), classes)
 
     def __len__(self):
         return self.instances.shape[0]
diff --git a/quapy/data/reader.py b/quapy/data/reader.py
index 5b4d115..59370bc 100644
--- a/quapy/data/reader.py
+++ b/quapy/data/reader.py
@@ -3,20 +3,25 @@ from scipy.sparse import dok_matrix
 from tqdm import tqdm
 
 
-def from_text(path, encoding='utf-8'):
+def from_text(path, encoding='utf-8', verbose=1, class2int=True):
     """
-    Reas a labelled colletion of documents.
+    Reads a labelled colletion of documents.
     File fomart <0 or 1>\t<document>\n
     :param path: path to the labelled collection
     :return: a list of sentences, and a list of labels
     """
     all_sentences, all_labels = [], []
-    for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
+    if verbose>0:
+        file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')
+    else:
+        file = open(path, 'rt', encoding=encoding).readlines()
+    for line in file:
         line = line.strip()
         if line:
             label, sentence = line.split('\t')
             sentence = sentence.strip()
-            label = int(label)
+            if class2int:
+                label = int(label)
             if sentence:
                 all_sentences.append(sentence)
                 all_labels.append(label)