From 1f591ec105d72669fc7ec77c7cbce080edac403d Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Wed, 1 Dec 2021 12:32:38 +0100
Subject: [PATCH] unifying load document functions (labelled/unlabelled)

---
 LeQua2022/baselines.py | 10 +++++-----
 LeQua2022/data.py      | 15 +++++++--------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/LeQua2022/baselines.py b/LeQua2022/baselines.py
index 42ead5e..28fd2d7 100644
--- a/LeQua2022/baselines.py
+++ b/LeQua2022/baselines.py
@@ -46,7 +46,7 @@ def main(args):
 
         def gen_samples():
             return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
-                                    load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
+                                    load_fn=load_raw_documents, vectorizer=tfidf)
 
     print(f'number of classes: {len(train.classes_)}')
     print(f'number of training documents: {len(train)}')
@@ -58,10 +58,10 @@ def main(args):
         'class_weight': ['balanced', None]
     }
 
-    # param_grid = {
-    #     'C': [0.01, 0.1, 1],
-    #     'class_weight': ['balanced']
-    # }
+    param_grid = {
+        'C': [0.01],
+        'class_weight': ['balanced']
+    }
 
     for quantifier, q_name in baselines():
         print(f'{q_name}: Model selection')
diff --git a/LeQua2022/data.py b/LeQua2022/data.py
index 6d09db9..cf3587f 100644
--- a/LeQua2022/data.py
+++ b/LeQua2022/data.py
@@ -22,16 +22,15 @@ def load_category_map(path):
     return cat2code, code2cat
 
 
-def load_raw_documents(path):
-    return qp.data.from_text(path, verbose=0, class2int=True)
-
-
-def load_raw_unlabelled_documents(path, vectorizer=None):
-    with open(path, 'rt', encoding='utf-8') as file:
-        documents = [d.strip() for d in file.readlines()]
+def load_raw_documents(path, vectorizer=None):
+    df = pd.read_csv(path)
+    documents = list(df["text"].values)
     if vectorizer:
         documents = vectorizer.transform(documents)
-    return documents, None
+    labels = None
+    if "label" in df.columns:
+        labels = df["label"].values.astype(np.int)
+    return documents, labels
 
 
 def load_vector_documents(path):