diff --git a/LeQua2022/baselines.py b/LeQua2022/baselines.py index 42ead5e..28fd2d7 100644 --- a/LeQua2022/baselines.py +++ b/LeQua2022/baselines.py @@ -46,7 +46,7 @@ def main(args): def gen_samples(): return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, - load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) + load_fn=load_raw_documents, vectorizer=tfidf) print(f'number of classes: {len(train.classes_)}') print(f'number of training documents: {len(train)}') @@ -58,10 +58,10 @@ def main(args): 'class_weight': ['balanced', None] } - # param_grid = { - # 'C': [0.01, 0.1, 1], - # 'class_weight': ['balanced'] - # } + param_grid = { + 'C': [0.01], + 'class_weight': ['balanced'] + } for quantifier, q_name in baselines(): print(f'{q_name}: Model selection') diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 6d09db9..cf3587f 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -22,16 +22,15 @@ def load_category_map(path): return cat2code, code2cat -def load_raw_documents(path): - return qp.data.from_text(path, verbose=0, class2int=True) - - -def load_raw_unlabelled_documents(path, vectorizer=None): - with open(path, 'rt', encoding='utf-8') as file: - documents = [d.strip() for d in file.readlines()] +def load_raw_documents(path, vectorizer=None): + df = pd.read_csv(path) + documents = list(df["text"].values) if vectorizer: documents = vectorizer.transform(documents) - return documents, None + labels = None + if "label" in df.columns: + labels = df["label"].values.astype(np.int) + return documents, labels def load_vector_documents(path):