1
0
Fork 0

unifying load document functions (labelled/unlabelled)

This commit is contained in:
Alejandro Moreo Fernandez 2021-12-01 12:32:38 +01:00
parent 4da1233b46
commit 1f591ec105
2 changed files with 12 additions and 13 deletions

View File

@ -46,7 +46,7 @@ def main(args):
def gen_samples(): def gen_samples():
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) load_fn=load_raw_documents, vectorizer=tfidf)
print(f'number of classes: {len(train.classes_)}') print(f'number of classes: {len(train.classes_)}')
print(f'number of training documents: {len(train)}') print(f'number of training documents: {len(train)}')
@ -58,10 +58,10 @@ def main(args):
'class_weight': ['balanced', None] 'class_weight': ['balanced', None]
} }
# param_grid = { param_grid = {
# 'C': [0.01, 0.1, 1], 'C': [0.01],
# 'class_weight': ['balanced'] 'class_weight': ['balanced']
# } }
for quantifier, q_name in baselines(): for quantifier, q_name in baselines():
print(f'{q_name}: Model selection') print(f'{q_name}: Model selection')

View File

@ -22,16 +22,15 @@ def load_category_map(path):
return cat2code, code2cat return cat2code, code2cat
def load_raw_documents(path): def load_raw_documents(path, vectorizer=None):
return qp.data.from_text(path, verbose=0, class2int=True) df = pd.read_csv(path)
documents = list(df["text"].values)
def load_raw_unlabelled_documents(path, vectorizer=None):
with open(path, 'rt', encoding='utf-8') as file:
documents = [d.strip() for d in file.readlines()]
if vectorizer: if vectorizer:
documents = vectorizer.transform(documents) documents = vectorizer.transform(documents)
return documents, None labels = None
if "label" in df.columns:
labels = df["label"].values.astype(np.int)
return documents, labels
def load_vector_documents(path): def load_vector_documents(path):