forked from moreo/QuaPy
unifying load document functions (labelled/unlabelled)
This commit is contained in:
parent
4da1233b46
commit
1f591ec105
|
@ -46,7 +46,7 @@ def main(args):
|
||||||
|
|
||||||
def gen_samples():
|
def gen_samples():
|
||||||
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
|
return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs,
|
||||||
load_fn=load_raw_unlabelled_documents, vectorizer=tfidf)
|
load_fn=load_raw_documents, vectorizer=tfidf)
|
||||||
|
|
||||||
print(f'number of classes: {len(train.classes_)}')
|
print(f'number of classes: {len(train.classes_)}')
|
||||||
print(f'number of training documents: {len(train)}')
|
print(f'number of training documents: {len(train)}')
|
||||||
|
@ -58,10 +58,10 @@ def main(args):
|
||||||
'class_weight': ['balanced', None]
|
'class_weight': ['balanced', None]
|
||||||
}
|
}
|
||||||
|
|
||||||
# param_grid = {
|
param_grid = {
|
||||||
# 'C': [0.01, 0.1, 1],
|
'C': [0.01],
|
||||||
# 'class_weight': ['balanced']
|
'class_weight': ['balanced']
|
||||||
# }
|
}
|
||||||
|
|
||||||
for quantifier, q_name in baselines():
|
for quantifier, q_name in baselines():
|
||||||
print(f'{q_name}: Model selection')
|
print(f'{q_name}: Model selection')
|
||||||
|
|
|
@ -22,16 +22,15 @@ def load_category_map(path):
|
||||||
return cat2code, code2cat
|
return cat2code, code2cat
|
||||||
|
|
||||||
|
|
||||||
def load_raw_documents(path):
|
def load_raw_documents(path, vectorizer=None):
|
||||||
return qp.data.from_text(path, verbose=0, class2int=True)
|
df = pd.read_csv(path)
|
||||||
|
documents = list(df["text"].values)
|
||||||
|
|
||||||
def load_raw_unlabelled_documents(path, vectorizer=None):
|
|
||||||
with open(path, 'rt', encoding='utf-8') as file:
|
|
||||||
documents = [d.strip() for d in file.readlines()]
|
|
||||||
if vectorizer:
|
if vectorizer:
|
||||||
documents = vectorizer.transform(documents)
|
documents = vectorizer.transform(documents)
|
||||||
return documents, None
|
labels = None
|
||||||
|
if "label" in df.columns:
|
||||||
|
labels = df["label"].values.astype(np.int)
|
||||||
|
return documents, labels
|
||||||
|
|
||||||
|
|
||||||
def load_vector_documents(path):
|
def load_vector_documents(path):
|
||||||
|
|
Loading…
Reference in New Issue