Added encoding option with default to utf-8.
This commit is contained in:
parent
d86c402916
commit
44cec7a046
|
@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
def from_text(path):
|
def from_text(path, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
Reas a labelled colletion of documents.
|
Reas a labelled colletion of documents.
|
||||||
File fomart <0 or 1>\t<document>\n
|
File fomart <0 or 1>\t<document>\n
|
||||||
|
@ -11,7 +11,7 @@ def from_text(path):
|
||||||
:return: a list of sentences, and a list of labels
|
:return: a list of sentences, and a list of labels
|
||||||
"""
|
"""
|
||||||
all_sentences, all_labels = [], []
|
all_sentences, all_labels = [], []
|
||||||
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line:
|
if line:
|
||||||
label, sentence = line.split('\t')
|
label, sentence = line.split('\t')
|
||||||
|
@ -25,8 +25,8 @@ def from_text(path):
|
||||||
|
|
||||||
def from_sparse(path):
|
def from_sparse(path):
|
||||||
"""
|
"""
|
||||||
Reas a labelled colletion of real-valued instances expressed in sparse format
|
Reads a labelled collection of real-valued instances expressed in sparse format
|
||||||
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
|
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||||
:param path: path to the labelled collection
|
:param path: path to the labelled collection
|
||||||
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
||||||
"""
|
"""
|
||||||
|
@ -56,16 +56,16 @@ def from_sparse(path):
|
||||||
return X, y
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
def from_csv(path):
|
def from_csv(path, encoding='utf-8'):
|
||||||
"""
|
"""
|
||||||
Reas a csv file in which columns are separated by ','.
|
Reads a csv file in which columns are separated by ','.
|
||||||
File fomart <label>,<feat1>,<feat2>,...,<featn>\n
|
File format <label>,<feat1>,<feat2>,...,<featn>\n
|
||||||
:param path: path to the csv file
|
:param path: path to the csv file
|
||||||
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
:return: a ndarray for the labels and a ndarray (float) for the covariates
|
||||||
"""
|
"""
|
||||||
|
|
||||||
X, y = [], []
|
X, y = [], []
|
||||||
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'):
|
for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
|
||||||
yi, *xi = instance.strip().split(',')
|
yi, *xi = instance.strip().split(',')
|
||||||
X.append(list(map(float,xi)))
|
X.append(list(map(float,xi)))
|
||||||
y.append(yi)
|
y.append(yi)
|
||||||
|
|
Loading…
Reference in New Issue