Added encoding option with default to utf-8.

This commit is contained in:
Andrea Esuli 2021-04-30 17:00:46 +02:00
parent d86c402916
commit 44cec7a046
1 changed files with 8 additions and 8 deletions

View File

@ -3,7 +3,7 @@ from scipy.sparse import dok_matrix
from tqdm import tqdm
def from_text(path):
def from_text(path, encoding='utf-8'):
"""
Reas a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n
@ -11,7 +11,7 @@ def from_text(path):
:return: a list of sentences, and a list of labels
"""
all_sentences, all_labels = [], []
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
line = line.strip()
if line:
label, sentence = line.split('\t')
@ -25,8 +25,8 @@ def from_text(path):
def from_sparse(path):
"""
Reas a labelled colletion of real-valued instances expressed in sparse format
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
Reads a labelled collection of real-valued instances expressed in sparse format
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
:param path: path to the labelled collection
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
"""
@ -56,16 +56,16 @@ def from_sparse(path):
return X, y
def from_csv(path):
def from_csv(path, encoding='utf-8'):
"""
Reas a csv file in which columns are separated by ','.
File fomart <label>,<feat1>,<feat2>,...,<featn>\n
Reads a csv file in which columns are separated by ','.
File format <label>,<feat1>,<feat2>,...,<featn>\n
:param path: path to the csv file
:return: a ndarray for the labels and a ndarray (float) for the covariates
"""
X, y = [], []
for instance in tqdm(open(path, 'rt').readlines(), desc=f'reading {path}'):
for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
yi, *xi = instance.strip().split(',')
X.append(list(map(float,xi)))
y.append(yi)