2020-12-03 16:24:21 +01:00
|
|
|
import numpy as np
|
|
|
|
from scipy.sparse import dok_matrix
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
|
2021-10-13 20:36:53 +02:00
|
|
|
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
2020-12-03 16:24:21 +01:00
|
|
|
"""
|
2021-10-13 20:36:53 +02:00
|
|
|
Reads a labelled colletion of documents.
|
2020-12-03 16:24:21 +01:00
|
|
|
File fomart <0 or 1>\t<document>\n
|
2021-12-06 18:25:47 +01:00
|
|
|
|
2020-12-03 16:24:21 +01:00
|
|
|
:param path: path to the labelled collection
|
2021-12-06 18:25:47 +01:00
|
|
|
:param encoding: the text encoding used to open the file
|
|
|
|
:param verbose: if >0 (default) shows some progress information in standard output
|
2020-12-03 16:24:21 +01:00
|
|
|
:return: a list of sentences, and a list of labels
|
|
|
|
"""
|
|
|
|
all_sentences, all_labels = [], []
|
2021-10-13 20:36:53 +02:00
|
|
|
if verbose>0:
|
|
|
|
file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')
|
|
|
|
else:
|
|
|
|
file = open(path, 'rt', encoding=encoding).readlines()
|
|
|
|
for line in file:
|
2020-12-03 16:24:21 +01:00
|
|
|
line = line.strip()
|
|
|
|
if line:
|
2021-11-24 11:20:42 +01:00
|
|
|
try:
|
|
|
|
label, sentence = line.split('\t')
|
|
|
|
sentence = sentence.strip()
|
|
|
|
if class2int:
|
|
|
|
label = int(label)
|
|
|
|
if sentence:
|
|
|
|
all_sentences.append(sentence)
|
|
|
|
all_labels.append(label)
|
|
|
|
except ValueError:
|
|
|
|
print(f'format error in {line}')
|
2020-12-03 16:24:21 +01:00
|
|
|
return all_sentences, all_labels
|
|
|
|
|
|
|
|
|
|
|
|
def from_sparse(path):
|
|
|
|
"""
|
2021-04-30 17:00:46 +02:00
|
|
|
Reads a labelled collection of real-valued instances expressed in sparse format
|
|
|
|
File format <-1 or 0 or 1>[\s col(int):val(float)]\n
|
2021-12-06 18:25:47 +01:00
|
|
|
|
2020-12-03 16:24:21 +01:00
|
|
|
:param path: path to the labelled collection
|
2021-12-06 18:25:47 +01:00
|
|
|
:return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
|
2020-12-03 16:24:21 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
def split_col_val(col_val):
|
|
|
|
col, val = col_val.split(':')
|
|
|
|
col, val = int(col) - 1, float(val)
|
|
|
|
return col, val
|
|
|
|
|
|
|
|
all_documents, all_labels = [], []
|
|
|
|
max_col = 0
|
|
|
|
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
|
|
|
parts = line.strip().split()
|
|
|
|
if parts:
|
|
|
|
all_labels.append(int(parts[0]))
|
|
|
|
cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]])
|
|
|
|
cols, vals = np.asarray(cols), np.asarray(vals)
|
|
|
|
max_col = max(max_col, cols.max())
|
|
|
|
all_documents.append((cols, vals))
|
|
|
|
n_docs = len(all_labels)
|
|
|
|
X = dok_matrix((n_docs, max_col + 1), dtype=float)
|
|
|
|
for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents),
|
|
|
|
desc=f'\-- filling matrix of shape {X.shape}'):
|
|
|
|
X[i, cols] = vals
|
|
|
|
X = X.tocsr()
|
|
|
|
y = np.asarray(all_labels) + 1
|
|
|
|
return X, y
|
2020-12-14 18:36:19 +01:00
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
|
2021-04-30 17:00:46 +02:00
|
|
|
def from_csv(path, encoding='utf-8'):
|
2021-01-06 14:58:29 +01:00
|
|
|
"""
|
2021-04-30 17:00:46 +02:00
|
|
|
Reads a csv file in which columns are separated by ','.
|
|
|
|
File format <label>,<feat1>,<feat2>,...,<featn>\n
|
2021-12-06 18:25:47 +01:00
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
:param path: path to the csv file
|
2021-12-06 18:25:47 +01:00
|
|
|
:param encoding: the text encoding used to open the file
|
|
|
|
:return: a np.ndarray for the labels and a ndarray (float) for the covariates
|
2021-01-06 14:58:29 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
X, y = [], []
|
2021-04-30 17:00:46 +02:00
|
|
|
for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
|
2021-01-06 14:58:29 +01:00
|
|
|
yi, *xi = instance.strip().split(',')
|
|
|
|
X.append(list(map(float,xi)))
|
|
|
|
y.append(yi)
|
|
|
|
X = np.asarray(X)
|
|
|
|
y = np.asarray(y)
|
|
|
|
return X, y
|
|
|
|
|
|
|
|
|
|
|
|
def reindex_labels(y):
|
|
|
|
"""
|
|
|
|
Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
|
2021-12-06 18:25:47 +01:00
|
|
|
E.g.:
|
|
|
|
|
|
|
|
>>> reindex_labels(['B', 'B', 'A', 'C'])
|
|
|
|
>>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))
|
|
|
|
|
2021-01-06 14:58:29 +01:00
|
|
|
:param y: the list or array of original labels
|
|
|
|
:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
|
|
|
|
"""
|
2021-12-06 18:25:47 +01:00
|
|
|
y = np.asarray(y)
|
|
|
|
classnames = np.asarray(sorted(np.unique(y)))
|
2021-01-06 14:58:29 +01:00
|
|
|
label2index = {label: index for index, label in enumerate(classnames)}
|
2023-01-16 13:51:29 +01:00
|
|
|
indexed = np.empty(y.shape, dtype=int)
|
2021-01-06 14:58:29 +01:00
|
|
|
for label in classnames:
|
|
|
|
indexed[y==label] = label2index[label]
|
|
|
|
return indexed, classnames
|
|
|
|
|
|
|
|
|
|
|
|
def binarize(y, pos_class):
|
2021-12-06 18:25:47 +01:00
|
|
|
"""
|
|
|
|
Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:
|
|
|
|
|
|
|
|
>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
|
|
|
|
>>> array([0, 1, 0, 0, 0, 0])
|
|
|
|
|
|
|
|
:param y: array-like of labels
|
|
|
|
:param pos_class: integer, the positive class
|
|
|
|
:return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
|
|
|
|
0 otherwise
|
|
|
|
"""
|
2021-01-06 14:58:29 +01:00
|
|
|
y = np.asarray(y)
|
2023-01-16 13:51:29 +01:00
|
|
|
ybin = np.zeros(y.shape, dtype=int)
|
2021-01-06 14:58:29 +01:00
|
|
|
ybin[y == pos_class] = 1
|
2021-01-28 18:22:43 +01:00
|
|
|
return ybin
|
|
|
|
|