forked from moreo/QuaPy
data loading
This commit is contained in:
parent
c0db4a2867
commit
b6820e8dba
|
@ -129,3 +129,4 @@ dmypy.json
|
|||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
*__pycache__*
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
from .base import *
|
||||
from . import base
|
||||
from . import reader
|
||||
from . import preprocessing
|
|
@ -0,0 +1,137 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import issparse, dok_matrix
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from quapy.functional import artificial_prevalence_sampling
|
||||
from scipy.sparse import vstack
|
||||
|
||||
|
||||
class LabelledCollection:
|
||||
|
||||
def __init__(self, instances, labels, n_classes=None):
|
||||
self.instances = instances if issparse(instances) else np.asarray(instances)
|
||||
self.labels = np.asarray(labels, dtype=int)
|
||||
n_docs = len(self)
|
||||
if n_classes is None:
|
||||
self.classes_ = np.unique(self.labels)
|
||||
self.classes_.sort()
|
||||
else:
|
||||
self.classes_ = np.arange(n_classes)
|
||||
self.index = {class_i: np.arange(n_docs)[self.labels == class_i] for class_i in self.classes_}
|
||||
|
||||
@classmethod
|
||||
def load(cls, path:str, loader_func:callable):
|
||||
return LabelledCollection(*loader_func(path))
|
||||
|
||||
@classmethod
|
||||
def load_dataset(cls, train_path, test_path):
|
||||
training = cls.load(train_path)
|
||||
test = cls.load(test_path)
|
||||
return Dataset(training, test)
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
||||
def prevalence(self):
|
||||
return self.counts()/len(self)
|
||||
|
||||
def counts(self):
|
||||
return np.asarray([len(self.index[ci]) for ci in self.classes_])
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return len(self.classes_)
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.n_classes==2
|
||||
|
||||
def sampling_index(self, size, *prevs, shuffle=True):
|
||||
if len(prevs) == self.n_classes-1:
|
||||
prevs = prevs + (1-sum(prevs),)
|
||||
assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
|
||||
assert sum(prevs) == 1, f'prevalences ({prevs}) out of range (sum={sum(prevs)})'
|
||||
|
||||
taken = 0
|
||||
indexes_sample = []
|
||||
for i, class_i in enumerate(self.classes_):
|
||||
if i == self.n_classes-1:
|
||||
n_requested = size - taken
|
||||
else:
|
||||
n_requested = int(size * prevs[i])
|
||||
|
||||
n_candidates = len(self.index[class_i])
|
||||
index_sample = self.index[class_i][
|
||||
np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
|
||||
] if n_requested > 0 else []
|
||||
|
||||
indexes_sample.append(index_sample)
|
||||
taken += n_requested
|
||||
|
||||
indexes_sample = np.concatenate(indexes_sample).astype(int)
|
||||
|
||||
if shuffle:
|
||||
indexes_sample = np.random.permutation(indexes_sample)
|
||||
|
||||
return indexes_sample
|
||||
|
||||
def sampling(self, size, *prevs, shuffle=True):
|
||||
index = self.sampling_index(size, *prevs, shuffle=shuffle)
|
||||
return self.sampling_from_index(index)
|
||||
|
||||
def sampling_from_index(self, index):
|
||||
documents = self.instances[index]
|
||||
labels = self.labels[index]
|
||||
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
||||
|
||||
def split_stratified(self, train_prop=0.6):
|
||||
tr_docs, te_docs, tr_labels, te_labels = \
|
||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
|
||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||
|
||||
def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1):
|
||||
dimensions=self.n_classes
|
||||
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
|
||||
yield self.sampling(sample_size, *prevs)
|
||||
|
||||
def __add__(self, other):
|
||||
if issparse(self.instances) and issparse(other.documents):
|
||||
docs = vstack([self.instances, other.documents])
|
||||
elif isinstance(self.instances, list) and isinstance(other.documents, list):
|
||||
docs = self.instances + other.documents
|
||||
else:
|
||||
raise NotImplementedError('unsupported operation for collection types')
|
||||
labels = np.concatenate([self.labels, other.labels])
|
||||
return LabelledCollection(docs, labels)
|
||||
|
||||
|
||||
|
||||
class Dataset:
|
||||
|
||||
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None):
|
||||
assert training.n_classes == test.n_classes, 'incompatible labels in training and test collections'
|
||||
self.training = training
|
||||
self.test = test
|
||||
self.vocabulary = vocabulary
|
||||
|
||||
@classmethod
|
||||
def SplitStratified(cls, collection: LabelledCollection, train_size=0.6):
|
||||
return Dataset(*collection.split_stratified(train_prop=train_size))
|
||||
|
||||
@property
|
||||
def n_classes(self):
|
||||
return self.training.n_classes
|
||||
|
||||
@property
|
||||
def binary(self):
|
||||
return self.training.binary
|
||||
|
||||
@classmethod
|
||||
def load(cls, train_path, test_path, loader_func:callable):
|
||||
training = LabelledCollection.load(train_path, loader_func)
|
||||
test = LabelledCollection.load(test_path, loader_func)
|
||||
return Dataset(training, test)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from dataset.base import Dataset
|
||||
from scipy.sparse import spmatrix
|
||||
import numpy as np
|
||||
from utils.util import parallelize
|
||||
from .base import LabelledCollection
|
||||
|
||||
|
||||
def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):
|
||||
"""
|
||||
Transforms a Dataset of textual instances into a Dataset of tfidf weighted sparse vectors
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of occurrences for a word to be considered as part of the vocabulary
|
||||
:param sublinear_tf: whether or not to apply the log scalling to the tf counters
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.TfidfVectorizer)
|
||||
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the instances are stored in a csr_matrix of real-valued tfidf scores
|
||||
"""
|
||||
__check_type(dataset.training.instances, list, str)
|
||||
__check_type(dataset.test.instances, list, str)
|
||||
|
||||
vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
|
||||
training_documents = vectorizer.fit_transform(dataset.training.instances)
|
||||
test_documents = vectorizer.transform(dataset.test.instances)
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_documents, dataset.training.labels, dataset.n_classes)
|
||||
dataset.test = LabelledCollection(test_documents, dataset.test.labels, dataset.n_classes)
|
||||
dataset.vocabulary = vectorizer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_documents, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(test_documents, dataset.test.labels.copy(), dataset.n_classes)
|
||||
return Dataset(training, test, vectorizer.vocabulary_)
|
||||
|
||||
|
||||
def reduce_columns(dataset:Dataset, min_df=5, inplace=False):
|
||||
"""
|
||||
Reduces the dimensionality of the csr_matrix by removing the columns of words which are not present in at least
|
||||
_min_occurrences_ instances
|
||||
:param dataset: a Dataset in sparse format (any subtype of scipy.sparse.spmatrix)
|
||||
:param min_df: minimum number of instances below which the columns are removed
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
where the dimensions corresponding to infrequent instances have been removed
|
||||
"""
|
||||
__check_type(dataset.training, spmatrix)
|
||||
__check_type(dataset.test, spmatrix)
|
||||
assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces'
|
||||
|
||||
def filter_by_occurrences(X, W):
|
||||
column_prevalence = np.asarray((X > 0).sum(axis=0)).flatten()
|
||||
take_columns = column_prevalence >= min_df
|
||||
X = X[:, take_columns]
|
||||
W = W[:, take_columns]
|
||||
return X, W
|
||||
|
||||
Xtr, Xte = filter_by_occurrences(dataset.training.instances, dataset.test.instances)
|
||||
if inplace:
|
||||
dataset.training.instances = Xtr
|
||||
dataset.test.instances = Xte
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(Xtr, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(Xte, dataset.test.labels.copy(), dataset.n_classes)
|
||||
return Dataset(training, test)
|
||||
|
||||
|
||||
def index(dataset: Dataset, min_df=5, inplace=False, **kwargs):
|
||||
"""
|
||||
Indexes a dataset of strings. To index a document means to replace each different token by a unique numerical index.
|
||||
Rare words (i.e., words occurring less than _min_df_ times) are replaced by a special token UNK
|
||||
:param dataset: a Dataset where the instances are lists of str
|
||||
:param min_df: minimum number of instances below which the term is replaced by a UNK index
|
||||
:param inplace: whether or not to apply the transformation inplace, or to a new copy
|
||||
:param kwargs: the rest of parameters of the transformation (as for sklearn.feature_extraction.text.CountVectorizer)
|
||||
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
|
||||
consisting of lists of integer values representing indices.
|
||||
"""
|
||||
__check_type(dataset.training.instances, list, str)
|
||||
__check_type(dataset.test.instances, list, str)
|
||||
|
||||
indexer = IndexTransformer(min_df=min_df, **kwargs)
|
||||
training_index = indexer.fit_transform(dataset.training.instances)
|
||||
test_index = indexer.transform(dataset.test.instances)
|
||||
|
||||
if inplace:
|
||||
dataset.training = LabelledCollection(training_index, dataset.training.labels, dataset.n_classes)
|
||||
dataset.test = LabelledCollection(test_index, dataset.test.labels, dataset.n_classes)
|
||||
dataset.vocabulary = indexer.vocabulary_
|
||||
return dataset
|
||||
else:
|
||||
training = LabelledCollection(training_index, dataset.training.labels.copy(), dataset.n_classes)
|
||||
test = LabelledCollection(test_index, dataset.test.labels.copy(), dataset.n_classes)
|
||||
return Dataset(training, test, indexer.vocabulary_)
|
||||
|
||||
|
||||
def __check_type(container, container_type=None, element_type=None):
|
||||
if container_type:
|
||||
assert isinstance(container, container_type), \
|
||||
f'unexpected type of container (expected {container_type}, found {type(container)})'
|
||||
if element_type:
|
||||
assert isinstance(next(container), element_type), \
|
||||
f'unexpected type of element (expected {container_type}, found {type(container)})'
|
||||
|
||||
|
||||
|
||||
class IndexTransformer:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
:param kwargs: keyworded arguments from _sklearn.feature_extraction.text.CountVectorizer_
|
||||
"""
|
||||
self.vect = CountVectorizer(**kwargs)
|
||||
self.unk = -1 # a valid index is assigned after fit
|
||||
|
||||
def fit(self, X):
|
||||
"""
|
||||
:param X: a list of strings
|
||||
:return: self
|
||||
"""
|
||||
self.vect.fit(X)
|
||||
self.analyzer = self.vect.build_analyzer()
|
||||
self.vocabulary_ = self.vect.vocabulary_
|
||||
self.unk = self.add_word('UNK')
|
||||
return self
|
||||
|
||||
def transform(self, X, n_jobs=-1):
|
||||
# given the number of tasks and the number of jobs, generates the slices for the parallel threads
|
||||
assert self.unk > 0, 'transform called before fit'
|
||||
indexed = parallelize(func=self.index, args=X, n_jobs=n_jobs)
|
||||
return np.asarray(indexed)
|
||||
|
||||
def index(self, documents):
|
||||
vocab = self.vocabulary_.copy()
|
||||
return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
||||
|
||||
def fit_transform(self, X, n_jobs=-1):
|
||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||
|
||||
def vocabulary_size(self):
|
||||
return len(self.vocabulary_) + 1 # the reserved unk token
|
||||
|
||||
def add_word(self, word):
|
||||
if word in self.vocabulary_:
|
||||
raise ValueError(f'word {word} already in dictionary')
|
||||
self.vocabulary_[word] = len(self.vocabulary_)
|
||||
return self.vocabulary_[word]
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import dok_matrix
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def from_text(path):
|
||||
"""
|
||||
Reas a labelled colletion of documents.
|
||||
File fomart <0 or 1>\t<document>\n
|
||||
:param path: path to the labelled collection
|
||||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
label = int(label)
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
return all_sentences, all_labels
|
||||
|
||||
|
||||
def from_sparse(path):
|
||||
"""
|
||||
Reas a labelled colletion of real-valued instances expressed in sparse format
|
||||
File fomart <-1 or 0 or 1>[\s col(int):val(float)]\n
|
||||
:param path: path to the labelled collection
|
||||
:return: a csr_matrix containing the instances (rows), and a ndarray containing the labels
|
||||
"""
|
||||
|
||||
def split_col_val(col_val):
|
||||
col, val = col_val.split(':')
|
||||
col, val = int(col) - 1, float(val)
|
||||
return col, val
|
||||
|
||||
all_documents, all_labels = [], []
|
||||
max_col = 0
|
||||
for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
|
||||
parts = line.strip().split()
|
||||
if parts:
|
||||
all_labels.append(int(parts[0]))
|
||||
cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]])
|
||||
cols, vals = np.asarray(cols), np.asarray(vals)
|
||||
max_col = max(max_col, cols.max())
|
||||
all_documents.append((cols, vals))
|
||||
n_docs = len(all_labels)
|
||||
X = dok_matrix((n_docs, max_col + 1), dtype=float)
|
||||
for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents),
|
||||
desc=f'\-- filling matrix of shape {X.shape}'):
|
||||
X[i, cols] = vals
|
||||
X = X.tocsr()
|
||||
y = np.asarray(all_labels) + 1
|
||||
return X, y
|
Loading…
Reference in New Issue