forked from moreo/QuaPy
some bugfixes, unittest and minor changes
This commit is contained in:
parent
bb7a77c7c0
commit
8b0b9f522a
|
@ -22,6 +22,18 @@
|
||||||
|
|
||||||
- examples directory created!
|
- examples directory created!
|
||||||
|
|
||||||
|
- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a
|
||||||
|
test protocol maybe, or None for bypassing it?
|
||||||
|
|
||||||
|
- I think Pablo added DyS, Topsoe distance and binary search.
|
||||||
|
|
||||||
|
- I think Pablo added multi-thread reproducibility.
|
||||||
|
|
||||||
|
- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes
|
||||||
|
|
||||||
|
- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances
|
||||||
|
with the plain python type (e.g., float).
|
||||||
|
|
||||||
Things to fix:
|
Things to fix:
|
||||||
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
|
- clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance():
|
||||||
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
|
this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the
|
||||||
|
|
|
@ -26,15 +26,15 @@ def load_raw_documents(path):
|
||||||
documents = list(df["text"].values)
|
documents = list(df["text"].values)
|
||||||
labels = None
|
labels = None
|
||||||
if "label" in df.columns:
|
if "label" in df.columns:
|
||||||
labels = df["label"].values.astype(np.int)
|
labels = df["label"].values.astype(int)
|
||||||
return documents, labels
|
return documents, labels
|
||||||
|
|
||||||
|
|
||||||
def load_vector_documents(path):
|
def load_vector_documents(path):
|
||||||
D = pd.read_csv(path).to_numpy(dtype=np.float)
|
D = pd.read_csv(path).to_numpy(dtype=float)
|
||||||
labelled = D.shape[1] == 301
|
labelled = D.shape[1] == 301
|
||||||
if labelled:
|
if labelled:
|
||||||
X, y = D[:, 1:], D[:, 0].astype(np.int).flatten()
|
X, y = D[:, 1:], D[:, 0].astype(int).flatten()
|
||||||
else:
|
else:
|
||||||
X, y = D, None
|
X, y = D, None
|
||||||
return X, y
|
return X, y
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
from functools import cached_property
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
|
@ -223,13 +225,44 @@ class LabelledCollection:
|
||||||
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
|
test = LabelledCollection(te_docs, te_labels, classes_=self.classes_)
|
||||||
return training, test
|
return training, test
|
||||||
|
|
||||||
|
|
||||||
|
def split_random(self, train_prop=0.6, random_state=None):
|
||||||
|
"""
|
||||||
|
Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired
|
||||||
|
proportion.
|
||||||
|
|
||||||
|
:param train_prop: the proportion of elements to include in the left-most returned collection (typically used
|
||||||
|
as the training collection). The rest of elements are included in the right-most returned collection
|
||||||
|
(typically used as a test collection).
|
||||||
|
:param random_state: if specified, guarantees reproducibility of the split.
|
||||||
|
:return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the
|
||||||
|
second one with `1-train_prop` elements
|
||||||
|
"""
|
||||||
|
indexes = np.random.RandomState(seed=random_state).permutation(len(self))
|
||||||
|
if isinstance(train_prop, int):
|
||||||
|
assert train_prop < len(self), \
|
||||||
|
'argument train_prop cannot be greater than the number of elements in the collection'
|
||||||
|
splitpoint = train_prop
|
||||||
|
elif isinstance(train_prop, float):
|
||||||
|
assert 0 < train_prop < 1, \
|
||||||
|
'argument train_prop out of range (0,1)'
|
||||||
|
splitpoint = int(np.round(len(self)*train_prop))
|
||||||
|
left, right = indexes[:splitpoint], indexes[splitpoint:]
|
||||||
|
training = self.sampling_from_index(left)
|
||||||
|
test = self.sampling_from_index(right)
|
||||||
|
return training, test
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
"""
|
"""
|
||||||
Returns a new :class:`LabelledCollection` as the union of this collection with another collection
|
Returns a new :class:`LabelledCollection` as the union of this collection with another collection.
|
||||||
|
Both labelled collections must have the same classes.
|
||||||
|
|
||||||
:param other: another :class:`LabelledCollection`
|
:param other: another :class:`LabelledCollection`
|
||||||
:return: a :class:`LabelledCollection` representing the union of both collections
|
:return: a :class:`LabelledCollection` representing the union of both collections
|
||||||
"""
|
"""
|
||||||
|
if not all(np.sort(self.classes_)==np.sort(other.classes_)):
|
||||||
|
raise NotImplementedError('unsupported operation for collections on different classes')
|
||||||
|
|
||||||
if other is None:
|
if other is None:
|
||||||
return self
|
return self
|
||||||
elif issparse(self.instances) and issparse(other.instances):
|
elif issparse(self.instances) and issparse(other.instances):
|
||||||
|
@ -241,7 +274,7 @@ class LabelledCollection:
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('unsupported operation for collection types')
|
raise NotImplementedError('unsupported operation for collection types')
|
||||||
labels = np.concatenate([self.labels, other.labels])
|
labels = np.concatenate([self.labels, other.labels])
|
||||||
return LabelledCollection(join_instances, labels)
|
return LabelledCollection(join_instances, labels, classes_=self.classes_)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def Xy(self):
|
def Xy(self):
|
||||||
|
|
|
@ -102,7 +102,7 @@ def reindex_labels(y):
|
||||||
y = np.asarray(y)
|
y = np.asarray(y)
|
||||||
classnames = np.asarray(sorted(np.unique(y)))
|
classnames = np.asarray(sorted(np.unique(y)))
|
||||||
label2index = {label: index for index, label in enumerate(classnames)}
|
label2index = {label: index for index, label in enumerate(classnames)}
|
||||||
indexed = np.empty(y.shape, dtype=np.int)
|
indexed = np.empty(y.shape, dtype=int)
|
||||||
for label in classnames:
|
for label in classnames:
|
||||||
indexed[y==label] = label2index[label]
|
indexed[y==label] = label2index[label]
|
||||||
return indexed, classnames
|
return indexed, classnames
|
||||||
|
@ -121,7 +121,7 @@ def binarize(y, pos_class):
|
||||||
0 otherwise
|
0 otherwise
|
||||||
"""
|
"""
|
||||||
y = np.asarray(y)
|
y = np.asarray(y)
|
||||||
ybin = np.zeros(y.shape, dtype=np.int)
|
ybin = np.zeros(y.shape, dtype=int)
|
||||||
ybin[y == pos_class] = 1
|
ybin[y == pos_class] = 1
|
||||||
return ybin
|
return ybin
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@ def prevalence_from_labels(labels, classes):
|
||||||
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
raise ValueError(f'param labels does not seem to be a ndarray of label predictions')
|
||||||
unique, counts = np.unique(labels, return_counts=True)
|
unique, counts = np.unique(labels, return_counts=True)
|
||||||
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
by_class = defaultdict(lambda:0, dict(zip(unique, counts)))
|
||||||
prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=np.float)
|
prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float)
|
||||||
prevalences /= prevalences.sum()
|
prevalences /= prevalences.sum()
|
||||||
return prevalences
|
return prevalences
|
||||||
|
|
||||||
|
|
|
@ -132,7 +132,11 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
if isinstance(self.learner, CalibratedClassifierCV):
|
if isinstance(self.learner, CalibratedClassifierCV):
|
||||||
parameters = {'base_estimator__' + k: v for k, v in parameters.items()}
|
if self.learner.get_params().get('base_estimator') == 'deprecated':
|
||||||
|
key_prefix = 'estimator__' # this has changed in the newer versions of sklearn
|
||||||
|
else:
|
||||||
|
key_prefix = 'base_estimator__'
|
||||||
|
parameters = {key_prefix + k: v for k, v in parameters.items()}
|
||||||
self.learner.set_params(**parameters)
|
self.learner.set_params(**parameters)
|
||||||
|
|
||||||
|
|
||||||
|
@ -369,7 +373,7 @@ class ACC(AggregativeQuantifier):
|
||||||
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
|
||||||
# document that belongs to yj ends up being classified as belonging to yi
|
# document that belongs to yj ends up being classified as belonging to yi
|
||||||
conf = confusion_matrix(y, y_, labels=classes).T
|
conf = confusion_matrix(y, y_, labels=classes).T
|
||||||
conf = conf.astype(np.float)
|
conf = conf.astype(float)
|
||||||
class_counts = conf.sum(axis=0)
|
class_counts = conf.sum(axis=0)
|
||||||
for i, _ in enumerate(classes):
|
for i, _ in enumerate(classes):
|
||||||
if class_counts[i] == 0:
|
if class_counts[i] == 0:
|
||||||
|
|
|
@ -370,7 +370,7 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs
|
||||||
bins[-1] += 0.001
|
bins[-1] += 0.001
|
||||||
|
|
||||||
# we use this to keep track of how many datapoits contribute to each bin
|
# we use this to keep track of how many datapoits contribute to each bin
|
||||||
inds_histogram_global = np.zeros(n_bins, dtype=np.float)
|
inds_histogram_global = np.zeros(n_bins, dtype=float)
|
||||||
n_methods = len(method_order)
|
n_methods = len(method_order)
|
||||||
buckets = np.zeros(shape=(n_methods, n_bins, 3))
|
buckets = np.zeros(shape=(n_methods, n_bins, 3))
|
||||||
for i, method in enumerate(method_order):
|
for i, method in enumerate(method_order):
|
||||||
|
|
|
@ -23,7 +23,8 @@ def _get_parallel_slices(n_tasks, n_jobs):
|
||||||
def map_parallel(func, args, n_jobs):
|
def map_parallel(func, args, n_jobs):
|
||||||
"""
|
"""
|
||||||
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then
|
||||||
func is applied in two parallel processes to args[0:50] and to args[50:99]
|
func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function
|
||||||
|
that already works with a list of arguments.
|
||||||
|
|
||||||
:param func: function to be parallelized
|
:param func: function to be parallelized
|
||||||
:param args: array-like of arguments to be passed to the function in different parallel calls
|
:param args: array-like of arguments to be passed to the function in different parallel calls
|
||||||
|
|
Loading…
Reference in New Issue