diff --git a/quapy/CHANGE_LOG.txt b/quapy/CHANGE_LOG.txt index 6bef8b0..06d7dc4 100644 --- a/quapy/CHANGE_LOG.txt +++ b/quapy/CHANGE_LOG.txt @@ -22,6 +22,18 @@ - examples directory created! +- cross_val_predict (for quantification) added to model_selection: would be nice to allow the user specifies a + test protocol maybe, or None for bypassing it? + +- I think Pablo added DyS, Topsoe distance and binary search. + +- I think Pablo added multi-thread reproducibility. + +- Bugfix: adding two labelled collections (with +) now checks for consistency in the classes + +- newer versions of numpy raise a warning when accessing types (e.g., np.float). I have replaced all such instances + with the plain python type (e.g., float). + Things to fix: - clean functions like binary, aggregative, probabilistic, etc; those should be resolved via isinstance(): this is not working; I don't know how to make the isinstance work. Looks like there is some problem with the diff --git a/quapy/data/_lequa2022.py b/quapy/data/_lequa2022.py index 79ccccc..449eab6 100644 --- a/quapy/data/_lequa2022.py +++ b/quapy/data/_lequa2022.py @@ -26,15 +26,15 @@ def load_raw_documents(path): documents = list(df["text"].values) labels = None if "label" in df.columns: - labels = df["label"].values.astype(np.int) + labels = df["label"].values.astype(int) return documents, labels def load_vector_documents(path): - D = pd.read_csv(path).to_numpy(dtype=np.float) + D = pd.read_csv(path).to_numpy(dtype=float) labelled = D.shape[1] == 301 if labelled: - X, y = D[:, 1:], D[:, 0].astype(np.int).flatten() + X, y = D[:, 1:], D[:, 0].astype(int).flatten() else: X, y = D, None return X, y diff --git a/quapy/data/base.py b/quapy/data/base.py index 3c9bb67..62f871d 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,3 +1,5 @@ +from functools import cached_property + import numpy as np from scipy.sparse import issparse from scipy.sparse import vstack @@ -223,13 +225,44 @@ class LabelledCollection: test = LabelledCollection(te_docs, te_labels, classes_=self.classes_) return training, test + + def split_random(self, train_prop=0.6, random_state=None): + """ + Returns two instances of :class:`LabelledCollection` split randomly from this collection, at desired + proportion. + + :param train_prop: the proportion of elements to include in the left-most returned collection (typically used + as the training collection). The rest of elements are included in the right-most returned collection + (typically used as a test collection). + :param random_state: if specified, guarantees reproducibility of the split. + :return: two instances of :class:`LabelledCollection`, the first one with `train_prop` elements, and the + second one with `1-train_prop` elements + """ + indexes = np.random.RandomState(seed=random_state).permutation(len(self)) + if isinstance(train_prop, int): + assert train_prop < len(self), \ + 'argument train_prop cannot be greater than the number of elements in the collection' + splitpoint = train_prop + elif isinstance(train_prop, float): + assert 0 < train_prop < 1, \ + 'argument train_prop out of range (0,1)' + splitpoint = int(np.round(len(self)*train_prop)) + left, right = indexes[:splitpoint], indexes[splitpoint:] + training = self.sampling_from_index(left) + test = self.sampling_from_index(right) + return training, test + def __add__(self, other): """ - Returns a new :class:`LabelledCollection` as the union of this collection with another collection + Returns a new :class:`LabelledCollection` as the union of this collection with another collection. + Both labelled collections must have the same classes. :param other: another :class:`LabelledCollection` :return: a :class:`LabelledCollection` representing the union of both collections """ + if not all(np.sort(self.classes_)==np.sort(other.classes_)): + raise NotImplementedError('unsupported operation for collections on different classes') + if other is None: return self elif issparse(self.instances) and issparse(other.instances): @@ -241,7 +274,7 @@ class LabelledCollection: else: raise NotImplementedError('unsupported operation for collection types') labels = np.concatenate([self.labels, other.labels]) - return LabelledCollection(join_instances, labels) + return LabelledCollection(join_instances, labels, classes_=self.classes_) @property def Xy(self): diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 8f8bc79..88791e3 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -102,7 +102,7 @@ def reindex_labels(y): y = np.asarray(y) classnames = np.asarray(sorted(np.unique(y))) label2index = {label: index for index, label in enumerate(classnames)} - indexed = np.empty(y.shape, dtype=np.int) + indexed = np.empty(y.shape, dtype=int) for label in classnames: indexed[y==label] = label2index[label] return indexed, classnames @@ -121,7 +121,7 @@ def binarize(y, pos_class): 0 otherwise """ y = np.asarray(y) - ybin = np.zeros(y.shape, dtype=np.int) + ybin = np.zeros(y.shape, dtype=int) ybin[y == pos_class] = 1 return ybin diff --git a/quapy/functional.py b/quapy/functional.py index 8cf0312..3ee46ff 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -39,7 +39,7 @@ def prevalence_from_labels(labels, classes): raise ValueError(f'param labels does not seem to be a ndarray of label predictions') unique, counts = np.unique(labels, return_counts=True) by_class = defaultdict(lambda:0, dict(zip(unique, counts))) - prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=np.float) + prevalences = np.asarray([by_class[class_] for class_ in classes], dtype=float) prevalences /= prevalences.sum() return prevalences diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 4cec2cd..57c821d 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -132,7 +132,11 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier): def set_params(self, **parameters): if isinstance(self.learner, CalibratedClassifierCV): - parameters = {'base_estimator__' + k: v for k, v in parameters.items()} + if self.learner.get_params().get('base_estimator') == 'deprecated': + key_prefix = 'estimator__' # this has changed in the newer versions of sklearn + else: + key_prefix = 'base_estimator__' + parameters = {key_prefix + k: v for k, v in parameters.items()} self.learner.set_params(**parameters) @@ -369,7 +373,7 @@ class ACC(AggregativeQuantifier): # estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a # document that belongs to yj ends up being classified as belonging to yi conf = confusion_matrix(y, y_, labels=classes).T - conf = conf.astype(np.float) + conf = conf.astype(float) class_counts = conf.sum(axis=0) for i, _ in enumerate(classes): if class_counts[i] == 0: diff --git a/quapy/plot.py b/quapy/plot.py index cdb9b1e..67ccd52 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -370,7 +370,7 @@ def brokenbar_supremacy_by_drift(method_names, true_prevs, estim_prevs, tr_prevs bins[-1] += 0.001 # we use this to keep track of how many datapoits contribute to each bin - inds_histogram_global = np.zeros(n_bins, dtype=np.float) + inds_histogram_global = np.zeros(n_bins, dtype=float) n_methods = len(method_order) buckets = np.zeros(shape=(n_methods, n_bins, 3)) for i, method in enumerate(method_order): diff --git a/quapy/util.py b/quapy/util.py index 94187e6..50a640d 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -23,7 +23,8 @@ def _get_parallel_slices(n_tasks, n_jobs): def map_parallel(func, args, n_jobs): """ Applies func to n_jobs slices of args. E.g., if args is an array of 99 items and n_jobs=2, then - func is applied in two parallel processes to args[0:50] and to args[50:99] + func is applied in two parallel processes to args[0:50] and to args[50:99]. func is a function + that already works with a list of arguments. :param func: function to be parallelized :param args: array-like of arguments to be passed to the function in different parallel calls