From 41baeb78ca1918797a0115b3f43945ff07253258 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Thu, 23 Oct 2025 12:35:01 +0200 Subject: [PATCH] lazy index construction in labelled collection --- CHANGE_LOG.txt | 1 + experimental_non_aggregative/method_dxs.py | 24 ++++++++++++---------- quapy/data/base.py | 9 ++++++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index cc6be1b..b6066b9 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -3,6 +3,7 @@ Change Log 0.2.1 - Improved documentation of confidence regions. - Added ReadMe method by Daniel Hopkins and Gary King +- Internal index in LabelledCollection is now "lazy", and is only constructed if required. Change Log 0.2.0 ----------------- diff --git a/experimental_non_aggregative/method_dxs.py b/experimental_non_aggregative/method_dxs.py index 54986a2..c531118 100644 --- a/experimental_non_aggregative/method_dxs.py +++ b/experimental_non_aggregative/method_dxs.py @@ -7,7 +7,7 @@ import numpy as np from experimental_non_aggregative.custom_vectorizers import * from protocol import APP -from quapy.method.aggregative import _get_divergence, HDy, DistributionMatching +from quapy.method.aggregative import HDy, DistributionMatchingY from quapy.method.base import BaseQuantifier from scipy import optimize import pandas as pd @@ -30,28 +30,30 @@ class DxS(BaseQuantifier): # return np.asarray(instances.sum(axis=0) / instances.sum()).flatten() def __as_distribution(self, instances): - dist = instances.sum(axis=0) / instances.sum() + dist = instances.mean(axis=0) return np.asarray(dist).flatten() - def fit(self, data: LabelledCollection): + def fit(self, text_instances, labels): - text_instances, labels = data.Xy + classes = np.unique(labels) if self.vectorizer is not None: text_instances = self.vectorizer.fit_transform(text_instances, y=labels) distributions = [] - for class_i in data.classes_: + for class_i in classes: distributions.append(self.__as_distribution(text_instances[labels == class_i])) + self.validation_distribution = np.asarray(distributions) + return self - def quantify(self, text_instances): + def predict(self, text_instances): if self.vectorizer is not None: text_instances = self.vectorizer.transform(text_instances) test_distribution = self.__as_distribution(text_instances) - divergence = _get_divergence(self.divergence) + divergence = qp.functional.get_divergence(self.divergence) n_classes, n_feats = self.validation_distribution.shape def match(prev): @@ -121,10 +123,10 @@ if __name__ == '__main__': hdy = HDy(LogisticRegression()) yield data, hdy, 'HDy' - dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5) + dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5) yield data, dm, 'DM-5b' - dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10) + dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10) yield data, dm, 'DM-10b' @@ -132,9 +134,9 @@ if __name__ == '__main__': with open(result_path, 'wt') as csv: csv.write(f'Method\tDataset\tMAE\tMRAE\n') for data, quantifier, quant_name in gen_methods(): - quantifier.fit(data.training) + quantifier.fit(*data.training.Xy) report = qp.evaluation.evaluation_report(quantifier, APP(data.test, repeats=repeats), error_metrics=['mae','mrae'], verbose=True) - means = report.mean() + means = report.mean(numeric_only=True) csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') df = pd.read_csv(result_path, sep='\t') diff --git a/quapy/data/base.py b/quapy/data/base.py index 82c57db..6db182a 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -33,7 +33,6 @@ class LabelledCollection: else: self.instances = np.asarray(instances) self.labels = np.asarray(labels) - n_docs = len(self) if classes is None: self.classes_ = F.classes_from_labels(self.labels) else: @@ -41,7 +40,13 @@ class LabelledCollection: self.classes_.sort() if len(set(self.labels).difference(set(classes))) > 0: raise ValueError(f'labels ({set(self.labels)}) contain values not included in classes_ ({set(classes)})') - self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} + self._index = None + + @property + def index(self): + if self._index is None: + self._index = {class_: np.arange(len(self))[self.labels == class_] for class_ in self.classes_} + return self._index @classmethod def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs):