diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 3470726..55e8ce3 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -75,11 +75,12 @@ class AggregativeQuantifier(BaseQuantifier, ABC): empty_class_names = data.classes_[empty_classes] raise ValueError(f'classes {empty_class_names} have no training examples') - def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): + def fit(self, X, y, fit_classifier=True, val_split=None): """ Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels :param fit_classifier: whether to train the learner (default is True). Set to False if the learner has been trained outside the quantifier. :param val_split: specifies the data used for generating classifier predictions. This specification @@ -92,16 +93,17 @@ class AggregativeQuantifier(BaseQuantifier, ABC): :return: self """ self._check_init_parameters() - classif_predictions = self.classifier_fit_predict(data, fit_classifier, predict_on=val_split) - self.aggregation_fit(classif_predictions, data) + P, y = self.classifier_fit_predict(X, y, fit_classifier, predict_on=val_split) + self.aggregation_fit(P, y) return self - def classifier_fit_predict(self, data: LabelledCollection, fit_classifier=True, predict_on=None): + def classifier_fit_predict(self, X, y, fit_classifier=True, predict_on=None): """ Trains the classifier if requested (`fit_classifier=True`) and generate the necessary predictions to train the aggregation function. - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels :param fit_classifier: whether to train the learner (default is True). Set to False if the learner has been trained outside the quantifier. :param predict_on: specifies the set on which predictions need to be issued. This parameter can @@ -113,10 +115,11 @@ class AggregativeQuantifier(BaseQuantifier, ABC): """ assert isinstance(fit_classifier, bool), 'unexpected type for "fit_classifier", must be boolean' + data = LabelledCollection(X, y) self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba')) if fit_classifier: - self._check_non_empty_classes(data) + self._check_non_empty_classes(y) if predict_on is None: if not fit_classifier: @@ -170,16 +173,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC): f'use either a float indicating the split proportion, or a ' f'tuple (X,y) indicating the validation partition') - return predictions + return predictions.Xy @abstractmethod - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray): """ Trains the aggregation function. - :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, - as instances, the predictions issued by the classifier and, as labels, the true labels - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param classif_predictions: `array-like` of shape `(n_samples, n_classes)` consisting of the classifier + predictions for each class + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels """ ... @@ -201,16 +204,16 @@ class AggregativeQuantifier(BaseQuantifier, ABC): """ self.classifier_ = classifier - def classify(self, instances): + def classify(self, X): """ Provides the label predictions for the given instances. The predictions should respect the format expected by :meth:`aggregate`, e.g., posterior probabilities for probabilistic quantifiers, or crisp predictions for non-probabilistic quantifiers. The default one is "decision_function". - :param instances: array-like of shape `(n_instances, n_features,)` + :param X: array-like of shape `(n_instances, n_features,)` :return: np.ndarray of shape `(n_instances,)` with label predictions """ - return getattr(self.classifier, self._classifier_method())(instances) + return getattr(self.classifier, self._classifier_method())(X) def _classifier_method(self): """ @@ -230,15 +233,15 @@ class AggregativeQuantifier(BaseQuantifier, ABC): assert hasattr(self.classifier, self._classifier_method()), \ f"the method does not implement the required {self._classifier_method()} method" - def quantify(self, instances): + def quantify(self, X): """ Generate class prevalence estimates for the sample's instances by aggregating the label predictions generated by the classifier. - :param instances: array-like + :param X: array-like :return: `np.ndarray` of shape `(n_classes)` with class prevalence estimates. """ - classif_predictions = self.classify(instances) + classif_predictions = self.classify(X) return self.aggregate(classif_predictions) @abstractmethod @@ -328,9 +331,9 @@ class BinaryAggregativeQuantifier(AggregativeQuantifier, BinaryQuantifier): def neg_label(self): return self.classifier.classes_[0] - def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): - self._check_binary(data, self.__class__.__name__) - return super().fit(data, fit_classifier, val_split) + def fit(self, X, y, fit_classifier=True, val_split=None): + self._check_binary(y, self.__class__.__name__) + return super().fit(X, y, fit_classifier, val_split) # Methods @@ -346,12 +349,12 @@ class CC(AggregativeCrispQuantifier): def __init__(self, classifier: BaseEstimator): self.classifier = classifier - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray): """ Nothing to do here! :param classif_predictions: not used - :param data: not used + :param y: not used """ pass @@ -376,12 +379,12 @@ class PCC(AggregativeSoftQuantifier): def __init__(self, classifier: BaseEstimator): self.classifier = classifier - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_posteriors: np.ndarray, y: np.ndarray): """ Nothing to do here! - :param classif_predictions: not used - :param data: not used + :param classif_posteriors: not used + :param y: not used """ pass @@ -482,17 +485,16 @@ class ACC(AggregativeCrispQuantifier): if self.norm not in ACC.NORMALIZATIONS: raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray): """ Estimates the misclassification rates. - :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, - as instances, the label predictions issued by the classifier and, as labels, the true labels - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param classif_predictions: `array-like` of shape `(n_samples, n_classes)` + consisting of the posterior probabilities of the training examples + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels """ - pred_labels, true_labels = classif_predictions.Xy self.cc = CC(self.classifier) - self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, true_labels, pred_labels) + self.Pte_cond_estim_ = ACC.getPteCondEstim(self.classifier.classes_, y, classif_predictions) @classmethod def getPteCondEstim(cls, classes, y, y_): @@ -593,17 +595,15 @@ class PACC(AggregativeSoftQuantifier): if self.norm not in ACC.NORMALIZATIONS: raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") - def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): + def aggregation_fit(self, classif_predictions: np.ndarray, y: np.ndarray): """ Estimates the misclassification rates - :param classif_predictions: a :class:`quapy.data.base.LabelledCollection` containing, - as instances, the posterior probabilities issued by the classifier and, as labels, the true labels - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data - """ - posteriors, true_labels = classif_predictions.Xy + :param classif_predictions: `array-like` of shape `(n_samples, n_classes)` + consisting of the posterior probabilities of the training examples + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels """ self.pcc = PCC(self.classifier) - self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, true_labels, posteriors) + self.Pte_cond_estim_ = PACC.getPteCondEstim(self.classifier.classes_, y, classif_predictions) def aggregate(self, classif_posteriors): prevs_estim = self.pcc.aggregate(classif_posteriors) diff --git a/quapy/method/base.py b/quapy/method/base.py index 58cd6f1..bd6cd80 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -19,21 +19,22 @@ class BaseQuantifier(BaseEstimator): """ @abstractmethod - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Trains a quantifier. - :param data: a :class:`quapy.data.base.LabelledCollection` consisting of the training data + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels :return: self """ ... @abstractmethod - def quantify(self, instances): + def quantify(self, X): """ Generate class prevalence estimates for the sample's instances - :param instances: array-like + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the test covariates :return: `np.ndarray` of shape `(n_classes,)` with class prevalence estimates. """ ... @@ -45,8 +46,9 @@ class BinaryQuantifier(BaseQuantifier): (typically, to be interpreted as one class and its complement). """ - def _check_binary(self, data: LabelledCollection, quantifier_name): - assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \ + def _check_binary(self, y, quantifier_name): + n_classes = len(np.unique(y)) + assert n_classes==2, f'{quantifier_name} works only on problems of binary classification. ' \ f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.' @@ -78,7 +80,8 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier): self.binary_quantifier = binary_quantifier self.n_jobs = qp._get_njobs(n_jobs) - def fit(self, data: LabelledCollection, fit_classifier=True): + def fit(self, X, y, fit_classifier=True): + data = LabelledCollection(X, y) assert not data.binary, f'{self.__class__.__name__} expect non-binary data' assert fit_classifier == True, 'fit_classifier must be True' @@ -93,8 +96,8 @@ class OneVsAllGeneric(OneVsAll, BaseQuantifier): ) ) - def quantify(self, instances): - prevalences = self._parallel(self._delayed_binary_predict, instances) + def quantify(self, X): + prevalences = self._parallel(self._delayed_binary_predict, X) return qp.functional.normalize_prevalence(prevalences) @property diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 4104a3f..dd59f3d 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -8,7 +8,7 @@ from quapy.method.base import BaseQuantifier, BinaryQuantifier import quapy.functional as F -class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): +class MLPE(BaseQuantifier): """ The `Maximum Likelihood Prevalence Estimation` (MLPE) method is a lazy method that assumes there is no prior probability shift between training and test instances (put it other way, that the i.i.d. assumpion holds). @@ -20,13 +20,15 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier): def __init__(self): self._classes_ = None - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Computes the training prevalence and stores it. - :param data: the training sample + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels :return: self """ + data = LabelledCollection(X, y) self.estimated_prevalence = data.prevalence() return self @@ -100,7 +102,7 @@ class DMx(BaseQuantifier): return distributions - def fit(self, data: LabelledCollection): + def fit(self, X, y): """ Generates the validation distributions out of the training data (covariates). The validation distributions have shape `(n, nfeats, nbins)`, with `n` the number of classes, `nfeats` @@ -109,15 +111,16 @@ class DMx(BaseQuantifier): training data labelled with class `i`; while `dij = di[j]` is the discrete distribution for feature j in training data labelled with class `i`, and `dij[k]` is the fraction of instances with a value in the `k`-th bin. - :param data: the training set + :param X: `array-like` of shape `(n_samples, n_features)` consisting of the training covariates + :param y: `array-like` of shape `(n_samples,)` consisting of the instances labels + :return: self """ - X, y = data.Xy - + data = LabelledCollection(X, y) self.nfeats = X.shape[1] self.feat_ranges = _get_features_range(X) self.validation_distribution = np.asarray( - [self.__get_distributions(X[y==cat]) for cat in range(data.n_classes)] + [self.__get_distributions(X[y==cat]) for cat in range(data.classes_)] ) return self @@ -147,53 +150,53 @@ class DMx(BaseQuantifier): return F.argmin_prevalence(loss, n_classes, method=self.search) -class ReadMe(BaseQuantifier): - - def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): - raise NotImplementedError('under development ...') - self.bootstrap_trials = bootstrap_trials - self.bootstrap_range = bootstrap_range - self.bagging_trials = bagging_trials - self.bagging_range = bagging_range - self.vectorizer_kwargs = vectorizer_kwargs - - def fit(self, data: LabelledCollection): - X, y = data.Xy - self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) - X = self.vectorizer.fit_transform(X) - self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} - - def quantify(self, instances): - X = self.vectorizer.transform(instances) - - # number of features - num_docs, num_feats = X.shape - - # bootstrap - p_boots = [] - for _ in range(self.bootstrap_trials): - docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) - class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} - Xboot = X[docs_idx] - - # bagging - p_bags = [] - for _ in range(self.bagging_trials): - feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) - class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} - Xbag = Xboot[:,feat_idx] - p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) - p_bags.append(p) - p_boots.append(np.mean(p_bags, axis=0)) - - p_mean = np.mean(p_boots, axis=0) - p_std = np.std(p_bags, axis=0) - - return p_mean - - - def std_constrained_linear_ls(self, X, class_cond_X: dict): - pass +# class ReadMe(BaseQuantifier): +# +# def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): +# raise NotImplementedError('under development ...') +# self.bootstrap_trials = bootstrap_trials +# self.bootstrap_range = bootstrap_range +# self.bagging_trials = bagging_trials +# self.bagging_range = bagging_range +# self.vectorizer_kwargs = vectorizer_kwargs +# +# def fit(self, data: LabelledCollection): +# X, y = data.Xy +# self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) +# X = self.vectorizer.fit_transform(X) +# self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} +# +# def quantify(self, instances): +# X = self.vectorizer.transform(instances) +# +# # number of features +# num_docs, num_feats = X.shape +# +# # bootstrap +# p_boots = [] +# for _ in range(self.bootstrap_trials): +# docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) +# class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} +# Xboot = X[docs_idx] +# +# # bagging +# p_bags = [] +# for _ in range(self.bagging_trials): +# feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) +# class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} +# Xbag = Xboot[:,feat_idx] +# p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) +# p_bags.append(p) +# p_boots.append(np.mean(p_bags, axis=0)) +# +# p_mean = np.mean(p_boots, axis=0) +# p_std = np.std(p_bags, axis=0) +# +# return p_mean +# +# +# def std_constrained_linear_ls(self, X, class_cond_X: dict): +# pass def _get_features_range(X): @@ -209,4 +212,5 @@ def _get_features_range(X): # aliases #--------------------------------------------------------------- +MaximumLikelihoodPrevalenceEstimation = MLPE DistributionMatchingX = DMx \ No newline at end of file