From d1d4e08314dae13db12039269b8ba76f61bd6125 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Tue, 16 Jun 2020 10:53:28 +0200 Subject: [PATCH] optimization optional, automatic download and unzip of corpus, calibration of Esuli --- src/author_identification.py | 15 +- src/experiments.sh | 19 ++ src/model.py | 25 +- src/util/calibration.py | 589 +++++++++++++++++++++++++++++++++++ 4 files changed, 639 insertions(+), 9 deletions(-) create mode 100644 src/experiments.sh create mode 100644 src/util/calibration.py diff --git a/src/author_identification.py b/src/author_identification.py index 5133808..8130931 100755 --- a/src/author_identification.py +++ b/src/author_identification.py @@ -56,7 +56,16 @@ def main(): Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) print('Fitting the Verificator') - av = AuthorshipVerificator(nfolds=10) + if args.C is None: + params = {'C': np.logspace(-4, +3, 8)} + C = 1. + else: + params = None + C = args.C + + from sklearn.calibration import CalibratedClassifierCV + + av = AuthorshipVerificator(C=C, params=params) av.fit(Xtr, ytr) if args.unknown: @@ -73,6 +82,7 @@ def main(): f1_scores.append(f1_from_counters(tp, fp, fn, tn)) counters.append((tp, fp, fn, tn)) tee(f'F1 for {author} = {f1_scores[-1]:.3f}', log) + print(f'TP={tp} FP={fp} FN={fn} TN={tn}') if args.loo: print(f'Computing macro- and micro-averages (discarded {discarded}/{len(args.authors)})') @@ -88,6 +98,7 @@ def main(): log.close() + def tee(msg, log): print(msg) log.write(f'{msg}\n') @@ -111,6 +122,8 @@ if __name__ == '__main__': help='path to the file of unknown paternity (default None)') parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt', help='path to the log file where to write the results (default ./results.txt)') + parser.add_argument('--C', type=float, metavar='C', default=None, + help='set the parameter C (trade off between error and margin) or leave as None to optimize') args = parser.parse_args() diff --git a/src/experiments.sh b/src/experiments.sh new file mode 100644 index 0000000..f5019c1 --- /dev/null +++ b/src/experiments.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -x + +corpus='../MedLatin' + +if [ ! -d $corpus ]; then + curl -0 http://hlt.isti.cnr.it/medlatin/MedLatin.zip -o ../MedLatin.zip + unzip ../MedLatin.zip -d ../ + rm ../MedLatin.zip +fi + +PY="python3 author_identification.py" +MedLatin1="../MedLatin/Corpora/MedLatin1" +MedLatin2="../MedLatin/Corpora/MedLatin2" +EP1="../MedLatin/Epistle/EpistolaXIII_1.txt" +EP2="../MedLatin/Epistle/EpistolaXIII_2.txt" + +$PY $MedLatin1 ALL --unknown $EP1 --loo --log ./results_EP1.txt +$PY $MedLatin2 ALL --unknown $EP2 --loo --log ./results_EP2.txt diff --git a/src/model.py b/src/model.py index 137fbbe..11bcc77 100755 --- a/src/model.py +++ b/src/model.py @@ -1,27 +1,28 @@ from sklearn.metrics import make_scorer from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeaveOneGroupOut, cross_val_score, StratifiedKFold -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV -from sklearn.svm import * +from sklearn.linear_model import LogisticRegression from data.features import * +from util.calibration import CalibratedClassifierCV from util.evaluation import f1, get_counters class AuthorshipVerificator: def __init__(self, nfolds=10, - params={'C': np.logspace(-4, +4, 9), 'class_weight': ['balanced', None]}, + params={'C': np.logspace(-4, +3, 8)}, + C=1., author_name=None): self.nfolds = nfolds self.params = params self.author_name = author_name if author_name else 'this author' - self.classifier = LogisticRegression() + self.classifier = LogisticRegression(C=C, class_weight='balanced') def fit(self, X, y): y = np.asarray(y) positive_examples = y.sum() - if positive_examples >= self.nfolds: + if positive_examples >= self.nfolds and self.params is not None: print('optimizing {}'.format(self.classifier.__class__.__name__)) - folds = list(StratifiedKFold(n_splits=self.nfolds).split(X, y)) + folds = list(StratifiedKFold(n_splits=self.nfolds, shuffle=True, random_state=42).split(X, y)) self.estimator = GridSearchCV( self.classifier, param_grid=self.params, cv=folds, scoring=make_scorer(f1), n_jobs=-1 ) @@ -35,6 +36,9 @@ class AuthorshipVerificator: print(f'Best params: {self.estimator.best_params_} (cross-validation F1={f1_mean:.3f})') self.estimator = self.estimator.best_estimator_ + self.estimator = CalibratedClassifierCV(base_estimator=self.estimator, cv=self.nfolds, ensemble=False) + self.estimator.fit(X, y) + return self def leave_one_out(self, X, y, files, groups=None, test_lowest_index_only=True, counters=False): @@ -50,9 +54,14 @@ class AuthorshipVerificator: folds = [(train, np.min(test, keepdims=True)) for train, test in folds] scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) - missclassified = '\n'.join(files[scores == 0].tolist()) + missclassified = files[scores == 0].tolist() + #if hasattr(self.estimator, 'predict_proba') and len(missclassified) > 0: + # missclassified_prob = self.estimator.predict_proba(csr_matrix(X)[scores == 0])[:, 1] + # missclassified_prob = missclassified_prob.flatten().tolist() + # missclassified = [f'{file} Pr={prob:.3f}' for file, prob in zip(missclassified,missclassified_prob)] print('missclassified texts:') - print(missclassified) + print('\n'.join(missclassified)) + if counters and test_lowest_index_only: yfull_true = y[:len(folds)] diff --git a/src/util/calibration.py b/src/util/calibration.py new file mode 100644 index 0000000..eb329a1 --- /dev/null +++ b/src/util/calibration.py @@ -0,0 +1,589 @@ +"""Calibration of predicted probabilities.""" + +# Author: Alexandre Gramfort +# Balazs Kegl +# Jan Hendrik Metzen +# Mathieu Blondel +# +# License: BSD 3 clause + +import warnings +from inspect import signature + +from math import log +import numpy as np + +from scipy.special import expit +from scipy.special import xlogy +from scipy.optimize import fmin_bfgs +from sklearn.preprocessing import LabelEncoder + +from sklearn.base import (BaseEstimator, ClassifierMixin, RegressorMixin, clone, + MetaEstimatorMixin) +from sklearn.preprocessing import label_binarize, LabelBinarizer +from sklearn.utils import check_array, indexable, column_or_1d +from sklearn.utils.validation import check_is_fitted, check_consistent_length +from sklearn.utils.validation import _check_sample_weight +from sklearn.isotonic import IsotonicRegression +from sklearn.svm import LinearSVC +from sklearn.model_selection import check_cv, cross_val_predict +from sklearn.utils.validation import _deprecate_positional_args + + +class CalibratedClassifierCV(BaseEstimator, ClassifierMixin, + MetaEstimatorMixin): + """Probability calibration with isotonic regression or logistic regression. + The calibration is based on the :term:`decision_function` method of the + `base_estimator` if it exists, else on :term:`predict_proba`. + Read more in the :ref:`User Guide `. + Parameters + ---------- + base_estimator : instance BaseEstimator + The classifier whose output need to be calibrated to provide more + accurate `predict_proba` outputs. + method : 'sigmoid' or 'isotonic' + The method to use for calibration. Can be 'sigmoid' which + corresponds to Platt's method (i.e. a logistic regression model) or + 'isotonic' which is a non-parametric approach. It is not advised to + use isotonic calibration with too few calibration samples + ``(<<1000)`` since it tends to overfit. + cv : integer, cross-validation generator, iterable or "prefit", optional + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + - None, to use the default 5-fold cross-validation, + - integer, to specify the number of folds. + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + For integer/None inputs, if ``y`` is binary or multiclass, + :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is + neither binary nor multiclass, :class:`sklearn.model_selection.KFold` + is used. + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + If "prefit" is passed, it is assumed that `base_estimator` has been + fitted already and all data is used for calibration. + .. versionchanged:: 0.22 + ``cv`` default value if None changed from 3-fold to 5-fold. + ensemble : bool, optional + When ``cv`` is not "prefit", it determines how the final estimator + is fit. + If ``ensemble`` is True (default), an estimator (clone of + base_estimator) is fit and calibrated on each fold. The final + estimator is an ensemble that averages the predicted probabilities + of all such estimators. + When ``ensemble`` is False, the cross validation generator is used to + compute predictions (using :func:`cross_val_predict`), and the union + of these predictions is used for training the sigmoid or isotonic + model. The ``base_estimator`` is then fit on the whole data. + Attributes + ---------- + classes_ : array, shape (n_classes) + The class labels. + calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit" \ + or ensemble == False) + When ``ensemble`` is True, a list of calibrated classifiers, one for + each crossvalidation fold, which has been fitted on all but the + validation fold and calibrated on the validation fold. + When ``ensemble`` is False, or when using "prefit", the list contains + the single calibrated classifier produced by the fit method. + References + ---------- + .. [1] Obtaining calibrated probability estimates from decision trees + and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 + .. [2] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to + Regularized Likelihood Methods, J. Platt, (1999) + .. [4] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 + """ + @_deprecate_positional_args + def __init__(self, base_estimator=None, *, method='sigmoid', cv=None, + ensemble=True): + self.base_estimator = base_estimator + self.method = method + self.cv = cv + self.ensemble = ensemble + + def fit(self, X, y, sample_weight=None): + """Fit the calibrated model + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data. + y : array-like, shape (n_samples,) + Target values. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Returns + ------- + self : object + Returns an instance of self. + """ + X, y = indexable(X, y) + le = LabelBinarizer().fit(y) + self.classes_ = le.classes_ + + # Check that each cross-validation fold can have at least one + # example per class + n_folds = self.cv if isinstance(self.cv, int) \ + else self.cv.n_folds if hasattr(self.cv, "n_folds") else None + if n_folds and \ + np.any([np.sum(y == class_) < n_folds for class_ in + self.classes_]): + raise ValueError("Requesting %d-fold cross-validation but provided" + " less than %d examples for at least one class." + % (n_folds, n_folds)) + + self.calibrated_classifiers_ = [] + if self.base_estimator is None: + # we want all classifiers that don't expose a random_state + # to be deterministic (and we don't want to expose this one). + base_estimator = LinearSVC(random_state=0) + else: + base_estimator = self.base_estimator + + if self.cv == "prefit": + calibrated_classifier = _CalibratedClassifier( + base_estimator, method=self.method) + calibrated_classifier.fit(X, y, sample_weight) + self.calibrated_classifiers_.append(calibrated_classifier) + else: + cv = check_cv(self.cv, y, classifier=True) + fit_parameters = signature(base_estimator.fit).parameters + base_estimator_supports_sw = "sample_weight" in fit_parameters + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if not base_estimator_supports_sw: + estimator_name = type(base_estimator).__name__ + warnings.warn("Since %s does not support sample_weights, " + "sample weights will only be used for the " + "calibration itself." % estimator_name) + if self.ensemble: + for train, test in cv.split(X, y): + this_estimator = clone(base_estimator) + + if sample_weight is not None and \ + base_estimator_supports_sw: + this_estimator.fit(X[train], y[train], + sample_weight=sample_weight[train]) + else: + this_estimator.fit(X[train], y[train]) + + calibrated_classifier = _CalibratedClassifier( + this_estimator, method=self.method, + classes=self.classes_) + sw = None if sample_weight is None else sample_weight[test] + calibrated_classifier.fit(X[test], y[test], + sample_weight=sw) + self.calibrated_classifiers_.append(calibrated_classifier) + else: + if hasattr(base_estimator, "decision_function"): + base_estimator_method = "decision_function" + elif hasattr(base_estimator, "predict_proba"): + base_estimator_method = "predict_proba" + else: + raise RuntimeError('classifier has no decision_function ' + 'or predict_proba method.') + predictions = cross_val_predict(base_estimator, X, y, cv=cv, + method=base_estimator_method) + this_estimator = clone(base_estimator) + if sample_weight is not None and base_estimator_supports_sw: + this_estimator.\ + fit(X, y, sample_weight=sample_weight) + else: + this_estimator.fit(X, y) + calibrated_classifier = \ + _CalibratedClassifier(this_estimator, method=self.method, + classes=self.classes_, + predictions_in_X=True) + if hasattr(this_estimator, "decision_function"): + if predictions.ndim == 1: + predictions = predictions[:, np.newaxis] + elif hasattr(this_estimator, "predict_proba"): + if len(self.classes_) == 2: + predictions = predictions[:, 1:] + calibrated_classifier.fit(predictions, y, sample_weight) + self.calibrated_classifiers_.append(calibrated_classifier) + return self + + def predict_proba(self, X): + """Posterior probabilities of classification + This function returns posterior probabilities of classification + according to each class on an array of test vectors X. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + Returns + ------- + C : array, shape (n_samples, n_classes) + The predicted probas. + """ + check_is_fitted(self) + X = check_array(X, accept_sparse=['csc', 'csr', 'coo'], + force_all_finite=False) + # Compute the arithmetic mean of the predictions of the calibrated + # classifiers + mean_proba = np.zeros((X.shape[0], len(self.classes_))) + for calibrated_classifier in self.calibrated_classifiers_: + proba = calibrated_classifier.predict_proba(X) + mean_proba += proba + + mean_proba /= len(self.calibrated_classifiers_) + + return mean_proba + + def predict(self, X): + """Predict the target of new samples. The predicted class is the + class that has the highest probability, and can thus be different + from the prediction of the uncalibrated classifier. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + Returns + ------- + C : array, shape (n_samples,) + The predicted class. + """ + check_is_fitted(self) + return self.classes_[np.argmax(self.predict_proba(X), axis=1)] + + +class _CalibratedClassifier: + """Probability calibration with isotonic regression or sigmoid. + It assumes that base_estimator has already been fit, and trains the + calibration on the input set of the fit function. Note that this class + should not be used as an estimator directly. Use CalibratedClassifierCV + with cv="prefit" instead. + Parameters + ---------- + base_estimator : instance BaseEstimator + The classifier whose output decision function needs to be calibrated + to offer more accurate predict_proba outputs. No default value since + it has to be an already fitted estimator. + method : 'sigmoid' | 'isotonic' + The method to use for calibration. Can be 'sigmoid' which + corresponds to Platt's method or 'isotonic' which is a + non-parametric approach based on isotonic regression. + classes : array-like, shape (n_classes,), optional + Contains unique classes used to fit the base estimator. + if None, then classes is extracted from the given target values + in fit(). + predictions_in_X : bool, optional + When False (default), ``X`` are the element to be classified, and + predictions are determined applying the ``base_estimator`` to + ``X``. + When True, ``X`` already contains predictions. + See also + -------- + CalibratedClassifierCV + References + ---------- + .. [1] Obtaining calibrated probability estimates from decision trees + and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001 + .. [2] Transforming Classifier Scores into Accurate Multiclass + Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002) + .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to + Regularized Likelihood Methods, J. Platt, (1999) + .. [4] Predicting Good Probabilities with Supervised Learning, + A. Niculescu-Mizil & R. Caruana, ICML 2005 + """ + @_deprecate_positional_args + def __init__(self, base_estimator, *, method='sigmoid', classes=None, + predictions_in_X=False): + self.base_estimator = base_estimator + self.method = method + self.classes = classes + self.predictions_in_X_ = predictions_in_X + + def _preproc(self, X): + n_classes = len(self.classes_) + if hasattr(self.base_estimator, "decision_function"): + df = self.base_estimator.decision_function(X) + if df.ndim == 1: + df = df[:, np.newaxis] + elif hasattr(self.base_estimator, "predict_proba"): + df = self.base_estimator.predict_proba(X) + if n_classes == 2: + df = df[:, 1:] + else: + raise RuntimeError('classifier has no decision_function or ' + 'predict_proba method.') + + idx_pos_class = self.label_encoder_.\ + transform(self.base_estimator.classes_) + + return df, idx_pos_class + + def fit(self, X, y, sample_weight=None): + """Calibrate the fitted model + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data. + y : array-like, shape (n_samples,) + Target values. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Returns + ------- + self : object + Returns an instance of self. + """ + + self.label_encoder_ = LabelEncoder() + if self.classes is None: + self.label_encoder_.fit(y) + else: + self.label_encoder_.fit(self.classes) + + self.classes_ = self.label_encoder_.classes_ + Y = label_binarize(y, classes=self.classes_) + + if not self.predictions_in_X_: + df, idx_pos_class = self._preproc(X) + else: + df = X + idx_pos_class = \ + self.label_encoder_.transform(self.base_estimator.classes_) + self.calibrators_ = [] + + for k, this_df in zip(idx_pos_class, df.T): + if self.method == 'isotonic': + calibrator = IsotonicRegression(out_of_bounds='clip') + elif self.method == 'sigmoid': + calibrator = _SigmoidCalibration() + else: + raise ValueError('method should be "sigmoid" or ' + '"isotonic". Got %s.' % self.method) + calibrator.fit(this_df, Y[:, k], sample_weight) + self.calibrators_.append(calibrator) + + return self + + def predict_proba(self, X): + """Posterior probabilities of classification + This function returns posterior probabilities of classification + according to each class on an array of test vectors X. + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples. + Returns + ------- + C : array, shape (n_samples, n_classes) + The predicted probas. Can be exact zeros. + """ + n_classes = len(self.classes_) + proba = np.zeros((X.shape[0], n_classes)) + + df, idx_pos_class = self._preproc(X) + + for k, this_df, calibrator in \ + zip(idx_pos_class, df.T, self.calibrators_): + if n_classes == 2: + k += 1 + proba[:, k] = calibrator.predict(this_df) + + # Normalize the probabilities + if n_classes == 2: + proba[:, 0] = 1. - proba[:, 1] + else: + proba /= np.sum(proba, axis=1)[:, np.newaxis] + + # XXX : for some reason all probas can be 0 + proba[np.isnan(proba)] = 1. / n_classes + + # Deal with cases where the predicted probability minimally exceeds 1.0 + proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0 + + return proba + + +def _sigmoid_calibration(df, y, sample_weight=None): + """Probability Calibration with sigmoid method (Platt 2000) + Parameters + ---------- + df : ndarray, shape (n_samples,) + The decision function or predict proba for the samples. + y : ndarray, shape (n_samples,) + The targets. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Returns + ------- + a : float + The slope. + b : float + The intercept. + References + ---------- + Platt, "Probabilistic Outputs for Support Vector Machines" + """ + df = column_or_1d(df) + y = column_or_1d(y) + + F = df # F follows Platt's notations + + # Bayesian priors (see Platt end of section 2.2) + prior0 = float(np.sum(y <= 0)) + prior1 = y.shape[0] - prior0 + T = np.zeros(y.shape) + T[y > 0] = (prior1 + 1.) / (prior1 + 2.) + T[y <= 0] = 1. / (prior0 + 2.) + T1 = 1. - T + + def objective(AB): + # From Platt (beginning of Section 2.2) + P = expit(-(AB[0] * F + AB[1])) + loss = -(xlogy(T, P) + xlogy(T1, 1. - P)) + if sample_weight is not None: + return (sample_weight * loss).sum() + else: + return loss.sum() + + def grad(AB): + # gradient of the objective function + P = expit(-(AB[0] * F + AB[1])) + TEP_minus_T1P = T - P + if sample_weight is not None: + TEP_minus_T1P *= sample_weight + dA = np.dot(TEP_minus_T1P, F) + dB = np.sum(TEP_minus_T1P) + return np.array([dA, dB]) + + AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))]) + AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False) + return AB_[0], AB_[1] + + +class _SigmoidCalibration(RegressorMixin, BaseEstimator): + """Sigmoid regression model. + Attributes + ---------- + a_ : float + The slope. + b_ : float + The intercept. + """ + def fit(self, X, y, sample_weight=None): + """Fit the model using X, y as training data. + Parameters + ---------- + X : array-like, shape (n_samples,) + Training data. + y : array-like, shape (n_samples,) + Training target. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. + Returns + ------- + self : object + Returns an instance of self. + """ + X = column_or_1d(X) + y = column_or_1d(y) + X, y = indexable(X, y) + + self.a_, self.b_ = _sigmoid_calibration(X, y, sample_weight) + return self + + def predict(self, T): + """Predict new data by linear interpolation. + Parameters + ---------- + T : array-like, shape (n_samples,) + Data to predict from. + Returns + ------- + T_ : array, shape (n_samples,) + The predicted data. + """ + T = column_or_1d(T) + return expit(-(self.a_ * T + self.b_)) + + +@_deprecate_positional_args +def calibration_curve(y_true, y_prob, *, normalize=False, n_bins=5, + strategy='uniform'): + """Compute true and predicted probabilities for a calibration curve. + The method assumes the inputs come from a binary classifier, and + discretize the [0, 1] interval into bins. + Calibration curves may also be referred to as reliability diagrams. + Read more in the :ref:`User Guide `. + Parameters + ---------- + y_true : array-like of shape (n_samples,) + True targets. + y_prob : array-like of shape (n_samples,) + Probabilities of the positive class. + normalize : bool, default=False + Whether y_prob needs to be normalized into the [0, 1] interval, i.e. + is not a proper probability. If True, the smallest value in y_prob + is linearly mapped onto 0 and the largest one onto 1. + n_bins : int, default=5 + Number of bins to discretize the [0, 1] interval. A bigger number + requires more data. Bins with no samples (i.e. without + corresponding values in `y_prob`) will not be returned, thus the + returned arrays may have less than `n_bins` values. + strategy : {'uniform', 'quantile'}, default='uniform' + Strategy used to define the widths of the bins. + uniform + The bins have identical widths. + quantile + The bins have the same number of samples and depend on `y_prob`. + Returns + ------- + prob_true : ndarray of shape (n_bins,) or smaller + The proportion of samples whose class is the positive class, in each + bin (fraction of positives). + prob_pred : ndarray of shape (n_bins,) or smaller + The mean predicted probability in each bin. + References + ---------- + Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good + Probabilities With Supervised Learning, in Proceedings of the 22nd + International Conference on Machine Learning (ICML). + See section 4 (Qualitative Analysis of Predictions). + """ + y_true = column_or_1d(y_true) + y_prob = column_or_1d(y_prob) + check_consistent_length(y_true, y_prob) + + if normalize: # Normalize predicted values into interval [0, 1] + y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min()) + elif y_prob.min() < 0 or y_prob.max() > 1: + raise ValueError("y_prob has values outside [0, 1] and normalize is " + "set to False.") + + labels = np.unique(y_true) + if len(labels) > 2: + raise ValueError("Only binary classification is supported. " + "Provided labels %s." % labels) + y_true = label_binarize(y_true, classes=labels)[:, 0] + + if strategy == 'quantile': # Determine bin edges by distribution of data + quantiles = np.linspace(0, 1, n_bins + 1) + bins = np.percentile(y_prob, quantiles * 100) + bins[-1] = bins[-1] + 1e-8 + elif strategy == 'uniform': + bins = np.linspace(0., 1. + 1e-8, n_bins + 1) + else: + raise ValueError("Invalid entry to 'strategy' input. Strategy " + "must be either 'quantile' or 'uniform'.") + + binids = np.digitize(y_prob, bins) - 1 + + bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins)) + bin_true = np.bincount(binids, weights=y_true, minlength=len(bins)) + bin_total = np.bincount(binids, minlength=len(bins)) + + nonzero = bin_total != 0 + prob_true = bin_true[nonzero] / bin_total[nonzero] + prob_pred = bin_sums[nonzero] / bin_total[nonzero] + + return prob_true, prob_pred \ No newline at end of file