from typing import Union
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.neighbors import KernelDensity

import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import AggregativeSoftQuantifier
import quapy.functional as F

from sklearn.metrics.pairwise import rbf_kernel


class KDEBase:
    """
    Common ancestor for KDE-based methods. Implements some common routines.
    """

    BANDWIDTH_METHOD = ['scott', 'silverman']

    @classmethod
    def _check_bandwidth(cls, bandwidth):
        """
        Checks that the bandwidth parameter is correct

        :param bandwidth: either a string (see BANDWIDTH_METHOD) or a float
        :return: nothing, but raises an exception for invalid values
        """
        assert bandwidth in KDEBase.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
            f'invalid bandwidth, valid ones are {KDEBase.BANDWIDTH_METHOD} or float values'
        if isinstance(bandwidth, float):
            assert 0 < bandwidth < 1,  "the bandwith for KDEy should be in (0,1), since this method models the unit simplex"

    def get_kde_function(self, X, bandwidth):
        """
        Wraps the KDE function from scikit-learn.

        :param X: data for which the density function is to be estimated
        :param bandwidth: the bandwidth of the kernel
        :return: a scikit-learn's KernelDensity object
        """
        return KernelDensity(bandwidth=bandwidth).fit(X)

    def pdf(self, kde, X):
        """
        Wraps the density evalution of scikit-learn's KDE. Scikit-learn returns log-scores (s), so this
        function returns :math:`e^{s}`

        :param kde: a previously fit KDE function
        :param X: the data for which the density is to be estimated
        :return: np.ndarray with the densities
        """
        return np.exp(kde.score_samples(X))

    def get_mixture_components(self, X, y, n_classes, bandwidth):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.

        :param X: the data containing the covariates
        :param y: the class labels
        :param n_classes: integer, the number of classes
        :param bandwidth: float, the bandwidth of the kernel
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
        return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]


class KDEyML(AggregativeSoftQuantifier, KDEBase):
    """
    Kernel Density Estimation model for quantification (KDEy) relying on the Kullback-Leibler divergence (KLD) as
    the divergence measure to be minimized. This method was first proposed in the paper
    `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
    the authors show that minimizing the distribution mathing criterion for KLD is akin to performing
    maximum likelihood (ML).

    The distribution matching optimization problem comes down to solving:

    :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`

    where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
    :math:`\\alpha` defined by

    :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`

    where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
    KDE function that uses the datapoints in X as the kernel centers.

    In KDEy-ML, the divergence is taken to be the Kullback-Leibler Divergence. This is equivalent to solving:
    :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} -
    \\mathbb{E}_{q_{\\widetilde{U}}} \\left[ \\log \\boldsymbol{p}_{\\alpha}(\\widetilde{x}) \\right]`

    which corresponds to the maximum likelihood estimate.

    :param classifier: a sklearn's Estimator that generates a binary classifier.
    :param val_split: specifies the data used for generating classifier predictions. This specification
        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
        be extracted from the training set; or as an integer (default 5), indicating that the predictions
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a collection defining the specific set of data to use for validation.
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    """

    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None, random_state=None):
        self._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
        return self

    def aggregate(self, posteriors: np.ndarray):
        """
        Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood
        of the data (i.e., that minimizes the negative log-likelihood)

        :param posteriors: instances in the sample converted into posterior probabilities
        :return: a vector of class prevalence estimates
        """
        np.random.RandomState(self.random_state)
        epsilon = 1e-10
        n_classes = len(self.mix_densities)
        test_densities = [self.pdf(kde_i, posteriors) for kde_i in self.mix_densities]

        def neg_loglikelihood(prev):
            test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities))
            test_loglikelihood = np.log(test_mixture_likelihood + epsilon)
            return  -np.sum(test_loglikelihood)

        return F.optim_minimize(neg_loglikelihood, n_classes)


class KDEyHD(AggregativeSoftQuantifier, KDEBase):
    """
    Kernel Density Estimation model for quantification (KDEy) relying on the squared Hellinger Disntace (HD) as
    the divergence measure to be minimized. This method was first proposed in the paper
    `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
    the authors proposed a Monte Carlo approach for minimizing the divergence.

    The distribution matching optimization problem comes down to solving:

    :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`

    where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
    :math:`\\alpha` defined by

    :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`

    where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
    KDE function that uses the datapoints in X as the kernel centers.

    In KDEy-HD, the divergence is taken to be the squared Hellinger Distance, an f-divergence with corresponding
    f-generator function given by:

    :math:`f(u)=(\\sqrt{u}-1)^2`

    The authors proposed a Monte Carlo solution that relies on importance sampling:

    :math:`\\hat{D}_f(p||q)= \\frac{1}{t} \\sum_{i=1}^t f\\left(\\frac{p(x_i)}{q(x_i)}\\right) \\frac{q(x_i)}{r(x_i)}`

    where the datapoints (trials) :math:`x_1,\\ldots,x_t\\sim_{\\mathrm{iid}} r` with :math:`r`  the
    uniform distribution.

    :param classifier: a sklearn's Estimator that generates a binary classifier.
    :param val_split: specifies the data used for generating classifier predictions. This specification
        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
        be extracted from the training set; or as an integer (default 5), indicating that the predictions
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a collection defining the specific set of data to use for validation.
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    :param random_state: a seed to be set before fitting any base quantifier (default None)
    :param montecarlo_trials: number of Monte Carlo trials (default 10000)
    """

    def __init__(self, classifier: BaseEstimator, val_split=10, divergence: str='HD',
                 bandwidth=0.1, n_jobs=None, random_state=None, montecarlo_trials=10000):
        
        self._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.divergence = divergence
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs
        self.random_state=random_state
        self.montecarlo_trials = montecarlo_trials

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)

        N = self.montecarlo_trials
        rs = self.random_state
        n = data.n_classes
        self.reference_samples = np.vstack([kde_i.sample(N//n, random_state=rs) for kde_i in self.mix_densities])
        self.reference_classwise_densities = np.asarray([self.pdf(kde_j, self.reference_samples) for kde_j in self.mix_densities])
        self.reference_density = np.mean(self.reference_classwise_densities, axis=0)  # equiv. to (uniform @ self.reference_classwise_densities)

        return self

    def aggregate(self, posteriors: np.ndarray):
        # we retain all n*N examples (sampled from a mixture with uniform parameter), and then
        # apply importance sampling (IS). In this version we compute D(p_alpha||q) with IS
        n_classes = len(self.mix_densities)

        test_kde = self.get_kde_function(posteriors, self.bandwidth)
        test_densities = self.pdf(test_kde, self.reference_samples)

        def f_squared_hellinger(u):
            return (np.sqrt(u)-1)**2
        
        # todo: this will fail when self.divergence is a callable, and is not the right place to do it anyway
        if self.divergence.lower() == 'hd':
            f = f_squared_hellinger
        else:
            raise ValueError('only squared HD is currently implemented')

        epsilon = 1e-10
        qs = test_densities + epsilon
        rs = self.reference_density + epsilon
        iw = qs/rs  #importance weights
        p_class = self.reference_classwise_densities + epsilon
        fracs = p_class/qs

        def divergence(prev):
            # ps / qs = (prev @ p_class) / qs = prev @ (p_class / qs) = prev @ fracs
            ps_div_qs = prev @ fracs
            return np.mean( f(ps_div_qs) * iw )

        return F.optim_minimize(divergence, n_classes)


class KDEyCS(AggregativeSoftQuantifier):
    """
    Kernel Density Estimation model for quantification (KDEy) relying on the Cauchy-Schwarz divergence (CS) as
    the divergence measure to be minimized. This method was first proposed in the paper
    `Kernel Density Estimation for Multiclass Quantification <https://arxiv.org/abs/2401.00490>`_, in which
    the authors proposed a Monte Carlo approach for minimizing the divergence.

    The distribution matching optimization problem comes down to solving:

    :math:`\\hat{\\alpha} = \\arg\\min_{\\alpha\\in\\Delta^{n-1}} \\mathcal{D}(\\boldsymbol{p}_{\\alpha}||q_{\\widetilde{U}})`

    where :math:`p_{\\alpha}` is the mixture of class-specific KDEs with mixture parameter (hence class prevalence)
    :math:`\\alpha` defined by

    :math:`\\boldsymbol{p}_{\\alpha}(\\widetilde{x}) = \\sum_{i=1}^n \\alpha_i p_{\\widetilde{L}_i}(\\widetilde{x})`

    where :math:`p_X(\\boldsymbol{x}) = \\frac{1}{|X|} \\sum_{x_i\\in X} K\\left(\\frac{x-x_i}{h}\\right)` is the
    KDE function that uses the datapoints in X as the kernel centers.

    In KDEy-CS, the divergence is taken to be the Cauchy-Schwarz divergence given by:

    :math:`\\mathcal{D}_{\\mathrm{CS}}(p||q)=-\\log\\left(\\frac{\\int p(x)q(x)dx}{\\sqrt{\\int p(x)^2dx \\int q(x)^2dx}}\\right)`

    The authors showed that this distribution matching admits a closed-form solution

    :param classifier: a sklearn's Estimator that generates a binary classifier.
    :param val_split: specifies the data used for generating classifier predictions. This specification
        can be made as float in (0, 1) indicating the proportion of stratified held-out validation set to
        be extracted from the training set; or as an integer (default 5), indicating that the predictions
        are to be generated in a `k`-fold cross-validation manner (with this integer indicating the value
        for `k`); or as a collection defining the specific set of data to use for validation.
        Alternatively, this set can be specified at fit time by indicating the exact set of data
        on which the predictions are to be generated.
    :param bandwidth: float, the bandwidth of the Kernel
    :param n_jobs: number of parallel workers
    """

    def __init__(self, classifier: BaseEstimator, val_split=10, bandwidth=0.1, n_jobs=None):
        KDEBase._check_bandwidth(bandwidth)
        self.classifier = classifier
        self.val_split = val_split
        self.bandwidth = bandwidth
        self.n_jobs = n_jobs

    def gram_matrix_mix_sum(self, X, Y=None):
        # this adapts the output of the rbf_kernel function (pairwise evaluations of Gaussian kernels k(x,y))
        # to contain pairwise evaluations of N(x|mu,Sigma1+Sigma2) with mu=y and Sigma1 and Sigma2 are 
        # two "scalar matrices" (h^2)*I each, so Sigma1+Sigma2 has scalar 2(h^2) (h is the bandwidth)
        h = self.bandwidth
        variance = 2 * (h**2)
        nD = X.shape[1]
        gamma = 1/(2*variance)
        norm_factor = 1/np.sqrt(((2*np.pi)**nD) * (variance**(nD)))
        gram = norm_factor * rbf_kernel(X, Y, gamma=gamma)
        return gram.sum()

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):

        P, y = classif_predictions.Xy
        n = data.n_classes

        assert all(sorted(np.unique(y)) == np.arange(n)), \
            'label name gaps not allowed in current implementation'

        # counts_inv keeps track of the relative weight of each datapoint within its class
        # (i.e., the weight in its KDE model)
        counts_inv = 1 / (data.counts())

        # tr_tr_sums corresponds to symbol \overline{B} in the paper
        tr_tr_sums = np.zeros(shape=(n,n), dtype=float)
        for i in range(n):
            for j in range(n):
                if i > j:
                    tr_tr_sums[i,j] = tr_tr_sums[j,i]
                else:
                    block = self.gram_matrix_mix_sum(P[y == i], P[y == j] if i!=j else None)
                    tr_tr_sums[i, j] = block

        # keep track of these data structures for the test phase
        self.Ptr = P
        self.ytr = y
        self.tr_tr_sums = tr_tr_sums
        self.counts_inv = counts_inv

        return self


    def aggregate(self, posteriors: np.ndarray):
        Ptr = self.Ptr
        Pte = posteriors
        y = self.ytr
        tr_tr_sums = self.tr_tr_sums

        M, nD = Pte.shape
        Minv = (1/M) # t in the paper
        n = Ptr.shape[1]

        # becomes a constant that does not affect the optimization, no need to compute it
        # partC = 0.5*np.log(self.gram_matrix_mix_sum(Pte) * Kinv * Kinv)

        # tr_te_sums corresponds to \overline{a}*(1/Li)*(1/M) in the paper (note the constants
        # are already aggregated to tr_te_sums, so these multiplications are not carried out
        # at each iteration of the optimization phase)
        tr_te_sums = np.zeros(shape=n, dtype=float)
        for i in range(n):
            tr_te_sums[i] = self.gram_matrix_mix_sum(Ptr[y==i], Pte) 

        def divergence(alpha):
            # called \overline{r} in the paper
            alpha_ratio = alpha * self.counts_inv

            # recal that tr_te_sums already accounts for the constant terms (1/Li)*(1/M)
            partA = -np.log((alpha_ratio @ tr_te_sums) * Minv)
            partB = 0.5 * np.log(alpha_ratio @ tr_tr_sums @ alpha_ratio)
            return partA + partB #+ partC

        return F.optim_minimize(divergence, n)