diff --git a/experimental_non_aggregative/method_dxs.py b/experimental_non_aggregative/method_dxs.py index c531118..93fb67e 100644 --- a/experimental_non_aggregative/method_dxs.py +++ b/experimental_non_aggregative/method_dxs.py @@ -1,16 +1,21 @@ +from scipy.sparse import issparse +from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler import quapy as qp from data import LabelledCollection import numpy as np from experimental_non_aggregative.custom_vectorizers import * +from method._kdey import KDEBase from protocol import APP from quapy.method.aggregative import HDy, DistributionMatchingY from quapy.method.base import BaseQuantifier from scipy import optimize import pandas as pd +import quapy.functional as F # TODO: explore the bernoulli (term presence/absence) variant @@ -72,6 +77,51 @@ class DxS(BaseQuantifier): +class KDExML(BaseQuantifier, KDEBase): + + def __init__(self, bandwidth=0.1, standardize=False): + self._check_bandwidth(bandwidth) + self.bandwidth = bandwidth + self.standardize = standardize + + def fit(self, X, y): + classes = sorted(np.unique(y)) + + if self.standardize: + self.scaler = StandardScaler() + X = self.scaler.fit_transform(X) + + if issparse(X): + X = X.toarray() + + self.mix_densities = self.get_mixture_components(X, y, classes, self.bandwidth) + return self + + def predict(self, X): + """ + Searches for the mixture model parameter (the sought prevalence values) that maximizes the likelihood + of the data (i.e., that minimizes the negative log-likelihood) + + :param X: instances in the sample + :return: a vector of class prevalence estimates + """ + epsilon = 1e-10 + if issparse(X): + X = X.toarray() + n_classes = len(self.mix_densities) + if self.standardize: + X = self.scaler.transform(X) + test_densities = [self.pdf(kde_i, X) for kde_i in self.mix_densities] + + def neg_loglikelihood(prev): + test_mixture_likelihood = sum(prev_i * dens_i for prev_i, dens_i in zip (prev, test_densities)) + test_loglikelihood = np.log(test_mixture_likelihood + epsilon) + return -np.sum(test_loglikelihood) + + return F.optim_minimize(neg_loglikelihood, n_classes) + + + if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = 250 @@ -91,43 +141,51 @@ if __name__ == '__main__': data = qp.datasets.fetch_reviews(dataset, tfidf=False) - bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True) - dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer) - yield data, dxs, 'DxS-Bernoulli' - - multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False) - dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer) - yield data, dxs, 'DxS-multinomial' - - tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None) - dxs = DxS(divergence=div, vectorizer=tf_vectorizer) - yield data, dxs, 'DxS-TF' - - logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None) - dxs = DxS(divergence=div, vectorizer=logtf_vectorizer) - yield data, dxs, 'DxS-logTF' - - tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None) - dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) - yield data, dxs, 'DxS-TFIDF' - - tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2') - dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) - yield data, dxs, 'DxS-TFIDF-l2' + # bernoulli_vectorizer = CountVectorizer(min_df=min_df, binary=True) + # dxs = DxS(divergence=div, vectorizer=bernoulli_vectorizer) + # yield data, dxs, 'DxS-Bernoulli' + # + # multinomial_vectorizer = CountVectorizer(min_df=min_df, binary=False) + # dxs = DxS(divergence=div, vectorizer=multinomial_vectorizer) + # yield data, dxs, 'DxS-multinomial' + # + # tf_vectorizer = TfidfVectorizer(sublinear_tf=False, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tf_vectorizer) + # yield data, dxs, 'DxS-TF' + # + # logtf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=False, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=logtf_vectorizer) + # yield data, dxs, 'DxS-logTF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm=None) + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF' + # + # tfidf_vectorizer = TfidfVectorizer(use_idf=True, min_df=min_df, norm='l2') + # dxs = DxS(divergence=div, vectorizer=tfidf_vectorizer) + # yield data, dxs, 'DxS-TFIDF-l2' tsr_vectorizer = TSRweighting(tsr_function=information_gain, min_df=min_df, norm='l2') dxs = DxS(divergence=div, vectorizer=tsr_vectorizer) yield data, dxs, 'DxS-TFTSR-l2' data = qp.datasets.fetch_reviews(dataset, tfidf=True, min_df=min_df) + + kdex = KDExML() + reduction = TruncatedSVD(n_components=100, random_state=0) + red_data = qp.data.preprocessing.instance_transformation(data, transformer=reduction, inplace=False) + yield red_data, kdex, 'KDEx' + hdy = HDy(LogisticRegression()) yield data, hdy, 'HDy' - dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5) - yield data, dm, 'DM-5b' + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=5) + # yield data, dm, 'DM-5b' + # + # dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10) + # yield data, dm, 'DM-10b' + - dm = DistributionMatchingY(LogisticRegression(), divergence=div, nbins=10) - yield data, dm, 'DM-10b' result_path = 'results.csv' diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index b196c11..5f7e0a9 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -24,6 +24,7 @@ def instance_transformation(dataset:Dataset, transformer, inplace=False): """ training_transformed = transformer.fit_transform(*dataset.training.Xy) test_transformed = transformer.transform(dataset.test.X) + orig_name = dataset.name if inplace: dataset.training = LabelledCollection(training_transformed, dataset.training.labels, dataset.classes_) @@ -34,10 +35,10 @@ def instance_transformation(dataset:Dataset, transformer, inplace=False): else: training = LabelledCollection(training_transformed, dataset.training.labels.copy(), dataset.classes_) test = LabelledCollection(test_transformed, dataset.test.labels.copy(), dataset.classes_) + vocab = None if hasattr(transformer, 'vocabulary_'): - return Dataset(training, test, transformer.vocabulary_) - else: - return Dataset(training, test) + vocab = transformer.vocabulary_ + return Dataset(training, test, vocabulary=vocab, name=orig_name) def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs):