testing first gp in binary data, with pdf table

2024-04-17 11:50:37 +02:00 · 2024-04-17 11:50:37 +02:00 · 6dfa1d3536
parent 820bdc8f18
commit 6dfa1d3536
3 changed files with 165 additions and 27 deletions
--- a/ClassifierAccuracy/gaussian_process.py
+++ b/ClassifierAccuracy/gaussian_process.py
@ -1,26 +1,78 @@
+import os.path
+import pickle
+from pathlib import Path
+
+from sklearn.linear_model import LogisticRegression
+
+from method.aggregative import PACC, EMQ, KDEyML
+
+"""
+Ideas:
+Try kernel based on feature covariance matrix, with dot product and with another kernel
+Try Cauchy-Schwarz kernel
+
+"""
+
 import sklearn.metrics
 from sklearn.gaussian_process import GaussianProcessRegressor
 import numpy as np
 from sklearn.gaussian_process.kernels import RBF, GenericKernelMixin, Kernel
 from sklearn.metrics.pairwise import pairwise_distances, pairwise_kernels

+from data import LabelledCollection
+from protocol import UPP
+from quapy.method.base import BaseQuantifier, BinaryQuantifier
+import quapy.functional as F
+from result_table.src.table import Table
+
 np.random.seed(0)


-class MinL2Kernel(GenericKernelMixin, Kernel):
+class FeatCovKernel(GenericKernelMixin, Kernel):
+    def __init__(self, dimensions):
+        self.dimensions = dimensions
+
+    def _f(self, sample1, sample2):
+        """
+        kernel value between a pair of samples
+        """
+        sample1 = sample1.reshape(-1, self.dimensions)
+        sample2 = sample2.reshape(-1, self.dimensions)
+        featCov1 = pairwise_distances(sample1.T, metric='correlation')
+        featCov2 = pairwise_distances(sample2.T, metric='correlation')
+        featDiffNorm = np.linalg.norm(featCov1-featCov2)
+        simil = np.exp(-featDiffNorm)
+        return simil
+
+    def __call__(self, X, Y=None, eval_gradient=False):
+        if Y is None:
+            Y = X
+
+        if eval_gradient:
+            raise NotImplementedError()
+        else:
+            return np.array([[self._f(x, y) for y in Y] for x in X])
+
+    def diag(self, X):
+        return np.array([self._f(x, x) for x in X])
+
+    def is_stationary(self):
+        return True
+
+class AveL2Kernel(GenericKernelMixin, Kernel):
    """
    A minimal (but valid) convolutional kernel for sequences of variable
    lengths."""

-    def __init__(self):
-        pass
+    def __init__(self, dimensions):
+        self.dimensions=dimensions

    def _f(self, sample1, sample2):
        """
        kernel value between a pair of sequences
        """
-        sample1 = sample1.reshape(-1, 3)
-        sample2 = sample2.reshape(-1, 3)
+        sample1 = sample1.reshape(-1, self.dimensions)
+        sample2 = sample2.reshape(-1, self.dimensions)
        dist = pairwise_distances(sample1, sample2)
        mean_dist = dist.mean()
        closenest = np.exp(-mean_dist)
@ -83,14 +135,15 @@ def RJSDk(sample_1, sample_2):
    pi1 = n1 / (n1 + n2)
    pi2 = n2 / (n1 + n2)
    Z = np.concatenate([sample_1, sample_2])
-    # Kz = pairwise_kernels(Z, metric='rbf', n_jobs=-1)
-    Kz = pairwise_kernels(Z, metric='cosine', n_jobs=-1)
+    Kz = pairwise_kernels(Z, metric='rbf', n_jobs=-1)
+    # Kz = pairwise_kernels(Z, metric='cosine', n_jobs=-1)
    Kx = Kz[:n1, :n1]
    Ky = Kz[n1:, n1:]

    SKz = S(Kz)
    SKx = S(Kx)
    SKy = S(Ky)
+
    return SKz - (pi1 * SKx + pi2 * SKy)

 def S(K):
@ -110,7 +163,7 @@ def target_function(X):
    return X[:,0]**3 + 2.1*X[:,1]**2 + X[:,0] + 0.1


-# X = np.random.rand(10,3)
+# X = np.random.rand(14,3)
 # X /= X.sum(axis=1, keepdims=True)
 # Y = np.random.rand(10,3)
 # Y /= Y.sum(axis=1, keepdims=True)
@ -122,27 +175,99 @@ def target_function(X):
 #
 # print(d)
 #
+# d = RJSDk(X, X)
+#
+# print(d)
+#
 # import sys ; sys.exit(0)

-X_train = [np.random.rand(10*3) for _ in range(15)]
-y_train = [target_function(X).mean() for X in X_train]
-
-X_test = [np.random.rand(10*3) for _ in range(11)]
-y_test = [target_function(X).mean() for X in X_test]
-
-
-print('fit')
-#kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
+# X_train = [np.random.rand(10*3) for _ in range(50)]
+# y_train = [target_function(X).mean() for X in X_train]
+#
+# X_test = [np.random.rand(10*3) for _ in range(20)]
+# y_test = [target_function(X).mean() for X in X_test]
+#
+#
+# print('fit')
+# # kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2))
 # kernel = MinL2Kernel()
-kernel = RJSDkernel()
-gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
-gaussian_process.fit(X_train, y_train)
-print('[done]')
+# # kernel = RJSDkernel()
+# gaussian_process = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
+# gaussian_process.fit(X_train, y_train)
+# print('[done]')
+#
+# print(gaussian_process.kernel_)
+#
+# y_pred = gaussian_process.predict(X_test)
+#
+# mse = np.mean((y_test - y_pred)**2)
+#
+# print(mse)

-print(gaussian_process.kernel_)
+class GPQuantifier(BaseQuantifier):

-y_pred = gaussian_process.predict(X_test)
+    def __init__(self, dimensions, kernel, num_tr_samples=20, size_tr_samples=50):
+        self.dimensions = dimensions
+        self.num_tr_samples = num_tr_samples
+        self.size_tr_samples = size_tr_samples
+        self.gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

-mse = np.mean((y_test - y_pred)**2)
+    def fit(self, data: LabelledCollection):
+        sampler = UPP(data, sample_size=self.size_tr_samples, repeats=self.num_tr_samples)
+        Xs, ps = list(zip(*[(X,p) for X,p in sampler()]))
+        ps = [p[1] for p in ps]
+        Xs = [X.flatten() for X in Xs]
+        self.gp.fit(Xs, ps)
+        return self

-print(mse)
+    def quantify(self, instances):
+        X = [instances.flatten()]
+        p = self.gp.predict(X)[0]
+        return F.as_binary_prevalence(p, clip_if_necessary=True)
+
+import quapy as qp
+
+from quapy.data.datasets import fetch_UCIBinaryDataset, UCI_BINARY_DATASETS
+
+table = Table('avel2')
+methodnames = ['AveL2','PACC', 'SLD', 'KDEyML']
+
+for methodname in methodnames:
+    errors = []
+    for dataset_name in UCI_BINARY_DATASETS:
+        if dataset_name in ['balance.2']:
+            continue
+
+        result_path = f'./results_gp/{dataset_name}_{methodname}.pkl'
+        os.makedirs(Path(result_path).parent, exist_ok=True)
+        if os.path.exists(result_path):
+            aes = pickle.load(open(result_path, 'rb'))
+        else:
+            dataset = fetch_UCIBinaryDataset(dataset_name)
+            qp.data.preprocessing.standardize(dataset, inplace=True)
+            train, test = dataset.train_test
+            d = train.X.shape[1]
+            if methodname=='AveL2':
+                q = GPQuantifier(dimensions=d, kernel=AveL2Kernel(dimensions=d), num_tr_samples=150, size_tr_samples=100)
+            elif methodname=='PACC':
+                q = PACC(LogisticRegression())
+            elif methodname=='SLD':
+                q = EMQ(LogisticRegression())
+            elif methodname=='KDEyML':
+                q = KDEyML(LogisticRegression(), bandwidth=0.05)
+            else:
+                raise ValueError('unknown method' + methodname)
+            q.fit(train)
+            aes = qp.evaluation.evaluate(q, UPP(test, sample_size=100), error_metric='ae', verbose=False)
+            pickle.dump(aes, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+        mae = np.mean(aes)
+        print(f'{dataset_name}\t{np.mean(mae):.4f}')
+
+        errors.append(mae)
+        table.add(dataset_name, methodname, aes)
+
+print(f'\nmean={np.mean(errors):.5f}')
+table.format.show_std=False
+table.format.mean_prec=4
+table.LatexPDF('./table_gp/gp.pdf', tables=[table], resizebox=False)
--- a/ClassifierAccuracy/models_multiclass.py
+++ b/ClassifierAccuracy/models_multiclass.py
@ -549,6 +549,16 @@ class EmptySafeQuantifier(BaseQuantifier):
    def num_non_empty_classes(self):
        return len(self.old_class_idx)

+    def get_params(self, deep=True):
+        return self.surrogate.get_params(deep=deep)
+
+    def set_params(self, **params):
+        return self.surrogate.set_params(**params)
+
+class EmptySafeAggregativeQuantifier(AggregativeQuantifier, EmptySafeQuantifier):
+
+
+

 # Baselines:
 class ATC(ClassifierAccuracyPrediction):
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -20,8 +20,11 @@ TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
 TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
                                 'semeval', 'semeval16',
                                 'sst', 'wa', 'wb']
-UCI_BINARY_DATASETS = ['acute.a', 'acute.b',
-                'balance.1', 'balance.2', 'balance.3',
+UCI_BINARY_DATASETS = [
+                #'acute.a', 'acute.b',
+                'balance.1',
+                #'balance.2',
+                'balance.3',
                'breast-cancer',
                'cmc.1', 'cmc.2', 'cmc.3',
                'ctg.1', 'ctg.2', 'ctg.3',