adding sample_weight to ordinal-aware classifiers

2022-03-10 18:28:49 +01:00 · 2022-03-10 18:28:49 +01:00 · ad64dfe2a0
parent b2e161480e
commit ad64dfe2a0
5 changed files with 66 additions and 36 deletions
--- a/Ordinal/gen_tables.py
+++ b/Ordinal/gen_tables.py
@ -14,13 +14,9 @@ outpath = f'./tables/{domain}/{prot}/results.tex'
 resultpath = join('./results', domain, prot)

 methods = [qname for qname, *_ in quantifiers()]
-methods += [m+'-r' for m in methods]
+# methods += [m+'-r' for m in methods]

-table = Table(benchmarks=['low', 'mid', 'high'],
-      methods=methods,
-      prec_mean=4,
-      show_std=True,
-      prec_std=4)
+table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)


 for resultfile in glob(f'{resultpath}/*.csv'):
@ -29,6 +25,7 @@ for resultfile in glob(f'{resultpath}/*.csv'):
    resultname = Path(resultfile).name
    method, drift, *other = resultname.replace('.csv', '').split('.')
    if other:
+        continue
        method += '-r'

    table.add(drift, method, nmd)
@ -37,9 +34,9 @@ os.makedirs(Path(outpath).parent, exist_ok=True)

 tabular = """
    \\resizebox{\\textwidth}{!}{%
-            \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks+1)) + """} \hline
+            \\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
            """
-tabular += table.latexTabularT()
+tabular += table.latexTabularT(average=False)
 tabular += """
    \end{tabular}%
    }"""
--- a/Ordinal/main.py
+++ b/Ordinal/main.py
@ -1,11 +1,10 @@
-import itertools
-
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 import numpy as np

-from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
+from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier, \
+    LogisticAT
 from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
 from quapy.data import LabelledCollection
 from os.path import join
@ -18,6 +17,14 @@ from tqdm import tqdm
 import mord


+#TODO:
+# Ordinal LR, LAD -> balance sample_weight
+# use BERT to extract features
+# other domains? Kitchen, Electronics...
+# try with the inverse of the distance
+# add drift='all'
+
+
 def load_test_samples():
    ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
    ids = set(ids)
@ -34,22 +41,11 @@ def load_dev_samples():
        yield sample.instances, sample.prevalence()


-class LAD(mord.LAD):
-    def fit(self, X, y):
-        self.classes_ = sorted(np.unique(y))
-        return super().fit(X, y)
-
-
-class OrdinalRidge(mord.OrdinalRidge):
-    def fit(self, X, y):
-        self.classes_ = sorted(np.unique(y))
-        return super().fit(X, y)
-
-
 def quantifiers():
    params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
-    params_OLR = {'alpha':np.logspace(-3, 3, 7)}
-    params_SVR = {'C': np.logspace(-3,3,7)}
+    # params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
+    params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
+    params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
    # params_SVR = {'C': np.logspace(0, 1, 2)}

    # baselines
@ -62,12 +58,12 @@ def quantifiers():

    # with order-aware classifiers
    # threshold-based ordinal regression (see https://pythonhosted.org/mord/)
-    yield 'CC(OLR-AT)', CC(mord.LogisticAT()), params_OLR
-    yield 'PCC(OLR-AT)', PCC(mord.LogisticAT()), params_OLR
-    yield 'ACC(OLR-AT)', ACC(mord.LogisticAT()), params_OLR
-    yield 'PACC(OLR-AT)', PACC(mord.LogisticAT()), params_OLR
+    yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
+    yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
+    yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
+    yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
    #yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
-    yield 'SLD(OLR-AT)', EMQ(mord.LogisticAT()), params_OLR
+    yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
    # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)

    # regression-based ordinal regression (see https://pythonhosted.org/mord/) 
@ -75,6 +71,7 @@ def quantifiers():
    # the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
    # not implement predict_proba nor decision_score
    yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
+    yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
    # yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
    # yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
    # yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
@ -137,7 +134,7 @@ if __name__ == '__main__':
    train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))

    with open(join(resultpath, 'hyper.txt'), 'at') as foo:
-        for drift in ['low', 'mid', 'high']:
+        for drift in ['low', 'mid', 'high', 'all']:
            params = [(*qs, drift) for qs in quantifiers()]
            hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
            for h in hypers:
--- a/Ordinal/model.py
+++ b/Ordinal/model.py
@ -10,6 +10,8 @@ from sklearn.multioutput import MultiOutputRegressor
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import LinearSVR, SVR
 from statsmodels.miscmodels.ordinal_model import OrderedModel
+import mord
+from sklearn.utils.class_weight import compute_class_weight


 class OrderedLogisticRegression:
@ -134,15 +136,20 @@ class RegressionQuantification:


 class RegressorClassifier(BaseEstimator, ClassifierMixin):
-    def __init__(self, C=1.0):
+    def __init__(self, C=1.0, class_weight=None):
        self.C = C
+        self.class_weight = class_weight

-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
        self.regressor = LinearSVR(C=self.C)
        # self.regressor = SVR()
        # self.regressor = Ridge(normalize=True)
-        self.nclasses = len(np.unique(y))
-        self.regressor.fit(X, y)
+        classes = sorted(np.unique(y))
+        self.nclasses = len(classes)
+        if self.class_weight == 'balanced':
+            class_weight = compute_class_weight('balanced', classes=classes, y=y)
+            sample_weight = class_weight[y]
+        self.regressor.fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
@ -179,3 +186,28 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
        self.C = params['C']


+class LogisticAT(mord.LogisticAT):
+    def __init__(self, alpha=1.0, class_weight=None):
+        assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
+        self.class_weight = class_weight
+        super(LogisticAT, self).__init__(alpha=alpha)
+
+    def fit(self, X, y, sample_weight=None):
+        if self.class_weight == 'balanced':
+            classes = sorted(np.unique(y))
+            class_weight = compute_class_weight('balanced', classes=classes, y=y)
+            sample_weight = class_weight[y]
+        return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
+
+
+class LAD(mord.LAD):
+    def fit(self, X, y):
+        self.classes_ = sorted(np.unique(y))
+        return super().fit(X, y)
+
+
+class OrdinalRidge(mord.OrdinalRidge):
+    def fit(self, X, y):
+        self.classes_ = sorted(np.unique(y))
+        return super().fit(X, y)
+
--- a/Ordinal/partition_dataset_by_shift.py
+++ b/Ordinal/partition_dataset_by_shift.py
@ -19,15 +19,19 @@ def partition_by_drift(split, training_prevalence):
    order = np.argsort(drifts)
    nD = len(order)
    low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
+    all_drift = np.arange(nD)
    np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
    np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
    np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
+    np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
    lows = drifts[low_drift]
    mids = drifts[mid_drift]
    highs = drifts[high_drift]
+    all = drifts[all_drift]
    print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
    print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
    print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
+    print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')


 domain = 'Books-tfidf'
--- a/Ordinal/tabular.py
+++ b/Ordinal/tabular.py
@ -284,7 +284,7 @@ class Table:
            if average:
                tab += ' & '
                tab += self.average.latexCell('ave', row)
-                tab += '\\\\\hline\n'
+            tab += '\\\\\hline\n'
        return tab

    def latexRow(self, benchmark, endl='\\\\\hline\n'):