1
0
Fork 0

adding sample_weight to ordinal-aware classifiers

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-10 18:28:49 +01:00
parent b2e161480e
commit ad64dfe2a0
5 changed files with 66 additions and 36 deletions

View File

@ -14,13 +14,9 @@ outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
methods = [qname for qname, *_ in quantifiers()]
methods += [m+'-r' for m in methods]
# methods += [m+'-r' for m in methods]
table = Table(benchmarks=['low', 'mid', 'high'],
methods=methods,
prec_mean=4,
show_std=True,
prec_std=4)
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
for resultfile in glob(f'{resultpath}/*.csv'):
@ -29,6 +25,7 @@ for resultfile in glob(f'{resultpath}/*.csv'):
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').split('.')
if other:
continue
method += '-r'
table.add(drift, method, nmd)
@ -37,9 +34,9 @@ os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks+1)) + """} \hline
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
"""
tabular += table.latexTabularT()
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""

View File

@ -1,11 +1,10 @@
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import numpy as np
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, RegressorClassifier, \
LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
@ -18,6 +17,14 @@ from tqdm import tqdm
import mord
#TODO:
# Ordinal LR, LAD -> balance sample_weight
# use BERT to extract features
# other domains? Kitchen, Electronics...
# try with the inverse of the distance
# add drift='all'
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
@ -34,22 +41,11 @@ def load_dev_samples():
yield sample.instances, sample.prevalence()
class LAD(mord.LAD):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
class OrdinalRidge(mord.OrdinalRidge):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha':np.logspace(-3, 3, 7)}
params_SVR = {'C': np.logspace(-3,3,7)}
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_SVR = {'C': np.logspace(0, 1, 2)}
# baselines
@ -62,12 +58,12 @@ def quantifiers():
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(mord.LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(mord.LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(mord.LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(mord.LogisticAT()), params_OLR
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(mord.LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
@ -75,6 +71,7 @@ def quantifiers():
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score
yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
# yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
@ -137,7 +134,7 @@ if __name__ == '__main__':
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
for drift in ['low', 'mid', 'high']:
for drift in ['low', 'mid', 'high', 'all']:
params = [(*qs, drift) for qs in quantifiers()]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers:

View File

@ -10,6 +10,8 @@ from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel
import mord
from sklearn.utils.class_weight import compute_class_weight
class OrderedLogisticRegression:
@ -134,15 +136,20 @@ class RegressionQuantification:
class RegressorClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0):
def __init__(self, C=1.0, class_weight=None):
self.C = C
self.class_weight = class_weight
def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
self.nclasses = len(np.unique(y))
self.regressor.fit(X, y)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
@ -179,3 +186,28 @@ class RegressorClassifier(BaseEstimator, ClassifierMixin):
self.C = params['C']
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LAD(mord.LAD):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)
class OrdinalRidge(mord.OrdinalRidge):
def fit(self, X, y):
self.classes_ = sorted(np.unique(y))
return super().fit(X, y)

View File

@ -19,15 +19,19 @@ def partition_by_drift(split, training_prevalence):
order = np.argsort(drifts)
nD = len(order)
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
all_drift = np.arange(nD)
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
lows = drifts[low_drift]
mids = drifts[mid_drift]
highs = drifts[high_drift]
all = drifts[all_drift]
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
domain = 'Books-tfidf'

View File

@ -284,7 +284,7 @@ class Table:
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
tab += '\\\\\hline\n'
tab += '\\\\\hline\n'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'):