forked from moreo/QuaPy
more stuff that does not work
This commit is contained in:
parent
adfa235cce
commit
e6e8ed87fd
|
@ -9,15 +9,22 @@ from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
from Transduction_office.grid_naive_quantif import GridQuantifier, binned_indexer, Indexer, GridQuantifier2, \
|
||||||
|
classifier_indexer
|
||||||
from Transduction_office.pykliep import DensityRatioEstimator
|
from Transduction_office.pykliep import DensityRatioEstimator
|
||||||
from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol
|
from method.non_aggregative import MLPE
|
||||||
|
from quapy.protocol import AbstractStochasticSeededProtocol, OnLabelledCollectionProtocol, UPP
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
from time import time
|
from time import time
|
||||||
|
from scipy.spatial.distance import cdist
|
||||||
|
|
||||||
|
|
||||||
def gaussian(mean, cov=1., label=0, size=100, random_state=0):
|
plottting = False
|
||||||
|
|
||||||
|
|
||||||
|
def gaussian(mean, cov=0.1, label=0, size=100, random_state=0):
|
||||||
"""
|
"""
|
||||||
Creates a label collection in which the instances are distributed according to a Gaussian with specified
|
Creates a label collection in which the instances are distributed according to a Gaussian with specified
|
||||||
parameters and labels all data points with a specific label.
|
parameters and labels all data points with a specific label.
|
||||||
|
@ -38,6 +45,36 @@ def gaussian(mean, cov=1., label=0, size=100, random_state=0):
|
||||||
return LabelledCollection(instances, labels=[label]*size)
|
return LabelledCollection(instances, labels=[label]*size)
|
||||||
|
|
||||||
|
|
||||||
|
def _internal_plot(train, val, test):
|
||||||
|
if plottting:
|
||||||
|
xmin = min(train.X[:, 0].min(), val.X[:, 0].min(), test[:, 0].min())
|
||||||
|
xmax = max(train.X[:, 0].max(), val.X[:, 0].max(), test[:, 0].max())
|
||||||
|
ymin = min(train.X[:, 1].min(), val.X[:, 1].min(), test[:, 1].min())
|
||||||
|
ymax = max(train.X[:, 1].max(), val.X[:, 1].max(), test[:, 1].max())
|
||||||
|
plot(train, 'sel_train.png', xlim=(xmin, xmax), ylim=(ymin, ymax))
|
||||||
|
plot(val, 'sel_val.png', xlim=(xmin, xmax), ylim=(ymin, ymax))
|
||||||
|
plot(test, 'test.png', xlim=(xmin, xmax), ylim=(ymin, ymax))
|
||||||
|
|
||||||
|
def plot(data: LabelledCollection, path, xlim=None, ylim=None):
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
plt.clf()
|
||||||
|
if isinstance(data, LabelledCollection):
|
||||||
|
if data.instances.shape[1] != 2:
|
||||||
|
return
|
||||||
|
|
||||||
|
negative, positive = data.separate()
|
||||||
|
plt.scatter(negative.X[:,0], negative.X[:,1], label='neg', alpha=0.5)
|
||||||
|
plt.scatter(positive.X[:, 0], positive.X[:, 1], label='pos', alpha=0.5)
|
||||||
|
else:
|
||||||
|
if data.shape[1] != 2:
|
||||||
|
return
|
||||||
|
plt.scatter(data[:, 0], data[:, 1], label='test', alpha=0.5)
|
||||||
|
if xlim is not None:
|
||||||
|
plt.xlim(*xlim)
|
||||||
|
plt.ylim(*ylim)
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(path)
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------
|
||||||
# Protocol for generating prior probability shift + covariate shift by mixing "domains"
|
# Protocol for generating prior probability shift + covariate shift by mixing "domains"
|
||||||
# ------------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------------
|
||||||
|
@ -62,7 +99,6 @@ class CovPriorShift(AbstractStochasticSeededProtocol):
|
||||||
tentatives = 0
|
tentatives = 0
|
||||||
while len(indexes) < self.repeats:
|
while len(indexes) < self.repeats:
|
||||||
alpha = F.uniform_simplex_sampling(n_classes=len(self.domains))
|
alpha = F.uniform_simplex_sampling(n_classes=len(self.domains))
|
||||||
# sizes = np.asarray([round(len(lc_i) * alpha_i) for lc_i, alpha_i in zip(self.domains, alpha)])
|
|
||||||
sizes = (alpha * self.sample_size).astype(int)
|
sizes = (alpha * self.sample_size).astype(int)
|
||||||
if all(sizes > self.min_support):
|
if all(sizes > self.min_support):
|
||||||
indexes_i = [lc.sampling_index(size) for lc, size in zip(self.domains, sizes)]
|
indexes_i = [lc.sampling_index(size) for lc, size in zip(self.domains, sizes)]
|
||||||
|
@ -185,6 +221,37 @@ class Random(ImportanceWeight):
|
||||||
def weights(self, Xtr, ytr, Xte):
|
def weights(self, Xtr, ytr, Xte):
|
||||||
return np.random.rand(len(Xtr))
|
return np.random.rand(len(Xtr))
|
||||||
|
|
||||||
|
|
||||||
|
class MostSimilarK(ImportanceWeight):
|
||||||
|
# retains the training documents that are most similar in average to the k closest test points
|
||||||
|
|
||||||
|
def __init__(self, k):
|
||||||
|
self.k = k
|
||||||
|
|
||||||
|
def weights(self, Xtr, ytr, Xte):
|
||||||
|
distances = cdist(Xtr, Xte)
|
||||||
|
min_dist = np.min(distances)
|
||||||
|
max_dist = np.max(distances)
|
||||||
|
distances = (distances-min_dist)/(max_dist-min_dist)
|
||||||
|
similarities = 1 / (1+distances)
|
||||||
|
top_k_sim = np.sort(similarities, axis=1)[:,-self.k:]
|
||||||
|
ave_sim = np.mean(top_k_sim, axis=1)
|
||||||
|
return ave_sim
|
||||||
|
|
||||||
|
class MostSimilarTest(ImportanceWeight):
|
||||||
|
# retains the training documents that are the most similar to one test document
|
||||||
|
# i.e., for each test point, selects the K most similar train instances
|
||||||
|
|
||||||
|
def __init__(self, k=1):
|
||||||
|
self.k = k
|
||||||
|
|
||||||
|
def weights(self, Xtr, ytr, Xte):
|
||||||
|
distances = cdist(Xtr, Xte)
|
||||||
|
most_similar_idx = np.argsort(distances, axis=0)[:self.k, :].flatten()
|
||||||
|
weights = np.zeros(shape=Xtr.shape[0])
|
||||||
|
weights[most_similar_idx] = 1
|
||||||
|
return weights
|
||||||
|
|
||||||
# --------------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------------
|
||||||
# Quantification Methods that rely on Importance Weight for reweighting the training instances
|
# Quantification Methods that rely on Importance Weight for reweighting the training instances
|
||||||
# --------------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------------
|
||||||
|
@ -218,37 +285,71 @@ class ReweightingAggregative(TransductiveQuantifier):
|
||||||
# Quantification Methods that rely on Importance Weight for selecting a validation partition
|
# Quantification Methods that rely on Importance Weight for selecting a validation partition
|
||||||
# --------------------------------------------------------------------------------------------
|
# --------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
def select_from_weights(w, data: LabelledCollection, val_prop=0.4):
|
|
||||||
# w[w<1]=0
|
|
||||||
order = np.argsort(w)
|
|
||||||
split_point = int(len(w)*val_prop)
|
|
||||||
train_idx, val_idx = order[:-split_point], order[-split_point:]
|
|
||||||
return data.sampling_from_index(train_idx), data.sampling_from_index(val_idx)
|
|
||||||
|
|
||||||
|
|
||||||
class SelectorQuantifiers(TransductiveQuantifier):
|
|
||||||
|
|
||||||
def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4):
|
class SelectorQuantifiersTrainVal(TransductiveQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, val_split=0.4, only_positives=False):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
self.weighter = weighter
|
self.weighter = weighter
|
||||||
self.quantif_method = quantif_method
|
self.quantif_method = quantif_method
|
||||||
self.val_split = val_split
|
self.val_split = val_split
|
||||||
|
self.only_positives = only_positives
|
||||||
|
|
||||||
def quantify(self, instances):
|
def quantify(self, instances):
|
||||||
w = self.weighter.weights(*self.training.Xy, instances)
|
w = self.weighter.weights(*self.training.Xy, instances)
|
||||||
train, val = select_from_weights(w, self.training, self.val_split)
|
train, val = self.select_from_weights(w, self.training, self.val_split, self.only_positives)
|
||||||
|
_internal_plot(train, val, instances)
|
||||||
|
# print('\ttraining size', len(train), '\tval size', len(val))
|
||||||
quantifier = self.quantif_method(self.classifier).fit(train, val_split=val)
|
quantifier = self.quantif_method(self.classifier).fit(train, val_split=val)
|
||||||
return quantifier.quantify(instances)
|
return quantifier.quantify(instances)
|
||||||
|
|
||||||
|
def select_from_weights(self, w, data: LabelledCollection, val_prop=0.4, only_positives=False):
|
||||||
|
order = np.argsort(w)
|
||||||
|
if only_positives:
|
||||||
|
val_prop = np.mean(w > 0)
|
||||||
|
split_point = int(len(w) * val_prop)
|
||||||
|
different_idx, similar_idx = order[:-split_point], order[-split_point:]
|
||||||
|
different, similar = data.sampling_from_index(different_idx), data.sampling_from_index(similar_idx)
|
||||||
|
# return different, similar
|
||||||
|
train, val = similar.split_stratified(0.6)
|
||||||
|
return train, val
|
||||||
|
|
||||||
|
|
||||||
|
class SelectorQuantifiersTrain(TransductiveQuantifier):
|
||||||
|
|
||||||
|
def __init__(self, classifier, weighter: ImportanceWeight, quantif_method=ACC, only_positives=False):
|
||||||
|
self.classifier = classifier
|
||||||
|
self.weighter = weighter
|
||||||
|
self.quantif_method = quantif_method
|
||||||
|
self.only_positives = only_positives
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
w = self.weighter.weights(*self.training.Xy, instances)
|
||||||
|
train = self.select_from_weights(w, self.training, select_prop=None, only_positives=self.only_positives)
|
||||||
|
# _internal_plot(train, None, instances)
|
||||||
|
# print('\ttraining size', len(train))
|
||||||
|
quantifier = self.quantif_method(self.classifier).fit(train)
|
||||||
|
return quantifier.quantify(instances)
|
||||||
|
|
||||||
|
def select_from_weights(self, w, data: LabelledCollection, select_prop=0.5, only_positives=False):
|
||||||
|
order = np.argsort(w)
|
||||||
|
if only_positives:
|
||||||
|
select_prop = np.mean(w > 0)
|
||||||
|
split_point = int(len(w) * select_prop)
|
||||||
|
different_idx, similar_idx = order[:-split_point], order[-split_point:]
|
||||||
|
different, similar = data.sampling_from_index(different_idx), data.sampling_from_index(similar_idx)
|
||||||
|
return similar
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
qp.environ['SAMPLE_SIZE'] = 500
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
|
||||||
dA_l0 = gaussian(mean=[0,0], label=0, size=1000)
|
dA_l0 = gaussian(mean=[0,0], label=0, size=5000)
|
||||||
dA_l1 = gaussian(mean=[1,0], label=1, size=1000)
|
dA_l1 = gaussian(mean=[1,0], label=1, size=5000)
|
||||||
dB_l0 = gaussian(mean=[0,1], label=0, size=1000)
|
dB_l0 = gaussian(mean=[0,1], label=0, size=5000)
|
||||||
dB_l1 = gaussian(mean=[1,1], label=1, size=1000)
|
dB_l1 = gaussian(mean=[1,1], label=1, size=5000)
|
||||||
|
|
||||||
dA = LabelledCollection.join(dA_l0, dA_l1)
|
dA = LabelledCollection.join(dA_l0, dA_l1)
|
||||||
dB = LabelledCollection.join(dB_l0, dB_l1)
|
dB = LabelledCollection.join(dB_l0, dB_l1)
|
||||||
|
@ -258,42 +359,62 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
train = LabelledCollection.join(dA_train, dB_train)
|
train = LabelledCollection.join(dA_train, dB_train)
|
||||||
|
|
||||||
|
plot(train, 'train.png')
|
||||||
|
|
||||||
def lr():
|
def lr():
|
||||||
return LogisticRegression()
|
return LogisticRegression()
|
||||||
|
|
||||||
# def lr():
|
|
||||||
# return GridSearchCV(
|
|
||||||
# LogisticRegression(),
|
|
||||||
# param_grid={'C':np.logspace(-3,3,7), 'class_weight': ['balanced', None]},
|
|
||||||
# n_jobs=-1
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
|
# EMQ.MAX_ITER*=10
|
||||||
|
# val_split = 0.5
|
||||||
|
k_sim = 10
|
||||||
|
Q=ACC
|
||||||
methods = [
|
methods = [
|
||||||
|
('MLPE', MLPE()),
|
||||||
('CC', CC(lr())),
|
('CC', CC(lr())),
|
||||||
('PCC', PCC(lr())),
|
('PCC', PCC(lr())),
|
||||||
('ACC', ACC(lr())),
|
('ACC', ACC(lr())),
|
||||||
('PACC', PACC(lr())),
|
('PACC', PACC(lr())),
|
||||||
('HDy', EMQ(lr())),
|
('HDy', HDy(lr())),
|
||||||
('EMQ', EMQ(lr())),
|
('EMQ', EMQ(lr())),
|
||||||
('Sel-ACC', SelectorQuantifiers(lr(), MostTest(), ACC)),
|
('GridQ', GridQuantifier2(classifier=lr())),
|
||||||
('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=2)), cell_quantifier=Q(lr()))),
|
||||||
('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=4)), cell_quantifier=Q(lr()))),
|
||||||
('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=6)), cell_quantifier=Q(lr()))),
|
||||||
('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=8)), cell_quantifier=Q(lr()))),
|
||||||
('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=10)), cell_quantifier=Q(lr()))),
|
||||||
# ('KLIEP-CC', TransductiveAggregative(lr(), KLIEP(), CC)),
|
# ('GridQ', GridQuantifier(Indexer(binned_indexer(train.X, nbins_by_dim=20)), cell_quantifier=Q(lr()))),
|
||||||
# ('KLIEP-PCC', TransductiveAggregative(lr(), KLIEP(), PCC)),
|
# ('kSim-ACC', SelectorQuantifiers(lr(), MostSimilar(k_sim), ACC, val_split=val_split)),
|
||||||
# ('KLIEP-EMQ', TransductiveAggregative(lr(), KLIEP(), EMQ)),
|
# ('kSim-PACC', SelectorQuantifiers(lr(), MostSimilar(k_sim), PACC, val_split=val_split)),
|
||||||
# ('SILF-CC', TransductiveAggregative(lr(), USILF(), CC)),
|
# ('kSim-HDy', SelectorQuantifiers(lr(), MostSimilar(k_sim), HDy, val_split=val_split)),
|
||||||
# ('SILF-PCC', TransductiveAggregative(lr(), USILF(), PCC)),
|
# ('Sel-CC', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), CC, only_positives=True)),
|
||||||
# ('SILF-EMQ', TransductiveAggregative(lr(), USILF(), EMQ))
|
# ('Sel-PCC', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), PCC, only_positives=True)),
|
||||||
|
# ('Sel-ACC', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), ACC, only_positives=True)),
|
||||||
|
# ('Sel-PACC', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), PACC, only_positives=True)),
|
||||||
|
# ('Sel-HDy', SelectorQuantifiersTrainVal(lr(), MostSimilarTest(k=k_sim), HDy, only_positives=True)),
|
||||||
|
# ('Sel-EMQ', SelectorQuantifiersTrain(lr(), MostSimilarTest(k=k_sim), EMQ, only_positives=True)),
|
||||||
|
# ('Sel-EMQ', SelectorQuantifiersTrainVal(lr(), USILF(), PACC, only_positives=False)),
|
||||||
|
# ('Sel-PACC', SelectorQuantifiers(lr(), MostTest(), PACC)),
|
||||||
|
# ('Sel-HDy', SelectorQuantifiers(lr(), MostTest(), HDy)),
|
||||||
|
# ('LogReg-CC', ReweightingAggregative(lr(), LogReg(), CC)),
|
||||||
|
# ('LogReg-PCC', ReweightingAggregative(lr(), LogReg(), PCC)),
|
||||||
|
# ('LogReg-EMQ', ReweightingAggregative(lr(), LogReg(), EMQ)),
|
||||||
|
# ('KLIEP-CC', ReweightingAggregative(lr(), KLIEP(), CC)),
|
||||||
|
# ('KLIEP-PCC', ReweightingAggregative(lr(), KLIEP(), PCC)),
|
||||||
|
# ('KLIEP-EMQ', ReweightingAggregative(lr(), KLIEP(), EMQ)),
|
||||||
|
# ('SILF-CC', ReweightingAggregative(lr(), USILF(), CC)),
|
||||||
|
# ('SILF-PCC', ReweightingAggregative(lr(), USILF(), PCC)),
|
||||||
|
# ('SILF-EMQ', ReweightingAggregative(lr(), USILF(), EMQ))
|
||||||
]
|
]
|
||||||
|
|
||||||
for name, model in methods:
|
for name, model in methods:
|
||||||
with qp.util.temp_seed(1):
|
with qp.util.temp_seed(5):
|
||||||
|
# print('original training size', len(train))
|
||||||
model.fit(train)
|
model.fit(train)
|
||||||
|
|
||||||
prot = CovPriorShift([dA_test, dB_test], repeats=10)
|
prot = CovPriorShift([dA_test, dB_test], repeats=1 if plottting else 150)
|
||||||
|
# prot = UPP(dA_test+dB_test, repeats=1 if plottting else 150)
|
||||||
mae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mae')
|
mae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mae')
|
||||||
print(f'{name}: {mae = :.4f}')
|
print(f'{name}: {mae = :.4f}')
|
||||||
# mrae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mrae')
|
# mrae = qp.evaluation.evaluate(model, protocol=prot, error_metric='mrae')
|
||||||
|
|
|
@ -33,3 +33,5 @@ class MaximumLikelihoodPrevalenceEstimation(BaseQuantifier):
|
||||||
"""
|
"""
|
||||||
return self.estimated_prevalence
|
return self.estimated_prevalence
|
||||||
|
|
||||||
|
|
||||||
|
MLPE = MaximumLikelihoodPrevalenceEstimation
|
Loading…
Reference in New Issue