kernel loss based on BCE

This commit is contained in:
Alejandro Moreo Fernandez 2020-07-23 14:29:00 +02:00
parent be83411e25
commit acb38d4aae
3 changed files with 57 additions and 29 deletions

View File

@ -81,7 +81,6 @@ def main(opt):
activation=nn.functional.relu,
dropout=0.5,
activate_last=True),
#norm=L2Norm()
).to(device)
cls = AuthorshipAttributionClassifier(

View File

@ -27,6 +27,7 @@ class AuthorshipAttributionClassifier(nn.Module):
#batcher = TwoClassBatch(batch_size=batch_size, n_epochs=epochs, steps_per_epoch=X.shape[0]//batch_size)
batcher_val = Batch(batch_size=batch_size, n_epochs=epochs, shuffle=False)
criterion = torch.nn.CrossEntropyLoss().to(self.device)
savcriterion = torch.nn.BCEWithLogitsLoss().to(self.device)
optim = torch.optim.Adam(self.parameters(), lr=lr)
X, Xval, y, yval = train_test_split(X, y, test_size=val_prop, stratify=y)
@ -53,15 +54,25 @@ class AuthorshipAttributionClassifier(nn.Module):
loss_attr_value = loss_attr.item()
if alpha < 1:
phi = F.normalize(phi)
# todo: optimize (only upper diagonal)
kernel = torch.matmul(phi, phi.T)
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
# choose balanced number of positive (same author) and negative (different authors)
idx1, idx2, sav_labels = choose_sav_pairs(yi, npairs=batch_size)
phi1 = phi[idx1]
phi2 = phi[idx2]
cross = torch.bmm(phi1.unsqueeze(1), phi2.unsqueeze(2).permute(0,1,2)).squeeze()
loss_sav = savcriterion(cross.unsqueeze(0), torch.as_tensor(sav_labels).float().unsqueeze(0).to(self.device))
loss_sav_value = loss_sav.item()
# add a cross-entropy based criterion (instead of KTA -- let's see how it works)
## todo: optimize (only upper diagonal)
#kernel = torch.matmul(phi, phi.T)
#ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
# todo: maybe the KALoss should take into consideration the balance (it is more likely to have
# a pair of negative examples than positives)
loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
loss_sav_value = loss_sav.item()
#loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
#loss_sav_value = loss_sav.item()
loss = loss_attr*alpha + loss_sav*(1.-alpha)
@ -77,8 +88,7 @@ class AuthorshipAttributionClassifier(nn.Module):
f'attr-loss={np.mean(attr_losses):.5f} '
f'sav-loss={np.mean(sav_losses):.5f} '
f'val_loss={val_loss:.5f} '
f'patience={early_stop.patience}/{early_stop.patience_limit}'
)
f'patience={early_stop.patience}/{early_stop.patience_limit}')
# validation
self.eval()
@ -126,6 +136,38 @@ class AuthorshipAttributionClassifier(nn.Module):
return self.ff(phi)
def choose_sav_pairs(y, npairs):
n = len(y)
y = y+1 # reindex from [0..n_classes-1] to [1..n_classes] for convenience
same_author = (np.outer(y, 1/y) == 1)
triu = np.triu_indices(n, k=1)
same_author_nodup = same_author[triu]
idxi, idxj = triu
posi, negi = idxi[same_author_nodup], idxi[same_author_nodup == False]
posj, negj = idxj[same_author_nodup], idxj[same_author_nodup == False]
num_pos = same_author_nodup.sum()
num_neg = len(same_author_nodup)-num_pos # == len(posj)
# balanced:
pos_take = np.random.choice(np.arange(num_pos), npairs//2, replace=num_pos < npairs//2)
posi, posj = posi[pos_take], posj[pos_take]
neg_take = np.random.choice(np.arange(num_neg), npairs//2, replace=num_neg < npairs//2)
negi, negj = negi[neg_take], negj[neg_take]
idx1 = np.concatenate([posi, negi])
idx2 = np.concatenate([posj, negj])
savlabels = np.array([1]*len(posi) + [0]*len(negi))
print(f'generated {len(posi)} pos and {len(negi)}')
return idx1, idx2, savlabels
class SameAuthorClassifier(nn.Module):
def __init__(self, projector, num_authors, pad_index, pad_length=500, device='cpu'):
super(SameAuthorClassifier, self).__init__()
@ -255,11 +297,11 @@ class FullAuthorClassifier(nn.Module):
return np.concatenate(predictions)
def KernelAlignmentLoss(K, Y):
n_el = K.shape[0]*K.shape[1]
loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
return loss
#def KernelAlignmentLoss(K, Y):
# n_el = K.shape[0]*K.shape[1]
# loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
# loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
# return loss

View File

@ -6,17 +6,16 @@ import torch.nn.functional as F
class Phi(nn.Module):
def __init__(self, cnn, ff, norm=None):
def __init__(self, cnn, ff):
super(Phi, self).__init__()
self.cnn = cnn
self.ff = ff
#self.norm = norm
self.output_size = self.ff.output_size
def forward(self, x):
x = self.cnn(x)
x = self.ff(x)
#x = self.norm(x)
x = F.normalize(x, p=2, dim=-1)
return x
@ -48,18 +47,6 @@ class CNNProjection(nn.Module):
return x
class L2Norm(nn.Module):
def __init__(self, p=2, dim=-1):
super(L2Norm, self).__init__()
self.p=p
self.dim=dim
def forward(self, x):
norm = x.norm(p=self.p, dim=self.dim, keepdim=True)
x = x.div(norm.expand_as(x))
return x
class FFProjection(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5,
activate_last=False):