refactor and moving the "additional layer" to the phi projection
This commit is contained in:
parent
a35f4e76df
commit
58f63586cd
|
@ -0,0 +1,6 @@
|
||||||
|
Por ahora tengo dos sets de experimentos:
|
||||||
|
a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers)
|
||||||
|
b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional
|
||||||
|
También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
|
||||||
|
Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
|
||||||
|
Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
|
|
@ -1,2 +1,3 @@
|
||||||
# kernel_authorship
|
# A Kernel-Target Alignement regularization for Authorship Analysis
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
Things to clarify:
|
||||||
|
maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same
|
||||||
|
submatrices of for alignment, and those may be mostly positive or mostly near an identity?
|
||||||
|
maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not
|
||||||
|
only the last one?
|
||||||
|
process datasets and leave it as a generic parameter
|
||||||
|
padding could start at any random point between [0, length_i-pad_length]
|
||||||
|
- in training, pad to the shortest
|
||||||
|
- in test, pad to the largest
|
||||||
|
save and restore checkpoints
|
||||||
|
should the phi(x) be normalized? if so:
|
||||||
|
- better at the last step of phi?
|
||||||
|
- better outside phi, previous to the gram matrix computation?
|
||||||
|
should the single-label classifier have some sort of non linearity from the phi(x) to the labels?
|
||||||
|
SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
|
||||||
|
I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
|
||||||
|
is the TwoClassBatch the best way?
|
||||||
|
are the contribution of the two losses comparable? or one contributes far more than the other?
|
||||||
|
what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one?
|
||||||
|
I think this is irrelevant for the paper
|
||||||
|
not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it
|
||||||
|
instead work directly on the representations with one simple linear projection; not clear either whether the kernel
|
||||||
|
should be computed on any further elaboration from the intermediate representation... thing is, that the <phi(xi),phi(xj)>
|
||||||
|
is imposing unimodality (documents from the same author should point in a single direction) while working out another
|
||||||
|
representation for the single-label classifier could instead relax this and attribute to the same author vectors that
|
||||||
|
come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start
|
||||||
|
thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the
|
||||||
|
last layer... Is this redundant?
|
||||||
|
not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with <K,Y>f (and
|
||||||
|
change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)?
|
|
@ -0,0 +1,14 @@
|
||||||
|
#!/bin/bash
|
||||||
|
conda activate torch
|
||||||
|
|
||||||
|
dataset=enron
|
||||||
|
for authors in 10 50 ; do
|
||||||
|
for alpha in 1 0.999 0.99 0.9 0.5 ; do
|
||||||
|
python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
dataset=imdb62
|
||||||
|
for alpha in 1 0.999 0.99 0.9 0.5 ; do
|
||||||
|
python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
|
||||||
|
done
|
|
@ -76,11 +76,11 @@ def main(opt):
|
||||||
kernel_sizes=opt.kernelsizes,
|
kernel_sizes=opt.kernelsizes,
|
||||||
dropout=0.5
|
dropout=0.5
|
||||||
).to(device)
|
).to(device)
|
||||||
|
print(phi)
|
||||||
|
|
||||||
cls = AuthorshipAttributionClassifier(
|
cls = AuthorshipAttributionClassifier(
|
||||||
phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
|
phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
|
||||||
)
|
)
|
||||||
|
|
||||||
print(cls)
|
print(cls)
|
||||||
|
|
||||||
if opt.name == 'auto':
|
if opt.name == 'auto':
|
||||||
|
|
|
@ -7,6 +7,7 @@ import math
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
from model.early_stop import EarlyStop
|
from model.early_stop import EarlyStop
|
||||||
|
from model.transformations import FFProjection
|
||||||
|
|
||||||
|
|
||||||
class AuthorshipAttributionClassifier(nn.Module):
|
class AuthorshipAttributionClassifier(nn.Module):
|
||||||
|
@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module):
|
||||||
loss_attr_value = loss_attr.item()
|
loss_attr_value = loss_attr.item()
|
||||||
|
|
||||||
if alpha < 1:
|
if alpha < 1:
|
||||||
|
# todo: optimize (only upper diagonal)
|
||||||
kernel = torch.matmul(phi, phi.T)
|
kernel = torch.matmul(phi, phi.T)
|
||||||
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
|
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
|
||||||
|
# todo: maybe the KALoss should take into consideration the balance (it is more likely to have
|
||||||
|
# a pair of negative examples than positives)
|
||||||
loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
|
loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
|
||||||
loss_sav_value = loss_sav.item()
|
loss_sav_value = loss_sav.item()
|
||||||
|
|
||||||
|
@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module):
|
||||||
def KernelAlignmentLoss(K, Y):
|
def KernelAlignmentLoss(K, Y):
|
||||||
n_el = K.shape[0]*K.shape[1]
|
n_el = K.shape[0]*K.shape[1]
|
||||||
loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
|
loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
|
||||||
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
|
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
|
||||||
class FFProjection(nn.Module):
|
|
||||||
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
|
|
||||||
super(FFProjection, self).__init__()
|
|
||||||
sizes = [input_size] + hidden_sizes + [output_size]
|
|
||||||
self.ff = nn.ModuleList([
|
|
||||||
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
|
|
||||||
])
|
|
||||||
self.activation = activation
|
|
||||||
self.dropout = nn.Dropout(p=dropout)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
for linear in self.ff[:-1]:
|
|
||||||
x = self.dropout(self.activation(linear(x)))
|
|
||||||
x = self.ff[-1](x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class Batch:
|
class Batch:
|
||||||
def __init__(self, batch_size, n_epochs=1, shuffle=True):
|
def __init__(self, batch_size, n_epochs=1, shuffle=True):
|
||||||
|
|
|
@ -13,47 +13,61 @@ class CNNProjection(nn.Module):
|
||||||
self.convs1 = nn.ModuleList(
|
self.convs1 = nn.ModuleList(
|
||||||
[nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
|
[nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
|
||||||
)
|
)
|
||||||
'''
|
|
||||||
self.conv13 = nn.Conv2d(Ci, Co, (3, D))
|
|
||||||
self.conv14 = nn.Conv2d(Ci, Co, (4, D))
|
|
||||||
self.conv15 = nn.Conv2d(Ci, Co, (5, D))
|
|
||||||
'''
|
|
||||||
self.dropout = nn.Dropout(dropout)
|
self.dropout = nn.Dropout(dropout)
|
||||||
self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
|
#self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
|
||||||
|
self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out,
|
||||||
|
hidden_sizes=[1024],
|
||||||
|
output_size=out_size,
|
||||||
|
activation=nn.functional.relu,
|
||||||
|
dropout=dropout)
|
||||||
self.output_size = out_size
|
self.output_size = out_size
|
||||||
|
|
||||||
|
def convolve(self, x):
|
||||||
|
x = x.unsqueeze(1) # (N, Ci, W, D)
|
||||||
|
x = [self.conv_and_pool(x, conv) for conv in self.convs1] # [(N, Co), ...]*len(Ks)
|
||||||
|
x = torch.cat(x, 1)
|
||||||
|
return x
|
||||||
|
|
||||||
def conv_and_pool(self, x, conv):
|
def conv_and_pool(self, x, conv):
|
||||||
x = F.relu(conv(x)).squeeze(3) # (N, Co, W)
|
x = F.relu(conv(x)).squeeze(3) # (N, Co, W)
|
||||||
x = F.max_pool1d(x, x.size(2)).squeeze(2)
|
x = F.max_pool1d(x, x.size(2)).squeeze(2)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def forward(self, x):
|
def l2norm(self, x):
|
||||||
x = self.embed(x) # (N, W, D)
|
|
||||||
x = x.unsqueeze(1) # (N, Ci, W, D)
|
|
||||||
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks)
|
|
||||||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks)
|
|
||||||
x = torch.cat(x, 1)
|
|
||||||
|
|
||||||
'''
|
|
||||||
x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
|
|
||||||
x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
|
|
||||||
x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
|
|
||||||
x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
|
|
||||||
'''
|
|
||||||
|
|
||||||
x = F.relu(self.fc1(x)) # (N, C)
|
|
||||||
|
|
||||||
norm = x.norm(p=2, dim=1, keepdim=True)
|
norm = x.norm(p=2, dim=1, keepdim=True)
|
||||||
x = x.div(norm.expand_as(x))
|
x = x.div(norm.expand_as(x))
|
||||||
|
return x
|
||||||
|
|
||||||
x = self.dropout(x) # (N, len(Ks)*Co)
|
def forward(self, x):
|
||||||
|
x = self.embed(x) # (N, W, D)
|
||||||
|
x = self.convolve(x) # (N, len(Ks)*Co]
|
||||||
|
x = self.fc(x)
|
||||||
|
#x = F.relu(self.fc1(x)) # (N, C)
|
||||||
|
# x = self.dropout(x)
|
||||||
|
x = self.l2norm(x)
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def space_dimensions(self):
|
def space_dimensions(self):
|
||||||
return self.output_size
|
return self.output_size
|
||||||
|
|
||||||
|
|
||||||
|
class FFProjection(nn.Module):
|
||||||
|
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
|
||||||
|
super(FFProjection, self).__init__()
|
||||||
|
sizes = [input_size] + hidden_sizes + [output_size]
|
||||||
|
self.ff = nn.ModuleList([
|
||||||
|
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
|
||||||
|
])
|
||||||
|
self.activation = activation
|
||||||
|
self.dropout = nn.Dropout(p=dropout)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for linear in self.ff[:-1]:
|
||||||
|
x = self.dropout(self.activation(linear(x)))
|
||||||
|
x = self.ff[-1](x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
class RNNProjection(nn.Module):
|
class RNNProjection(nn.Module):
|
||||||
def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
|
def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
|
||||||
super(RNNProjection, self).__init__()
|
super(RNNProjection, self).__init__()
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
import pandas as pd
|
||||||
|
from glob import glob
|
||||||
|
|
||||||
|
filedir = '../../results_*.csv'
|
||||||
|
|
||||||
|
df = [pd.read_csv(file, sep='\t') for file in glob(filedir)]
|
||||||
|
df = pd.concat(df)
|
||||||
|
|
||||||
|
df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True)
|
||||||
|
df = df.drop(columns='Dataset')
|
||||||
|
|
||||||
|
pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1'])
|
||||||
|
|
||||||
|
print(pv)
|
Loading…
Reference in New Issue