refactor and moving the "additional layer" to the phi projection
This commit is contained in:
parent
a35f4e76df
commit
58f63586cd
|
@ -0,0 +1,6 @@
|
|||
Por ahora tengo dos sets de experimentos:
|
||||
a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers)
|
||||
b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional
|
||||
También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
|
||||
Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
|
||||
Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
|
|
@ -1,2 +1,3 @@
|
|||
# kernel_authorship
|
||||
# A Kernel-Target Alignement regularization for Authorship Analysis
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
Things to clarify:
|
||||
maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same
|
||||
submatrices of for alignment, and those may be mostly positive or mostly near an identity?
|
||||
maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not
|
||||
only the last one?
|
||||
process datasets and leave it as a generic parameter
|
||||
padding could start at any random point between [0, length_i-pad_length]
|
||||
- in training, pad to the shortest
|
||||
- in test, pad to the largest
|
||||
save and restore checkpoints
|
||||
should the phi(x) be normalized? if so:
|
||||
- better at the last step of phi?
|
||||
- better outside phi, previous to the gram matrix computation?
|
||||
should the single-label classifier have some sort of non linearity from the phi(x) to the labels?
|
||||
SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
|
||||
I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
|
||||
is the TwoClassBatch the best way?
|
||||
are the contribution of the two losses comparable? or one contributes far more than the other?
|
||||
what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one?
|
||||
I think this is irrelevant for the paper
|
||||
not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it
|
||||
instead work directly on the representations with one simple linear projection; not clear either whether the kernel
|
||||
should be computed on any further elaboration from the intermediate representation... thing is, that the <phi(xi),phi(xj)>
|
||||
is imposing unimodality (documents from the same author should point in a single direction) while working out another
|
||||
representation for the single-label classifier could instead relax this and attribute to the same author vectors that
|
||||
come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start
|
||||
thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the
|
||||
last layer... Is this redundant?
|
||||
not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with <K,Y>f (and
|
||||
change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)?
|
|
@ -0,0 +1,14 @@
|
|||
#!/bin/bash
|
||||
conda activate torch
|
||||
|
||||
dataset=enron
|
||||
for authors in 10 50 ; do
|
||||
for alpha in 1 0.999 0.99 0.9 0.5 ; do
|
||||
python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
|
||||
done
|
||||
done
|
||||
|
||||
dataset=imdb62
|
||||
for alpha in 1 0.999 0.99 0.9 0.5 ; do
|
||||
python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
|
||||
done
|
|
@ -76,11 +76,11 @@ def main(opt):
|
|||
kernel_sizes=opt.kernelsizes,
|
||||
dropout=0.5
|
||||
).to(device)
|
||||
print(phi)
|
||||
|
||||
cls = AuthorshipAttributionClassifier(
|
||||
phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
|
||||
)
|
||||
|
||||
print(cls)
|
||||
|
||||
if opt.name == 'auto':
|
||||
|
|
|
@ -7,6 +7,7 @@ import math
|
|||
from sklearn.model_selection import train_test_split
|
||||
|
||||
from model.early_stop import EarlyStop
|
||||
from model.transformations import FFProjection
|
||||
|
||||
|
||||
class AuthorshipAttributionClassifier(nn.Module):
|
||||
|
@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module):
|
|||
loss_attr_value = loss_attr.item()
|
||||
|
||||
if alpha < 1:
|
||||
# todo: optimize (only upper diagonal)
|
||||
kernel = torch.matmul(phi, phi.T)
|
||||
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
|
||||
# todo: maybe the KALoss should take into consideration the balance (it is more likely to have
|
||||
# a pair of negative examples than positives)
|
||||
loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
|
||||
loss_sav_value = loss_sav.item()
|
||||
|
||||
|
@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module):
|
|||
def KernelAlignmentLoss(K, Y):
|
||||
n_el = K.shape[0]*K.shape[1]
|
||||
loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
|
||||
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
|
||||
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
|
||||
return loss
|
||||
|
||||
|
||||
class FFProjection(nn.Module):
|
||||
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
|
||||
super(FFProjection, self).__init__()
|
||||
sizes = [input_size] + hidden_sizes + [output_size]
|
||||
self.ff = nn.ModuleList([
|
||||
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
|
||||
])
|
||||
self.activation = activation
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
def forward(self, x):
|
||||
for linear in self.ff[:-1]:
|
||||
x = self.dropout(self.activation(linear(x)))
|
||||
x = self.ff[-1](x)
|
||||
return x
|
||||
|
||||
|
||||
class Batch:
|
||||
def __init__(self, batch_size, n_epochs=1, shuffle=True):
|
||||
|
|
|
@ -13,47 +13,61 @@ class CNNProjection(nn.Module):
|
|||
self.convs1 = nn.ModuleList(
|
||||
[nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
|
||||
)
|
||||
'''
|
||||
self.conv13 = nn.Conv2d(Ci, Co, (3, D))
|
||||
self.conv14 = nn.Conv2d(Ci, Co, (4, D))
|
||||
self.conv15 = nn.Conv2d(Ci, Co, (5, D))
|
||||
'''
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
|
||||
#self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
|
||||
self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out,
|
||||
hidden_sizes=[1024],
|
||||
output_size=out_size,
|
||||
activation=nn.functional.relu,
|
||||
dropout=dropout)
|
||||
self.output_size = out_size
|
||||
|
||||
def convolve(self, x):
|
||||
x = x.unsqueeze(1) # (N, Ci, W, D)
|
||||
x = [self.conv_and_pool(x, conv) for conv in self.convs1] # [(N, Co), ...]*len(Ks)
|
||||
x = torch.cat(x, 1)
|
||||
return x
|
||||
|
||||
def conv_and_pool(self, x, conv):
|
||||
x = F.relu(conv(x)).squeeze(3) # (N, Co, W)
|
||||
x = F.max_pool1d(x, x.size(2)).squeeze(2)
|
||||
return x
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embed(x) # (N, W, D)
|
||||
x = x.unsqueeze(1) # (N, Ci, W, D)
|
||||
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks)
|
||||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks)
|
||||
x = torch.cat(x, 1)
|
||||
|
||||
'''
|
||||
x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
|
||||
x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
|
||||
x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
|
||||
x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
|
||||
'''
|
||||
|
||||
x = F.relu(self.fc1(x)) # (N, C)
|
||||
|
||||
def l2norm(self, x):
|
||||
norm = x.norm(p=2, dim=1, keepdim=True)
|
||||
x = x.div(norm.expand_as(x))
|
||||
return x
|
||||
|
||||
x = self.dropout(x) # (N, len(Ks)*Co)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.embed(x) # (N, W, D)
|
||||
x = self.convolve(x) # (N, len(Ks)*Co]
|
||||
x = self.fc(x)
|
||||
#x = F.relu(self.fc1(x)) # (N, C)
|
||||
# x = self.dropout(x)
|
||||
x = self.l2norm(x)
|
||||
return x
|
||||
|
||||
def space_dimensions(self):
|
||||
return self.output_size
|
||||
|
||||
|
||||
class FFProjection(nn.Module):
|
||||
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
|
||||
super(FFProjection, self).__init__()
|
||||
sizes = [input_size] + hidden_sizes + [output_size]
|
||||
self.ff = nn.ModuleList([
|
||||
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
|
||||
])
|
||||
self.activation = activation
|
||||
self.dropout = nn.Dropout(p=dropout)
|
||||
|
||||
def forward(self, x):
|
||||
for linear in self.ff[:-1]:
|
||||
x = self.dropout(self.activation(linear(x)))
|
||||
x = self.ff[-1](x)
|
||||
return x
|
||||
|
||||
|
||||
class RNNProjection(nn.Module):
|
||||
def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
|
||||
super(RNNProjection, self).__init__()
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
import pandas as pd
|
||||
from glob import glob
|
||||
|
||||
filedir = '../../results_*.csv'
|
||||
|
||||
df = [pd.read_csv(file, sep='\t') for file in glob(filedir)]
|
||||
df = pd.concat(df)
|
||||
|
||||
df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True)
|
||||
df = df.drop(columns='Dataset')
|
||||
|
||||
pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1'])
|
||||
|
||||
print(pv)
|
Loading…
Reference in New Issue