refactor and moving the "additional layer" to the phi projection

This commit is contained in:
Alejandro Moreo Fernandez 2020-07-07 16:41:05 +02:00
parent a35f4e76df
commit 58f63586cd
8 changed files with 110 additions and 43 deletions

6
Notes.txt Normal file
View File

@ -0,0 +1,6 @@
Por ahora tengo dos sets de experimentos:
a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers)
b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional
También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...

View File

@ -1,2 +1,3 @@
# kernel_authorship
# A Kernel-Target Alignement regularization for Authorship Analysis

30
TODO.txt Normal file
View File

@ -0,0 +1,30 @@
Things to clarify:
maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same
submatrices of for alignment, and those may be mostly positive or mostly near an identity?
maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not
only the last one?
process datasets and leave it as a generic parameter
padding could start at any random point between [0, length_i-pad_length]
- in training, pad to the shortest
- in test, pad to the largest
save and restore checkpoints
should the phi(x) be normalized? if so:
- better at the last step of phi?
- better outside phi, previous to the gram matrix computation?
should the single-label classifier have some sort of non linearity from the phi(x) to the labels?
SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
is the TwoClassBatch the best way?
are the contribution of the two losses comparable? or one contributes far more than the other?
what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one?
I think this is irrelevant for the paper
not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it
instead work directly on the representations with one simple linear projection; not clear either whether the kernel
should be computed on any further elaboration from the intermediate representation... thing is, that the <phi(xi),phi(xj)>
is imposing unimodality (documents from the same author should point in a single direction) while working out another
representation for the single-label classifier could instead relax this and attribute to the same author vectors that
come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start
thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the
last layer... Is this redundant?
not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with <K,Y>f (and
change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)?

14
experiments.sh Normal file
View File

@ -0,0 +1,14 @@
#!/bin/bash
conda activate torch
dataset=enron
for authors in 10 50 ; do
for alpha in 1 0.999 0.99 0.9 0.5 ; do
python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
done
done
dataset=imdb62
for alpha in 1 0.999 0.99 0.9 0.5 ; do
python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
done

View File

@ -76,11 +76,11 @@ def main(opt):
kernel_sizes=opt.kernelsizes,
dropout=0.5
).to(device)
print(phi)
cls = AuthorshipAttributionClassifier(
phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
)
print(cls)
if opt.name == 'auto':

View File

@ -7,6 +7,7 @@ import math
from sklearn.model_selection import train_test_split
from model.early_stop import EarlyStop
from model.transformations import FFProjection
class AuthorshipAttributionClassifier(nn.Module):
@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module):
loss_attr_value = loss_attr.item()
if alpha < 1:
# todo: optimize (only upper diagonal)
kernel = torch.matmul(phi, phi.T)
ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
# todo: maybe the KALoss should take into consideration the balance (it is more likely to have
# a pair of negative examples than positives)
loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
loss_sav_value = loss_sav.item()
@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module):
def KernelAlignmentLoss(K, Y):
n_el = K.shape[0]*K.shape[1]
loss = torch.norm(K - Y, p='fro') # in Nello's paper this is different
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
return loss
class FFProjection(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
super(FFProjection, self).__init__()
sizes = [input_size] + hidden_sizes + [output_size]
self.ff = nn.ModuleList([
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
])
self.activation = activation
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
for linear in self.ff[:-1]:
x = self.dropout(self.activation(linear(x)))
x = self.ff[-1](x)
return x
class Batch:
def __init__(self, batch_size, n_epochs=1, shuffle=True):

View File

@ -13,47 +13,61 @@ class CNNProjection(nn.Module):
self.convs1 = nn.ModuleList(
[nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
)
'''
self.conv13 = nn.Conv2d(Ci, Co, (3, D))
self.conv14 = nn.Conv2d(Ci, Co, (4, D))
self.conv15 = nn.Conv2d(Ci, Co, (5, D))
'''
self.dropout = nn.Dropout(dropout)
self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
#self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out,
hidden_sizes=[1024],
output_size=out_size,
activation=nn.functional.relu,
dropout=dropout)
self.output_size = out_size
def convolve(self, x):
x = x.unsqueeze(1) # (N, Ci, W, D)
x = [self.conv_and_pool(x, conv) for conv in self.convs1] # [(N, Co), ...]*len(Ks)
x = torch.cat(x, 1)
return x
def conv_and_pool(self, x, conv):
x = F.relu(conv(x)).squeeze(3) # (N, Co, W)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
return x
def forward(self, x):
x = self.embed(x) # (N, W, D)
x = x.unsqueeze(1) # (N, Ci, W, D)
x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks)
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks)
x = torch.cat(x, 1)
'''
x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
'''
x = F.relu(self.fc1(x)) # (N, C)
def l2norm(self, x):
norm = x.norm(p=2, dim=1, keepdim=True)
x = x.div(norm.expand_as(x))
return x
x = self.dropout(x) # (N, len(Ks)*Co)
def forward(self, x):
x = self.embed(x) # (N, W, D)
x = self.convolve(x) # (N, len(Ks)*Co]
x = self.fc(x)
#x = F.relu(self.fc1(x)) # (N, C)
# x = self.dropout(x)
x = self.l2norm(x)
return x
def space_dimensions(self):
return self.output_size
class FFProjection(nn.Module):
def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
super(FFProjection, self).__init__()
sizes = [input_size] + hidden_sizes + [output_size]
self.ff = nn.ModuleList([
nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
])
self.activation = activation
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
for linear in self.ff[:-1]:
x = self.dropout(self.activation(linear(x)))
x = self.ff[-1](x)
return x
class RNNProjection(nn.Module):
def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
super(RNNProjection, self).__init__()

14
src/tools/gen_tables.py Normal file
View File

@ -0,0 +1,14 @@
import pandas as pd
from glob import glob
filedir = '../../results_*.csv'
df = [pd.read_csv(file, sep='\t') for file in glob(filedir)]
df = pd.concat(df)
df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True)
df = df.drop(columns='Dataset')
pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1'])
print(pv)