refactor and moving the "additional layer" to the phi projection

2020-07-07 16:41:05 +02:00 · 2020-07-07 16:41:05 +02:00 · 58f63586cd
parent a35f4e76df
commit 58f63586cd
8 changed files with 110 additions and 43 deletions
--- a/Notes.txt
+++ b/Notes.txt
@ -0,0 +1,6 @@
 Por ahora tengo dos sets de experimentos:
 a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers)
 b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional
 También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
 Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
 Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
--- a/README.md
+++ b/README.md
@ -1,2 +1,3 @@
-# kernel_authorship
+# A Kernel-Target Alignement regularization for Authorship Analysis
--- a/TODO.txt
+++ b/TODO.txt
@ -0,0 +1,30 @@
 Things to clarify:
 maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same
    submatrices of for alignment, and those may be mostly positive or mostly near an identity?
 maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not
    only the last one?
 process datasets and leave it as a generic parameter
 padding could start at any random point between [0, length_i-pad_length]
    - in training, pad to the shortest
    - in test, pad to the largest
 save and restore checkpoints
 should the phi(x) be normalized? if so:
    - better at the last step of phi?
    - better outside phi, previous to the gram matrix computation?
 should the single-label classifier have some sort of non linearity from the phi(x) to the labels?
 SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
    I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
 is the TwoClassBatch the best way?
 are the contribution of the two losses comparable? or one contributes far more than the other?
 what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one?
    I think this is irrelevant for the paper
 not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it
    instead work directly on the representations with one simple linear projection; not clear either whether the kernel
    should be computed on any further elaboration from the intermediate representation... thing is, that the <phi(xi),phi(xj)>
    is imposing unimodality (documents from the same author should point in a single direction) while working out another
    representation for the single-label classifier could instead relax this and attribute to the same author vectors that
    come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start
    thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the
    last layer... Is this redundant?
 not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with <K,Y>f (and
    change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)?
--- a/experiments.sh
+++ b/experiments.sh
@ -0,0 +1,14 @@
 #!/bin/bash
 conda activate torch
 dataset=enron
 for authors in 10 50 ;  do
  for alpha in 1 0.999 0.99 0.9 0.5 ; do
    python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
  done
 done
 dataset=imdb62
 for alpha in 1 0.999 0.99 0.9 0.5 ; do
  python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
 done
--- a/src/main.py
+++ b/src/main.py
@ -76,11 +76,11 @@ def main(opt):
        kernel_sizes=opt.kernelsizes,
        dropout=0.5
    ).to(device)
    print(phi)
    cls = AuthorshipAttributionClassifier(
        phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
    )
    print(cls)
    if opt.name == 'auto':
--- a/src/model/classifiers.py
+++ b/src/model/classifiers.py
@ -7,6 +7,7 @@ import math
 from sklearn.model_selection import train_test_split
 from model.early_stop import EarlyStop
 from model.transformations import FFProjection
 class AuthorshipAttributionClassifier(nn.Module):
@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module):
                        loss_attr_value = loss_attr.item()
                    if alpha < 1:
                        # todo: optimize (only upper diagonal)
                        kernel = torch.matmul(phi, phi.T)
                        ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
                        # todo: maybe the KALoss should take into consideration the balance (it is more likely to have
                        # a pair of negative examples than positives)
                        loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
                        loss_sav_value = loss_sav.item()
@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module):
 def KernelAlignmentLoss(K, Y):
    n_el = K.shape[0]*K.shape[1]
    loss = torch.norm(K - Y, p='fro')  # in Nello's paper this is different
-    loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
+    loss = loss / n_el  # this is in order to factor out the accumulation which is only due to the size
    return loss
 class FFProjection(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
        super(FFProjection, self).__init__()
        sizes = [input_size] + hidden_sizes + [output_size]
        self.ff = nn.ModuleList([
            nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
        ])
        self.activation = activation
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, x):
        for linear in self.ff[:-1]:
            x = self.dropout(self.activation(linear(x)))
        x = self.ff[-1](x)
        return x
 class Batch:
    def __init__(self, batch_size, n_epochs=1, shuffle=True):
--- a/src/model/transformations.py
+++ b/src/model/transformations.py
@ -13,47 +13,61 @@ class CNNProjection(nn.Module):
        self.convs1 = nn.ModuleList(
            [nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
        )
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(dropout)
-        self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
+        #self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
        self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out,
                               hidden_sizes=[1024],
                               output_size=out_size,
                               activation=nn.functional.relu,
                               dropout=dropout)
        self.output_size = out_size
    def convolve(self, x):
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [self.conv_and_pool(x, conv) for conv in self.convs1]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        return x
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x
-    def forward(self, x):
+    def l2norm(self, x):
        x = self.embed(x)  # (N, W, D)
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = F.relu(self.fc1(x))  # (N, C)
        norm = x.norm(p=2, dim=1, keepdim=True)
        x = x.div(norm.expand_as(x))
        return x
-        x = self.dropout(x)  # (N, len(Ks)*Co)
+    def forward(self, x):
-
+        x = self.embed(x)  # (N, W, D)
        x = self.convolve(x)  # (N, len(Ks)*Co]
        x = self.fc(x)
        #x = F.relu(self.fc1(x))  # (N, C)
        # x = self.dropout(x)
        x = self.l2norm(x)
        return x
    def space_dimensions(self):
        return self.output_size
 class FFProjection(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
        super(FFProjection, self).__init__()
        sizes = [input_size] + hidden_sizes + [output_size]
        self.ff = nn.ModuleList([
            nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
        ])
        self.activation = activation
        self.dropout = nn.Dropout(p=dropout)
    def forward(self, x):
        for linear in self.ff[:-1]:
            x = self.dropout(self.activation(linear(x)))
        x = self.ff[-1](x)
        return x
 class RNNProjection(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
        super(RNNProjection, self).__init__()
--- a/src/tools/gen_tables.py
+++ b/src/tools/gen_tables.py
@ -0,0 +1,14 @@
 import pandas as pd
 from glob import glob
 filedir = '../../results_*.csv'
 df = [pd.read_csv(file, sep='\t') for file in glob(filedir)]
 df = pd.concat(df)
 df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True)
 df = df.drop(columns='Dataset')
 pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1'])
 print(pv)
`@ -1,2 +1,3 @@`
	`# kernel_authorship`	`# A Kernel-Target Alignement regularization for Authorship Analysis`