refactor and moving the "additional layer" to the phi projection

2020-07-07 16:41:05 +02:00 · 2020-07-07 16:41:05 +02:00 · 58f63586cd
parent a35f4e76df
commit 58f63586cd
8 changed files with 110 additions and 43 deletions
--- a/Notes.txt
+++ b/Notes.txt
@ -0,0 +1,6 @@
+Por ahora tengo dos sets de experimentos:
+a) unos mejores que los de Ruder donde hay un layer más de clasificación (o sea, está phi(x) y luego dos layers)
+b) unos "simplified" que son peores que los de Ruder porque he quitado ese layer adicional
+También vi que se mejoraba con l2(phi(x)) así que lo he dejado así
+Ahora voy a probar a añadir ese layer adicional como último step in phi(x) <-- ejecutando
+Luego quiero probar a imponer la regularización en todos los layers antes de la clasificación...
--- a/README.md
+++ b/README.md
@ -1,2 +1,3 @@
-# kernel_authorship
+# A Kernel-Target Alignement regularization for Authorship Analysis
+

--- a/TODO.txt
+++ b/TODO.txt
@ -0,0 +1,30 @@
+Things to clarify:
+maybe I have to review the validation of the sav-loss; since it is batched, it might be always checking the same
+    submatrices of for alignment, and those may be mostly positive or mostly near an identity?
+maybe the sav-loss is something which may have sense to impose, as a regularization, across many last layers, and not
+    only the last one?
+process datasets and leave it as a generic parameter
+padding could start at any random point between [0, length_i-pad_length]
+    - in training, pad to the shortest
+    - in test, pad to the largest
+save and restore checkpoints
+should the phi(x) be normalized? if so:
+    - better at the last step of phi?
+    - better outside phi, previous to the gram matrix computation?
+should the single-label classifier have some sort of non linearity from the phi(x) to the labels?
+SAV: how should the range of k(xi,xj) be interpreted? how to decide for value threshold for returning -1 or +1?
+    I guess the best thing to do is to learn a simple threshold, one feed forward 1-to-1
+is the TwoClassBatch the best way?
+are the contribution of the two losses comparable? or one contributes far more than the other?
+what is the best representation for inputs? char-based? ngrams-based? word-based? or a multichannel one?
+    I think this is irrelevant for the paper
+not clear whether the single-label classifier should work out a ff on top of the intermediate representation, or should it
+    instead work directly on the representations with one simple linear projection; not clear either whether the kernel
+    should be computed on any further elaboration from the intermediate representation... thing is, that the <phi(xi),phi(xj)>
+    is imposing unimodality (documents from the same author should point in a single direction) while working out another
+    representation for the single-label classifier could instead relax this and attribute to the same author vectors that
+    come from a multimodal distribution. No... This "unimodality" should exist anyway in the last layer. Indeed I start
+    thinking that the optimum for any classifier should already impose something similar to the KTA criteria in the
+    last layer... Is this redundant?
+not clear whether we should define the loss as in "On kernel target alignment", i.e., a numerator with <K,Y>f (and
+    change sign to minimize) or as |K-Y|f norm. What about the denominator (now, the normalization factor is n**2)?
--- a/experiments.sh
+++ b/experiments.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+conda activate torch
+
+dataset=enron
+for authors in 10 50 ;  do
+  for alpha in 1 0.999 0.99 0.9 0.5 ; do
+    python main.py --dataset $dataset -A $authors -s 0 -o ../results_$dataset.csv --alpha $alpha
+  done
+done
+
+dataset=imdb62
+for alpha in 1 0.999 0.99 0.9 0.5 ; do
+  python main.py --dataset $dataset -A -1 -s 0 -o ../results_$dataset.csv --alpha $alpha
+done
--- a/src/main.py
+++ b/src/main.py
@ -76,11 +76,11 @@ def main(opt):
        kernel_sizes=opt.kernelsizes,
        dropout=0.5
    ).to(device)
+    print(phi)

    cls = AuthorshipAttributionClassifier(
        phi, num_authors=A.size, pad_index=pad_index, pad_length=opt.pad, device=device
    )
-
    print(cls)

    if opt.name == 'auto':
--- a/src/model/classifiers.py
+++ b/src/model/classifiers.py
@ -7,6 +7,7 @@ import math
 from sklearn.model_selection import train_test_split

 from model.early_stop import EarlyStop
+from model.transformations import FFProjection


 class AuthorshipAttributionClassifier(nn.Module):
@ -55,8 +56,11 @@ class AuthorshipAttributionClassifier(nn.Module):
                        loss_attr_value = loss_attr.item()

                    if alpha < 1:
+                        # todo: optimize (only upper diagonal)
                        kernel = torch.matmul(phi, phi.T)
                        ideal_kernel = torch.as_tensor(1 * (np.outer(1 + yi, 1 / (yi + 1)) == 1)).to(self.device)
+                        # todo: maybe the KALoss should take into consideration the balance (it is more likely to have
+                        # a pair of negative examples than positives)
                        loss_sav = KernelAlignmentLoss(kernel, ideal_kernel)
                        loss_sav_value = loss_sav.item()

@ -254,26 +258,10 @@ class FullAuthorClassifier(nn.Module):
 def KernelAlignmentLoss(K, Y):
    n_el = K.shape[0]*K.shape[1]
    loss = torch.norm(K - Y, p='fro')  # in Nello's paper this is different
-    loss = loss / n_el # this is in order to factor out the accumulation which is only due to the size
+    loss = loss / n_el  # this is in order to factor out the accumulation which is only due to the size
    return loss


-class FFProjection(nn.Module):
-    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
-        super(FFProjection, self).__init__()
-        sizes = [input_size] + hidden_sizes + [output_size]
-        self.ff = nn.ModuleList([
-            nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
-        ])
-        self.activation = activation
-        self.dropout = nn.Dropout(p=dropout)
-
-    def forward(self, x):
-        for linear in self.ff[:-1]:
-            x = self.dropout(self.activation(linear(x)))
-        x = self.ff[-1](x)
-        return x
-

 class Batch:
    def __init__(self, batch_size, n_epochs=1, shuffle=True):
--- a/src/model/transformations.py
+++ b/src/model/transformations.py
@ -13,47 +13,61 @@ class CNNProjection(nn.Module):
        self.convs1 = nn.ModuleList(
            [nn.Conv2d(channels_in, channels_out, (K, embedding_dim)) for K in kernel_sizes]
        )
-        '''
-        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
-        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
-        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
-        '''
        self.dropout = nn.Dropout(dropout)
-        self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
+        #self.fc1 = nn.Linear(len(kernel_sizes) * channels_out, out_size)
+        self.fc = FFProjection(input_size=len(kernel_sizes) * channels_out,
+                               hidden_sizes=[1024],
+                               output_size=out_size,
+                               activation=nn.functional.relu,
+                               dropout=dropout)
        self.output_size = out_size

+    def convolve(self, x):
+        x = x.unsqueeze(1)  # (N, Ci, W, D)
+        x = [self.conv_and_pool(x, conv) for conv in self.convs1]  # [(N, Co), ...]*len(Ks)
+        x = torch.cat(x, 1)
+        return x
+
    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x

-    def forward(self, x):
-        x = self.embed(x)  # (N, W, D)
-        x = x.unsqueeze(1)  # (N, Ci, W, D)
-        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
-        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
-        x = torch.cat(x, 1)
-
-        '''
-        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
-        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
-        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
-        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
-        '''
-
-        x = F.relu(self.fc1(x))  # (N, C)
-
+    def l2norm(self, x):
        norm = x.norm(p=2, dim=1, keepdim=True)
        x = x.div(norm.expand_as(x))
+        return x

-        x = self.dropout(x)  # (N, len(Ks)*Co)
-
+    def forward(self, x):
+        x = self.embed(x)  # (N, W, D)
+        x = self.convolve(x)  # (N, len(Ks)*Co]
+        x = self.fc(x)
+        #x = F.relu(self.fc1(x))  # (N, C)
+        # x = self.dropout(x)
+        x = self.l2norm(x)
        return x

    def space_dimensions(self):
        return self.output_size


+class FFProjection(nn.Module):
+    def __init__(self, input_size, hidden_sizes, output_size, activation=nn.functional.relu, dropout=0.5):
+        super(FFProjection, self).__init__()
+        sizes = [input_size] + hidden_sizes + [output_size]
+        self.ff = nn.ModuleList([
+            nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)
+        ])
+        self.activation = activation
+        self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        for linear in self.ff[:-1]:
+            x = self.dropout(self.activation(linear(x)))
+        x = self.ff[-1](x)
+        return x
+
+
 class RNNProjection(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size, device='cpu'):
        super(RNNProjection, self).__init__()
--- a/src/tools/gen_tables.py
+++ b/src/tools/gen_tables.py
@ -0,0 +1,14 @@
+import pandas as pd
+from glob import glob
+
+filedir = '../../results_*.csv'
+
+df = [pd.read_csv(file, sep='\t') for file in glob(filedir)]
+df = pd.concat(df)
+
+df[['dataset','authors','docs','seed']] = df.Dataset.str.split('_',expand=True)
+df = df.drop(columns='Dataset')
+
+pv = df.pivot_table(index = ['dataset','authors','docs','Method'], values=['microF1','val_microF1'])
+
+print(pv)