fixed imports
This commit is contained in:
parent
ce4e32aad2
commit
20c76a2103
|
@ -1,12 +1,11 @@
|
|||
from argparse import ArgumentParser
|
||||
|
||||
from data.dataset_builder import MultilingualDataset
|
||||
from funnelling import *
|
||||
from util.common import MultilingualIndex, get_params, get_method_name
|
||||
from util.evaluation import evaluate
|
||||
from util.results_csv import CSVlog
|
||||
from view_generators import *
|
||||
from time import time
|
||||
from src.data.dataset_builder import MultilingualDataset
|
||||
from src.funnelling import *
|
||||
from src.util.common import MultilingualIndex, get_params, get_method_name
|
||||
from src.util.evaluation import evaluate
|
||||
from src.util.results_csv import CSVlog
|
||||
from src.view_generators import *
|
||||
|
||||
|
||||
def main(args):
|
||||
|
@ -60,18 +59,17 @@ def main(args):
|
|||
|
||||
# Training ---------------------------------------
|
||||
print('\n[Training Generalized Funnelling]')
|
||||
time_init = time()
|
||||
time_tr = time()
|
||||
time_init = time.time()
|
||||
gfun.fit(lX, ly)
|
||||
time_tr = round(time() - time_tr, 3)
|
||||
time_tr = round(time.time() - time_init, 3)
|
||||
print(f'Training completed in {time_tr} seconds!')
|
||||
|
||||
# Testing ----------------------------------------
|
||||
print('\n[Testing Generalized Funnelling]')
|
||||
time_te = time()
|
||||
time_te = time.time()
|
||||
ly_ = gfun.predict(lXte)
|
||||
l_eval = evaluate(ly_true=lyte, ly_pred=ly_)
|
||||
time_te = round(time() - time_te, 3)
|
||||
time_te = round(time.time() - time_te, 3)
|
||||
print(f'Testing completed in {time_te} seconds!')
|
||||
|
||||
# Logging ---------------------------------------
|
||||
|
@ -101,7 +99,7 @@ def main(args):
|
|||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
||||
|
||||
overall_time = round(time() - time_init, 3)
|
||||
overall_time = round(time.time() - time_init, 3)
|
||||
exit(f'\nExecuted in: {overall_time} seconds!')
|
||||
|
||||
|
||||
|
@ -112,7 +110,7 @@ if __name__ == '__main__':
|
|||
|
||||
parser.add_argument('-o', '--output', dest='csv_dir',
|
||||
help='Result file (default ../csv_log/gfun_results.csv)', type=str,
|
||||
default='csv_logs/gfun/gfun_results.csv')
|
||||
default='../csv_logs/gfun/gfun_results.csv')
|
||||
|
||||
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
||||
help='deploy posterior probabilities embedder to compute document embeddings',
|
||||
|
@ -138,7 +136,7 @@ if __name__ == '__main__':
|
|||
help='Optimize SVMs C hyperparameter',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-n', '--nepochs', dest='nepochs', type=str,
|
||||
parser.add_argument('-n', '--nepochs', dest='nepochs', type=int,
|
||||
help='Number of max epochs to train Recurrent embedder (i.e., -g)')
|
||||
|
||||
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int,
|
|
@ -135,15 +135,15 @@ class RecurrentDataModule(pl.LightningDataModule):
|
|||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.training_dataset.collate_fn)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.val_dataset.collate_fn)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS,
|
||||
collate_fn=self.test_dataset.collate_fn)
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import itertools
|
||||
import pickle
|
||||
import re
|
||||
from os.path import exists
|
||||
|
||||
|
@ -12,10 +11,10 @@ from sklearn.model_selection import train_test_split
|
|||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from tqdm import tqdm
|
||||
|
||||
from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from data.reader.jrcacquis_reader import *
|
||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
|
||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
from src.data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from src.data.reader.jrcacquis_reader import *
|
||||
from src.data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
|
||||
from src.data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
|
|
|
@ -14,9 +14,9 @@ import rdflib
|
|||
from rdflib.namespace import RDF, SKOS
|
||||
from sklearn.datasets import get_data_home
|
||||
|
||||
from data.languages import JRC_LANGS
|
||||
from data.languages import lang_set
|
||||
from util.file import download_file, list_dirs, list_files
|
||||
from src.data.languages import JRC_LANGS
|
||||
from src.data.languages import lang_set
|
||||
from src.util.file import download_file, list_dirs, list_files
|
||||
|
||||
"""
|
||||
JRC Acquis' Nomenclature:
|
||||
|
|
|
@ -5,8 +5,8 @@ from zipfile import ZipFile
|
|||
|
||||
import numpy as np
|
||||
|
||||
from util.file import download_file_if_not_exists
|
||||
from util.file import list_files
|
||||
from src.util.file import download_file_if_not_exists
|
||||
from src.util.file import list_files
|
||||
|
||||
"""
|
||||
RCV2's Nomenclature:
|
||||
|
|
|
@ -11,7 +11,6 @@ from os.path import join
|
|||
from xml.sax.saxutils import escape
|
||||
|
||||
import numpy as np
|
||||
|
||||
from util.file import list_dirs, list_files
|
||||
|
||||
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
|
||||
|
|
|
@ -2,7 +2,7 @@ from nltk import word_tokenize
|
|||
from nltk.corpus import stopwords
|
||||
from nltk.stem import SnowballStemmer
|
||||
|
||||
from data.languages import NLTK_LANGMAP
|
||||
from src.data.languages import NLTK_LANGMAP
|
||||
|
||||
|
||||
def preprocess_documents(documents, lang):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from models.learners import *
|
||||
from util.common import _normalize
|
||||
from view_generators import VanillaFunGen
|
||||
from src.models.learners import *
|
||||
from src.util.common import _normalize
|
||||
from src.view_generators import VanillaFunGen
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
|
|
|
@ -7,7 +7,7 @@ from sklearn.model_selection import GridSearchCV
|
|||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from util.standardizer import StandardizeTransformer
|
||||
from src.util.standardizer import StandardizeTransformer
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear', C=1):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
|
||||
from torch.autograd import Variable
|
||||
|
||||
from models.helpers import *
|
||||
from torch.autograd import Variable
|
||||
|
||||
|
||||
class RNNMultilingualClassifier(nn.Module):
|
||||
|
|
|
@ -3,8 +3,8 @@ import torch
|
|||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import BertForSequenceClassification, AdamW
|
||||
|
||||
from util.common import define_pad_length, pad
|
||||
from util.pl_metrics import CustomF1, CustomK
|
||||
from src.util.common import define_pad_length, pad
|
||||
from src.util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class BertModel(pl.LightningModule):
|
||||
|
|
|
@ -7,9 +7,9 @@ from torch.autograd import Variable
|
|||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import AdamW
|
||||
|
||||
from models.helpers import init_embeddings
|
||||
from util.common import define_pad_length, pad
|
||||
from util.pl_metrics import CustomF1, CustomK
|
||||
from src.models.helpers import init_embeddings
|
||||
from src.util.common import define_pad_length, pad
|
||||
from src.util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class RecurrentModel(pl.LightningModule):
|
||||
|
|
|
@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
from util.embeddings_manager import supervised_embeddings_tfidf
|
||||
from src.util.embeddings_manager import supervised_embeddings_tfidf
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
|
|
@ -4,7 +4,7 @@ import numpy as np
|
|||
import torch
|
||||
from torchtext.vocab import Vectors
|
||||
|
||||
from util.SIF_embed import remove_pc
|
||||
from src.util.SIF_embed import remove_pc
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
from util.metrics import *
|
||||
from src.util.metrics import *
|
||||
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import torch
|
||||
from pytorch_lightning.metrics import Metric
|
||||
|
||||
from util.common import is_false, is_true
|
||||
from src.util.common import is_false, is_true
|
||||
|
||||
|
||||
def _update(pred, target, device):
|
||||
|
|
|
@ -21,12 +21,12 @@ from time import time
|
|||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
|
||||
from data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
||||
from models.learners import *
|
||||
from models.pl_bert import BertModel
|
||||
from models.pl_gru import RecurrentModel
|
||||
from util.common import TfidfVectorizerMultilingual, _normalize
|
||||
from util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||
from src.data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
||||
from src.models.learners import *
|
||||
from src.models.pl_bert import BertModel
|
||||
from src.models.pl_gru import RecurrentModel
|
||||
from src.util.common import TfidfVectorizerMultilingual, _normalize
|
||||
from src.util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||
|
||||
|
||||
class ViewGen(ABC):
|
||||
|
@ -232,7 +232,7 @@ class RecurrentGen(ViewGen):
|
|||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='rnn', default_hp_metric=False)
|
||||
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
|
||||
|
||||
def _init_model(self):
|
||||
|
@ -293,9 +293,9 @@ class RecurrentGen(ViewGen):
|
|||
data = self.multilingualIndex.l_devel_index()
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
time_init = time()
|
||||
time_init = time.time()
|
||||
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
||||
transform_time = round(time() - time_init, 3)
|
||||
transform_time = round(time.time() - time_init, 3)
|
||||
print(f'Executed! Transform took: {transform_time}')
|
||||
return l_embeds
|
||||
|
||||
|
@ -328,7 +328,7 @@ class BertGen(ViewGen):
|
|||
self.n_jobs = n_jobs
|
||||
self.stored_path = stored_path
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
|
||||
self.logger = TensorBoardLogger(save_dir='../tb_logs', name='bert', default_hp_metric=False)
|
||||
|
||||
def _init_model(self):
|
||||
output_size = self.multilingualIndex.get_target_dim()
|
||||
|
@ -362,14 +362,12 @@ class BertGen(ViewGen):
|
|||
data = tokenize(data, max_len=512)
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
time_init = time()
|
||||
time_init = time.time()
|
||||
l_emebds = self.model.encode(data, batch_size=64)
|
||||
transform_time = round(time() - time_init, 3)
|
||||
transform_time = round(time.time() - time_init, 3)
|
||||
print(f'Executed! Transform took: {transform_time}')
|
||||
return l_emebds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
# we can assume that we have already indexed data for transform() since we are first calling fit()
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue