Parser + fixed bert pad token id
This commit is contained in:
parent
108f423d41
commit
90e974f0a3
|
@ -147,7 +147,6 @@ def tokenize(l_raw, max_len):
|
||||||
:param max_len:
|
:param max_len:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
# TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast
|
|
||||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||||
l_tokenized = {}
|
l_tokenized = {}
|
||||||
for lang in l_raw.keys():
|
for lang in l_raw.keys():
|
||||||
|
|
119
refactor/main.py
119
refactor/main.py
|
@ -2,60 +2,57 @@ from argparse import ArgumentParser
|
||||||
from funnelling import *
|
from funnelling import *
|
||||||
from view_generators import *
|
from view_generators import *
|
||||||
from data.dataset_builder import MultilingualDataset
|
from data.dataset_builder import MultilingualDataset
|
||||||
from util.common import MultilingualIndex, get_params
|
from util.common import MultilingualIndex, get_params, get_method_name
|
||||||
from util.evaluation import evaluate
|
from util.evaluation import evaluate
|
||||||
from util.results_csv import CSVlog
|
from util.results_csv import CSVlog
|
||||||
from time import time
|
from time import time
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
OPTIMC = False # TODO
|
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
|
||||||
N_JOBS = 8
|
'empty set of document embeddings is not allowed!'
|
||||||
print('Running refactored...')
|
|
||||||
|
|
||||||
# _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle'
|
print('Running generalized funnelling...')
|
||||||
# EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE'
|
|
||||||
|
|
||||||
_DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle'
|
data = MultilingualDataset.load(args.dataset)
|
||||||
EMBEDDINGS_PATH = '/home/andreapdr/gfun/embeddings'
|
|
||||||
data = MultilingualDataset.load(_DATASET)
|
|
||||||
data.set_view(languages=['it', 'fr'])
|
data.set_view(languages=['it', 'fr'])
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
lX, ly = data.training()
|
lX, ly = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
||||||
|
if args.gru_embedder or args.bert_embedder:
|
||||||
multilingualIndex = MultilingualIndex()
|
multilingualIndex = MultilingualIndex()
|
||||||
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH)
|
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
|
||||||
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
|
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
|
||||||
|
|
||||||
embedder_list = []
|
embedder_list = []
|
||||||
if args.X:
|
if args.post_embedder:
|
||||||
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS)
|
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
|
||||||
embedder_list.append(posteriorEmbedder)
|
embedder_list.append(posteriorEmbedder)
|
||||||
|
|
||||||
if args.M:
|
if args.muse_embedder:
|
||||||
museEmbedder = MuseGen(muse_dir=EMBEDDINGS_PATH, n_jobs=N_JOBS)
|
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
|
||||||
embedder_list.append(museEmbedder)
|
embedder_list.append(museEmbedder)
|
||||||
|
|
||||||
if args.W:
|
if args.wce_embedder:
|
||||||
wceEmbedder = WordClassGen(n_jobs=N_JOBS)
|
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
|
||||||
embedder_list.append(wceEmbedder)
|
embedder_list.append(wceEmbedder)
|
||||||
|
|
||||||
if args.G:
|
if args.gru_embedder:
|
||||||
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, batch_size=256,
|
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
|
||||||
nepochs=250, gpus=args.gpus, n_jobs=N_JOBS)
|
nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||||
embedder_list.append(rnnEmbedder)
|
embedder_list.append(rnnEmbedder)
|
||||||
|
|
||||||
if args.B:
|
if args.bert_embedder:
|
||||||
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=1, gpus=args.gpus, n_jobs=N_JOBS)
|
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||||
|
bertEmbedder.transform(lX)
|
||||||
embedder_list.append(bertEmbedder)
|
embedder_list.append(bertEmbedder)
|
||||||
|
|
||||||
# Init DocEmbedderList
|
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
||||||
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
|
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
|
||||||
meta_parameters = None if not OPTIMC else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
|
||||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||||
meta_parameters=get_params(optimc=OPTIMC))
|
meta_parameters=get_params(optimc=args.optimc))
|
||||||
|
|
||||||
# Init Funnelling Architecture
|
# Init Funnelling Architecture
|
||||||
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
|
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
|
||||||
|
@ -78,18 +75,21 @@ def main(args):
|
||||||
|
|
||||||
# Logging ---------------------------------------
|
# Logging ---------------------------------------
|
||||||
print('\n[Results]')
|
print('\n[Results]')
|
||||||
results = CSVlog('test_log.csv')
|
results = CSVlog(args.csv_dir)
|
||||||
metrics = []
|
metrics = []
|
||||||
for lang in lXte.keys():
|
for lang in lXte.keys():
|
||||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
|
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
|
||||||
|
if results is not None:
|
||||||
|
_id, _dataset = get_method_name(args)
|
||||||
results.add_row(method='gfun',
|
results.add_row(method='gfun',
|
||||||
setting='TODO',
|
setting=_id,
|
||||||
|
optimc=args.optimc,
|
||||||
sif='True',
|
sif='True',
|
||||||
zscore='True',
|
zscore='True',
|
||||||
l2='True',
|
l2='True',
|
||||||
dataset='TODO',
|
dataset=_dataset,
|
||||||
time_tr=time_tr,
|
time_tr=time_tr,
|
||||||
time_te=time_te,
|
time_te=time_te,
|
||||||
lang=lang,
|
lang=lang,
|
||||||
|
@ -105,12 +105,63 @@ def main(args):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani')
|
||||||
parser.add_argument('--X')
|
|
||||||
parser.add_argument('--M')
|
parser.add_argument('dataset', help='Path to the dataset')
|
||||||
parser.add_argument('--W')
|
|
||||||
parser.add_argument('--G')
|
parser.add_argument('-o', '--output', dest='csv_dir',
|
||||||
parser.add_argument('--B')
|
help='Result file (default ../csv_log/gfun_results.csv)', type=str,
|
||||||
parser.add_argument('--gpus', default=None)
|
default='csv_logs/gfun/gfun_results.csv')
|
||||||
|
|
||||||
|
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
||||||
|
help='deploy posterior probabilities embedder to compute document embeddings',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true',
|
||||||
|
help='deploy (supervised) Word-Class embedder to the compute document embeddings',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true',
|
||||||
|
help='deploy (pretrained) MUSE embedder to compute document embeddings',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true',
|
||||||
|
help='deploy multilingual Bert to compute document embeddings',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true',
|
||||||
|
help='deploy a GRU in order to compute document embeddings',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true',
|
||||||
|
help='Optimize SVMs C hyperparameter',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('-n', '--nepochs', dest='nepochs', type=str,
|
||||||
|
help='Number of max epochs to train Recurrent embedder (i.e., -g)')
|
||||||
|
|
||||||
|
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int,
|
||||||
|
help='Number of parallel jobs (default is -1, all)',
|
||||||
|
default=-1)
|
||||||
|
|
||||||
|
parser.add_argument('--muse_dir', dest='muse_dir', type=str,
|
||||||
|
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
|
||||||
|
default='../embeddings')
|
||||||
|
|
||||||
|
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
|
||||||
|
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
|
||||||
|
default=False)
|
||||||
|
|
||||||
|
parser.add_argument('--gru_dir', dest='gru_dir', type=str,
|
||||||
|
help='Set the path to a pretrained GRU model (i.e., -g view generator)',
|
||||||
|
default=None)
|
||||||
|
|
||||||
|
parser.add_argument('--bert_dir', dest='bert_dir', type=str,
|
||||||
|
help='Set the path to a pretrained mBERT model (i.e., -b view generator)',
|
||||||
|
default=None)
|
||||||
|
|
||||||
|
parser.add_argument('--gpus', help='specifies how many GPUs to use per node',
|
||||||
|
default=None)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|
|
@ -161,7 +161,7 @@ class BertModel(pl.LightningModule):
|
||||||
else:
|
else:
|
||||||
batch = lX[lang][i:i + batch_size]
|
batch = lX[lang][i:i + batch_size]
|
||||||
max_pad_len = define_pad_length(batch)
|
max_pad_len = define_pad_length(batch)
|
||||||
batch = pad(batch, pad_index='101', max_pad_length=max_pad_len) # TODO: check pad index!
|
batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len)
|
||||||
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
||||||
_, output = self.forward(batch)
|
_, output = self.forward(batch)
|
||||||
doc_embeds = output[-1][:, 0, :]
|
doc_embeds = output[-1][:, 0, :]
|
||||||
|
|
|
@ -369,3 +369,16 @@ def get_params(optimc=False):
|
||||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||||
kernel = 'rbf'
|
kernel = 'rbf'
|
||||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||||
|
|
||||||
|
|
||||||
|
def get_method_name(args):
|
||||||
|
_id = ''
|
||||||
|
_id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder]
|
||||||
|
_id_name = ['X', 'W', 'M', 'B', 'G']
|
||||||
|
for i, conf in enumerate(_id_conf):
|
||||||
|
if conf:
|
||||||
|
_id += _id_name[i]
|
||||||
|
_id = _id if not args.gru_wce else _id + '_wce'
|
||||||
|
_dataset_path = args.dataset.split('/')[-1].split('_')
|
||||||
|
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||||
|
return _id, dataset_id
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from os import listdir, makedirs
|
from os import listdir, makedirs
|
||||||
from os.path import isdir, isfile, join, exists, dirname
|
from os.path import isdir, isfile, join, exists, dirname
|
||||||
#from sklearn.externals.six.moves import urllib
|
|
||||||
import urllib
|
import urllib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -14,6 +13,7 @@ def download_file(url, archive_filename):
|
||||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
def download_file_if_not_exists(url, archive_path):
|
def download_file_if_not_exists(url, archive_path):
|
||||||
if exists(archive_path): return
|
if exists(archive_path): return
|
||||||
makedirs_if_not_exist(dirname(archive_path))
|
makedirs_if_not_exist(dirname(archive_path))
|
||||||
|
@ -25,20 +25,26 @@ def ls(dir, typecheck):
|
||||||
el.sort()
|
el.sort()
|
||||||
return el
|
return el
|
||||||
|
|
||||||
|
|
||||||
def list_dirs(dir):
|
def list_dirs(dir):
|
||||||
return ls(dir, typecheck=isdir)
|
return ls(dir, typecheck=isdir)
|
||||||
|
|
||||||
|
|
||||||
def list_files(dir):
|
def list_files(dir):
|
||||||
return ls(dir, typecheck=isfile)
|
return ls(dir, typecheck=isfile)
|
||||||
|
|
||||||
|
|
||||||
def makedirs_if_not_exist(path):
|
def makedirs_if_not_exist(path):
|
||||||
if not exists(path): makedirs(path)
|
if not exists(path): makedirs(path)
|
||||||
|
|
||||||
|
|
||||||
def create_if_not_exist(path):
|
def create_if_not_exist(path):
|
||||||
if not exists(path): makedirs(path)
|
if not exists(path): makedirs(path)
|
||||||
|
|
||||||
|
|
||||||
def get_parent_name(path):
|
def get_parent_name(path):
|
||||||
return Path(path).parent
|
return Path(path).parent
|
||||||
|
|
||||||
|
|
||||||
def get_file_name(path):
|
def get_file_name(path):
|
||||||
return Path(path).name
|
return Path(path).name
|
||||||
|
|
|
@ -8,6 +8,7 @@ class CSVlog:
|
||||||
self.file = file
|
self.file = file
|
||||||
self.columns = ['method',
|
self.columns = ['method',
|
||||||
'setting',
|
'setting',
|
||||||
|
'optimc',
|
||||||
'sif',
|
'sif',
|
||||||
'zscore',
|
'zscore',
|
||||||
'l2',
|
'l2',
|
||||||
|
@ -34,9 +35,9 @@ class CSVlog:
|
||||||
def already_calculated(self, id):
|
def already_calculated(self, id):
|
||||||
return (self.df['id'] == id).any()
|
return (self.df['id'] == id).any()
|
||||||
|
|
||||||
def add_row(self, method, setting, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||||
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||||
s = pd.Series([method, setting,sif, zscore, l2, dataset, time_tr, time_te, lang,
|
s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||||
macrof1, microf1, macrok, microk, notes],
|
macrof1, microf1, macrok, microk, notes],
|
||||||
index=self.columns)
|
index=self.columns)
|
||||||
self.df = self.df.append(s, ignore_index=True)
|
self.df = self.df.append(s, ignore_index=True)
|
||||||
|
|
Loading…
Reference in New Issue