diff --git a/src/data/__init__.py b/src/data/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/data/languages.py b/src/data/languages.py
deleted file mode 100644
index 2d03d8e..0000000
--- a/src/data/languages.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""
-bg = Bulgarian
-cs = Czech
-da = Danish
-de = German
-el = Greek
-en = English
-es = Spanish
-et = Estonian
-fi = Finnish
-fr = French
-hu = Hungarian
-it = Italian
-lt = Lithuanian
-lv = Latvian
-nl = Dutch
-mt = Maltese
-pl = Polish
-pt = Portuguese
-ro = Romanian
-sk = Slovak
-sl = Slovene
-sv = Swedish
-"""
-
-NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
- 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
-
-
-#top 10 languages in wikipedia order by the number of articles
-#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
-
-#all languages in JRC-acquis v3
-JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
-JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
-
-RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
-RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
-
-lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
- 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}
-
diff --git a/src/data/reader/__init__.py b/src/data/reader/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py
deleted file mode 100644
index c0441ed..0000000
--- a/src/data/reader/jrcacquis_reader.py
+++ /dev/null
@@ -1,321 +0,0 @@
-from __future__ import print_function
-import os, sys
-from os.path import join
-import tarfile
-import xml.etree.ElementTree as ET
-from sklearn.datasets import get_data_home
-import pickle
-from util.file import download_file, list_dirs, list_files
-import rdflib
-from rdflib.namespace import RDF, SKOS
-from rdflib import URIRef
-import zipfile
-from data.languages import JRC_LANGS
-from collections import Counter
-from random import shuffle
-from data.languages import lang_set
-
-"""
-JRC Acquis' Nomenclature:
-bg = Bulgarian
-cs = Czech
-da = Danish
-de = German
-el = Greek
-en = English
-es = Spanish
-et = Estonian
-fi = Finnish
-fr = French
-hu = Hungarian
-it = Italian
-lt = Lithuanian
-lv = Latvian
-nl = Dutch
-mt = Maltese
-pl = Polish
-pt = Portuguese
-ro = Romanian
-sk = Slovak
-sl = Slovene
-sv = Swedish
-"""
-
-class JRCAcquis_Document:
- def __init__(self, id, name, lang, year, head, body, categories):
- self.id = id
- self.parallel_id = name
- self.lang = lang
- self.year = year
- self.text = body if not head else head + "\n" + body
- self.categories = categories
-
-# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
-# however, it seems that the title is often appearing as the first paragraph in the text/body (with
-# standard codification), so it might be preferable not to read the header after all (as here by default)
-def _proc_acute(text):
- for ch in ['a','e','i','o','u']:
- text = text.replace('%'+ch+'acute%',ch)
- return text
-
-def parse_document(file, year, head=False):
- root = ET.parse(file).getroot()
-
- doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
- doc_lang = root.attrib['lang'] # e.g., 'es'
- doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
- doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
- doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
- doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
-
- def raise_if_empty(field, from_file):
- if isinstance(field, str):
- if not field.strip():
- raise ValueError("Empty field in file %s" % from_file)
-
- raise_if_empty(doc_name, file)
- raise_if_empty(doc_lang, file)
- raise_if_empty(doc_id, file)
- if head: raise_if_empty(doc_head, file)
- raise_if_empty(doc_body, file)
-
- return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
-
-# removes documents without a counterpart in all other languages
-def _force_parallel(doclist, langs):
- n_langs = len(langs)
- par_id_count = Counter([d.parallel_id for d in doclist])
- parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
- return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
-
-def random_sampling_avoiding_parallel(doclist):
- random_order = list(range(len(doclist)))
- shuffle(random_order)
- sampled_request = []
- parallel_ids = set()
- for ind in random_order:
- pid = doclist[ind].parallel_id
- if pid not in parallel_ids:
- sampled_request.append(doclist[ind])
- parallel_ids.add(pid)
- print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
- return sampled_request
-
-
-#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
-def _filter_by_category(doclist, cat_filter):
- if not isinstance(cat_filter, frozenset):
- cat_filter = frozenset(cat_filter)
- filtered = []
- for doc in doclist:
- doc.categories = list(cat_filter & set(doc.categories))
- if doc.categories:
- doc.categories.sort()
- filtered.append(doc)
- print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
- return filtered
-
-#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
-def _filter_by_frequency(doclist, cat_threshold):
- cat_count = Counter()
- for d in doclist:
- cat_count.update(d.categories)
-
- freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
- freq_categories.sort()
- return _filter_by_category(doclist, freq_categories), freq_categories
-
-#select top most_frequent categories (and filters documents containing those categories)
-def _most_common(doclist, most_frequent):
- cat_count = Counter()
- for d in doclist:
- cat_count.update(d.categories)
-
- freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
- freq_categories.sort()
- return _filter_by_category(doclist, freq_categories), freq_categories
-
-def _get_categories(request):
- final_cats = set()
- for d in request:
- final_cats.update(d.categories)
- return list(final_cats)
-
-def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
- parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
-
- assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
- if not langs:
- langs = JRC_LANGS
- else:
- if isinstance(langs, str): langs = [langs]
- for l in langs:
- if l not in JRC_LANGS:
- raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
-
- if not data_path:
- data_path = get_data_home()
-
- if not os.path.exists(data_path):
- os.mkdir(data_path)
-
- request = []
- total_read = 0
- for l in langs:
- file_name = 'jrc-'+l+'.tgz'
- archive_path = join(data_path, file_name)
-
- if not os.path.exists(archive_path):
- print("downloading language-specific dataset (once and for all) into %s" % data_path)
- DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
- download_file(DOWNLOAD_URL, archive_path)
- print("untarring dataset...")
- tarfile.open(archive_path, 'r:gz').extractall(data_path)
-
- documents_dir = join(data_path, l)
-
- print("Reading documents...")
- read = 0
- for dir in list_dirs(documents_dir):
- year = int(dir)
- if years==None or year in years:
- year_dir = join(documents_dir,dir)
- pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
- if os.path.exists(pickle_name):
- print("loading from file %s" % pickle_name)
- l_y_documents = pickle.load(open(pickle_name, "rb"))
- read += len(l_y_documents)
- else:
- l_y_documents = []
- all_documents = list_files(year_dir)
- empty = 0
- for i,doc_file in enumerate(all_documents):
- try:
- jrc_doc = parse_document(join(year_dir, doc_file), year)
- except ValueError:
- jrc_doc = None
-
- if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
- l_y_documents.append(jrc_doc)
- else: empty += 1
- if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
- print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
- read+=1
- print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
- print("\t\t(Pickling object for future runs in %s)" % pickle_name)
- pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
- request += l_y_documents
- print("Read %d documents for language %s\n" % (read, l))
- total_read += read
- print("Read %d documents in total" % (total_read))
-
- if parallel=='force':
- request = _force_parallel(request, langs)
- elif parallel == 'avoid':
- request = random_sampling_avoiding_parallel(request)
-
- final_cats = _get_categories(request)
-
- if cat_filter:
- request = _filter_by_category(request, cat_filter)
- final_cats = _get_categories(request)
- if cat_threshold > 0:
- request, final_cats = _filter_by_frequency(request, cat_threshold)
- if most_frequent != -1 and len(final_cats) > most_frequent:
- request, final_cats = _most_common(request, most_frequent)
-
- return request, final_cats
-
-def print_cat_analysis(request):
- cat_count = Counter()
- for d in request:
- cat_count.update(d.categories)
- print("Number of active categories: {}".format(len(cat_count)))
- print(cat_count.most_common())
-
-# inspects the Eurovoc thesaurus in order to select a subset of categories
-# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
-def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
- eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
- select="broadest"):
-
- fullpath_pickle = join(data_path, select+'_concepts.pickle')
- if os.path.exists(fullpath_pickle):
- print("Pickled object found in %s. Loading it." % fullpath_pickle)
- return pickle.load(open(fullpath_pickle,'rb'))
-
- fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
- if not os.path.exists(fullpath):
- print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
- download_file(eurovoc_url, fullpath)
- print("Unzipping file...")
- zipped = zipfile.ZipFile(data_path + '.zip', 'r')
- zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
- zipped.close()
-
- print("Parsing %s" %fullpath)
- g = rdflib.Graph()
- g.parse(location=fullpath, format="application/rdf+xml")
-
- if select == "all":
- print("Selecting all concepts")
- all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
- all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
- all_concepts.sort()
- selected_concepts = all_concepts
- elif select=="broadest":
- print("Selecting broadest concepts (those without any other broader concept linked to it)")
- all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
- narrower_concepts = set(g.subjects(SKOS.broader, None))
- broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
- broadest_concepts.sort()
- selected_concepts = broadest_concepts
- elif select=="leaves":
- print("Selecting leaves concepts (those not linked as broader of any other concept)")
- all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
- broad_concepts = set(g.objects(None, SKOS.broader))
- leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
- leave_concepts.sort()
- selected_concepts = leave_concepts
- else:
- raise ValueError("Selection policy %s is not currently supported" % select)
-
- print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
- print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
- pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
-
- return selected_concepts
-
-if __name__ == '__main__':
-
- def single_label_fragment(doclist):
- single = [d for d in doclist if len(d.categories) < 2]
- final_categories = set([d.categories[0] if d.categories else [] for d in single])
- print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
- len(final_categories),
- len(doclist)))
- return single, list(final_categories)
-
- train_years = list(range(1986, 2006))
- test_years = [2006]
- cat_policy = 'leaves'
- most_common_cat = 300
- # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
- JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
- langs = lang_set['JRC_NLTK']
- cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
- sys.exit()
-
- training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
- test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
-
- print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
- print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
-
- training_docs, label_names = single_label_fragment(training_docs)
- test_docs, label_namestest = single_label_fragment(test_docs)
-
- print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
- print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
-
-
diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py
deleted file mode 100644
index cd4b416..0000000
--- a/src/data/reader/rcv_reader.py
+++ /dev/null
@@ -1,225 +0,0 @@
-from zipfile import ZipFile
-import xml.etree.ElementTree as ET
-from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
-from util.file import list_files
-from sklearn.datasets import get_data_home
-import gzip
-from os.path import join, exists
-from util.file import download_file_if_not_exists
-import re
-from collections import Counter
-import numpy as np
-import sys
-
-"""
-RCV2's Nomenclature:
-ru = Russian
-da = Danish
-de = German
-es = Spanish
-lat = Spanish Latin-American (actually is also 'es' in the collection)
-fr = French
-it = Italian
-nl = Dutch
-pt = Portuguese
-sv = Swedish
-ja = Japanese
-htw = Chinese
-no = Norwegian
-"""
-
-RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
-RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
-RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
-RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
-
-rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
- 'lyrl2004_tokens_test_pt1.dat.gz',
- 'lyrl2004_tokens_test_pt2.dat.gz',
- 'lyrl2004_tokens_test_pt3.dat.gz']
-
-rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
-
-rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
-
-RCV2_LANG_DIR = {'ru':'REUTE000',
- 'de':'REUTE00A',
- 'fr':'REUTE00B',
- 'sv':'REUTE001',
- 'no':'REUTE002',
- 'da':'REUTE003',
- 'pt':'REUTE004',
- 'it':'REUTE005',
- 'es':'REUTE006',
- 'lat':'REUTE007',
- 'jp':'REUTE008',
- 'htw':'REUTE009',
- 'nl':'REUTERS_'}
-
-
-class RCV_Document:
-
- def __init__(self, id, text, categories, date='', lang=None):
- self.id = id
- self.date = date
- self.lang = lang
- self.text = text
- self.categories = categories
-
-
-class ExpectedLanguageException(Exception): pass
-class IDRangeException(Exception): pass
-
-
-nwords = []
-
-def parse_document(xml_content, assert_lang=None, valid_id_range=None):
- root = ET.fromstring(xml_content)
- if assert_lang:
- if assert_lang not in root.attrib.values():
- if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp'
- raise ExpectedLanguageException('error: document of a different language')
-
- doc_id = root.attrib['itemid']
- if valid_id_range is not None:
- if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
- raise IDRangeException
-
- doc_categories = [cat.attrib['code'] for cat in
- root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
-
- doc_date = root.attrib['date']
- doc_title = root.find('.//title').text
- doc_headline = root.find('.//headline').text
- doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
-
- if not doc_body:
- raise ValueError('Empty document')
-
- if doc_title is None: doc_title = ''
- if doc_headline is None or doc_headline in doc_title: doc_headline = ''
- text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
-
- text_length = len(text.split())
- global nwords
- nwords.append(text_length)
-
- return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
-
-
-def fetch_RCV1(data_path, split='all'):
-
- assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
-
- request = []
- labels = set()
- read_documents = 0
- lang = 'en'
-
- training_documents = 23149
- test_documents = 781265
-
- if split == 'all':
- split_range = (2286, 810596)
- expected = training_documents+test_documents
- elif split == 'train':
- split_range = (2286, 26150)
- expected = training_documents
- else:
- split_range = (26151, 810596)
- expected = test_documents
-
- global nwords
- nwords=[]
- for part in list_files(data_path):
- if not re.match('\d+\.zip', part): continue
- target_file = join(data_path, part)
- assert exists(target_file), \
- "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
- " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
- zipfile = ZipFile(target_file)
- for xmlfile in zipfile.namelist():
- xmlcontent = zipfile.open(xmlfile).read()
- try:
- doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
- labels.update(doc.categories)
- request.append(doc)
- read_documents += 1
- except ValueError:
- print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
- except (IDRangeException, ExpectedLanguageException) as e:
- pass
- print('\r[{}] read {} documents'.format(part, len(request)), end='')
- if read_documents == expected: break
- if read_documents == expected: break
- print()
- print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
- return request, list(labels)
-
-
-def fetch_RCV2(data_path, languages=None):
-
- if not languages:
- languages = list(RCV2_LANG_DIR.keys())
- else:
- assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
-
- request = []
- labels = set()
- global nwords
- nwords=[]
- for lang in languages:
- path = join(data_path, RCV2_LANG_DIR[lang])
- lang_docs_read = 0
- for part in list_files(path):
- target_file = join(path, part)
- assert exists(target_file), \
- "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
- " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
- zipfile = ZipFile(target_file)
- for xmlfile in zipfile.namelist():
- xmlcontent = zipfile.open(xmlfile).read()
- try:
- doc = parse_document(xmlcontent, assert_lang=lang)
- labels.update(doc.categories)
- request.append(doc)
- lang_docs_read += 1
- except ValueError:
- print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
- except (IDRangeException, ExpectedLanguageException) as e:
- pass
- print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
- print()
- print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
- return request, list(labels)
-
-
-def fetch_topic_hierarchy(path, topics='all'):
- assert topics in ['all', 'leaves']
-
- download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
- hierarchy = {}
- for line in open(path, 'rt'):
- parts = line.strip().split()
- parent,child = parts[1],parts[3]
- if parent not in hierarchy:
- hierarchy[parent]=[]
- hierarchy[parent].append(child)
-
- del hierarchy['None']
- del hierarchy['Root']
- print(hierarchy)
-
- if topics=='all':
- topics = set(hierarchy.keys())
- for parent in hierarchy.keys():
- topics.update(hierarchy[parent])
- return list(topics)
- elif topics=='leaves':
- parents = set(hierarchy.keys())
- childs = set()
- for parent in hierarchy.keys():
- childs.update(hierarchy[parent])
- return list(childs.difference(parents))
-
-
diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py
deleted file mode 100644
index 83e11e3..0000000
--- a/src/data/reader/wikipedia_tools.py
+++ /dev/null
@@ -1,304 +0,0 @@
-from __future__ import print_function
-# import ijson
-# from ijson.common import ObjectBuilder
-import os, sys
-from os.path import join
-from bz2 import BZ2File
-import pickle
-from util.file import list_dirs, list_files, makedirs_if_not_exist
-from itertools import islice
-import re
-from xml.sax.saxutils import escape
-import numpy as np
-
-policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
-
-"""
-This file contains a set of tools for processing the Wikipedia multilingual documents.
-In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
-and have processed each document to clean their texts with one of the tools:
- - https://github.com/aesuli/wikipediatools (Python 2)
- - https://github.com/aesuli/wikipedia-extractor (Python 3)
-It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
-
-This tools help you in:
- - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
- Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
- extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
- Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
- - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
- - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
- language-specific versions from the document.
- - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
- in a way that the i-th element from any list refers to the same element in the respective language.
-"""
-
-def _doc_generator(text_path, langs):
- dotspace = re.compile(r'\.(?!\s)')
- for l,lang in enumerate(langs):
- print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
- lang_dir = join(text_path, lang)
- split_dirs = list_dirs(lang_dir)
- for sd,split_dir in enumerate(split_dirs):
- print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
- split_files = list_files(join(lang_dir, split_dir))
- for sf,split_file in enumerate(split_files):
- print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
- with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
- while True:
- doc_lines = list(islice(fi, 3))
- if doc_lines:
- # some sentences are not followed by a space after the dot
- doc_lines[1] = dotspace.sub('. ', doc_lines[1])
- # [workaround] I found html symbol was not treated, and unescaping it now might not help...
- doc_lines[1] = escape(doc_lines[1].replace(" ", " "))
- yield doc_lines, lang
- else: break
-
-def _extract_title(doc_lines):
- m = re.search('title="(.+?)"', doc_lines[0])
- if m: return m.group(1).decode('utf-8')
- else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
-
-def _create_doc(target_file, id, doc, lang):
- doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
- with open(target_file, 'w') as fo:
- fo.write('\n'%id)
- [fo.write(line) for line in doc]
- fo.write('')
-
-def _append_doc(target_file, doc, lang):
- doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
- with open(target_file, 'r', buffering=1024*1024) as fi:
- lines = fi.readlines()
- if doc[0] in lines[1::3]:
- return
- lines[-1:-1]=doc
- with open(target_file, 'w', buffering=1024*1024) as fo:
- [fo.write(line) for line in lines]
-
-def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
- if not os.path.exists(out_path):
- os.makedirs(out_path)
- for lang in langs:
- if lang not in inv_dict:
- raise ValueError("Lang %s is not in the dictionary" % lang)
-
- docs_created = len(list_files(out_path))
- print("%d multilingual documents found." % docs_created)
- for doc,lang in _doc_generator(text_path, langs):
- title = _extract_title(doc)
-
- if title in inv_dict[lang]:
- #pass
- ids = inv_dict[lang][title]
- for id in ids:
- target_file = join(out_path, id) + ".xml"
- if os.path.exists(target_file):
- _append_doc(target_file, doc, lang)
- else:
- _create_doc(target_file, id, doc, lang)
- docs_created+=1
- else:
- if not re.match('[A-Za-z]+', title):
- print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
-
-
-
-def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
- simplified_file = join(data_dir,filename)
-
- if policy not in policies:
- raise ValueError("Policy %s not supported." % policy)
- print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
-
- lang_prefix = list(langs)
- lang_prefix.sort()
- pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
- pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
- pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
- if os.path.exists(pickle_invdict):
- if return_both and os.path.exists(pickle_dict):
- print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
- return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
- elif return_both==False:
- print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
- return pickle.load(open(pickle_invdict, 'rb'))
-
- multiling_titles = {}
- inv_dict = {lang:{} for lang in langs}
-
- def process_entry(line):
- parts = line.strip().split('\t')
- id = parts[0]
- if id in multiling_titles:
- raise ValueError("id <%s> already indexed" % id)
-
- titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
- for lang in titles.keys():
- if lang not in langs:
- del titles[lang]
-
- if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
- or (policy == "IN_ANY_LANG" and len(titles) > 0):
- multiling_titles[id] = titles
- for lang, title in titles.items():
- if title in inv_dict[lang]:
- inv_dict[lang][title].append(id)
- inv_dict[lang][title] = [id]
-
- with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
- completed = 0
- try:
- for line in fi:
- process_entry(line)
- completed += 1
- if completed % 10 == 0:
- print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
- print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
- except EOFError:
- print("\nUnexpected file ending... saving anyway")
-
- print("Pickling dictionaries in %s" % data_dir)
- pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
- pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
- print("Done")
-
- return (multiling_titles, inv_dict) if return_both else inv_dict
-
-
-# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
-def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
- latest_all_json_file = join(data_dir,json_file)
-
- if policy not in policies:
- raise ValueError("Policy %s not supported." % policy)
-
- print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
-
- lang_prefix = list(langs)
- lang_prefix.sort()
- simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
-
- def process_entry(last, fo):
- global written
- id = last["id"]
- titles = None
- if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
- titles = {lang: last["labels"][lang]["value"] for lang in langs}
- elif policy == "IN_ANY_LANG":
- titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
-
- if titles:
- fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
- return True
- else:
- return False
-
- written = 0
- with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
- BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
- builder = ObjectBuilder()
- completed = 0
- for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
- builder.event(event, value)
- if len(builder.value)>1:
- if process_entry(builder.value.pop(0), fo): written += 1
- completed += 1
- print("\rCompleted %d\ttitles %d" % (completed,written), end="")
- print("")
-
- #process the last entry
- process_entry(builder.value.pop(0))
-
- return simple_titles_path
-
-"""
-Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
-specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
-specific version of the same document. Documents are forced to contain version in all specified languages and to contain
-a minimum number of words; otherwise it is discarded.
-"""
-class MinWordsNotReached(Exception): pass
-class WrongDocumentFormat(Exception): pass
-
-def _load_multilang_doc(path, langs, min_words=100):
- import xml.etree.ElementTree as ET
- from xml.etree.ElementTree import Element, ParseError
- try:
- root = ET.parse(path).getroot()
- doc = {}
- for lang in langs:
- doc_body = root.find('.//doc[@lang="' + lang + '"]')
- if isinstance(doc_body, Element):
- n_words = len(doc_body.text.split(' '))
- if n_words >= min_words:
- doc[lang] = doc_body.text
- else:
- raise MinWordsNotReached
- else:
- raise WrongDocumentFormat
- except ParseError:
- raise WrongDocumentFormat
- return doc
-
-#returns the multilingual documents mapped by language, and a counter with the number of documents readed
-def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
- if pickle_name and os.path.exists(pickle_name):
- print("unpickling %s" % pickle_name)
- return pickle.load(open(pickle_name, 'rb'))
-
- multi_docs = list_files(wiki_multi_path)
- mling_documents = {l:[] for l in langs}
- valid_documents = 0
- minwords_exception = 0
- wrongdoc_exception = 0
- for d,multi_doc in enumerate(multi_docs):
- print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
- (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
- doc_path = join(wiki_multi_path, multi_doc)
- try:
- m_doc = _load_multilang_doc(doc_path, langs, min_words)
- valid_documents += 1
- for l in langs:
- mling_documents[l].append(m_doc[l])
- except MinWordsNotReached:
- minwords_exception += 1
- if deletions: os.remove(doc_path)
- except WrongDocumentFormat:
- wrongdoc_exception += 1
- if deletions: os.remove(doc_path)
- if max_documents>0 and valid_documents>=max_documents:
- break
-
- if pickle_name:
- print("Pickling wikipedia documents object in %s" % pickle_name)
- pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
-
- return mling_documents
-
-def random_wiki_sample(l_wiki, max_documents):
- if max_documents == 0: return None
- langs = list(l_wiki.keys())
- assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
- ndocs_per_lang = len(l_wiki[langs[0]])
- if ndocs_per_lang > max_documents:
- sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
- for lang in langs:
- l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
- return l_wiki
-
-
-if __name__ == "__main__":
-
- wikipedia_home = "../Datasets/Wikipedia"
-
- from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
- langs = frozenset(langs)
-
- simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
- _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
- extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
- out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))
-
-
diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py
deleted file mode 100644
index 1a6e3ae..0000000
--- a/src/data/text_preprocessor.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from nltk.corpus import stopwords
-from data.languages import NLTK_LANGMAP
-from nltk import word_tokenize
-from nltk.stem import SnowballStemmer
-
-
-def preprocess_documents(documents, lang):
- tokens = NLTKStemTokenizer(lang, verbose=True)
- sw = stopwords.words(NLTK_LANGMAP[lang])
- return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
-
-
-class NLTKStemTokenizer(object):
-
- def __init__(self, lang, verbose=False):
- if lang not in NLTK_LANGMAP:
- raise ValueError('Language %s is not supported in NLTK' % lang)
- self.verbose=verbose
- self.called = 0
- self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
- self.cache = {}
-
- def __call__(self, doc):
- self.called += 1
- if self.verbose:
- print("\r\t\t[documents processed %d]" % (self.called), end="")
- tokens = word_tokenize(doc)
- stems = []
- for t in tokens:
- if t not in self.cache:
- self.cache[t] = self.wnl.stem(t)
- stems.append(self.cache[t])
- return stems
\ No newline at end of file
diff --git a/src/data/tsr_function__.py b/src/data/tsr_function__.py
deleted file mode 100755
index 0af8690..0000000
--- a/src/data/tsr_function__.py
+++ /dev/null
@@ -1,270 +0,0 @@
-import math
-import numpy as np
-from scipy.stats import t
-from joblib import Parallel, delayed
-from scipy.sparse import csr_matrix, csc_matrix
-
-
-def get_probs(tpr, fpr, pc):
- # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
- # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
- pnc = 1.0 - pc
- tp = tpr * pc
- fn = pc - tp
- fp = fpr * pnc
- tn = pnc - fp
- return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
-
-
-def apply_tsr(tpr, fpr, pc, tsr):
- cell = get_probs(tpr, fpr, pc)
- return tsr(cell)
-
-
-def positive_information_gain(cell):
- if cell.tpr() < cell.fpr():
- return 0.0
- else:
- return information_gain(cell)
-
-
-def posneg_information_gain(cell):
- ig = information_gain(cell)
- if cell.tpr() < cell.fpr():
- return -ig
- else:
- return ig
-
-
-def __ig_factor(p_tc, p_t, p_c):
- den = p_t * p_c
- if den != 0.0 and p_tc != 0:
- return p_tc * math.log(p_tc / den, 2)
- else:
- return 0.0
-
-
-def information_gain(cell):
- return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
- __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
- __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
- __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
-
-
-def information_gain_mod(cell):
- return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
- - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
-
-
-def pointwise_mutual_information(cell):
- return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
-
-
-def gain_ratio(cell):
- pc = cell.p_c()
- pnc = 1.0 - pc
- norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
- return information_gain(cell) / (-norm)
-
-
-def chi_square(cell):
- den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
- if den==0.0: return 0.0
- num = gss(cell)**2
- return num / den
-
-
-def relevance_frequency(cell):
- a = cell.tp
- c = cell.fp
- if c == 0: c = 1
- return math.log(2.0 + (a * 1.0 / c), 2)
-
-
-def idf(cell):
- if cell.p_f()>0:
- return math.log(1.0 / cell.p_f())
- return 0.0
-
-
-def gss(cell):
- return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
-
-
-def conf_interval(xt, n):
- if n>30:
- z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
- else:
- z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
- p = (xt + 0.5 * z2) / (n + z2)
- amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
- return p, amplitude
-
-def strength(minPosRelFreq, minPos, maxNeg):
- if minPos > maxNeg:
- return math.log(2.0 * minPosRelFreq, 2.0)
- else:
- return 0.0
-
-
-#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
-#however, for some extremely imbalanced dataset caused all documents to be 0
-def conf_weight(cell, cancel_features=False):
- c = cell.get_c()
- not_c = cell.get_not_c()
- tp = cell.tp
- fp = cell.fp
-
- pos_p, pos_amp = conf_interval(tp, c)
- neg_p, neg_amp = conf_interval(fp, not_c)
-
- min_pos = pos_p-pos_amp
- max_neg = neg_p+neg_amp
- den = (min_pos + max_neg)
- minpos_relfreq = min_pos / (den if den != 0 else 1)
-
- str_tplus = strength(minpos_relfreq, min_pos, max_neg);
-
- if str_tplus == 0 and not cancel_features:
- return 1e-20
-
- return str_tplus;
-
-
-class ContTable:
-
- def __init__(self, tp=0, tn=0, fp=0, fn=0):
- self.tp=tp
- self.tn=tn
- self.fp=fp
- self.fn=fn
-
- def get_d(self): return self.tp + self.tn + self.fp + self.fn
-
- def get_c(self): return self.tp + self.fn
-
- def get_not_c(self): return self.tn + self.fp
-
- def get_f(self): return self.tp + self.fp
-
- def get_not_f(self): return self.tn + self.fn
-
- def p_c(self): return (1.0*self.get_c())/self.get_d()
-
- def p_not_c(self): return 1.0-self.p_c()
-
- def p_f(self): return (1.0*self.get_f())/self.get_d()
-
- def p_not_f(self): return 1.0-self.p_f()
-
- def p_tp(self): return (1.0*self.tp) / self.get_d()
-
- def p_tn(self): return (1.0*self.tn) / self.get_d()
-
- def p_fp(self): return (1.0*self.fp) / self.get_d()
-
- def p_fn(self): return (1.0*self.fn) / self.get_d()
-
- def tpr(self):
- c = 1.0*self.get_c()
- return self.tp / c if c > 0.0 else 0.0
-
- def fpr(self):
- _c = 1.0*self.get_not_c()
- return self.fp / _c if _c > 0.0 else 0.0
-
-
-def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
- print(f'[selectiong {k} terms]')
- nC = Y.shape[1]
- FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
- best_features_idx = np.argsort(-FC, axis=0).flatten()
- tsr_values = FC.flatten()
- selected_indexes_set = set()
- selected_indexes = list()
- selected_value = list()
- from_category = list()
- round_robin = iter(best_features_idx)
- values_iter = iter(tsr_values)
- round=0
- while len(selected_indexes) < k:
- term_idx = next(round_robin)
- term_val = next(values_iter)
- if term_idx not in selected_indexes_set:
- selected_indexes_set.add(term_idx)
- selected_indexes.append(term_idx)
- selected_value.append(term_val)
- from_category.append(round)
- round = (round + 1) % nC
- return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
-
-
-def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
- tp_ = len(positive_document_indexes & feature_document_indexes)
- fp_ = len(feature_document_indexes - positive_document_indexes)
- fn_ = len(positive_document_indexes - feature_document_indexes)
- tn_ = nD - (tp_ + fp_ + fn_)
- return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
-
-
-def category_tables(feature_sets, category_sets, c, nD, nF):
- return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
-
-
-"""
-Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
-Efficiency O(nF x nC x log(S)) where S is the sparse factor
-"""
-def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
- nD, nF = coocurrence_matrix.shape
- nD2, nC = label_matrix.shape
-
- if nD != nD2:
- raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
- (coocurrence_matrix.shape,label_matrix.shape))
-
- def nonzero_set(matrix, col):
- return set(matrix[:, col].nonzero()[0])
-
- if isinstance(coocurrence_matrix, csr_matrix):
- coocurrence_matrix = csc_matrix(coocurrence_matrix)
- feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
- category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
- cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
- return np.array(cell_matrix)
-
-# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
-def get_tsr_matrix(cell_matrix, tsr_score_funtion):
- nC,nF = cell_matrix.shape
- tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
- return np.array(tsr_matrix)
-
-
-""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
-take as input any real-valued feature column (e.g., tf-idf weights).
-feat is the feature vector, and c is a binary classification vector.
-This implementation covers only the binary case, while the formula is defined for multiclass
-single-label scenarios, for which the version [2] might be preferred.
-[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
-[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
-"""
-def fisher_score_binary(feat, c):
- neg = np.ones_like(c) - c
-
- npos = np.sum(c)
- nneg = np.sum(neg)
-
- mupos = np.mean(feat[c == 1])
- muneg = np.mean(feat[neg == 1])
- mu = np.mean(feat)
-
- stdpos = np.std(feat[c == 1])
- stdneg = np.std(feat[neg == 1])
-
- num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
- den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
-
- if den>0:
- return num / den
- else:
- return num
diff --git a/src/dataset_builder.py b/src/dataset_builder.py
deleted file mode 100644
index b9650c7..0000000
--- a/src/dataset_builder.py
+++ /dev/null
@@ -1,710 +0,0 @@
-from os.path import join, exists
-from nltk.corpus import stopwords
-from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
-from sklearn.preprocessing import MultiLabelBinarizer
-from data.reader.jrcacquis_reader import *
-from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
-from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
-from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
-import pickle
-import numpy as np
-from sklearn.model_selection import train_test_split
-from scipy.sparse import issparse
-import itertools
-from tqdm import tqdm
-import re
-from scipy.sparse import csr_matrix
-
-
-class MultilingualDataset:
- """
- A multilingual dataset is a dictionary of training and test documents indexed by language code.
- Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
- documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
- labels of each document, and ids is a list of document-identifiers from the original collection.
- """
-
- def __init__(self):
- self.dataset_name = ""
- self.multiling_dataset = {}
-
- def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
- self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
-
- def save(self, file):
- self.sort_indexes()
- pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
- return self
-
- def __getitem__(self, item):
- if item in self.langs():
- return self.multiling_dataset[item]
- return None
-
- @classmethod
- def load(cls, file):
- data = pickle.load(open(file, 'rb'))
- data.sort_indexes()
- return data
-
- @classmethod
- def load_ids(cls, file):
- data = pickle.load(open(file, 'rb'))
- tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
- te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
- return tr_ids, te_ids
-
- def sort_indexes(self):
- for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
- if issparse(Xtr): Xtr.sort_indices()
- if issparse(Xte): Xte.sort_indices()
-
- def set_view(self, categories=None, languages=None):
- if categories is not None:
- if isinstance(categories, int):
- categories = np.array([categories])
- elif isinstance(categories, list):
- categories = np.array(categories)
- self.categories_view = categories
- if languages is not None:
- self.languages_view = languages
-
- def training(self, mask_numbers=False, target_as_csr=False):
- return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
-
- def test(self, mask_numbers=False, target_as_csr=False):
- return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
-
- def lXtr(self, mask_numbers=False):
- proc = lambda x:_mask_numbers(x) if mask_numbers else x
- # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
- return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
-
- def lXte(self, mask_numbers=False):
- proc = lambda x: _mask_numbers(x) if mask_numbers else x
- # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
- return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
-
- def lYtr(self, as_csr=False):
- lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
- if as_csr:
- lY = {l:csr_matrix(Y) for l,Y in lY.items()}
- return lY
-
- def lYte(self, as_csr=False):
- lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
- if as_csr:
- lY = {l:csr_matrix(Y) for l,Y in lY.items()}
- return lY
-
- def cat_view(self, Y):
- if hasattr(self, 'categories_view'):
- return Y[:,self.categories_view]
- else:
- return Y
-
- def langs(self):
- if hasattr(self, 'languages_view'):
- langs = self.languages_view
- else:
- langs = sorted(self.multiling_dataset.keys())
- return langs
-
- def num_categories(self):
- return self.lYtr()[self.langs()[0]].shape[1]
-
- def show_dimensions(self):
- def shape(X):
- return X.shape if hasattr(X, 'shape') else len(X)
- for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
- if lang not in self.langs(): continue
- print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
-
- def show_category_prevalences(self):
- nC = self.num_categories()
- accum_tr = np.zeros(nC, dtype=np.int)
- accum_te = np.zeros(nC, dtype=np.int)
- in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category)
- for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
- if lang not in self.langs(): continue
- prev_train = np.sum(self.cat_view(Ytr), axis=0)
- prev_test = np.sum(self.cat_view(Yte), axis=0)
- accum_tr += prev_train
- accum_te += prev_test
- in_langs += (prev_train>0)*1
- print(lang+'-train', prev_train)
- print(lang+'-test', prev_test)
- print('all-train', accum_tr)
- print('all-test', accum_te)
-
- return accum_tr, accum_te, in_langs
-
- def set_labels(self, labels):
- self.labels = labels
-
-def _mask_numbers(data):
- mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
- mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
- mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
- mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
- mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
- masked = []
- for text in tqdm(data, desc='masking numbers'):
- text = ' ' + text
- text = mask_moredigit.sub(' MoreDigitMask', text)
- text = mask_4digit.sub(' FourDigitMask', text)
- text = mask_3digit.sub(' ThreeDigitMask', text)
- text = mask_2digit.sub(' TwoDigitMask', text)
- text = mask_1digit.sub(' OneDigitMask', text)
- masked.append(text.replace('.','').replace(',','').strip())
- return masked
-
-
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-# Helpers
-# ----------------------------------------------------------------------------------------------------------------------
-def get_active_labels(doclist):
- cat_list = set()
- for d in doclist:
- cat_list.update(d.categories)
- return list(cat_list)
-
-def filter_by_categories(doclist, keep_categories):
- catset = frozenset(keep_categories)
- for d in doclist:
- d.categories = list(set(d.categories).intersection(catset))
-
-def __years_to_str(years):
- if isinstance(years, list):
- if len(years) > 1:
- return str(years[0])+'-'+str(years[-1])
- return str(years[0])
- return str(years)
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-# Matrix builders
-# ----------------------------------------------------------------------------------------------------------------------
-def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
- """
- Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
- i.e., each language-specific matrix lies in a dedicate feature space.
- :param dataset_name: the name of the dataset (str)
- :param langs: list of languages (str)
- :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
- :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
- :param label_names: list of names of labels (str)
- :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
- :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
- :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
- by language the processed wikipedia documents in their respective language-specific feature spaces
- """
-
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- lW = {}
-
- multilingual_dataset = MultilingualDataset()
- multilingual_dataset.dataset_name = dataset_name
- multilingual_dataset.set_labels(mlb.classes_)
- for lang in langs:
- print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
- (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
-
- tr_data, tr_labels, IDtr = zip(*training_docs[lang])
- te_data, te_labels, IDte = zip(*test_docs[lang])
-
- if preprocess:
- tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
- tokenizer=NLTKStemTokenizer(lang, verbose=True),
- stop_words=stopwords.words(NLTK_LANGMAP[lang]))
- else:
- tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
-
- Xtr = tfidf.fit_transform(tr_data)
- Xte = tfidf.transform(te_data)
- if wiki_docs:
- lW[lang] = tfidf.transform(wiki_docs[lang])
-
- Ytr = mlb.transform(tr_labels)
- Yte = mlb.transform(te_labels)
-
- multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
-
- multilingual_dataset.show_dimensions()
- multilingual_dataset.show_category_prevalences()
-
- if wiki_docs:
- return multilingual_dataset, lW
- else:
- return multilingual_dataset
-
-
-# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
-def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
- """
- Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
- since all of them lie on the same yuxtaposed feature space.
- :param dataset_name: the name of the dataset (str)
- :param langs: list of languages (str)
- :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
- :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
- :param label_names: list of names of labels (str)
- :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
- :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
- by language the processed wikipedia documents in their respective language-specific feature spaces
- """
-
- multiling_dataset = MultilingualDataset()
- multiling_dataset.dataset_name = dataset_name
-
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- multiling_dataset.set_labels(mlb.classes_)
-
- tr_data_stack = []
- for lang in langs:
- print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
- tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
- te_data, te_labels, te_ID = zip(*test_docs[lang])
- if preprocess:
- tr_data = preprocess_documents(tr_data, lang)
- te_data = preprocess_documents(te_data, lang)
- tr_data_stack.extend(tr_data)
- multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
-
- tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
- tfidf.fit(tr_data_stack)
-
- for lang in langs:
- print("\nweighting documents for language <%s>" % (lang))
- (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
- Xtr = tfidf.transform(tr_data)
- Xte = tfidf.transform(te_data)
- Ytr = mlb.transform(tr_labels)
- Yte = mlb.transform(te_labels)
- multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
-
- multiling_dataset.show_dimensions()
- return multiling_dataset
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-# Methods to recover the original documents from the MultilingualDataset's ids
-# ----------------------------------------------------------------------------------------------------------------------
-"""
-This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
-article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
-from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
-"""
-def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
-
- tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
- assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
- langs = list(tr_ids.keys())
-
- print('fetching the datasets')
- rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
- rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
-
- filter_by_categories(rcv1_documents, labels_rcv2)
- filter_by_categories(rcv2_documents, labels_rcv1)
-
- label_names = get_active_labels(rcv1_documents + rcv2_documents)
- print('Active labels in RCV1/2 {}'.format(len(label_names)))
-
- print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
- print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
-
- all_docs = rcv1_documents + rcv2_documents
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- dataset = MultilingualDataset()
- for lang in langs:
- analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
- stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
-
- Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
- Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
- Xtr = [' '.join(analyzer(d)) for d in Xtr]
- Xte = [' '.join(analyzer(d)) for d in Xte]
- Ytr = mlb.transform(Ytr)
- Yte = mlb.transform(Yte)
- dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
-
- dataset.save(outpath)
-
-"""
-Same thing but for JRC-Acquis
-"""
-def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
-
- tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
- assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
- langs = list(tr_ids.keys())
-
- print('fetching the datasets')
-
- cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
- training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
- cat_filter=cat_list, cat_threshold=1, parallel=None,
- most_frequent=most_common_cat)
- test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
- parallel='force')
-
- def filter_by_id(doclist, ids):
- ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
- return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
-
- training_docs = filter_by_id(training_docs, tr_ids)
- test_docs = filter_by_id(test_docs, te_ids)
-
- print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
-
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- dataset = MultilingualDataset()
- for lang in langs:
- analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
- stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
-
- Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
- Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
- Xtr = [' '.join(analyzer(d)) for d in Xtr]
- Xte = [' '.join(analyzer(d)) for d in Xte]
- Ytr = mlb.transform(Ytr)
- Yte = mlb.transform(Yte)
- dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
-
- dataset.save(outpath)
-
-# ----------------------------------------------------------------------------------------------------------------------
-# Dataset Generators
-# ----------------------------------------------------------------------------------------------------------------------
-def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
- from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
-
-
- """
- Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
- "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
- In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
- :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
- all splits will be generated
- :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
- :param langs: the list of languages to consider (as defined in data/languages.py)
- :param train_years: a list of ints containing the years to be considered as training documents
- :param test_years: a list of ints containing the years to be considered as test documents
- :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
- (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
- leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
- :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
- :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
- :param run: a numeric label naming the random split (useful to keep track of different runs)
- :return: None
- """
-
- name = 'JRCacquis'
- run = '_run' + str(run)
- config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
- 'vs' + __years_to_str(test_years) + \
- '_' + cat_policy + \
- ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
- '_noparallel_processed'
-
- indep_path = join(jrc_data_home, config_name + run + '.pickle')
- upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
- yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
- wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle')
- wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
-
- cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
- training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
- cat_filter=cat_list, cat_threshold=1, parallel=None,
- most_frequent=most_common_cat)
- test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
- parallel='force')
-
- print('Generating feature-independent dataset...')
- training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
-
- def _group_by_lang(doc_list, langs):
- return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
- for lang in langs}
-
- training_docs = _group_by_lang(training_docs, langs)
- training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
- test_docs = _group_by_lang(test_docs, langs)
- if not exists(indep_path):
- wiki_docs=None
- if max_wiki>0:
- if not exists(wiki_docs_path):
- wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
- wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
- pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
- else:
- wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
- wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
-
- if wiki_docs:
- lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
- pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
- else:
- lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
-
- lang_data.save(indep_path)
-
- print('Generating upper-bound (English-only) dataset...')
- if not exists(upper_path):
- training_docs_eng_only = {'en':training_docs['en']}
- test_docs_eng_only = {'en':test_docs['en']}
- build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
-
- print('Generating yuxtaposed dataset...')
- if not exists(yuxta_path):
- build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
-
-
-def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
- train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
- from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
- """
- Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
- "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
-
- :param outpath: path where all splits will be dumped
- :param rcv1_data_home: path to the RCV1-v2 dataset (English only)
- :param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
- :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
- :param langs: the list of languages to consider (as defined in data/languages.py)
- :param train_for_lang: maximum number of training documents per language
- :param test_for_lang: maximum number of test documents per language
- :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
- :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
- :param run: a numeric label naming the random split (useful to keep track of different runs)
- :return: None
- """
-
- assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
- assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
- assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
- "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
-
- name = 'RCV1/2'
- run = '_run' + str(run)
- config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
- ('_processed' if preprocess else '_raw')
-
- indep_path = join(outpath, config_name + run + '.pickle')
- upper_path = join(outpath, config_name + run +'_upper.pickle')
- yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
- wiki_path = join(outpath, config_name + run + '.wiki.pickle')
- wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
-
- print('fetching the datasets')
- rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
- rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
- filter_by_categories(rcv1_documents, labels_rcv2)
- filter_by_categories(rcv2_documents, labels_rcv1)
-
- label_names = get_active_labels(rcv1_documents+rcv2_documents)
- print('Active labels in RCV1/2 {}'.format(len(label_names)))
-
- print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
- print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
-
- lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
-
- # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
- # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
- print('Generating upper-bound (English-only) dataset...')
- train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
- train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
- test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
- build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
-
- train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
- for lang in langs:
- if lang=='en': continue # already split
- test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
- train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
- train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
- test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test]
-
- print('Generating feature-independent dataset...')
- wiki_docs=None
- if max_wiki>0:
- if not exists(wiki_docs_path):
- wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
- wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
- pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
- else:
- wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
- wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
-
- if wiki_docs:
- lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
- pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
- else:
- lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
-
- lang_data.save(indep_path)
-
- print('Generating yuxtaposed dataset...')
- build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-# Methods to generate full RCV and JRC datasets
-# ----------------------------------------------------------------------------------------------------------------------
-def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):
-
-
- print('fetching the datasets')
- rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
- rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
- rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
-
- filter_by_categories(rcv1_train_documents, labels_rcv2)
- filter_by_categories(rcv1_test_documents, labels_rcv2)
- filter_by_categories(rcv2_documents, labels_rcv1)
-
- label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
- print('Active labels in RCV1/2 {}'.format(len(label_names)))
-
- print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
- print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
-
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
- lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}
-
- def get_ids(doclist):
- return frozenset([d.id for d in doclist])
-
- tr_ids = {'en': get_ids(rcv1_train_documents)}
- te_ids = {'en': get_ids(rcv1_test_documents)}
- for lang in langs:
- if lang == 'en': continue
- tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)
-
- dataset = MultilingualDataset()
- dataset.dataset_name = 'RCV1/2-full'
- for lang in langs:
- print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
- analyzer = CountVectorizer(
- strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
- ).build_analyzer()
-
- Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
- Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
- Xtr = [' '.join(analyzer(d)) for d in Xtr]
- Xte = [' '.join(analyzer(d)) for d in Xte]
- Ytr = mlb.transform(Ytr)
- Yte = mlb.transform(Yte)
- dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
-
- dataset.save(outpath)
-
-
-def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):
-
- print('fetching the datasets')
- cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
- training_docs, label_names = fetch_jrcacquis(
- langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
- )
- test_docs, _ = fetch_jrcacquis(
- langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
- )
-
- def _group_by_lang(doc_list, langs):
- return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}
-
- training_docs = _group_by_lang(training_docs, langs)
- test_docs = _group_by_lang(test_docs, langs)
-
- mlb = MultiLabelBinarizer()
- mlb.fit([label_names])
-
- dataset = MultilingualDataset()
- data.dataset_name = 'JRC-Acquis-full'
- for lang in langs:
- analyzer = CountVectorizer(
- strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
- ).build_analyzer()
-
- Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
- Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
- Xtr = [' '.join(analyzer(d)) for d in Xtr]
- Xte = [' '.join(analyzer(d)) for d in Xte]
- Ytr = mlb.transform(Ytr)
- Yte = mlb.transform(Yte)
- dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
-
- dataset.save(outpath)
-
-
-#-----------------------------------------------------------------------------------------------------------------------
-# MAIN BUILDER
-#-----------------------------------------------------------------------------------------------------------------------
-
-if __name__=='__main__':
- import sys
- RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
- RCV2_PATH = '../Datasets/RCV2'
- JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
- full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
- # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
- sys.exit(0)
-
- # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
- # data = MultilingualDataset.load(datasetpath)
- # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
- # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
- # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
- # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
- # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
- # sys.exit(0)
-
- assert len(sys.argv) == 5, "wrong number of arguments; required: " \
- " "
-
- JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
- RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
- RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
- WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
-
- langs = lang_set['JRC_NLTK']
- max_wiki = 5000
-
- for run in range(0,10):
- print('Building JRC-Acquis datasets run', run)
- prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
- train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
- cat_policy='all', most_common_cat=300, run=run)
-
- print('Building RCV1-v2/2 datasets run', run)
- prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
- train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
-
- # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
- # (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
- # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
- # outpath = datasetpath.replace('_nltk_','_doclist_')
- # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
-
- # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
- # outpath = datasetpath.replace('_nltk_', '_doclist_')
- # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)
-
-
-
diff --git a/src/embeddings/__init__.py b/src/embeddings/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py
deleted file mode 100644
index 27367e9..0000000
--- a/src/embeddings/embeddings.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-from torchtext.vocab import Vectors
-import torch
-from abc import ABC, abstractmethod
-from util.SIF_embed import *
-
-
-class PretrainedEmbeddings(ABC):
-
- def __init__(self):
- super().__init__()
-
- @abstractmethod
- def vocabulary(self): pass
-
- @abstractmethod
- def dim(self): pass
-
- @classmethod
- def reindex(cls, words, word2index):
- if isinstance(words, dict):
- words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
-
- source_idx, target_idx = [], []
- for i, word in enumerate(words):
- if word not in word2index: continue
- j = word2index[word]
- source_idx.append(i)
- target_idx.append(j)
- source_idx = np.asarray(source_idx)
- target_idx = np.asarray(target_idx)
- return source_idx, target_idx
-
-
-class FastTextWikiNews(Vectors):
-
- url_base = 'Cant auto-download MUSE embeddings'
- path = '../embeddings/wiki.multi.{}.vec'
- _name = '/wiki.multi.{}.vec'
-
- def __init__(self, cache, language="en", **kwargs):
- url = self.url_base.format(language)
- name = cache + self._name.format(language)
- super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
-
-
-class FastTextMUSE(PretrainedEmbeddings):
- def __init__(self, path, lang, limit=None):
- super().__init__()
- assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
- self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
-
- def vocabulary(self):
- return set(self.embed.stoi.keys())
-
- def dim(self):
- return self.embed.dim
-
- def extract(self, words):
- source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
- extraction = torch.zeros((len(words), self.dim()))
- extraction[source_idx] = self.embed.vectors[target_idx]
- return extraction
-
-
-
diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py
deleted file mode 100644
index 026823e..0000000
--- a/src/embeddings/pretrained.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from abc import ABC, abstractmethod
-import torch, torchtext
-# import gensim
-# import os
-import numpy as np
-
-
-# class KeyedVectors:
-#
-# def __init__(self, word2index, weights):
-# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
-# index2word = {i:w for w,i in word2index.items()}
-# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
-# self.word2index = word2index
-# self.index2word = index2word
-# self.weights = weights
-#
-# def extract(self, words):
-# dim = self.weights.shape[1]
-# v_size = len(words)
-#
-# source_idx, target_idx = [], []
-# for i,word in enumerate(words):
-# if word not in self.word2index: continue
-# j = self.word2index[word]
-# source_idx.append(i)
-# target_idx.append(j)
-#
-# extraction = np.zeros((v_size, dim))
-# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
-#
-# return extraction
-
-
-# class PretrainedEmbeddings(ABC):
-#
-# def __init__(self):
-# super().__init__()
-#
-# @abstractmethod
-# def vocabulary(self): pass
-#
-# @abstractmethod
-# def dim(self): pass
-#
-# @classmethod
-# def reindex(cls, words, word2index):
-# source_idx, target_idx = [], []
-# for i, word in enumerate(words):
-# if word not in word2index: continue
-# j = word2index[word]
-# source_idx.append(i)
-# target_idx.append(j)
-# source_idx = np.asarray(source_idx)
-# target_idx = np.asarray(target_idx)
-# return source_idx, target_idx
-
-
-# class GloVe(PretrainedEmbeddings):
-#
-# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
-# super().__init__()
-# print(f'Loading GloVe pretrained vectors from torchtext')
-# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
-# print('Done')
-#
-# def vocabulary(self):
-# return set(self.embed.stoi.keys())
-#
-# def dim(self):
-# return self.embed.dim
-#
-# def extract(self, words):
-# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
-# extraction = torch.zeros((len(words), self.dim()))
-# extraction[source_idx] = self.embed.vectors[target_idx]
-# return extraction
-
-
-# class Word2Vec(PretrainedEmbeddings):
-#
-# def __init__(self, path, limit=None):
-# super().__init__()
-# print(f'Loading word2vec pretrained vectors from {path}')
-# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
-# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
-# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
-# print('Done')
-#
-# def vocabulary(self):
-# return set(self.word2index.keys())
-#
-# def dim(self):
-# return self.embed.vector_size
-#
-# def extract(self, words):
-# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
-# extraction = np.zeros((len(words), self.dim()))
-# extraction[source_idx] = self.embed.vectors[target_idx]
-# extraction = torch.from_numpy(extraction).float()
-# return extraction
-
diff --git a/src/embeddings/supervised.py b/src/embeddings/supervised.py
deleted file mode 100755
index f84793e..0000000
--- a/src/embeddings/supervised.py
+++ /dev/null
@@ -1,74 +0,0 @@
-from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
-import numpy as np
-
-
-def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
- std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
- mean = np.mean(x, axis=axis)
- return (x - mean) / std
-
-
-def supervised_embeddings_tfidf(X,Y):
- tfidf_norm = X.sum(axis=0)
- tfidf_norm[tfidf_norm==0] = 1
- F = (X.T).dot(Y) / tfidf_norm.T
- return F
-
-
-def supervised_embeddings_ppmi(X,Y):
- Xbin = X>0
- D = X.shape[0]
- Pxy = (Xbin.T).dot(Y)/D
- Px = Xbin.sum(axis=0)/D
- Py = Y.sum(axis=0)/D
- F = np.asarray(Pxy/(Px.T*Py))
- F = np.maximum(F, 1.0)
- F = np.log(F)
- return F
-
-
-def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
- D = X.shape[0]
- if D>max_documents:
- print(f'sampling {max_documents}')
- random_sample = np.random.permutation(D)[:max_documents]
- X = X[random_sample]
- Y = Y[random_sample]
- cell_matrix = get_supervised_matrix(X, Y)
- F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
- return F
-
-
-def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
- if max_label_space != 0:
- print('computing supervised embeddings...')
- nC = Y.shape[1]
-
- if method=='ppmi':
- F = supervised_embeddings_ppmi(X, Y)
- elif method == 'dotn':
- F = supervised_embeddings_tfidf(X, Y)
- elif method == 'ig':
- F = supervised_embeddings_tsr(X, Y, information_gain)
- elif method == 'chi2':
- F = supervised_embeddings_tsr(X, Y, chi_square)
-
- if dozscore:
- F = zscores(F, axis=0)
-
- # Dumping F-matrix for further studies
- dump_it = False
- if dump_it:
- with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
- np.savetxt(outfile, F, delimiter='\t')
- with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
- for token in voc.keys():
- outfile.write(token+'\n')
-
- return F
-
-
-
-
-
-
diff --git a/src/experiment_scripts/10run_dl_jrc.sh b/src/experiment_scripts/10run_dl_jrc.sh
deleted file mode 100644
index ce04aa8..0000000
--- a/src/experiment_scripts/10run_dl_jrc.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
-logfile=../log/log10run_dl_jrc.csv
-
-runs='0 1 2 3 4 5 6 7 8 9'
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
-done
\ No newline at end of file
diff --git a/src/experiment_scripts/10run_dl_rcv.sh b/src/experiment_scripts/10run_dl_rcv.sh
deleted file mode 100644
index 51ca64b..0000000
--- a/src/experiment_scripts/10run_dl_rcv.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-
-dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
-logfile=../log/log10run_dl_rcv.csv
-
-runs='0 1 2 3 4 5 6 7 8 9'
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
-done
diff --git a/src/experiment_scripts/10run_jrc.sh b/src/experiment_scripts/10run_jrc.sh
deleted file mode 100644
index 37e3333..0000000
--- a/src/experiment_scripts/10run_jrc.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
-logfile=./results/10run_jrc_final_results.csv
-
-runs='0 1 2 3 4 5 6 7 8 9'
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
- python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
- python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
-
-done
diff --git a/src/experiment_scripts/10run_jrc_combinations.sh b/src/experiment_scripts/10run_jrc_combinations.sh
deleted file mode 100644
index 156a0a5..0000000
--- a/src/experiment_scripts/10run_jrc_combinations.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
-logfile=./results/funnelling_10run_jrc_CIKM.csv
-
-runs='6 7 8 9' #0 1 2 3 4 5
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5)
- python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
- #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
- #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
-done
\ No newline at end of file
diff --git a/src/experiment_scripts/10run_rcv.sh b/src/experiment_scripts/10run_rcv.sh
deleted file mode 100644
index 9d49f94..0000000
--- a/src/experiment_scripts/10run_rcv.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
-logfile=./results/10run_rcv_final_results.csv
-
-runs='0 1 2 3 4 5 6 7 8 9'
-
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2
- python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2
- python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2
-
-done
-
-
diff --git a/src/experiment_scripts/10run_rcv_combinations.sh b/src/experiment_scripts/10run_rcv_combinations.sh
deleted file mode 100644
index b5d8a3b..0000000
--- a/src/experiment_scripts/10run_rcv_combinations.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
-logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv
-
-runs='0 1 2 3 4 5 6 7 8 9'
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated
- python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated
- #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob
- #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2
- #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2
-done
\ No newline at end of file
diff --git a/src/experiment_scripts/extract_features.sh b/src/experiment_scripts/extract_features.sh
deleted file mode 100644
index d0bd3ac..0000000
--- a/src/experiment_scripts/extract_features.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-
-dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run#
-
-runs='1 2 3 4 5 6 7 8 9'
-for run in $runs
-do
- dataset=$dataset_path$run.pickle
- modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs
- python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
-done
-
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
-python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath
\ No newline at end of file
diff --git a/src/experiment_scripts/main_deep_learning.py b/src/experiment_scripts/main_deep_learning.py
deleted file mode 100755
index ee56054..0000000
--- a/src/experiment_scripts/main_deep_learning.py
+++ /dev/null
@@ -1,329 +0,0 @@
-import argparse
-import torch.nn as nn
-from torch.optim.lr_scheduler import StepLR
-from dataset_builder import MultilingualDataset
-from learning.transformers import load_muse_embeddings
-from models.lstm_class import RNNMultilingualClassifier
-from util.csv_log import CSVLog
-from util.early_stop import EarlyStopping
-from util.common import *
-from util.file import create_if_not_exist
-from time import time
-from tqdm import tqdm
-from util.evaluation import evaluate
-from util.file import get_file_name
-# import pickle
-
-allowed_nets = {'rnn'}
-
-# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
-def init_Net(nC, multilingual_index, xavier_uniform=True):
- net=opt.net
- assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}'
-
- # instantiate the required net
- if net=='rnn':
- only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised)
- if only_post:
- print('working on ONLY POST mode')
- model = RNNMultilingualClassifier(
- output_size=nC,
- hidden_size=opt.hidden,
- lvocab_size=multilingual_index.l_vocabsize(),
- learnable_length=opt.learnable,
- lpretrained=multilingual_index.l_embeddings(),
- drop_embedding_range=multilingual_index.sup_range,
- drop_embedding_prop=opt.sup_drop,
- post_probabilities=opt.posteriors,
- only_post=only_post,
- bert_embeddings=opt.mbert
- )
-
- # weight initialization
- if xavier_uniform:
- for p in model.parameters():
- if p.dim() > 1 and p.requires_grad:
- nn.init.xavier_uniform_(p)
-
- if opt.tunable:
- # this has to be performed *after* Xavier initialization is done,
- # otherwise the pretrained embedding parameters will be overrided
- model.finetune_pretrained()
-
- return model.cuda()
-
-
-def set_method_name():
- method_name = f'{opt.net}(H{opt.hidden})'
- if opt.pretrained:
- method_name += f'-Muse'
- if opt.supervised:
- method_name += f'-WCE'
- if opt.posteriors:
- method_name += f'-Posteriors'
- if opt.mbert:
- method_name += f'-mBert'
- if (opt.pretrained or opt.supervised) and opt.tunable:
- method_name += '-(trainable)'
- else:
- method_name += '-(static)'
- if opt.learnable > 0:
- method_name += f'-Learnable{opt.learnable}'
- return method_name
-
-
-def init_optimizer(model, lr):
- return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay)
-
-
-def init_logfile(method_name, opt):
- logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
- logfile.set_default('dataset', opt.dataset)
- logfile.set_default('run', opt.seed)
- logfile.set_default('method', method_name)
- assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
- f'and run {opt.seed} already calculated'
- return logfile
-
-
-# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
-def load_pretrained_embeddings(we_path, langs):
- lpretrained = lpretrained_vocabulary = none_dict(langs)
- if opt.pretrained:
- lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
- lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
- return lpretrained, lpretrained_vocabulary
-
-
-def get_lr(optimizer):
- for param_group in optimizer.param_groups:
- return param_group['lr']
-
-
-def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name):
- _dataset_path = opt.dataset.split('/')[-1].split('_')
- dataset_id = _dataset_path[0] + _dataset_path[-1]
-
- loss_history = []
- model.train()
- for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
- optim.zero_grad()
- # _out = model(batch, post, bert_emb, lang)
- loss = criterion(model(batch, post, bert_emb, lang), target)
- loss.backward()
- clip_gradient(model)
- optim.step()
- loss_history.append(loss.item())
-
- if idx % opt.log_interval == 0:
- interval_loss = np.mean(loss_history[-opt.log_interval:])
- print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
-
- mean_loss = np.mean(interval_loss)
- logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
- return mean_loss
-
-
-def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
-
- loss_history = []
- model.eval()
- langs = sorted(ltest_index.keys())
- predictions = {l:[] for l in langs}
- yte_stacked = {l:[] for l in langs}
- batcher.init_offset()
- for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '):
- logits = model(batch, post, bert_emb, lang)
- loss = criterion(logits, target).item()
- prediction = predict(logits)
- predictions[lang].append(prediction)
- yte_stacked[lang].append(target.detach().cpu().numpy())
- loss_history.append(loss)
-
- ly = {l:np.vstack(yte_stacked[l]) for l in langs}
- ly_ = {l:np.vstack(predictions[l]) for l in langs}
- l_eval = evaluate(ly, ly_)
- metrics = []
- for lang in langs:
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- if measure_prefix == 'te':
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
- print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
-
- mean_loss = np.mean(loss_history)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
-
- return Mf1
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-def main():
- DEBUGGING = False
-
- method_name = set_method_name()
- logfile = init_logfile(method_name, opt)
-
- # Loading the dataset
- data = MultilingualDataset.load(opt.dataset)
- # data.set_view(languages=['it', 'fr']) # Testing with less langs
- data.show_dimensions()
- langs = data.langs()
- l_devel_raw, l_devel_target = data.training(target_as_csr=True)
- l_test_raw, l_test_target = data.test(target_as_csr=True)
-
- # Loading the MUSE pretrained embeddings (only if requested)
- lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
- # lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set
-
- # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
- multilingual_index = MultilingualIndex()
- multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
- multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
- multilingual_index.embedding_matrices(lpretrained, opt.supervised)
- if opt.posteriors:
- if DEBUGGING:
- import pickle
- with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile:
- data_post = pickle.load(infile)
- lPtr = data_post[0]
- lPva = data_post[1]
- lPte = data_post[2]
- print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0')
- else:
- lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000)
- else:
- lPtr, lPva, lPte = None, None, None
-
- if opt.mbert:
- _dataset_path = opt.dataset.split('/')[-1].split('_')
- _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '')
- # print(f'Model Folder: {_model_folder}')
-
- if DEBUGGING:
- with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile:
- data_embed = pickle.load(infile)
- tr_bert_embeddings = data_embed[0]
- va_bert_embeddings = data_embed[1]
- te_bert_embeddings = data_embed[2]
- print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0')
- else:
- tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \
- = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/')
- else:
- tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None
-
- # Model initialization
- model = init_Net(data.num_categories(), multilingual_index)
-
- optim = init_optimizer(model, lr=opt.lr)
- criterion = torch.nn.BCEWithLogitsLoss().cuda()
- lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
- batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
- batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
-
- tinit = time()
- create_if_not_exist(opt.checkpoint_dir)
- early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
- checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
-
- l_train_index, l_train_target = multilingual_index.l_train()
- l_val_index, l_val_target = multilingual_index.l_val()
- l_test_index = multilingual_index.l_test_index()
-
- print('-'*80)
- print('Start training')
- for epoch in range(1, opt.nepochs + 1):
- train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
- lr_scheduler.step() # reduces the learning rate
-
- # validation
- macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va')
- early_stop(macrof1, epoch)
- if opt.test_each>0:
- if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0:
- print(f'running last {opt.val_epochs} training epochs on the validation set')
- for val_epoch in range(1, opt.val_epochs + 1):
- batcher_train.init_offset()
- train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
-
- # final test
- print('Training complete: testing')
- test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te')
-
-
-# ----------------------------------------------------------------------------------------------------------------------
-if __name__ == '__main__':
-
- parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings')
- parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
- parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)')
- parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)')
- parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)')
- parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)')
- parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order '
- 'to generate enough data to produce trend plots (test-each should be >0. This mode is '
- 'used to produce plots, and does not perform an evaluation on the test set.')
- parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)')
- parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)')
- parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)')
- parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)')
- parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
- parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by '
- 'language used to train the calibrated SVMs (only used if --posteriors is active)')
- parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
- parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
- parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
- parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
- parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
- parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings')
- parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings')
- parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings')
- parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)')
- parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the '
- 'validation set once training is over (default 1)')
- parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str',
- help=f'path to MUSE pretrained embeddings')
- parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the '
- 'feature-label embedding (if larger, then PCA with this number of components is applied '
- '(default 300)')
- parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
- parser.add_argument('--tunable', action='store_true', default=False,
- help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
- parser.add_argument('--mbert', action='store_true', default=False,
- help='use mBert embeddings')
-
- opt = parser.parse_args()
-
- assert torch.cuda.is_available(), 'CUDA not available'
- assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0'
- # if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle')
- torch.manual_seed(opt.seed)
-
- main()
diff --git a/src/experiment_scripts/main_embeddings_cls.py b/src/experiment_scripts/main_embeddings_cls.py
deleted file mode 100644
index 08552d3..0000000
--- a/src/experiment_scripts/main_embeddings_cls.py
+++ /dev/null
@@ -1,127 +0,0 @@
-import os
-from dataset_builder import MultilingualDataset
-from util.evaluation import *
-from optparse import OptionParser
-from util.file import exists
-from util.results import PolylingualClassificationResults
-from util.util import get_learner, get_params
-
-parser = OptionParser()
-
-parser.add_option("-d", "--dataset", dest="dataset",
- help="Path to the multilingual dataset processed and stored in .pickle format",
- default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
-
-parser.add_option("-o", "--output", dest="output",
- help="Result file", type=str, default='./results/results.csv')
-
-parser.add_option("-e", "--mode-embed", dest="mode_embed",
- help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
-
-parser.add_option("-w", "--we-path", dest="we_path",
- help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
-
-parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
- default='MUSE')
-
-parser.add_option("-s", "--set_c", dest="set_c",type=float,
- help="Set the C parameter", default=1)
-
-parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
- help="Optimize hyperparameters", default=False)
-
-parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
- help="Number of parallel jobs (default is -1, all)", default=-1)
-
-parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
- help="If smaller than number of target classes, PCA will be applied to supervised matrix. "
- "If set to 0 it will automatically search for the best number of components. "
- "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)",
- default=300)
-
-parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
- help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
- " If set to 0 it will automatically search for the best number of components", default=300)
-
-parser.add_option("-l", dest="lang", type=str)
-
-if __name__ == '__main__':
- (op, args) = parser.parse_args()
-
- assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
- assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
-
- dataset_file = os.path.basename(op.dataset)
-
- results = PolylingualClassificationResults('./results/PLE_results.csv')
-
- data = MultilingualDataset.load(op.dataset)
- data.show_dimensions()
-
- # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
- # data.set_view(languages=[op.lang])
- # data.set_view(categories=list(range(10)))
- lXtr, lytr = data.training()
- lXte, lyte = data.test()
-
- if op.set_c != -1:
- meta_parameters = None
- else:
- meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
-
- # Embeddings and WCE config
- _available_mode = ['none', 'unsupervised', 'supervised', 'both']
- _available_type = ['MUSE', 'FastText']
- assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}'
- assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}'
-
- if op.mode_embed == 'none':
- config = {'unsupervised': False,
- 'supervised': False,
- 'we_type': None}
- _config_id = 'None'
- elif op.mode_embed == 'unsupervised':
- config = {'unsupervised': True,
- 'supervised': False,
- 'we_type': op.we_type}
- _config_id = 'M'
- elif op.mode_embed == 'supervised':
- config = {'unsupervised': False,
- 'supervised': True,
- 'we_type': None}
- _config_id = 'F'
- elif op.mode_embed == 'both':
- config = {'unsupervised': True,
- 'supervised': True,
- 'we_type': op.we_type}
- _config_id = 'M+F'
-
- config['reduction'] = 'PCA'
- config['max_label_space'] = op.max_labels_S
- config['dim_reduction_unsupervised'] = op.max_labels_U
- # config['post_pca'] = op.post_pca
- # config['plot_covariance_matrices'] = True
-
- result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '')
-
- ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/',
- config = config,
- learner=get_learner(calibrate=False),
- c_parameters=get_params(dense=False),
- n_jobs=op.n_jobs)
-
- print('# Fitting ...')
- ple.fit(lXtr, lytr)
-
- print('# Evaluating ...')
- ple_eval = evaluate_method(ple, lXte, lyte)
-
- metrics = []
- for lang in lXte.keys():
- macrof1, microf1, macrok, microk = ple_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
- results.add_row('MLE', 'svm', _config_id, config['we_type'],
- 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time,
- lang, macrof1, microf1, macrok, microk, '')
- print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/experiment_scripts/main_majorityvoting_cls.py b/src/experiment_scripts/main_majorityvoting_cls.py
deleted file mode 100644
index ee5efe5..0000000
--- a/src/experiment_scripts/main_majorityvoting_cls.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import os
-from dataset_builder import MultilingualDataset
-# from learning.learners import *
-# from learning.learners import FunnellingMultimodal
-from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
-from util.evaluation import *
-from optparse import OptionParser
-from util.file import exists
-from util.results import PolylingualClassificationResults
-from sklearn.svm import SVC
-
-parser = OptionParser()
-
-# parser.add_option("-d", "--dataset", dest="dataset",
-# help="Path to the multilingual dataset processed and stored in .pickle format",
-# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
-
-parser.add_option("-o", "--output", dest="output",
- help="Result file", type=str, default='./results/results.csv')
-
-parser.add_option("-P", "--probs", dest="posteriors", action='store_true',
- help="Add posterior probabilities to the document embedding representation", default=False)
-
-parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
- help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
-
-parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
- help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
-
-parser.add_option("-w", "--we-path", dest="we_path",
- help="Path to the MUSE polylingual word embeddings", default='../embeddings')
-
-parser.add_option("-s", "--set_c", dest="set_c",type=float,
- help="Set the C parameter", default=1)
-
-parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
- help="Optimize hyperparameters", default=False)
-
-parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
- help="Number of parallel jobs (default is -1, all)", default=-1)
-
-parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
- help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
- default=300)
-
-parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
- help="Remove common component when computing dot product of word embedding matrices", default=False)
-
-# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
-# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
-# " If set to 0 it will automatically search for the best number of components", default=300)
-
-# parser.add_option("-a", dest="post_pca",
-# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
-# "embedding space", default=False)
-
-
-def get_learner(calibrate=False, kernel='linear'):
- return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
-
-
-def get_params(dense=False):
- if not op.optimc:
- return None
- c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
- kernel = 'rbf' if dense else 'linear'
- return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
-
-#######################################################################################################################
-
-
-if __name__ == '__main__':
- (op, args) = parser.parse_args()
-
- assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
- dataset = args[0]
-
- assert exists(dataset), 'Unable to find file '+str(dataset)
- assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
- assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
-
- dataset_file = os.path.basename(dataset)
-
- results = PolylingualClassificationResults(op.output)
-
- data = MultilingualDataset.load(dataset)
- data.show_dimensions()
-
- lXtr, lytr = data.training()
- lXte, lyte = data.test()
-
- meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
-
- # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
- result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \
- f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}'
- print(f'{result_id}')
-
- # text preprocessing
- tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
-
- lXtr = tfidfvectorizer.fit_transform(lXtr, lytr)
- lXte = tfidfvectorizer.transform(lXte)
- lV = tfidfvectorizer.vocabulary()
-
- classifiers = []
- if op.posteriors:
- classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
- if op.supervised:
- classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
- if op.pretrained:
- classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV)))
-
- classifier = Voting(*classifiers)
-
- print('# Fitting ...')
- classifier.fit(lXtr, lytr)
-
- print('\n# Evaluating ...')
- l_eval = evaluate_method(classifier, lXte, lyte)
-
- # renaming arguments to be printed on log
- _id = ''
- _id_conf = [op.posteriors, op.supervised, op.pretrained]
- _id_name = ['+P', '+W', '+M']
- for i, conf in enumerate(_id_conf):
- if conf:
- _id += _id_name[i]
- _id = _id.lstrip('+')
- _dataset_path = dataset.split('/')[-1].split('_')
- dataset_id = _dataset_path[0] + _dataset_path[-1]
-
- metrics = []
- for lang in lXte.keys():
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- results.add_row(method='Voting',
- learner='svm',
- optimp=op.optimc,
- sif=op.sif,
- zscore='todo',
- l2='todo',
- wescaler='todo',
- pca=op.max_labels_S,
- id=_id,
- dataset=dataset_id,
- time='todo',
- lang=lang,
- macrof1=macrof1,
- microf1=microf1,
- macrok=macrok,
- microk=microk,
- notes='')
- print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
diff --git a/src/experiment_scripts/main_mbert.py b/src/experiment_scripts/main_mbert.py
deleted file mode 100644
index aa44407..0000000
--- a/src/experiment_scripts/main_mbert.py
+++ /dev/null
@@ -1,390 +0,0 @@
-from dataset_builder import MultilingualDataset
-from transformers import BertTokenizer, BertForSequenceClassification, AdamW
-from torch.utils.data import Dataset, DataLoader
-import numpy as np
-import torch
-from util.common import predict
-from time import time
-from util.csv_log import CSVLog
-from util.evaluation import evaluate
-from util.early_stop import EarlyStopping
-from torch.optim.lr_scheduler import StepLR
-from sklearn.model_selection import train_test_split
-from copy import deepcopy
-import argparse
-# from torch.utils.tensorboard import SummaryWriter
-
-
-def check_sentences(sentences):
- tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
- for sentence in sentences:
- converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0]
- print(converted)
- return
-
-
-def get_model(n_out):
- print('# Initializing model ...')
- model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
- return model
-
-
-def set_method_name():
- return 'mBERT'
-
-
-def init_optimizer(model, lr):
- # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay)
- no_decay = ['bias', 'LayerNorm.weight']
- optimizer_grouped_parameters = [
- {'params': [p for n, p in model.named_parameters()
- if not any(nd in n for nd in no_decay)],
- 'weight_decay': opt.weight_decay},
- {'params': [p for n, p in model.named_parameters()
- if any(nd in n for nd in no_decay)],
- 'weight_decay': opt.weight_decay}
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
- return optimizer
-
-
-def init_logfile(method_name, opt):
- logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
- logfile.set_default('dataset', opt.dataset)
- logfile.set_default('run', opt.seed)
- logfile.set_default('method', method_name)
- assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
- f'and run {opt.seed} already calculated'
- return logfile
-
-
-def get_lr(optimizer):
- for param_group in optimizer.param_groups:
- return param_group['lr']
-
-
-def get_dataset_name(datapath):
- possible_splits = [str(i) for i in range(10)]
- splitted = datapath.split('_')
- id_split = splitted[-1].split('.')[0][-1]
- if id_split in possible_splits:
- dataset_name = splitted[0].split('/')[-1]
- return f'{dataset_name}_run{id_split}'
- elif splitted[-2].split('.')[0] == 'full':
- dataset_name = splitted[0].split('/')[-1]
- return f'{dataset_name}_fullrun'
-
-
-def load_datasets(datapath):
- data = MultilingualDataset.load(datapath)
- # data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs
- data.show_dimensions()
-
- l_devel_raw, l_devel_target = data.training(target_as_csr=False)
- l_test_raw, l_test_target = data.test(target_as_csr=False)
-
- return l_devel_raw, l_devel_target, l_test_raw, l_test_target
-
-
-def do_tokenization(l_dataset, max_len=512, verbose=True):
- if verbose:
- print('# Starting Tokenization ...')
- tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
- langs = l_dataset.keys()
- l_tokenized = {}
- for lang in langs:
- l_tokenized[lang] = tokenizer(l_dataset[lang],
- truncation=True,
- max_length=max_len,
- padding='max_length')
- return l_tokenized
-
-
-class TrainingDataset(Dataset):
- """
- data: dict of lang specific tokenized data
- labels: dict of lang specific targets
- """
-
- def __init__(self, data, labels):
- self.langs = data.keys()
- self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
-
- for i, lang in enumerate(self.langs):
- _data = data[lang]['input_ids']
- _data = np.array(_data)
- _labels = labels[lang]
- _lang_value = np.full(len(_data), self.lang_ids[lang])
-
- if i == 0:
- self.data = _data
- self.labels = _labels
- self.lang_index = _lang_value
- else:
- self.data = np.vstack((self.data, _data))
- self.labels = np.vstack((self.labels, _labels))
- self.lang_index = np.concatenate((self.lang_index, _lang_value))
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, idx):
- x = self.data[idx]
- y = self.labels[idx]
- lang = self.lang_index[idx]
-
- return x, torch.tensor(y, dtype=torch.float), lang
-
- def get_lang_ids(self):
- return self.lang_ids
-
- def get_nclasses(self):
- if hasattr(self, 'labels'):
- return len(self.labels[0])
- else:
- print('Method called before init!')
-
-
-def freeze_encoder(model):
- for param in model.base_model.parameters():
- param.requires_grad = False
- return model
-
-
-def check_param_grad_status(model):
- print('#' * 50)
- print('Model paramater status:')
- for name, child in model.named_children():
- trainable = False
- for param in child.parameters():
- if param.requires_grad:
- trainable = True
- if not trainable:
- print(f'{name} is frozen')
- else:
- print(f'{name} is not frozen')
- print('#' * 50)
-
-
-def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer):
- _dataset_path = opt.dataset.split('/')[-1].split('_')
- dataset_id = _dataset_path[0] + _dataset_path[-1]
-
- loss_history = []
- model.train()
-
- for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
- optim.zero_grad()
- out = model(batch.cuda())
- logits = out[0]
- loss = criterion(logits, target.cuda())
- loss.backward()
- # clip_gradient(model)
- optim.step()
- loss_history.append(loss.item())
-
- if writer is not None:
- _n_step = (epoch - 1) * (len(train_dataloader)) + idx
- writer.add_scalar('Loss_step/Train', loss, _n_step)
-
- # Check tokenized sentences consistency
- # check_sentences(batch.cpu())
-
- if idx % opt.log_interval == 0:
- interval_loss = np.mean(loss_history[-opt.log_interval:])
- print(
- f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
-
- mean_loss = np.mean(interval_loss)
- logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
- return mean_loss
-
-
-def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer):
- print('# Validating model ...')
- loss_history = []
- model.eval()
- langs = lang_ids.keys()
- id_2_lang = {v: k for k, v in lang_ids.items()}
- predictions = {l: [] for l in langs}
- yte_stacked = {l: [] for l in langs}
-
- for batch, target, lang_idx in test_dataloader:
- out = model(batch.cuda())
- logits = out[0]
- loss = criterion(logits, target.cuda()).item()
- prediction = predict(logits)
- loss_history.append(loss)
-
- # Assigning prediction to dict in predictions and yte_stacked according to lang_idx
- for i, pred in enumerate(prediction):
- lang_pred = id_2_lang[lang_idx.numpy()[i]]
- predictions[lang_pred].append(pred)
- yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
-
- ly = {l: np.vstack(yte_stacked[l]) for l in langs}
- ly_ = {l: np.vstack(predictions[l]) for l in langs}
- l_eval = evaluate(ly, ly_)
- metrics = []
- for lang in langs:
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- if measure_prefix == 'te':
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
- print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
- if writer is not None:
- writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch)
-
- mean_loss = np.mean(loss_history)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
-
- return Mf1
-
-
-def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
- l_split_va = deepcopy(l_tokenized_tr)
- l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
- l_split_tr = deepcopy(l_tokenized_tr)
- l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
-
- for lang in l_tokenized_tr.keys():
- val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
- l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
- lang] = \
- train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
- random_state=seed, shuffle=True)
-
- return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
-
-
-def main():
- print('Running main ...')
-
- DATAPATH = opt.dataset
- MAX_LEN = 512
- method_name = set_method_name()
- logfile = init_logfile(method_name, opt)
-
- l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
- l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
-
- l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target,
- val_prop=0.2, max_val=2000,
- seed=opt.seed)
-
- l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
-
- tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
- va_dataset = TrainingDataset(l_split_va, l_split_val_target)
- te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
-
- tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True)
- va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True)
- te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False)
-
-
- # Initializing model
- nC = tr_dataset.get_nclasses()
- model = get_model(nC)
- model = model.cuda()
- criterion = torch.nn.BCEWithLogitsLoss().cuda()
- optim = init_optimizer(model, lr=opt.lr)
- lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
- early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience,
- checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}',
- is_bert=True)
-
- # Freezing encoder
- # model = freeze_encoder(model)
- check_param_grad_status(model)
-
- # Tensorboard logger
- # writer = SummaryWriter('../log/tensorboard_logs/')
-
- # Training loop
- tinit = time()
- lang_ids = va_dataset.lang_ids
- for epoch in range(1, opt.nepochs + 1):
- print('# Start Training ...')
- train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None)
- lr_scheduler.step() # reduces the learning rate
-
- # Validation
- macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None)
- early_stop(macrof1, epoch)
- if opt.test_each > 0:
- if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or (
- not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs):
- test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
-
- if early_stop.STOP:
- print('[early-stop] STOP')
- if not opt.plotmode:
- break
-
- if not opt.plotmode:
- print('-' * 80)
- print('Training over. Performing final evaluation')
-
- model = early_stop.restore_checkpoint()
- model = model.cuda()
-
- if opt.val_epochs > 0:
- print(f'running last {opt.val_epochs} training epochs on the validation set')
- for val_epoch in range(1, opt.val_epochs + 1):
- train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None)
-
- # final test
- print('Training complete: testing')
- test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None)
-
- # writer.flush()
- # writer.close()
- exit('Code Executed!')
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model')
-
- parser.add_argument('--dataset', type=str,
- default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
- metavar='datasetpath', help=f'path to the pickled dataset')
- parser.add_argument('--nepochs', type=int, default=200, metavar='int',
- help='number of epochs (default: 200)')
- parser.add_argument('--lr', type=float, default=2e-5, metavar='float',
- help='learning rate (default: 2e-5)')
- parser.add_argument('--weight_decay', type=float, default=0, metavar='float',
- help='weight decay (default: 0)')
- parser.add_argument('--patience', type=int, default=10, metavar='int',
- help='patience for early-stop (default: 10)')
- parser.add_argument('--log-interval', type=int, default=20, metavar='int',
- help='how many batches to wait before printing training status')
- parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str',
- help='path to the log csv file')
- parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
- parser.add_argument('--force', action='store_true', default=False,
- help='do not check if this experiment has already been run')
- parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str',
- help='path to the directory containing checkpoints')
- parser.add_argument('--plotmode', action='store_true', default=False,
- help='in plot mode executes a long run in order '
- 'to generate enough data to produce trend plots (test-each should be >0. This mode is '
- 'used to produce plots, and does not perform an evaluation on the test set.')
- parser.add_argument('--test-each', type=int, default=0, metavar='int',
- help='how many epochs to wait before invoking test (default: 0, only at the end)')
- parser.add_argument('--val-epochs', type=int, default=1, metavar='int',
- help='number of training epochs to perform on the validation set once training is over (default 1)')
- opt = parser.parse_args()
-
- # Testing different parameters ...
- opt.weight_decay = 0.01
- opt.lr = 1e-5
- opt.patience = 5
-
- main()
- # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size
diff --git a/src/experiment_scripts/main_mbert_extractor.py b/src/experiment_scripts/main_mbert_extractor.py
deleted file mode 100644
index 16f09d3..0000000
--- a/src/experiment_scripts/main_mbert_extractor.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from experiment_scripts.main_mbert import *
-import pickle
-
-
-class ExtractorDataset(Dataset):
- """
- data: dict of lang specific tokenized data
- labels: dict of lang specific targets
- """
-
- def __init__(self, data):
- self.langs = data.keys()
- self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
-
- for i, lang in enumerate(self.langs):
- _data = data[lang]['input_ids']
- _data = np.array(_data)
- _lang_value = np.full(len(_data), self.lang_ids[lang])
-
- if i == 0:
- self.data = _data
- self.lang_index = _lang_value
- else:
- self.data = np.vstack((self.data, _data))
- self.lang_index = np.concatenate((self.lang_index, _lang_value))
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, idx):
- x = self.data[idx]
- lang = self.lang_index[idx]
-
- return x, lang
-
- def get_lang_ids(self):
- return self.lang_ids
-
-
-def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'):
- print('# Feature Extractor Mode...')
- from transformers import BertConfig
- config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300)
- model = BertForSequenceClassification.from_pretrained(model_path,
- config=config).cuda()
-
- """
- Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
- the output of each layer) of shape (batch_size, sequence_length, hidden_size)
- """
- all_batch_embeddings = {}
- id2lang = {v:k for k,v in lang_ids.items()}
- with torch.no_grad():
- for batch, target, lang_idx in data:
- out = model(batch.cuda())
- last_hidden_state = out[1][-1]
- batch_embeddings = last_hidden_state[:, 0, :]
- for i, l_idx in enumerate(lang_idx.numpy()):
- if id2lang[l_idx] not in all_batch_embeddings.keys():
- all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
- else:
- all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
- batch_embeddings[i].detach().cpu().numpy()))
-
- return all_batch_embeddings, id2lang
-
-
-def main():
- print('Running main ...')
- print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}')
- DATAPATH = opt.dataset
- MAX_LEN = 512
-
- l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH)
- l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN)
- l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN)
-
- tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target)
- tr_lang_ids = tr_dataset.lang_ids
-
- te_dataset = TrainingDataset(l_tokenized_te, l_test_target)
- te_lang_ids = te_dataset.lang_ids
-
- tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings
- te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc
-
- tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel
- with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
- pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile)
-
- te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test
- with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile:
- pickle.dump((te_all_batch_embeddings, id2lang_te), outfile)
-
- exit('Extraction completed!')
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='mBert model document embedding extractor')
-
- parser.add_argument('--dataset', type=str,
- default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle',
- metavar='datasetpath', help=f'path to the pickled dataset')
- parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
- parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0',
- metavar='modelpath', help=f'path to pre-trained mBert model')
- opt = parser.parse_args()
-
- main()
-
diff --git a/src/experiment_scripts/main_qualitative_analysis.py b/src/experiment_scripts/main_qualitative_analysis.py
deleted file mode 100644
index aead994..0000000
--- a/src/experiment_scripts/main_qualitative_analysis.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os
-from dataset_builder import MultilingualDataset
-from optparse import OptionParser
-from util.file import exists
-import numpy as np
-from sklearn.feature_extraction.text import CountVectorizer
-
-parser = OptionParser(usage="usage: %prog datapath [options]")
-
-(op, args) = parser.parse_args()
-assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
-dataset = args[0]
-assert exists(dataset), 'Unable to find file '+str(dataset)
-
-dataset_file = os.path.basename(dataset)
-
-data = MultilingualDataset.load(dataset)
-data.set_view(languages=['it'])
-data.show_dimensions()
-lXtr, lytr = data.training()
-lXte, lyte = data.test()
-
-vect_lXtr = dict()
-vectorizer = CountVectorizer()
-vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it'])
-# print(type(vect_lXtr['it']))
-
-corr = vect_lXtr['it'].T.dot(lytr['it'])
-# print(corr.shape)
-sum_correlated_class = corr.sum(axis=0)
-print(len(sum_correlated_class))
-print(sum_correlated_class.max())
-
-
-w2idx = vectorizer.vocabulary_
-idx2w = {v:k for k,v in w2idx.items()}
-
-word_tot_corr = corr.sum(axis=1)
-print(word_tot_corr.shape)
-dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)}
-
-sorted_word_tot_corr = np.sort(word_tot_corr)
-sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:]
-
-top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr]
-print([idx2w[idx] for idx in top_idx])
-print([elem for elem in top_idx])
-print(corr[8709])
-print('Finished...')
\ No newline at end of file
diff --git a/src/experiment_scripts/run_combinations_jrc.sh b/src/experiment_scripts/run_combinations_jrc.sh
deleted file mode 100644
index a4aabde..0000000
--- a/src/experiment_scripts/run_combinations_jrc.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
-logfile=./results/final_combinations_jrc.csv
-#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
-# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
-# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
-
-# aggregation=concatenation
-#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
-#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
-#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
-#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
-#
-
-##FeatureSetToPosteriors (aggregation mean)
-python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
-
-##FeatureSetToPosteriors
-#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
-
-#MajorityVoting
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
-
-
diff --git a/src/experiment_scripts/run_combinations_rcv.sh b/src/experiment_scripts/run_combinations_rcv.sh
deleted file mode 100644
index 4e1acfb..0000000
--- a/src/experiment_scripts/run_combinations_rcv.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
-logfile=./results/final_combinations_rcv.csv
-#A.2: ensembling feature sets (combinations of posteriors, wce, muse):
-# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc...
-# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...)
-
-# aggregation=concatenation
-#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2
-#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2
-#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2
-#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2
-#
-##FeatureSetToPosteriors (aggregation mean)
-python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob
-python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob
-
-##FeatureSetToPosteriors
-#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob
-#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob
-
-#MajorityVoting
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r
-#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r
\ No newline at end of file
diff --git a/src/experiment_scripts/run_dl_jrc.sh b/src/experiment_scripts/run_dl_jrc.sh
deleted file mode 100644
index 1d28e83..0000000
--- a/src/experiment_scripts/run_dl_jrc.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-logfile=../log/log_pre_jrc.csv
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
\ No newline at end of file
diff --git a/src/experiment_scripts/run_dl_rcv.sh b/src/experiment_scripts/run_dl_rcv.sh
deleted file mode 100644
index 4782887..0000000
--- a/src/experiment_scripts/run_dl_rcv.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
-python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --supervised --plotmode --test-each 20
-python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20
-
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20
-python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20
\ No newline at end of file
diff --git a/src/experiment_scripts/run_fulljrc_dl.sh b/src/experiment_scripts/run_fulljrc_dl.sh
deleted file mode 100644
index 4d5eeaa..0000000
--- a/src/experiment_scripts/run_fulljrc_dl.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
-seeds='5' #2 3 4 5 6 7 8 9 10'
-for seed in $seeds
-do
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
- python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force
-
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
-
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force
-
-done
\ No newline at end of file
diff --git a/src/experiment_scripts/run_fullrcv_dl.sh b/src/experiment_scripts/run_fullrcv_dl.sh
deleted file mode 100644
index 5894aef..0000000
--- a/src/experiment_scripts/run_fullrcv_dl.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
-seeds='1 ' #2 3 4 5' # 6 7 8 9 10'
-for seed in $seeds
-do
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed
- python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200
-
-
-
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed
-
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed
-
-# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed
-# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200
- #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed
-done
\ No newline at end of file
diff --git a/src/experiment_scripts/run_fun_bert_jrc.sh b/src/experiment_scripts/run_fun_bert_jrc.sh
deleted file mode 100644
index fc2e2c3..0000000
--- a/src/experiment_scripts/run_fun_bert_jrc.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
-#logfile=../log/log_FunBert_jrc.csv
-#
-#runs='0 1 2 3 4'
-#for run in $runs
-#do
-# dataset=$dataset_path$run.pickle
-# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable
-#done
-
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
-logfile=../log/log_FunBert_fulljrc_static.csv
-
-python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
\ No newline at end of file
diff --git a/src/experiment_scripts/run_fun_bert_rcv.sh b/src/experiment_scripts/run_fun_bert_rcv.sh
deleted file mode 100644
index e27fe54..0000000
--- a/src/experiment_scripts/run_fun_bert_rcv.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-
-#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
-#logfile=../log/log_FunBert_rcv_static.csv
-#
-#runs='0 1 2 3 4'
-#for run in $runs
-#do
-# dataset=$dataset_path$run.pickle
-# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
-#done
-
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
-logfile=../log/log_FunBert_fullrcv_static.csv
-
-python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile
\ No newline at end of file
diff --git a/src/experiment_scripts/run_mbert_jrc.sh b/src/experiment_scripts/run_mbert_jrc.sh
deleted file mode 100644
index 08733a4..0000000
--- a/src/experiment_scripts/run_mbert_jrc.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run
-#logfile=../log/log_mBert_jrc_NEW.csv
-#
-#runs='0 1 2 3 4'
-#for run in $runs
-#do
-# dataset=$dataset_path$run.pickle
-# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
-#done
-
-logfile=../log/log_mBert_fulljrc.csv
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle
-python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
\ No newline at end of file
diff --git a/src/experiment_scripts/run_mbert_rcv.sh b/src/experiment_scripts/run_mbert_rcv.sh
deleted file mode 100644
index 66ffba1..0000000
--- a/src/experiment_scripts/run_mbert_rcv.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run
-#logfile=../log/log_mBert_rcv_NEW.csv
-#
-#runs='0 1 2 3 4'
-#for run in $runs
-#do
-# dataset=$dataset_path$run.pickle
-# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50
-#done
-
-logfile=../log/log_mBert_fullrcv.csv
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
-python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3
\ No newline at end of file
diff --git a/src/experiment_scripts/run_traditional_jrc.sh b/src/experiment_scripts/run_traditional_jrc.sh
deleted file mode 100644
index 460c9e8..0000000
--- a/src/experiment_scripts/run_traditional_jrc.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle
-
-######################################## POSTERIORS
- # Posteriors
-python main_multimodal_cls.py $dataset -P # + zscore
-python main_multimodal_cls.py $dataset -P -z # +l2norm
-python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
-
-
-######################################### WCE
- #WCE supervised
-python main_multimodal_cls.py $dataset -S # + zscore
-python main_multimodal_cls.py $dataset -S -z # +l2norm
-python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
-python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
-
-python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca
-python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF
-
-python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
-python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
-python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca
-python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig
-
-
-python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi
-
-################################# MUSE
-
- # MUSE unsupervised
-python main_multimodal_cls.py $dataset -U # + zscore
-python main_multimodal_cls.py $dataset -U -z # +l2norm
-python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
-python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
-
-python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
-python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
-
-python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
diff --git a/src/experiment_scripts/run_traditional_rcv.sh b/src/experiment_scripts/run_traditional_rcv.sh
deleted file mode 100644
index 0dcfa2c..0000000
--- a/src/experiment_scripts/run_traditional_rcv.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle
-
-######################################## POSTERIORS
- # Posteriors
-python main_multimodal_cls.py $dataset -P # + zscore
-python main_multimodal_cls.py $dataset -P -z # +l2norm
-python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight
-
-
-######################################### WCE
- #WCE supervised
-python main_multimodal_cls.py $dataset -S # + zscore
-python main_multimodal_cls.py $dataset -S -z # +l2norm
-python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight
-python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA
-
-python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca
-python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF
-
-python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight
-python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig
-python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca
-python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig
-
-
-python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi
-
-################################# MUSE
-
- # MUSE unsupervised
-python main_multimodal_cls.py $dataset -U # + zscore
-python main_multimodal_cls.py $dataset -U -z # +l2norm
-python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight
-python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA
-
-python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca
-python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig
-
-python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi
-python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi
diff --git a/src/experiment_scripts/time_comparison.sh b/src/experiment_scripts/time_comparison.sh
deleted file mode 100644
index 60e1c25..0000000
--- a/src/experiment_scripts/time_comparison.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle
-seeds='1 2 3 4 5 6 7 8 9 10'
-for seed in $seeds
-do
- python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed
- done
\ No newline at end of file
diff --git a/src/learning/learners.py b/src/learning/learners.py
deleted file mode 100644
index 708eaad..0000000
--- a/src/learning/learners.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import numpy as np
-import time
-from scipy.sparse import issparse
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.model_selection import GridSearchCV
-from joblib import Parallel, delayed
-
-
-def _sort_if_sparse(X):
- if issparse(X) and not X.has_sorted_indices:
- X.sort_indices()
-
-
-def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
- if n_jobs == 1:
- return {lang:transformer(lX[lang]) for lang in lX.keys()}
- else:
- langs = list(lX.keys())
- transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
- return {lang: transformations[i] for i, lang in enumerate(langs)}
-
-
-class TrivialRejector:
- def fit(self, X, y):
- self.cats = y.shape[1]
- return self
-
- def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
-
- def predict(self, X): return np.zeros((X.shape[0],self.cats))
-
- def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
-
- def best_params(self): return {}
-
-
-class NaivePolylingualClassifier:
- """
- Is a mere set of independet MonolingualClassifiers
- """
- def __init__(self, base_learner, parameters=None, n_jobs=-1):
- self.base_learner = base_learner
- self.parameters = parameters
- self.model = None
- self.n_jobs = n_jobs
-
- def fit(self, lX, ly):
- """
- trains the independent monolingual classifiers
- :param lX: a dictionary {language_label: X csr-matrix}
- :param ly: a dictionary {language_label: y np.array}
- :return: self
- """
- tinit = time.time()
- assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
- langs = list(lX.keys())
- for lang in langs:
- _sort_if_sparse(lX[lang])
-
- models = Parallel(n_jobs=self.n_jobs)\
- (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
-
- self.model = {lang: models[i] for i, lang in enumerate(langs)}
- self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
- self.time = time.time() - tinit
- return self
-
- def decision_function(self, lX):
- """
- :param lX: a dictionary {language_label: X csr-matrix}
- :return: a dictionary of classification scores for each class
- """
- assert self.model is not None, 'predict called before fit'
- assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
- langs=list(lX.keys())
- scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
- return {lang:scores[i] for i,lang in enumerate(langs)}
-
- def predict_proba(self, lX):
- """
- :param lX: a dictionary {language_label: X csr-matrix}
- :return: a dictionary of probabilities that each document belongs to each class
- """
- assert self.model is not None, 'predict called before fit'
- assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
- langs=list(lX.keys())
- scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
- return {lang:scores[i] for i,lang in enumerate(langs)}
-
- def predict(self, lX):
- """
- :param lX: a dictionary {language_label: X csr-matrix}
- :return: a dictionary of predictions
- """
- assert self.model is not None, 'predict called before fit'
- assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
- if self.n_jobs == 1:
- return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
- else:
- langs = list(lX.keys())
- scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
- return {lang: scores[i] for i, lang in enumerate(langs)}
-
- def best_params(self):
- return {l:model.best_params() for l,model in self.model.items()}
-
-
-class MonolingualClassifier:
-
- def __init__(self, base_learner, parameters=None, n_jobs=-1):
- self.learner = base_learner
- self.parameters = parameters
- self.model = None
- self.n_jobs = n_jobs
- self.best_params_ = None
-
- def fit(self, X, y):
- if X.shape[0] == 0:
- print('Warning: X has 0 elements, a trivial rejector will be created')
- self.model = TrivialRejector().fit(X,y)
- self.empty_categories = np.arange(y.shape[1])
- return self
-
- tinit = time.time()
- _sort_if_sparse(X)
- self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
-
- # multi-class format
- if len(y.shape) == 2:
- if self.parameters is not None:
- self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
- for params in self.parameters]
- self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
- else:
- self.model = self.learner
- raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in '
- 'the labels across languages')
-
- # parameter optimization?
- if self.parameters:
- print('debug: optimizing parameters:', self.parameters)
- self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
- error_score=0, verbose=10)
-
- # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
- print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
- self.model.fit(X, y)
- if isinstance(self.model, GridSearchCV):
- self.best_params_ = self.model.best_params_
- print('best parameters: ', self.best_params_)
- self.time = time.time()-tinit
- return self
-
- def decision_function(self, X):
- assert self.model is not None, 'predict called before fit'
- _sort_if_sparse(X)
- return self.model.decision_function(X)
-
- def predict_proba(self, X):
- assert self.model is not None, 'predict called before fit'
- assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
- _sort_if_sparse(X)
- return self.model.predict_proba(X)
-
- def predict(self, X):
- assert self.model is not None, 'predict called before fit'
- _sort_if_sparse(X)
- return self.model.predict(X)
-
- def best_params(self):
- return self.best_params_
\ No newline at end of file
diff --git a/src/learning/transformers.py b/src/learning/transformers.py
deleted file mode 100644
index 5a76740..0000000
--- a/src/learning/transformers.py
+++ /dev/null
@@ -1,863 +0,0 @@
-from torch.optim.lr_scheduler import StepLR
-from torch.utils.data import DataLoader
-from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain
-from embeddings.embeddings import FastTextMUSE
-from embeddings.supervised import supervised_embeddings_tfidf, zscores
-from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
-from sklearn.decomposition import PCA
-from scipy.sparse import hstack
-from util_transformers.StandardizeTransformer import StandardizeTransformer
-from util.SIF_embed import remove_pc
-from sklearn.preprocessing import normalize
-from scipy.sparse import csr_matrix
-from models.mBert import *
-from models.lstm_class import *
-from util.csv_log import CSVLog
-from util.file import get_file_name, create_if_not_exist, exists
-from util.early_stop import EarlyStopping
-from util.common import *
-import pickle
-import time
-
-
-# ------------------------------------------------------------------
-# Data Processing
-# ------------------------------------------------------------------
-
-
-class FeatureWeight:
-
- def __init__(self, weight='tfidf', agg='mean'):
- assert weight in ['tfidf', 'pmi', 'ig'] or callable(
- weight), 'weight should either be "tfidf" or a callable function'
- assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
- self.weight = weight
- self.agg = agg
- self.fitted = False
- if weight == 'pmi':
- self.weight = pointwise_mutual_information
- elif weight == 'ig':
- self.weight = information_gain
-
- def fit(self, lX, ly):
- if not self.fitted:
- if self.weight == 'tfidf':
- self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
- else:
- self.lF = {}
- for l in lX.keys():
- X, y = lX[l], ly[l]
-
- print(f'getting supervised cell-matrix lang {l}')
- tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
- if self.agg == 'max':
- F = tsr_matrix.max(axis=0)
- elif self.agg == 'mean':
- F = tsr_matrix.mean(axis=0)
- self.lF[l] = F
- self.fitted = True
- return self
-
- def transform(self, lX):
- return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
-
- def fit_transform(self, lX, ly):
- return self.fit(lX, ly).transform(lX)
-
-# ------------------------------------------------------------------
-# View Generators (aka first-tier learners)
-# ------------------------------------------------------------------
-
-
-class PosteriorProbabilitiesEmbedder:
-
- def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'):
- self.fist_tier_learner = first_tier_learner
- self.fist_tier_parameters = first_tier_parameters
- self.l2 = l2
- self.n_jobs = n_jobs
- self.doc_projector = NaivePolylingualClassifier(
- self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
- )
- self.requires_tfidf = True
- self.storing_path = storing_path
- self.is_training = is_training
-
- def fit(self, lX, lY, lV=None, called_by_viewgen=False):
- # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
- # print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
- # return self
- if not called_by_viewgen:
- # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen)
- print('### Posterior Probabilities View Generator (X)')
- print('fitting the projectors... {}'.format(lX.keys()))
- self.doc_projector.fit(lX, lY)
- return self
-
- def transform(self, lX):
- # if dir exist, load and return already computed results
- # _endpoint = 'tr' if self.is_training else 'te'
- # _actual_path = self.storing_path + '/' + _endpoint
- # if exists(_actual_path):
- # print('NB: loading pre-computed results!')
- # with open(_actual_path + '/X.pickle', 'rb') as infile:
- # self.is_training = False
- # return pickle.load(infile)
-
- lZ = self.predict_proba(lX)
- lZ = _normalize(lZ, self.l2)
- # create dir and dump computed results
- # create_if_not_exist(_actual_path)
- # with open(_actual_path + '/X.pickle', 'wb') as outfile:
- # pickle.dump(lZ, outfile)
- self.is_training = False
- return lZ
-
- def fit_transform(self, lX, ly=None, lV=None):
- return self.fit(lX, ly).transform(lX)
-
- def best_params(self):
- return self.doc_projector.best_params()
-
- def predict(self, lX, ly=None):
- return self.doc_projector.predict(lX)
-
- def predict_proba(self, lX, ly=None):
- print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents')
- lZ = self.doc_projector.predict_proba(lX)
- return lZ
-
-
-class MuseEmbedder:
-
- def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False):
- self.path = path
- self.lV = lV
- self.l2 = l2
- self.n_jobs = n_jobs
- self.featureweight = featureweight
- self.sif = sif
- self.requires_tfidf = True
-
- def fit(self, lX, ly, lV=None):
- assert lV is not None or self.lV is not None, 'lV not specified'
- print('### MUSE View Generator (M)')
- print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...')
- self.langs = sorted(lX.keys())
- self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
- lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
- self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()}
- self.featureweight.fit(lX, ly)
- return self
-
- def transform(self, lX):
- MUSE = self.MUSE
- lX = self.featureweight.transform(lX)
- XdotMUSE = Parallel(n_jobs=self.n_jobs)(
- delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs)
- lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
- lMuse = _normalize(lMuse, self.l2)
- return lMuse
-
- def fit_transform(self, lX, ly, lV):
- return self.fit(lX, ly, lV).transform(lX)
-
- def _get_wordlist_from_word2index(self, word2index):
- return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0]
-
- def _get_output_dim(self):
- return self.MUSE['da'].shape[1]
-
-
-class WordClassEmbedder:
-
- def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False):
- self.n_jobs = n_jobs
- self.l2 = l2
- self.max_label_space = max_label_space
- self.featureweight = featureweight
- self.sif = sif
- self.requires_tfidf = True
-
- def fit(self, lX, ly, lV=None):
- print('### WCE View Generator (M)')
- print('Computing supervised embeddings...')
- self.langs = sorted(lX.keys())
- WCE = Parallel(n_jobs=self.n_jobs)(
- delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
- )
- self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)}
- self.featureweight.fit(lX, ly)
- return self
-
- def transform(self, lX):
- lWCE = self.lWCE
- lX = self.featureweight.transform(lX)
- XdotWCE = Parallel(n_jobs=self.n_jobs)(
- delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs
- )
- lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
- lwce = _normalize(lwce, self.l2)
- return lwce
-
- def fit_transform(self, lX, ly, lV=None):
- return self.fit(lX, ly).transform(lX)
-
- def _get_output_dim(self):
- return 73 # TODO !
-
-
-class MBertEmbedder:
-
- def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None,
- nC=None, avoid_loading=False):
- self.doc_embed_path = doc_embed_path
- self.patience = patience
- self.checkpoint_dir = checkpoint_dir
- self.fitted = False
- self.requires_tfidf = False
- self.avoid_loading = avoid_loading
- if path_to_model is None:
- self.model = None
- else:
- config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
- num_labels=nC)
- if self.avoid_loading:
- self.model = None
- else:
- self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results!
- self.fitted = True
-
- def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1):
- print('### mBERT View Generator (B)')
- if self.fitted is True:
- print('Bert model already fitted!')
- return self
-
- print('Fine-tune mBert on the given dataset.')
- l_tokenized_tr = do_tokenization(lX, max_len=512)
- l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly,
- val_prop=0.2, max_val=2000,
- seed=seed)
-
- tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target)
- va_dataset = TrainingDataset(l_split_va, l_split_val_target)
- tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True)
- va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True)
-
- nC = tr_dataset.get_nclasses()
- model = get_model(nC)
- model = model.cuda()
- criterion = torch.nn.BCEWithLogitsLoss().cuda()
- optim = init_optimizer(model, lr=lr, weight_decay=0.01)
- lr_scheduler = StepLR(optim, step_size=25, gamma=0.1)
- early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience,
- checkpoint=self.checkpoint_dir,
- is_bert=True)
-
- # Training loop
- logfile = '../log/log_mBert_extractor.csv'
- method_name = 'mBert_feature_extractor'
-
- tinit = time()
- lang_ids = va_dataset.lang_ids
- for epoch in range(1, nepochs + 1):
- print('# Start Training ...')
- train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile)
- lr_scheduler.step() # reduces the learning rate # TODO arg epoch?
-
- # Validation
- macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va')
- early_stop(macrof1, epoch)
-
- if early_stop.STOP:
- print('[early-stop] STOP')
- break
-
- model = early_stop.restore_checkpoint()
- self.model = model.cuda()
-
- if val_epochs > 0:
- print(f'running last {val_epochs} training epochs on the validation set')
- for val_epoch in range(1, val_epochs + 1):
- train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile)
-
- self.fitted = True
- return self
-
- def transform(self, lX):
- assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \
- 'pass the "path_to_model" arg.'
- print('Obtaining document embeddings from pretrained mBert ')
- l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True)
- feat_dataset = ExtractorDataset(l_tokenized_X)
- feat_lang_ids = feat_dataset.lang_ids
- dataloader = DataLoader(feat_dataset, batch_size=64)
- all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model)
- return all_batch_embeddings
-
- def fit_transform(self, lX, ly, lV=None):
- return self.fit(lX, ly).transform(lX)
-
-
-class RecurrentEmbedder:
-
- def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3,
- we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10,
- test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1):
- self.pretrained = pretrained
- self.supervised = supervised
- self.concat = concat
- self.requires_tfidf = False
- self.multilingual_dataset = multilingual_dataset
- self.model = None
- self.we_path = we_path
- self.langs = multilingual_dataset.langs()
- self.hidden_size = hidden_size
- self.sup_drop = sup_drop
- self.posteriors = posteriors
- self.patience = patience
- self.checkpoint_dir = checkpoint_dir
- self.test_each = test_each
- self.options = options
- self.seed = options.seed
- self.model_path = model_path
- self.n_jobs = n_jobs
- self.is_trained = False
-
- ## INIT MODEL for training
- self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True)
- self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True)
- self.nC = self.lyte[self.langs[0]].shape[1]
- lpretrained, self.lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs)
- self.multilingual_index = MultilingualIndex()
- self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, self.lpretrained_vocabulary)
- self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
- self.multilingual_index.embedding_matrices(lpretrained, self.supervised)
-
- if model_path is not None:
- self.is_trained = True
- self.model = torch.load(model_path)
- else:
- self.model = self._init_Net()
-
- self.optim = init_optimizer(self.model, lr=lr)
- self.criterion = torch.nn.BCEWithLogitsLoss().cuda()
- self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5)
- self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience,
- checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}')
-
- def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1):
- print('### Gated Recurrent Unit View Generator (G)')
- if self.model is None:
- print('TODO: Init model!')
- if not self.is_trained:
- # Batchify input
- self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed)
- l_train_index, l_train_target = self.multilingual_index.l_train()
- l_val_index, l_val_target = self.multilingual_index.l_val()
- l_test_index = self.multilingual_index.l_test_index()
- batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
- lpad=self.multilingual_index.l_pad())
- batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs,
- lpad=self.multilingual_index.l_pad())
-
- # Train loop
- print('Start training')
- method_name = 'gru_view_generator'
- logfile = init_logfile_nn(method_name, self.options)
- tinit = time.time()
- for epoch in range(1, nepochs + 1):
- train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
- tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
- epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
- ltrain_bert=None)
- self.lr_scheduler.step()
-
- # validation step
- macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch,
- logfile, self.criterion, 'va')
-
- self.early_stop(macrof1, epoch)
- if self.test_each > 0:
- test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch,
- logfile, self.criterion, 'te')
-
- if self.early_stop.STOP:
- print('[early-stop] STOP')
- print('Restoring best model...')
- break
-
- self.model = self.early_stop.restore_checkpoint()
- print(f'running last {val_epochs} training epochs on the validation set')
- for val_epoch in range(1, val_epochs+1):
- batcher_train.init_offset()
- train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target,
- tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim,
- epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None,
- ltrain_bert=None)
- self.is_trained = True
-
- return self
-
- def transform(self, lX, batch_size=64):
- lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary)
- lX = self._get_doc_embeddings(lX, batch_size)
- return lX
-
- def fit_transform(self, lX, ly, lV=None):
- return self.fit(lX, ly).transform(lX)
-
- def _get_doc_embeddings(self, lX, batch_size):
- assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!'
- print('Generating document embeddings via GRU')
- _lX = {}
-
- l_devel_target = self.multilingual_index.l_devel_target()
-
- # show_gpu('RNN init at extraction')
- for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target,
- batch_size, self.multilingual_index.l_pad())):
- if lang not in _lX.keys():
- _lX[lang] = self.model.get_embeddings(batch, lang)
- else:
- _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0)
- # show_gpu('RNN after batch pred at extraction')
- return _lX
-
- # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
- def _load_pretrained_embeddings(self, we_path, langs):
- lpretrained = lpretrained_vocabulary = self._none_dict(langs)
- lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs)
- lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
- return lpretrained, lpretrained_vocabulary
-
- def _none_dict(self, langs):
- return {l:None for l in langs}
-
- # instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
- def _init_Net(self, xavier_uniform=True):
- model = RNNMultilingualClassifier(
- output_size=self.nC,
- hidden_size=self.hidden_size,
- lvocab_size=self.multilingual_index.l_vocabsize(),
- learnable_length=0,
- lpretrained=self.multilingual_index.l_embeddings(),
- drop_embedding_range=self.multilingual_index.sup_range,
- drop_embedding_prop=self.sup_drop,
- post_probabilities=self.posteriors
- )
- return model.cuda()
-
-
-class DocEmbedderList:
-
- def __init__(self, *embedder_list, aggregation='concat'):
- assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
- if len(embedder_list) == 0:
- embedder_list = []
- self.embedders = embedder_list
- self.aggregation = aggregation
- print(f'Aggregation mode: {self.aggregation}')
-
- def fit(self, lX, ly, lV=None, tfidf=None):
- for transformer in self.embedders:
- _lX = lX
- if transformer.requires_tfidf:
- _lX = tfidf
- transformer.fit(_lX, ly, lV)
- return self
-
- def transform(self, lX, tfidf=None):
- if self.aggregation == 'concat':
- return self.transform_concat(lX, tfidf)
- elif self.aggregation == 'mean':
- return self.transform_mean(lX, tfidf)
-
- def transform_concat(self, lX, tfidf):
- if len(self.embedders) == 1:
- if self.embedders[0].requires_tfidf:
- lX = tfidf
- return self.embedders[0].transform(lX)
-
- some_sparse = False
- langs = sorted(lX.keys())
-
- lZparts = {l: [] for l in langs}
- for transformer in self.embedders:
- _lX = lX
- if transformer.requires_tfidf:
- _lX = tfidf
- lZ = transformer.transform(_lX)
- for l in langs:
- Z = lZ[l]
- some_sparse = some_sparse or issparse(Z)
- lZparts[l].append(Z)
-
- hstacker = hstack if some_sparse else np.hstack
- return {l: hstacker(lZparts[l]) for l in langs}
-
- def transform_mean(self, lX, tfidf):
- if len(self.embedders) == 1:
- if self.embedders[0].requires_tfidf:
- lX = tfidf
- return self.embedders[0].transform(lX)
-
- langs = sorted(lX.keys())
- lZparts = {l: None for l in langs}
-
- for transformer in self.embedders:
- _lX = lX
- if transformer.requires_tfidf:
- _lX = tfidf
- lZ = transformer.transform(_lX)
- for l in langs:
- Z = lZ[l]
- if lZparts[l] is None:
- lZparts[l] = Z
- else:
- lZparts[l] += Z
-
- n_transformers = len(self.embedders)
-
- return {l: lZparts[l] / n_transformers for l in langs}
-
- def fit_transform(self, lX, ly, lV=None, tfidf=None):
- return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf)
-
- def best_params(self):
- return {'todo'}
-
- def append(self, embedder):
- self.embedders.append(embedder)
-
-
-class FeatureSet2Posteriors:
- def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'):
- self.transformer = transformer
- self.l2 = l2
- self.n_jobs = n_jobs
- self.prob_classifier = MetaClassifier(
- SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
- self.requires_tfidf = requires_tfidf
-
- self.storing_path = storing_path
- self.is_training = True
- self.method_id = method_id
-
- def fit(self, lX, ly, lV=None):
- if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'):
- print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results')
- return self
-
- if lV is None and hasattr(self.transformer, 'lV'):
- lV = self.transformer.lV
- lZ = self.transformer.fit_transform(lX, ly, lV)
- self.prob_classifier.fit(lZ, ly)
- return self
-
- def transform(self, lX):
- # if dir exist, load and return already computed results
- # _endpoint = 'tr' if self.is_training else 'te'
- # _actual_path = self.storing_path + '/' + _endpoint
- # if exists(_actual_path):
- # print('NB: loading pre-computed results!')
- # with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile:
- # self.is_training = False
- # return pickle.load(infile)
-
- lP = self.predict_proba(lX)
- lP = _normalize(lP, self.l2)
- # create dir and dump computed results
- # create_if_not_exist(_actual_path)
- # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile:
- # pickle.dump(lP, outfile)
- self.is_training = False
- return lP
-
- def fit_transform(self, lX, ly, lV):
- return self.fit(lX, ly, lV).transform(lX)
-
- def predict(self, lX, ly=None):
- lZ = self.transformer.transform(lX)
- return self.prob_classifier.predict(lZ)
-
- def predict_proba(self, lX, ly=None):
- lZ = self.transformer.transform(lX)
- return self.prob_classifier.predict_proba(lZ)
-
-
-# ------------------------------------------------------------------
-# Meta-Classifier (aka second-tier learner)
-# ------------------------------------------------------------------
-class MetaClassifier:
-
- def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
- self.n_jobs = n_jobs
- self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
- self.standardize_range = standardize_range
-
- def fit(self, lZ, ly):
- tinit = time.time()
- Z, y = self.stack(lZ, ly)
-
- self.standardizer = StandardizeTransformer(range=self.standardize_range)
- Z = self.standardizer.fit_transform(Z)
-
- print('fitting the Z-space of shape={}'.format(Z.shape))
- self.model.fit(Z, y)
- self.time = time.time() - tinit
-
- def stack(self, lZ, ly=None):
- langs = list(lZ.keys())
- Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
- if ly is not None:
- y = np.vstack([ly[lang] for lang in langs])
- return Z, y
- else:
- return Z
-
- def predict(self, lZ, ly=None):
- lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
- return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
-
- def predict_proba(self, lZ, ly=None):
- lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
- return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
-
- def best_params(self):
- return self.model.best_params()
-
-
-# ------------------------------------------------------------------
-# Ensembling (aka Funnelling)
-# ------------------------------------------------------------------
-class Funnelling:
- def __init__(self,
- vectorizer: TfidfVectorizerMultilingual,
- first_tier: DocEmbedderList,
- meta: MetaClassifier):
- self.vectorizer = vectorizer
- self.first_tier = first_tier
- self.meta = meta
- self.n_jobs = meta.n_jobs
-
- def fit(self, lX, ly, target_lang=None):
- if target_lang is not None:
- LX = lX.copy()
- LX.update(target_lang)
- self.vectorizer.fit(LX)
- tfidf_lX = self.vectorizer.transform(lX)
- else:
- tfidf_lX = self.vectorizer.fit_transform(lX, ly)
- lV = self.vectorizer.vocabulary()
- print('## Fitting first-tier learners!')
- lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX)
- print('## Fitting meta-learner!')
- self.meta.fit(lZ, ly)
-
- def predict(self, lX, ly=None):
- tfidf_lX = self.vectorizer.transform(lX)
- lZ = self.first_tier.transform(lX, tfidf=tfidf_lX)
- ly_ = self.meta.predict(lZ)
- return ly_
-
- def best_params(self):
- return {'1st-tier': self.first_tier.best_params(),
- 'meta': self.meta.best_params()}
-
-
-class Voting:
- def __init__(self, *prob_classifiers):
- assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
- self.prob_classifiers = prob_classifiers
-
- def fit(self, lX, ly, lV=None):
- for classifier in self.prob_classifiers:
- classifier.fit(lX, ly, lV)
-
- def predict(self, lX, ly=None):
- lP = {l: [] for l in lX.keys()}
- for classifier in self.prob_classifiers:
- lPi = classifier.predict_proba(lX)
- for l in lX.keys():
- lP[l].append(lPi[l])
-
- lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()}
- ly = {l: P > 0.5 for l, P in lP.items()}
-
- return ly
-
-
-# ------------------------------------------------------------------------------
-# HELPERS
-# ------------------------------------------------------------------------------
-
-def load_muse_embeddings(we_path, langs, n_jobs=-1):
- MUSE = Parallel(n_jobs=n_jobs)(
- delayed(FastTextMUSE)(we_path, lang) for lang in langs
- )
- return {l: MUSE[i] for i, l in enumerate(langs)}
-
-
-def word_class_embedding_matrix(X, Y, max_label_space=300):
- WCE = supervised_embeddings_tfidf(X, Y)
- WCE = zscores(WCE, axis=0)
-
- nC = Y.shape[1]
- if nC > max_label_space:
- print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
- f'Applying PCA(n_components={max_label_space})')
- pca = PCA(n_components=max_label_space)
- WCE = pca.fit(WCE).transform(WCE)
-
- return WCE
-
-
-def XdotM(X, M, sif):
- E = X.dot(M)
- if sif:
- # print("removing pc...")
- E = remove_pc(E, npc=1)
- return E
-
-
-def _normalize(lX, l2=True):
- return {l: normalize(X) for l, X in lX.items()} if l2 else lX
-
-
-class BatchGRU:
- def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
- self.batchsize = batchsize
- self.batches_per_epoch = batches_per_epoch
- self.languages = languages
- self.lpad = lpad
- self.max_pad_length = max_pad_length
- self.init_offset()
-
- def init_offset(self):
- self.offset = {lang: 0 for lang in self.languages}
-
- def batchify(self, l_index, l_post, l_bert, llabels, extractor=False):
- langs = self.languages
- l_num_samples = {l: len(l_index[l]) for l in langs}
-
- max_samples = max(l_num_samples.values())
- n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
- if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
- n_batches = self.batches_per_epoch
-
- for b in range(n_batches):
- for lang in langs:
- index, labels = l_index[lang], llabels[lang]
- offset = self.offset[lang]
- if offset >= l_num_samples[lang]:
- offset = 0
- limit = offset+self.batchsize
-
- batch_slice = slice(offset, limit)
- batch = index[batch_slice]
- batch_labels = labels[batch_slice].toarray()
-
- post = None
- bert_emb = None
-
- batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
- batch = torch.LongTensor(batch).cuda()
- target = torch.FloatTensor(batch_labels).cuda()
-
- self.offset[lang] = limit
-
- yield batch, post, bert_emb, target, lang
-
-
-def pad(index_list, pad_index, max_pad_length=None):
- pad_length = np.max([len(index) for index in index_list])
- if max_pad_length is not None:
- pad_length = min(pad_length, max_pad_length)
- for i,indexes in enumerate(index_list):
- index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
- return index_list
-
-
-def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt,
- ltrain_posteriors=None, ltrain_bert=None, log_interval=10):
- _dataset_path = opt.dataset.split('/')[-1].split('_')
- dataset_id = _dataset_path[0] + _dataset_path[-1]
-
- # show_gpu('RNN init pre-training')
- loss_history = []
- model.train()
- for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)):
- optim.zero_grad()
- loss = criterion(model(batch, post, bert_emb, lang), target)
- loss.backward()
- clip_gradient(model)
- optim.step()
- loss_history.append(loss.item())
- # show_gpu('RNN after batch prediction')
-
- if idx % log_interval == 0:
- interval_loss = np.mean(loss_history[-log_interval:])
- print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, '
- f'Training Loss: {interval_loss:.6f}')
-
- mean_loss = np.mean(interval_loss)
- logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit)
- return mean_loss
-
-
-def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix):
- loss_history = []
- model.eval()
- langs = sorted(ltest_index.keys())
- predictions = {l: [] for l in langs}
- yte_stacked = {l: [] for l in langs}
- batcher.init_offset()
- for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte),
- desc='evaluation: '):
- logits = model(batch, post, bert_emb, lang)
- loss = criterion(logits, target).item()
- prediction = predict(logits)
- predictions[lang].append(prediction)
- yte_stacked[lang].append(target.detach().cpu().numpy())
- loss_history.append(loss)
-
- ly = {l:np.vstack(yte_stacked[l]) for l in langs}
- ly_ = {l:np.vstack(predictions[l]) for l in langs}
- l_eval = evaluate(ly, ly_)
- metrics = []
- for lang in langs:
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- if measure_prefix == 'te':
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
- print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
-
- mean_loss = np.mean(loss_history)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit)
-
- return Mf1
-
-
-def clip_gradient(model, clip_value=1e-1):
- params = list(filter(lambda p: p.grad is not None, model.parameters()))
- for p in params:
- p.grad.data.clamp_(-clip_value, clip_value)
-
-
-def init_logfile_nn(method_name, opt):
- import os
- logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
- logfile.set_default('dataset', opt.dataset)
- logfile.set_default('run', opt.seed)
- logfile.set_default('method', get_method_name(os.path.basename(opt.dataset), opt.posteriors, opt.supervised, opt.pretrained, opt.mbert,
- opt.gruViewGenerator, opt.gruMUSE, opt.gruWCE, opt.agg, opt.allprob))
- assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \
- f'and run {opt.seed} already calculated'
- return logfile
diff --git a/src/main_gFun.py b/src/main_gFun.py
deleted file mode 100644
index 8694087..0000000
--- a/src/main_gFun.py
+++ /dev/null
@@ -1,166 +0,0 @@
-import os
-from dataset_builder import MultilingualDataset
-from learning.transformers import *
-from util.evaluation import *
-from util.file import exists
-from util.results import PolylingualClassificationResults
-from util.common import *
-from util.parser_options import *
-
-if __name__ == '__main__':
- (op, args) = parser.parse_args()
- dataset = op.dataset
- assert exists(dataset), 'Unable to find file '+str(dataset)
- assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
- assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \
- 'empty set of document embeddings is not allowed'
- if op.gruViewGenerator:
- assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \
- 'explicit initialization of GRU View Generator'
-
- l2 = op.l2
- dataset_file = os.path.basename(dataset)
- results = PolylingualClassificationResults('../log/' + op.output)
- allprob = 'Prob' if op.allprob else ''
-
- method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert,
- op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob)
-
- print(f'Method: gFun{method_name}\nDataset: {dataset_name}')
- print('-'*50)
-
- n_jobs = -1 # TODO SETTING n_JOBS
-
- standardize_range = slice(0, 0)
- if op.zscore:
- standardize_range = None
-
- # load dataset
- data = MultilingualDataset.load(dataset)
- # data.set_view(languages=['it']) # TODO: DEBUG SETTING
- data.show_dimensions()
- lXtr, lytr = data.training()
- lXte, lyte = data.test()
-
- # text preprocessing
- tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
-
- # feature weighting (for word embeddings average)
- feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
-
- # document embedding modules aka View Generators
- doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat')
-
- # init View Generators
- if op.posteriors:
- """
- View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means
- of a set of SVM.
- """
- # Check if we already have VG outputs from previous runs
- VG_name = 'X'
- storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
- exist = exists(storing_path)
- doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True,
- kernel='linear',
- C=op.set_c),
- l2=l2, storing_path=storing_path, n_jobs=n_jobs))
-
- if op.supervised:
- """
- View Generator (-W): generates document representation via Word-Class-Embeddings.
- Document embeddings are obtained via weighted sum of document's constituent embeddings.
- """
- VG_name = 'W'
- storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
- exist = exists(storing_path)
- wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting,
- sif=op.sif, n_jobs=n_jobs)
- if op.allprob:
- wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
- n_jobs=n_jobs)
- doc_embedder.append(wce)
-
- if op.pretrained:
- """
- View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word
- embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
- """
- VG_name = 'M'
- storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
- exist = exists(storing_path)
- muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs)
- if op.allprob:
- muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path,
- n_jobs=n_jobs)
- doc_embedder.append(muse)
-
- if op.gruViewGenerator:
- """
- View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be
- initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
- Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM.
- """
- VG_name = 'G'
- VG_name += '_muse' if op.gruMUSE else ''
- VG_name += '_wce' if op.gruWCE else ''
- storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
- rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data,
- options=op, model_path=None, n_jobs=n_jobs)
- if op.allprob:
- rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False,
- storing_path=storing_path, n_jobs=n_jobs)
- doc_embedder.append(rnn_embedder)
-
- if op.mbert:
- """
- View generator (-B): generates document embedding via mBERT model.
- """
- VG_name = 'B'
- storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}'
- avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run))
-
- mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading)
- if op.allprob:
- mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path)
- doc_embedder.append(mbert)
-
- # metaclassifier
- meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
- meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c),
- meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs)
-
- # ensembling the modules
- classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
-
- print('\n# Fitting Funnelling Architecture...')
- tinit = time.time()
- classifier.fit(lXtr, lytr)
- time = time.time()-tinit
-
- print('\n# Evaluating ...')
- l_eval = evaluate_method(classifier, lXte, lyte)
-
- metrics = []
- for lang in lXte.keys():
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- results.add_row(method='MultiModal',
- learner='SVM',
- optimp=op.optimc,
- sif=op.sif,
- zscore=op.zscore,
- l2=op.l2,
- wescaler=op.feat_weight,
- pca=op.max_labels_S,
- id=method_name,
- dataset=dataset_name,
- time=time,
- lang=lang,
- macrof1=macrof1,
- microf1=microf1,
- macrok=macrok,
- microk=microk,
- notes='')
- print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
diff --git a/src/models/cnn_class_bu.py b/src/models/cnn_class_bu.py
deleted file mode 100644
index a47d5fc..0000000
--- a/src/models/cnn_class_bu.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import torch.nn as nn
-from torch.nn import functional as F
-import torch
-
-class CNN_pdr(nn.Module):
-
- def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None,
- drop_embedding_prop=0, drop_prob=0.5):
- super(CNN_pdr, self).__init__()
- self.vocab_size = vocab_size
- self.emb_dim = emb_dim
- self.embeddings = torch.FloatTensor(embeddings)
- self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings)
- self.kernel_heights = kernel_heights=[3,5,7]
- self.stride = 1
- self.padding = 0
- self.drop_embedding_range = drop_embedding_range
- self.drop_embedding_prop = drop_embedding_prop
- assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
- self.nC = 73
-
- self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding)
- self.dropout = nn.Dropout(drop_prob)
- self.label = nn.Linear(len(kernel_heights) * out_channels, output_size)
- self.fC = nn.Linear(compositional_dim + self.nC, self.nC)
-
-
- def forward(self, x, svm_output):
- x = torch.LongTensor(x)
- svm_output = torch.FloatTensor(svm_output)
- x = self.embedding_layer(x)
- x = self.conv1(x.unsqueeze(1))
- x = F.relu(x.squeeze(3))
- x = F.max_pool1d(x, x.size()[2]).squeeze(2)
- x = torch.cat((x, svm_output), 1)
- x = F.sigmoid(self.fC(x))
- return x #.detach().numpy()
-
- # logits = self.label(x)
- # return logits
-
-
diff --git a/src/models/helpers.py b/src/models/helpers.py
deleted file mode 100755
index 93e5805..0000000
--- a/src/models/helpers.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-
-
-def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'):
- pretrained_embeddings = None
- pretrained_length = 0
- if pretrained is not None:
- pretrained_length = pretrained.shape[1]
- assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
- pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
- pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
- # pretrained_embeddings.to(device)
-
- learnable_embeddings = None
- if learnable_length > 0:
- learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
- # learnable_embeddings.to(device)
-
- embedding_length = learnable_length + pretrained_length
- assert embedding_length > 0, '0-size embeddings'
-
- return pretrained_embeddings, learnable_embeddings, embedding_length
-
-
-def embed(model, input, lang):
- input_list = []
- if model.lpretrained_embeddings[lang]:
- input_list.append(model.lpretrained_embeddings[lang](input))
- if model.llearnable_embeddings[lang]:
- input_list.append(model.llearnable_embeddings[lang](input))
- return torch.cat(tensors=input_list, dim=2)
-
-
-def embedding_dropout(input, drop_range, p_drop=0.5, training=True):
- if p_drop > 0 and training and drop_range is not None:
- p = p_drop
- drop_from, drop_to = drop_range
- m = drop_to - drop_from #length of the supervised embedding
- l = input.shape[2] #total embedding length
- corr = (1 - p)
- input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p)
- input /= (1 - (p * m / l))
-
- return input
diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py
deleted file mode 100755
index 98424f1..0000000
--- a/src/models/lstm_class.py
+++ /dev/null
@@ -1,114 +0,0 @@
-#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-from models.helpers import *
-
-
-class RNNMultilingualClassifier(nn.Module):
-
- def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
- drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False,
- bert_embeddings=False):
-
- super(RNNMultilingualClassifier, self).__init__()
- self.output_size = output_size
- self.hidden_size = hidden_size
- self.drop_embedding_range = drop_embedding_range
- self.drop_embedding_prop = drop_embedding_prop
- self.post_probabilities = post_probabilities
- self.bert_embeddings = bert_embeddings
- assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
-
- self.lpretrained_embeddings = nn.ModuleDict()
- self.llearnable_embeddings = nn.ModuleDict()
- self.embedding_length = None
- self.langs = sorted(lvocab_size.keys())
- self.only_post = only_post
-
- self.n_layers = 1
- self.n_directions = 1
-
- self.dropout = nn.Dropout(0.6)
-
- lstm_out = 256
- ff1 = 512
- ff2 = 256
-
- lpretrained_embeddings = {}
- llearnable_embeddings = {}
- if only_post==False:
- for l in self.langs:
- pretrained = lpretrained[l] if lpretrained else None
- pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
- pretrained, lvocab_size[l], learnable_length
- )
- lpretrained_embeddings[l] = pretrained_embeddings
- llearnable_embeddings[l] = learnable_embeddings
- self.embedding_length = embedding_length
-
- # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
- self.rnn = nn.GRU(self.embedding_length, hidden_size)
- self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
- self.lpretrained_embeddings.update(lpretrained_embeddings)
- self.llearnable_embeddings.update(llearnable_embeddings)
-
- self.linear1 = nn.Linear(lstm_out, ff1)
- self.linear2 = nn.Linear(ff1, ff2)
-
- if only_post:
- self.label = nn.Linear(output_size, output_size)
- elif post_probabilities and not bert_embeddings:
- self.label = nn.Linear(ff2 + output_size, output_size)
- elif bert_embeddings and not post_probabilities:
- self.label = nn.Linear(ff2 + 768, output_size)
- elif post_probabilities and bert_embeddings:
- self.label = nn.Linear(ff2 + output_size + 768, output_size)
- else:
- self.label = nn.Linear(ff2, output_size)
-
- def forward(self, input, post, bert_embed, lang):
- if self.only_post:
- doc_embedding = post
- else:
- doc_embedding = self.transform(input, lang)
- if self.post_probabilities:
- doc_embedding = torch.cat([doc_embedding, post], dim=1)
- if self.bert_embeddings:
- doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1)
-
- logits = self.label(doc_embedding)
- return logits
-
- def transform(self, input, lang):
- batch_size = input.shape[0]
- input = embed(self, input, lang)
- input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
- training=self.training)
- input = input.permute(1, 0, 2)
- h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
- # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
- # output, (_, _) = self.lstm(input, (h_0, c_0))
- output, _ = self.rnn(input, h_0)
- output = output[-1, :, :]
- output = F.relu(self.linear0(output))
- output = self.dropout(F.relu(self.linear1(output)))
- output = self.dropout(F.relu(self.linear2(output)))
- return output
-
- def finetune_pretrained(self):
- for l in self.langs:
- self.lpretrained_embeddings[l].requires_grad = True
- self.lpretrained_embeddings[l].weight.requires_grad = True
-
- def get_embeddings(self, input, lang):
- batch_size = input.shape[0]
- input = embed(self, input, lang)
- input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
- training=self.training)
- input = input.permute(1, 0, 2)
- h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda())
- output, _ = self.rnn(input, h_0)
- output = output[-1, :, :]
- return output.cpu().detach().numpy()
-
diff --git a/src/models/mBert.py b/src/models/mBert.py
deleted file mode 100644
index 56695a6..0000000
--- a/src/models/mBert.py
+++ /dev/null
@@ -1,247 +0,0 @@
-from copy import deepcopy
-import torch
-from torch.utils.data import Dataset
-from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig
-from sklearn.model_selection import train_test_split
-from util.evaluation import *
-from time import time
-from util.common import show_gpu
-
-
-def predict(logits, classification_type='multilabel'):
- if classification_type == 'multilabel':
- prediction = torch.sigmoid(logits) > 0.5
- elif classification_type == 'singlelabel':
- prediction = torch.argmax(logits, dim=1).view(-1, 1)
- else:
- print('unknown classification type')
-
- return prediction.detach().cpu().numpy()
-
-
-class TrainingDataset(Dataset):
-
- def __init__(self, data, labels):
- self.langs = data.keys()
- self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
-
- for i, lang in enumerate(self.langs):
- _data = data[lang]['input_ids']
- _data = np.array(_data)
- _labels = labels[lang]
- _lang_value = np.full(len(_data), self.lang_ids[lang])
-
- if i == 0:
- self.data = _data
- self.labels = _labels
- self.lang_index = _lang_value
- else:
- self.data = np.vstack((self.data, _data))
- self.labels = np.vstack((self.labels, _labels))
- self.lang_index = np.concatenate((self.lang_index, _lang_value))
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, idx):
- x = self.data[idx]
- y = self.labels[idx]
- lang = self.lang_index[idx]
-
- return x, torch.tensor(y, dtype=torch.float), lang
-
- def get_lang_ids(self):
- return self.lang_ids
-
- def get_nclasses(self):
- if hasattr(self, 'labels'):
- return len(self.labels[0])
- else:
- print('Method called before init!')
-
-
-class ExtractorDataset(Dataset):
- """
- data: dict of lang specific tokenized data
- labels: dict of lang specific targets
- """
-
- def __init__(self, data):
- self.langs = data.keys()
- self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)}
-
- for i, lang in enumerate(self.langs):
- _data = data[lang]['input_ids']
- _data = np.array(_data)
- _lang_value = np.full(len(_data), self.lang_ids[lang])
-
- if i == 0:
- self.data = _data
- self.lang_index = _lang_value
- else:
- self.data = np.vstack((self.data, _data))
- self.lang_index = np.concatenate((self.lang_index, _lang_value))
-
- def __len__(self):
- return len(self.data)
-
- def __getitem__(self, idx):
- x = self.data[idx]
- lang = self.lang_index[idx]
-
- return x, lang
-
- def get_lang_ids(self):
- return self.lang_ids
-
-
-def get_model(n_out):
- print('# Initializing model ...')
- model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out)
- return model
-
-
-def init_optimizer(model, lr, weight_decay=0):
- no_decay = ['bias', 'LayerNorm.weight']
- optimizer_grouped_parameters = [
- {'params': [p for n, p in model.named_parameters()
- if not any(nd in n for nd in no_decay)],
- 'weight_decay': weight_decay},
- {'params': [p for n, p in model.named_parameters()
- if any(nd in n for nd in no_decay)],
- 'weight_decay': weight_decay}
- ]
- optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
- return optimizer
-
-
-def get_lr(optimizer):
- for param_group in optimizer.param_groups:
- return param_group['lr']
-
-
-def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed):
- l_split_va = deepcopy(l_tokenized_tr)
- l_split_val_target = {l: [] for l in l_tokenized_tr.keys()}
- l_split_tr = deepcopy(l_tokenized_tr)
- l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()}
-
- for lang in l_tokenized_tr.keys():
- val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val))
- l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[
- lang] = \
- train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size,
- random_state=seed, shuffle=True)
-
- return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target
-
-
-def do_tokenization(l_dataset, max_len=512, verbose=True):
- if verbose:
- print('# Starting Tokenization ...')
- tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
- langs = l_dataset.keys()
- l_tokenized = {}
- for lang in langs:
- l_tokenized[lang] = tokenizer(l_dataset[lang],
- truncation=True,
- max_length=max_len,
- padding='max_length')
- return l_tokenized
-
-
-def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10):
- # _dataset_path = opt.dataset.split('/')[-1].split('_')
- # dataset_id = _dataset_path[0] + _dataset_path[-1]
- dataset_id = 'TODO fix this!' # TODO
-
- loss_history = []
- model.train()
-
- for idx, (batch, target, lang_idx) in enumerate(train_dataloader):
- optim.zero_grad()
- out = model(batch.cuda())
- logits = out[0]
- loss = criterion(logits, target.cuda())
- loss.backward()
- # clip_gradient(model)
- optim.step()
- loss_history.append(loss.item())
-
- if idx % log_interval == 0:
- interval_loss = np.mean(loss_history[log_interval:])
- print(
- f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
-
- mean_loss = np.mean(interval_loss)
- logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
- return mean_loss
-
-
-def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix):
- print('# Validating model ...')
- loss_history = []
- model.eval()
- langs = lang_ids.keys()
- id_2_lang = {v: k for k, v in lang_ids.items()}
- predictions = {l: [] for l in langs}
- yte_stacked = {l: [] for l in langs}
-
- for batch, target, lang_idx in test_dataloader:
- out = model(batch.cuda())
- logits = out[0]
- loss = criterion(logits, target.cuda()).item()
- prediction = predict(logits)
- loss_history.append(loss)
-
- # Assigning prediction to dict in predictions and yte_stacked according to lang_idx
- for i, pred in enumerate(prediction):
- lang_pred = id_2_lang[lang_idx.numpy()[i]]
- predictions[lang_pred].append(pred)
- yte_stacked[lang_pred].append(target[i].detach().cpu().numpy())
-
- ly = {l: np.vstack(yte_stacked[l]) for l in langs}
- ly_ = {l: np.vstack(predictions[l]) for l in langs}
- l_eval = evaluate(ly, ly_)
- metrics = []
- for lang in langs:
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- if measure_prefix == 'te':
- print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
- Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
- print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
-
- mean_loss = np.mean(loss_history)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit)
- logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit)
-
- return Mf1
-
-
-def feature_extractor(data, lang_ids, model):
- print('# Feature Extractor Mode...')
- """
- Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for
- the output of each layer) of shape (batch_size, sequence_length, hidden_size)
- """
- # show_gpu('Before Training')
- all_batch_embeddings = {}
- id2lang = {v: k for k, v in lang_ids.items()}
- with torch.no_grad():
- for batch, lang_idx in data:
- out = model(batch.cuda())
- # show_gpu('After Batch Prediction')
- last_hidden_state = out[1][-1]
- batch_embeddings = last_hidden_state[:, 0, :]
- for i, l_idx in enumerate(lang_idx.numpy()):
- if id2lang[l_idx] not in all_batch_embeddings.keys():
- all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
- else:
- all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
- batch_embeddings[i].detach().cpu().numpy()))
- # show_gpu('After Full Prediction')
- return all_batch_embeddings, id2lang
diff --git a/src/results/results_manager.py b/src/results/results_manager.py
deleted file mode 100644
index 1fe57dd..0000000
--- a/src/results/results_manager.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import pandas as pd
-import numpy as np
-
-# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t')
-df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t')
-pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std])
-with pd.option_context('display.max_rows', None):
- print(pivot.round(3))
-print('Finished ...')
-
-
diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py
deleted file mode 100644
index cfe096e..0000000
--- a/src/util/SIF_embed.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import numpy as np
-from sklearn.decomposition import TruncatedSVD
-
-def get_weighted_average(We, x, w):
- """
- Compute the weighted average vectors
- :param We: We[i,:] is the vector for word i
- :param x: x[i, :] are the indices of the words in sentence i
- :param w: w[i, :] are the weights for the words in sentence i
- :return: emb[i, :] are the weighted average vector for sentence i
- """
- n_samples = x.shape[0]
- emb = np.zeros((n_samples, We.shape[1]))
- for i in range(n_samples):
- emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
- return emb
-
-def compute_pc(X,npc=1):
- """
- Compute the principal components.
- :param X: X[i,:] is a data point
- :param npc: number of principal components to remove
- :return: component_[i,:] is the i-th pc
- """
- svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
- svd.fit(X)
- return svd.components_
-
-def remove_pc(X, npc=1):
- """
- Remove the projection on the principal components
- :param X: X[i,:] is a data point
- :param npc: number of principal components to remove
- :return: XX[i, :] is the data point after removing its projection
- """
- pc = compute_pc(X, npc)
- if npc==1:
- XX = X - X.dot(pc.transpose()) * pc
- else:
- XX = X - X.dot(pc.transpose()).dot(pc)
- return XX
-
-
-def SIF_embedding(We, x, w, params):
- """
- Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
- :param We: We[i,:] is the vector for word i
- :param x: x[i, :] are the indices of the words in the i-th sentence
- :param w: w[i, :] are the weights for the words in the i-th sentence
- :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
- :return: emb, emb[i, :] is the embedding for sentence i
- """
- emb = get_weighted_average(We, x, w)
- if params.rmpc > 0:
- emb = remove_pc(emb, params.rmpc)
- return emb
\ No newline at end of file
diff --git a/src/util/common.py b/src/util/common.py
deleted file mode 100755
index 48a0525..0000000
--- a/src/util/common.py
+++ /dev/null
@@ -1,542 +0,0 @@
-import subprocess
-import warnings
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.svm import SVC
-from sklearn.model_selection import train_test_split
-from embeddings.supervised import get_supervised_embeddings
-import numpy as np
-from tqdm import tqdm
-import torch
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-
-def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
- """
- Index (i.e., replaces word strings with numerical indexes) a list of string documents
- :param data: list of string documents
- :param vocab: a fixed mapping [str]->[int] of words to indexes
- :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
- because they are anyway contained in a pre-trained embedding set that we know in advance)
- :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
- :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
- :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
- are not in the original vocab but that are in the known_words
- :return:
- """
- indexes=[]
- vocabsize = len(vocab)
- unk_count = 0
- knw_count = 0
- out_count = 0
- pbar = tqdm(data, desc=f'indexing documents')
- for text in pbar:
- words = analyzer(text)
- index = []
- for word in words:
- if word in vocab:
- idx = vocab[word]
- else:
- if word in known_words:
- if word not in out_of_vocabulary:
- out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
- idx = out_of_vocabulary[word]
- out_count += 1
- else:
- idx = unk_index
- unk_count += 1
- index.append(idx)
- indexes.append(index)
- knw_count += len(index)
- pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
- f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
- return indexes
-
-
-def define_pad_length(index_list):
- lengths = [len(index) for index in index_list]
- return int(np.mean(lengths)+np.std(lengths))
-
-
-def pad(index_list, pad_index, max_pad_length=None):
- pad_length = np.max([len(index) for index in index_list])
- if max_pad_length is not None:
- pad_length = min(pad_length, max_pad_length)
- for i,indexes in enumerate(index_list):
- index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
- return index_list
-
-
-class Index:
- def __init__(self, devel_raw, devel_target, test_raw, lang):
- self.lang = lang
- self.devel_raw = devel_raw
- self.devel_target = devel_target
- self.test_raw = test_raw
-
- def index(self, pretrained_vocabulary, analyzer, vocabulary):
- self.word2index = dict(vocabulary) # word2idx
- known_words = set(self.word2index.keys())
- if pretrained_vocabulary is not None:
- known_words.update(pretrained_vocabulary)
-
- self.word2index['UNKTOKEN'] = len(self.word2index)
- self.word2index['PADTOKEN'] = len(self.word2index)
- self.unk_index = self.word2index['UNKTOKEN']
- self.pad_index = self.word2index['PADTOKEN']
-
- # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
- self.out_of_vocabulary = dict()
- self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
- self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
-
- self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
-
- print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
-
- def train_val_split(self, val_prop, max_val, seed):
- devel = self.devel_index
- target = self.devel_target
- devel_raw = self.devel_raw
-
- val_size = int(min(len(devel) * val_prop, max_val))
-
- self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
- train_test_split(
- devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
- )
-
- print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
-
- def get_word_list(self):
- def extract_word_list(word2index):
- return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
-
- word_list = extract_word_list(self.word2index)
- word_list += extract_word_list(self.out_of_vocabulary)
- return word_list
-
- def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
- print(f'[generating embedding matrix for lang {self.lang}]')
-
- self.wce_range = None
- embedding_parts = []
-
- if pretrained is not None:
- print('\t[pretrained-matrix]')
- word_list = self.get_word_list()
- muse_embeddings = pretrained.extract(word_list)
- embedding_parts.append(muse_embeddings)
- del pretrained
-
- if supervised:
- print('\t[supervised-matrix]')
- F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
- num_missing_rows = self.vocabsize - F.shape[0]
- F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
- F = torch.from_numpy(F).float()
-
- offset = 0
- if embedding_parts:
- offset = embedding_parts[0].shape[1]
- self.wce_range = [offset, offset + F.shape[1]]
-
- embedding_parts.append(F)
-
- make_dumps = False
- if make_dumps:
- print(f'Dumping Embedding Matrices ...')
- import pickle
- with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
- pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
- with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
- pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
-
- self.embedding_matrix = torch.cat(embedding_parts, dim=1)
-
- print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
-
-
-def none_dict(langs):
- return {l:None for l in langs}
-
-
-class MultilingualIndex:
- def __init__(self): #, add_language_trace=False):
- self.l_index = {}
- self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
- # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
- # self.add_language_trace=add_language_trace}
-
- def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
- self.langs = sorted(l_devel_raw.keys())
-
- #build the vocabularies
- self.l_vectorizer.fit(l_devel_raw)
- l_vocabulary = self.l_vectorizer.vocabulary()
- l_analyzer = self.l_vectorizer.get_analyzer()
-
- for l in self.langs:
- self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
- self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
-
- def get_indexed(self, l_texts, pretrained_vocabulary=None):
- assert len(self.l_index) != 0, 'Cannot index data without first index call to multilingual index!'
- l_indexed = {}
- for l, texts in l_texts.items():
- if l in self.langs:
- word2index = self.l_index[l].word2index
- known_words = set(word2index.keys())
- if pretrained_vocabulary[l] is not None:
- known_words.update(pretrained_vocabulary[l])
- l_indexed[l] = index(texts,
- vocab=word2index,
- known_words=known_words,
- analyzer=self.l_vectorizer.get_analyzer(l),
- unk_index=word2index['UNKTOKEN'],
- out_of_vocabulary=dict())
- return l_indexed
-
- def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
- for l,index in self.l_index.items():
- index.train_val_split(val_prop, max_val, seed=seed)
-
- def embedding_matrices(self, lpretrained, supervised):
- lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
- lYtr = self.l_train_target() if supervised else none_dict(self.langs)
- for l,index in self.l_index.items():
- index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
- self.sup_range = index.wce_range
-
-
- def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
- show_gpu('GPU memory before initializing mBert model:')
- # TODO: load dumped embeddings?
- from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
- from transformers import BertConfig, BertForSequenceClassification
-
- print('[mBERT] generating mBERT doc embeddings')
- lXtr_raw = self.get_raw_lXtr()
- lXva_raw = self.get_raw_lXva()
- lXte_raw = self.get_raw_lXte()
-
- print('# Tokenizing datasets')
- l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
- tr_dataset = ExtractorDataset(l_tokenized_tr)
- tr_lang_ids = tr_dataset.lang_ids
- tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
-
- l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
- va_dataset = ExtractorDataset(l_tokenized_va)
- va_lang_ids = va_dataset.lang_ids
- va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
-
- l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
- te_dataset = ExtractorDataset(l_tokenized_te)
- te_lang_ids = te_dataset.lang_ids
- te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
-
- num_labels = self.l_index[self.langs[0]].val_target.shape[1]
- config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
- num_labels=num_labels)
- model = BertForSequenceClassification.from_pretrained(bert_path,
- config=config).cuda()
- print('# Extracting document embeddings')
- tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
- va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
- te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
-
- show_gpu('GPU memory before after mBert model:')
- # Freeing GPU's memory
- import gc
- del model, tr_dataloader, va_dataloader, te_dataloader
- gc.collect()
- torch.cuda.empty_cache()
- show_gpu('GPU memory after clearing cache:')
- return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
-
-
- @staticmethod
- def do_bert_embeddings(model, data, lang_ids, verbose=True):
- if verbose:
- print('# Feature Extractor Mode...')
- all_batch_embeddings = {}
- id2lang = {v: k for k, v in lang_ids.items()}
- with torch.no_grad():
- for batch, lang_idx in data:
- out = model(batch.cuda())
- last_hidden_state = out[1][-1]
- batch_embeddings = last_hidden_state[:, 0, :]
- for i, l_idx in enumerate(lang_idx.numpy()):
- if id2lang[l_idx] not in all_batch_embeddings.keys():
- all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
- else:
- all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
- batch_embeddings[i].detach().cpu().numpy()))
-
- return all_batch_embeddings, id2lang
-
- def get_raw_lXtr(self):
- lXtr_raw = {k:[] for k in self.langs}
- lYtr_raw = {k: [] for k in self.langs}
- for lang in self.langs:
- lXtr_raw[lang] = self.l_index[lang].train_raw
- lYtr_raw[lang] = self.l_index[lang].train_raw
- return lXtr_raw
-
- def get_raw_lXva(self):
- lXva_raw = {k: [] for k in self.langs}
- for lang in self.langs:
- lXva_raw[lang] = self.l_index[lang].val_raw
-
- return lXva_raw
-
- def get_raw_lXte(self):
- lXte_raw = {k: [] for k in self.langs}
- for lang in self.langs:
- lXte_raw[lang] = self.l_index[lang].test_raw
-
- return lXte_raw
-
- def get_lXtr(self):
- if not hasattr(self, 'lXtr'):
- self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
- return self.lXtr
-
- def get_lXva(self):
- if not hasattr(self, 'lXva'):
- self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
- return self.lXva
-
- def get_lXte(self):
- if not hasattr(self, 'lXte'):
- self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
- return self.lXte
-
- def l_vocabsize(self):
- return {l:index.vocabsize for l,index in self.l_index.items()}
-
- def l_embeddings(self):
- return {l:index.embedding_matrix for l,index in self.l_index.items()}
-
- def l_pad(self):
- return {l: index.pad_index for l, index in self.l_index.items()}
-
- def l_train_index(self):
- return {l: index.train_index for l, index in self.l_index.items()}
-
- def l_train_target(self):
- return {l: index.train_target for l, index in self.l_index.items()}
-
- def l_val_index(self):
- return {l: index.val_index for l, index in self.l_index.items()}
-
- def l_val_target(self):
- return {l: index.val_target for l, index in self.l_index.items()}
-
- def l_test_index(self):
- return {l: index.test_index for l, index in self.l_index.items()}
-
- def l_devel_index(self):
- return {l: index.devel_index for l, index in self.l_index.items()}
-
- def l_devel_target(self):
- return {l: index.devel_target for l, index in self.l_index.items()}
-
- def l_train(self):
- return self.l_train_index(), self.l_train_target()
-
- def l_val(self):
- return self.l_val_index(), self.l_val_target()
-
-
-class Batch:
- def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
- self.batchsize = batchsize
- self.batches_per_epoch = batches_per_epoch
- self.languages = languages
- self.lpad=lpad
- self.max_pad_length=max_pad_length
- self.init_offset()
-
- def init_offset(self):
- self.offset = {lang: 0 for lang in self.languages}
-
- def batchify(self, l_index, l_post, l_bert, llabels):
- langs = self.languages
- l_num_samples = {l:len(l_index[l]) for l in langs}
-
- max_samples = max(l_num_samples.values())
- n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
- if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
- n_batches = self.batches_per_epoch
-
- for b in range(n_batches):
- for lang in langs:
- index, labels = l_index[lang], llabels[lang]
- offset = self.offset[lang]
- if offset >= l_num_samples[lang]:
- offset = 0
- limit = offset+self.batchsize
-
- batch_slice = slice(offset, limit)
- batch = index[batch_slice]
- batch_labels = labels[batch_slice].toarray()
-
- post = None
- if l_post is not None:
- post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
-
- bert_emb = None
- if l_bert is not None:
- bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
-
- batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
-
- batch = torch.LongTensor(batch).cuda()
- target = torch.FloatTensor(batch_labels).cuda()
-
- self.offset[lang] = limit
-
- yield batch, post, bert_emb, target, lang
-
-
-def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
- langs = sorted(l_index.keys())
- nsamples = max([len(l_index[l]) for l in langs])
- nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
- for b in range(nbatches):
- for lang in langs:
- index, labels = l_index[lang], llabels[lang]
-
- if b * batchsize >= len(index):
- continue
- batch = index[b*batchsize:(b+1)*batchsize]
- batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
- post = None
- if l_post is not None:
- post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
- batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
- batch = torch.LongTensor(batch)
- target = torch.FloatTensor(batch_labels)
- yield batch.cuda(), post, target.cuda(), lang
-
-
-def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
- nsamples = len(index_list)
- nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
- for b in range(nbatches):
- batch = index_list[b*batchsize:(b+1)*batchsize]
- batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
- batch = torch.LongTensor(batch)
- yield batch.cuda()
-
-
-def clip_gradient(model, clip_value=1e-1):
- params = list(filter(lambda p: p.grad is not None, model.parameters()))
- for p in params:
- p.grad.data.clamp_(-clip_value, clip_value)
-
-
-def predict(logits, classification_type='multilabel'):
- if classification_type == 'multilabel':
- prediction = torch.sigmoid(logits) > 0.5
- elif classification_type == 'singlelabel':
- prediction = torch.argmax(logits, dim=1).view(-1, 1)
- else:
- print('unknown classification type')
-
- return prediction.detach().cpu().numpy()
-
-
-def count_parameters(model):
- return sum(p.numel() for p in model.parameters() if p.requires_grad)
-
-
-def show_gpu(msg):
- """
- ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
- """
-
- def query(field):
- return (subprocess.check_output(
- ['nvidia-smi', f'--query-gpu={field}',
- '--format=csv,nounits,noheader'],
- encoding='utf-8'))
-
- def to_int(result):
- return int(result.strip().split('\n')[0])
-
- used = to_int(query('memory.used'))
- total = to_int(query('memory.total'))
- pct = used / total
- print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
-
-
-class TfidfVectorizerMultilingual:
-
- def __init__(self, **kwargs):
- self.kwargs = kwargs
-
- def fit(self, lX, ly=None):
- self.langs = sorted(lX.keys())
- self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
- # self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()}
- return self
-
- def transform(self, lX):
- return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
- # return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()}
-
- def fit_transform(self, lX, ly=None):
- return self.fit(lX, ly).transform(lX)
-
- def vocabulary(self, l=None):
- if l is None:
- return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
- else:
- return self.vectorizer[l].vocabulary_
-
- def get_analyzer(self, l=None):
- if l is None:
- return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
- else:
- return self.vectorizer[l].build_analyzer()
-
-
-def get_learner(calibrate=False, kernel='linear', C=1):
- return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
-
-
-def get_params(optimc=False):
- if not optimc:
- return None
- c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
- kernel = 'rbf'
- return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
-
-
-def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
- gruMUSE, gruWCE, agg, allprob):
- _id = '-'
- _id_conf = [posteriors, supervised, pretrained, mbert, gru]
- _id_name = ['X', 'W', 'M', 'B', 'G']
- for i, conf in enumerate(_id_conf):
- if conf:
- _id += _id_name[i]
- _id = _id if not gruMUSE else _id + '_muse'
- _id = _id if not gruWCE else _id + '_wce'
- _id = _id if not agg else _id + '_mean'
- _id = _id if not allprob else _id + '_allprob'
-
- _dataset_path = dataset.split('/')[-1].split('_')
- dataset_id = _dataset_path[0] + _dataset_path[-1]
- return _id, dataset_id
-
-
-def get_zscl_setting(langs):
- settings = []
- for elem in langs:
- for tar in langs:
- if elem != tar:
- settings.append((elem, tar))
- return settings
\ No newline at end of file
diff --git a/src/util/csv_log.py b/src/util/csv_log.py
deleted file mode 100755
index 8c11e36..0000000
--- a/src/util/csv_log.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import pandas as pd
-pd.set_option('display.max_rows', 500)
-pd.set_option('display.max_columns', 500)
-pd.set_option('display.width', 1000)
-
-
-class CSVLog:
-
- def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
- self.file = file
- self.autoflush = autoflush
- self.verbose = verbose
- if os.path.exists(file) and not overwrite:
- self.tell('Loading existing file from {}'.format(file))
- self.df = pd.read_csv(file, sep='\t')
- self.columns = sorted(self.df.columns.values.tolist())
- else:
- self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
- assert columns is not None, 'columns cannot be None'
- self.columns = sorted(columns)
- dir = os.path.dirname(self.file)
- if dir and not os.path.exists(dir): os.makedirs(dir)
- self.df = pd.DataFrame(columns=self.columns)
- self.defaults={}
-
- def already_calculated(self, **kwargs):
- df = self.df
- if df.shape[0]==0:
- return False
- if len(kwargs)==0:
- kwargs = self.defaults
- for key,val in kwargs.items():
- df = df.loc[df[key]==val]
- if df.shape[0]==0: return False
- return True
-
- def set_default(self, param, value):
- self.defaults[param]=value
-
- def add_row(self, **kwargs):
- for key in self.defaults.keys():
- if key not in kwargs:
- kwargs[key]=self.defaults[key]
- colums = sorted(list(kwargs.keys()))
- values = [kwargs[col_i] for col_i in colums]
- s = pd.Series(values, index=self.columns)
- self.df = self.df.append(s, ignore_index=True)
- if self.autoflush: self.flush()
- # self.tell(s.to_string())
- self.tell(kwargs)
-
- def flush(self):
- self.df.to_csv(self.file, index=False, sep='\t')
-
- def tell(self, msg):
- if self.verbose: print(msg)
-
-
-
diff --git a/src/util/decompositions.py b/src/util/decompositions.py
deleted file mode 100644
index 9d14a0c..0000000
--- a/src/util/decompositions.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from sklearn.decomposition import PCA
-import numpy as np
-import matplotlib.pyplot as plt
-
-
-def run_pca(dim, X):
- """
- :param dim: number of pca components to keep
- :param X: dictionary str(lang): matrix
- :return: dict lang: reduced matrix
- """
- r = dict()
- pca = PCA(n_components=dim)
- for lang in X.keys():
- r[lang] = pca.fit_transform(X[lang])
- return r
-
-
-def get_optimal_dim(X, embed_type):
- """
- :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
- :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
- :return:
- """
- _idx = []
-
- plt.figure(figsize=(15, 10))
- if embed_type == 'U':
- plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
- else:
- plt.title(f'WCE Explained Variance')
- plt.xlabel('Number of Components')
- plt.ylabel('Variance (%)')
-
- for lang in X.keys():
- pca = PCA(n_components=X[lang].shape[1])
- pca.fit(X[lang])
- _r = pca.explained_variance_ratio_
- _r = np.cumsum(_r)
- plt.plot(_r, label=lang)
- for i in range(len(_r) - 1, 1, -1):
- delta = _r[i] - _r[i - 1]
- if delta > 0:
- _idx.append(i)
- break
- best_n = max(_idx)
- plt.axvline(best_n, color='r', label='optimal N')
- plt.legend()
- plt.show()
- return best_n
diff --git a/src/util/early_stop.py b/src/util/early_stop.py
deleted file mode 100755
index 7d72cde..0000000
--- a/src/util/early_stop.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
-import torch
-from transformers import BertForSequenceClassification
-from time import time
-from util.file import create_if_not_exist
-import warnings
-
-class EarlyStopping:
-
- def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False):
- # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
- self.patience_limit = patience
- self.patience = patience
- self.verbose = verbose
- self.best_score = None
- self.best_epoch = None
- self.stop_time = None
- self.checkpoint = checkpoint
- self.model = model
- self.optimizer = optimizer
- self.STOP = False
- self.is_bert = is_bert
-
- def __call__(self, watch_score, epoch):
-
- if self.STOP:
- return
-
- if self.best_score is None or watch_score >= self.best_score:
- self.best_score = watch_score
- self.best_epoch = epoch
- self.stop_time = time()
- if self.checkpoint:
- self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
- if self.is_bert:
- print(f'Serializing Huggingface model...')
- create_if_not_exist(self.checkpoint)
- self.model.save_pretrained(self.checkpoint)
- else:
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- torch.save(self.model, self.checkpoint)
- # with open(self.checkpoint)
- # torch.save({'state_dict': self.model.state_dict(),
- # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint)
- else:
- self.print(f'[early-stop] improved')
- self.patience = self.patience_limit
- else:
- self.patience -= 1
- if self.patience == 0:
- self.STOP = True
- self.print(f'[early-stop] patience exhausted')
- else:
- if self.patience>0: # if negative, then early-stop is ignored
- self.print(f'[early-stop] patience={self.patience}')
-
- def reinit_counter(self):
- self.STOP = False
- self.patience=self.patience_limit
-
- def restore_checkpoint(self):
- print(f'restoring best model from epoch {self.best_epoch}...')
- if self.is_bert:
- return BertForSequenceClassification.from_pretrained(self.checkpoint)
- else:
- return torch.load(self.checkpoint)
-
- def print(self, msg):
- if self.verbose:
- print(msg)
diff --git a/src/util/evaluation.py b/src/util/evaluation.py
deleted file mode 100644
index 41a2813..0000000
--- a/src/util/evaluation.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# from sklearn.externals.joblib import Parallel, delayed
-from joblib import Parallel, delayed
-from util.metrics import *
-from sklearn.metrics import f1_score
-import numpy as np
-import time
-
-
-def evaluation_metrics(y, y_):
- if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
- raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
- else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
- return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
-
-
-def soft_evaluation_metrics(y, y_):
- if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
- raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
- else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
- return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
-
-
-def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
- print('evaluation (n_jobs={})'.format(n_jobs))
- if n_jobs == 1:
- return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
- else:
- langs = list(ly_true.keys())
- evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
- return {lang: evals[i] for i, lang in enumerate(langs)}
-
-
-def average_results(l_eval, show=True):
- metrics = []
- for lang in l_eval.keys():
- macrof1, microf1, macrok, microk = l_eval[lang]
- metrics.append([macrof1, microf1, macrok, microk])
- if show:
- print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
-
- ave = np.mean(np.array(metrics), axis=0)
- if show:
- print('Averages: MF1, mF1, MK, mK', ave)
- return ave
-
-
-def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
- tinit = time.time()
- print('prediction for test')
- assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
- n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1
-
- if predictor is None:
- predictor = polylingual_method.predict
-
- metrics = evaluation_metrics
- if soft is True:
- metrics = soft_evaluation_metrics
- ly_ = predictor(lX, ly)
-
- eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
- if return_time:
- return eval_, time.time()-tinit
- else:
- return eval_
-
-
-def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
- print('prediction for test in a single language')
- if predictor is None:
- predictor = polylingual_method.predict
-
- metrics = evaluation_metrics
- if soft is True:
- metrics = soft_evaluation_metrics
-
- ly_ = predictor({lang:X})
- return metrics(y, ly_[lang])
-
-
-def get_binary_counters(polylingual_method, lX, ly, predictor=None):
- print('prediction for test')
- assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
- n_jobs = polylingual_method.n_jobs
- if predictor is None:
- predictor = polylingual_method.predict
- ly_ = predictor(lX)
- print('evaluation (n_jobs={})'.format(n_jobs))
- if n_jobs == 1:
- return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
- else:
- langs = list(ly.keys())
- evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
- return {lang: evals[i] for i, lang in enumerate(langs)}
-
-
-def binary_counters(y, y_):
- y = np.reshape(y, (-1))
- assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
- counters = hard_single_metric_statistics(y, y_)
- return counters.tp, counters.tn, counters.fp, counters.fn
-
diff --git a/src/util/file.py b/src/util/file.py
deleted file mode 100644
index a3d0a3a..0000000
--- a/src/util/file.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from os import listdir, makedirs
-from os.path import isdir, isfile, join, exists, dirname
-#from sklearn.externals.six.moves import urllib
-import urllib
-from pathlib import Path
-
-
-def download_file(url, archive_filename):
- def progress(blocknum, bs, size):
- total_sz_mb = '%.2f MB' % (size / 1e6)
- current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
- print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
- print("Downloading %s" % url)
- urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
- print("")
-
-def download_file_if_not_exists(url, archive_path):
- if exists(archive_path): return
- makedirs_if_not_exist(dirname(archive_path))
- download_file(url,archive_path)
-
-
-def ls(dir, typecheck):
- el = [f for f in listdir(dir) if typecheck(join(dir, f))]
- el.sort()
- return el
-
-def list_dirs(dir):
- return ls(dir, typecheck=isdir)
-
-def list_files(dir):
- return ls(dir, typecheck=isfile)
-
-def makedirs_if_not_exist(path):
- if not exists(path): makedirs(path)
-
-def create_if_not_exist(path):
- if not exists(path): makedirs(path)
-
-def get_parent_name(path):
- return Path(path).parent
-
-def get_file_name(path):
- return Path(path).name
diff --git a/src/util/metrics.py b/src/util/metrics.py
deleted file mode 100644
index ca688b7..0000000
--- a/src/util/metrics.py
+++ /dev/null
@@ -1,255 +0,0 @@
-import numpy as np
-from scipy.sparse import lil_matrix, issparse
-from sklearn.metrics import f1_score, accuracy_score
-
-
-
-"""
-Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
-I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
-affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
-We adhere to the common practice of outputting 1 in this case since the classifier has correctly
-classified all examples as negatives.
-"""
-
-class ContTable:
- def __init__(self, tp=0, tn=0, fp=0, fn=0):
- self.tp=tp
- self.tn=tn
- self.fp=fp
- self.fn=fn
-
- def get_d(self): return self.tp + self.tn + self.fp + self.fn
-
- def get_c(self): return self.tp + self.fn
-
- def get_not_c(self): return self.tn + self.fp
-
- def get_f(self): return self.tp + self.fp
-
- def get_not_f(self): return self.tn + self.fn
-
- def p_c(self): return (1.0*self.get_c())/self.get_d()
-
- def p_not_c(self): return 1.0-self.p_c()
-
- def p_f(self): return (1.0*self.get_f())/self.get_d()
-
- def p_not_f(self): return 1.0-self.p_f()
-
- def p_tp(self): return (1.0*self.tp) / self.get_d()
-
- def p_tn(self): return (1.0*self.tn) / self.get_d()
-
- def p_fp(self): return (1.0*self.fp) / self.get_d()
-
- def p_fn(self): return (1.0*self.fn) / self.get_d()
-
- def tpr(self):
- c = 1.0*self.get_c()
- return self.tp / c if c > 0.0 else 0.0
-
- def fpr(self):
- _c = 1.0*self.get_not_c()
- return self.fp / _c if _c > 0.0 else 0.0
-
- def __add__(self, other):
- return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
-
-def accuracy(cell):
- return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
-
-def f1(cell):
- num = 2.0 * cell.tp
- den = 2.0 * cell.tp + cell.fp + cell.fn
- if den>0: return num / den
- #we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
- return 1.0
-
-def K(cell):
- specificity, recall = 0., 0.
-
- AN = cell.tn + cell.fp
- if AN != 0:
- specificity = cell.tn*1. / AN
-
- AP = cell.tp + cell.fn
- if AP != 0:
- recall = cell.tp*1. / AP
-
- if AP == 0:
- return 2. * specificity - 1.
- elif AN == 0:
- return 2. * recall - 1.
- else:
- return specificity + recall - 1.
-
-#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
-#true_labels and predicted_labels are two vectors of shape (number_documents,)
-def hard_single_metric_statistics(true_labels, predicted_labels):
- assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
- nd = len(true_labels)
- tp = np.sum(predicted_labels[true_labels==1])
- fp = np.sum(predicted_labels[true_labels == 0])
- fn = np.sum(true_labels[predicted_labels == 0])
- tn = nd - (tp+fp+fn)
- return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
-
-#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
-# probabilitiesfron with respect to the true binary labels
-#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
-def soft_single_metric_statistics(true_labels, posterior_probabilities):
- assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
- tp = np.sum(posterior_probabilities[true_labels == 1])
- fn = np.sum(1. - posterior_probabilities[true_labels == 1])
- fp = np.sum(posterior_probabilities[true_labels == 0])
- tn = np.sum(1. - posterior_probabilities[true_labels == 0])
- return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
-
-#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
-#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
-def __check_consistency_and_adapt(true_labels, predictions):
- if predictions.ndim == 1:
- return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
- if true_labels.ndim == 1:
- return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
- if true_labels.shape != predictions.shape:
- raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
- % (true_labels.shape, predictions.shape))
- _,nC = true_labels.shape
- return true_labels, predictions, nC
-
-def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
- true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
- return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
-
-def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
- true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
-
- accum = ContTable()
- for c in range(nC):
- other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
- accum = accum + other
-
- return metric(accum)
-
-#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
-def macroF1(true_labels, predicted_labels):
- return macro_average(true_labels,predicted_labels, f1)
-
-#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
-def microF1(true_labels, predicted_labels):
- return micro_average(true_labels, predicted_labels, f1)
-
-#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
-def macroK(true_labels, predicted_labels):
- return macro_average(true_labels,predicted_labels, K)
-
-#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
-def microK(true_labels, predicted_labels):
- return micro_average(true_labels, predicted_labels, K)
-
-#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
-#of the same shape containing real values in [0,1]
-def smoothmacroF1(true_labels, posterior_probabilities):
- return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
-
-#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
-#of the same shape containing real values in [0,1]
-def smoothmicroF1(true_labels, posterior_probabilities):
- return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
-
-#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
-#of the same shape containing real values in [0,1]
-def smoothmacroK(true_labels, posterior_probabilities):
- return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
-
-#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
-#of the same shape containing real values in [0,1]
-def smoothmicroK(true_labels, posterior_probabilities):
- return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
-
-
-
-
-"""
-Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
-I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
-affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
-We adhere to the common practice of outputting 1 in this case since the classifier has correctly
-classified all examples as negatives.
-"""
-
-def evaluation(y_true, y_pred, classification_type):
-
- if classification_type == 'multilabel':
- eval_function = multilabel_eval
- elif classification_type == 'singlelabel':
- eval_function = singlelabel_eval
-
- Mf1, mf1, accuracy = eval_function(y_true, y_pred)
-
- return Mf1, mf1, accuracy
-
-
-def multilabel_eval(y, y_):
-
- tp = y.multiply(y_)
-
- fn = lil_matrix(y.shape)
- true_ones = y==1
- fn[true_ones]=1-tp[true_ones]
-
- fp = lil_matrix(y.shape)
- pred_ones = y_==1
- if pred_ones.nnz>0:
- fp[pred_ones]=1-tp[pred_ones]
-
- #macro-f1
- tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
- fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
- fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
-
- pos_pred = tp_macro+fp_macro
- pos_true = tp_macro+fn_macro
- prec=np.zeros(shape=tp_macro.shape,dtype=float)
- rec=np.zeros(shape=tp_macro.shape,dtype=float)
- np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
- np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
- den=prec+rec
-
- macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
- np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
- macrof1 *=2
-
- macrof1[(pos_pred==0)*(pos_true==0)]=1
- macrof1 = np.mean(macrof1)
-
- #micro-f1
- tp_micro = tp_macro.sum()
- fn_micro = fn_macro.sum()
- fp_micro = fp_macro.sum()
- pos_pred = tp_micro + fp_micro
- pos_true = tp_micro + fn_micro
- prec = (tp_micro / pos_pred) if pos_pred>0 else 0
- rec = (tp_micro / pos_true) if pos_true>0 else 0
- den = prec+rec
- microf1 = 2*prec*rec/den if den>0 else 0
- if pos_pred==pos_true==0:
- microf1=1
-
- #accuracy
- ndecisions = np.multiply(*y.shape)
- tn = ndecisions - (tp_micro+fn_micro+fp_micro)
- acc = (tp_micro+tn)/ndecisions
-
- return macrof1,microf1,acc
-
-
-def singlelabel_eval(y, y_):
- if issparse(y_): y_ = y_.toarray().flatten()
- macrof1 = f1_score(y, y_, average='macro')
- microf1 = f1_score(y, y_, average='micro')
- acc = accuracy_score(y, y_)
- return macrof1,microf1,acc
-
diff --git a/src/util/parser_options.py b/src/util/parser_options.py
deleted file mode 100644
index 14d827c..0000000
--- a/src/util/parser_options.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from optparse import OptionParser
-
-parser = OptionParser(usage="usage: %prog datapath [options]")
-
-parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
-
-parser.add_option("-o", "--output", dest="output",
- help="Result file", type=str, default='../log/multiModal_log.csv')
-
-parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true',
- help="Add posterior probabilities to the document embedding representation", default=False)
-
-parser.add_option("-W", "--supervised", dest="supervised", action='store_true',
- help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
-
-parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true',
- help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
-
-parser.add_option("-B", "--mbert", dest="mbert", action='store_true',
- help="Add multilingual Bert (mBert) document embedding representation", default=False)
-
-parser.add_option('-G', dest='gruViewGenerator', action='store_true',
- help="Add document embedding generated via recurrent net (GRU)", default=False)
-
-parser.add_option("--l2", dest="l2", action='store_true',
- help="Activates l2 normalization as a post-processing for the document embedding views",
- default=True)
-
-parser.add_option("--allprob", dest="allprob", action='store_true',
- help="All views are generated as posterior probabilities. This affects the supervised and pretrained"
- "embeddings, for which a calibrated classifier is generated, which generates the posteriors",
- default=True)
-
-parser.add_option("--feat-weight", dest="feat_weight",
- help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
-
-parser.add_option("-w", "--we-path", dest="we_path",
- help="Path to the MUSE polylingual word embeddings", default='../embeddings')
-
-parser.add_option("-s", "--set_c", dest="set_c", type=float,
- help="Set the C parameter", default=1)
-
-parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
- help="Optimize hyperparameters", default=False)
-
-parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int,
- help="Number of parallel jobs (default is -1, all)", default=-1)
-
-parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
- help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
- default=300)
-
-parser.add_option("-r", "--remove-pc", dest="sif", action='store_true',
- help="Remove common component when computing dot product of word embedding matrices", default=True)
-
-parser.add_option("-z", "--zscore", dest="zscore", action='store_true',
- help="Z-score normalize matrices (WCE and MUSE)", default=True)
-
-parser.add_option("-a", "--agg", dest="agg", action='store_true',
- help="Set aggregation function of the common Z-space to average (Default: concatenation)",
- default=True)
-
-parser.add_option("-l", dest="avoid_loading", action="store_true",
- help="TODO", default=False)
-
-# ------------------------------------------------------------------------------------
-
-parser.add_option('--hidden', type=int, default=512, metavar='int',
- help='hidden lstm size (default: 512)')
-
-parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]',
- help='dropout probability for the supervised matrix (default: 0.5)')
-
-parser.add_option('--tunable', action='store_true', default=False,
- help='pretrained embeddings are tunable from the beginning (default False, i.e., static)')
-
-parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv')
-
-parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
-
-parser.add_option('--force', action='store_true', default=False,
- help='do not check if this experiment has already been run')
-
-parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False,
- help='Deploy MUSE embedding as embedding layer of the GRU View Generator')
-
-parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False,
- help='Deploy WCE embedding as embedding layer of the GRU View Generator')
-
-parser.add_option('--gru-path', dest='gru_path', default=None,
- help='Set the path to a pretrained GRU model (aka, -G view generator)')
-
-parser.add_option('--bert-path', dest='bert_path', default=None,
- help='Set the path to a pretrained mBERT model (aka, -B view generator)')
diff --git a/src/util/results.py b/src/util/results.py
deleted file mode 100644
index 6526303..0000000
--- a/src/util/results.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import os
-import pandas as pd
-import numpy as np
-
-class PolylingualClassificationResults:
- def __init__(self, file, autoflush=True, verbose=False):
- self.file = file
- self.columns = ['method',
- 'learner',
- 'optimp',
- 'sif',
- 'zscore',
- 'l2',
- 'wescaler',
- 'pca',
- 'id',
- 'dataset',
- 'time',
- 'lang',
- 'macrof1',
- 'microf1',
- 'macrok',
- 'microk',
- 'notes']
- self.autoflush = autoflush
- self.verbose = verbose
- if os.path.exists(file):
- self.tell('Loading existing file from {}'.format(file))
- self.df = pd.read_csv(file, sep='\t')
- else:
- self.tell('File {} does not exist. Creating new frame.'.format(file))
- dir = os.path.dirname(self.file)
- if dir and not os.path.exists(dir): os.makedirs(dir)
- self.df = pd.DataFrame(columns=self.columns)
-
- def already_calculated(self, id):
- return (self.df['id'] == id).any()
-
- def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
- s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
- self.df = self.df.append(s, ignore_index=True)
- if self.autoflush: self.flush()
- self.tell(s.to_string())
-
- def flush(self):
- self.df.to_csv(self.file, index=False, sep='\t')
-
- def tell(self, msg):
- if self.verbose: print(msg)
-
-
-class ZSCLResults:
- def __init__(self, file, autoflush=True, verbose=False):
- self.file = file
- self.columns = ['method',
- 'optimp',
- 'source',
- 'target',
- 'id',
- 'dataset',
- 'time',
- 'lang',
- 'macrof1',
- 'microf1',
- 'macrok',
- 'microk',
- 'notes']
- self.autoflush = autoflush
- self.verbose = verbose
- if os.path.exists(file):
- self.tell('Loading existing file from {}'.format(file))
- self.df = pd.read_csv(file, sep='\t')
- else:
- self.tell('File {} does not exist. Creating new frame.'.format(file))
- dir = os.path.dirname(self.file)
- if dir and not os.path.exists(dir): os.makedirs(dir)
- self.df = pd.DataFrame(columns=self.columns)
-
- def already_calculated(self, id):
- return (self.df['id'] == id).any()
-
- def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
- s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
- self.df = self.df.append(s, ignore_index=True)
- if self.autoflush: self.flush()
- self.tell(s.to_string())
-
- def flush(self):
- self.df.to_csv(self.file, index=False, sep='\t')
-
- def tell(self, msg):
- if self.verbose: print(msg)
diff --git a/src/util/util.py b/src/util/util.py
deleted file mode 100644
index 823c82d..0000000
--- a/src/util/util.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from sklearn.svm import SVC
-from tqdm import tqdm
-import re
-import sys
-
-
-def mask_numbers(data, number_mask='numbermask'):
- mask = re.compile(r'\b[0-9][0-9.,-]*\b')
- masked = []
- for text in tqdm(data, desc='masking numbers'):
- masked.append(mask.sub(number_mask, text))
- return masked
-
-
-def fill_missing_classes(lXtr, lytr):
- pass
-
-
-def get_learner(calibrate=False, kernel='linear'):
- return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
-
-
-def get_params(dense=False):
- if not op.optimc:
- return None
- c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
- kernel = 'rbf' if dense else 'linear'
- return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
-
diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py
deleted file mode 100644
index e1a10cf..0000000
--- a/src/util_transformers/StandardizeTransformer.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import numpy as np
-
-
-class StandardizeTransformer:
-
- def __init__(self, axis=0, range=None):
- assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
- self.axis = axis
- self.yetfit = False
- self.range = range
-
- def fit(self, X):
- print('fitting Standardizer...')
- std=np.std(X, axis=self.axis, ddof=1)
- self.std = np.clip(std, 1e-5, None)
- self.mean = np.mean(X, axis=self.axis)
- if self.range is not None:
- ones = np.ones_like(self.std)
- zeros = np.zeros_like(self.mean)
- ones[self.range] = self.std[self.range]
- zeros[self.range] = self.mean[self.range]
- self.std = ones
- self.mean = zeros
- self.yetfit=True
- return self
-
- def transform(self, X):
- if not self.yetfit: 'transform called before fit'
- return (X - self.mean) / self.std
-
- def fit_transform(self, X):
- return self.fit(X).transform(X)
diff --git a/src/util_transformers/__init__.py b/src/util_transformers/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/util_transformers/clesa.py b/src/util_transformers/clesa.py
deleted file mode 100644
index da17393..0000000
--- a/src/util_transformers/clesa.py
+++ /dev/null
@@ -1,110 +0,0 @@
-import numpy as np
-import sklearn
-# from sklearn.externals.joblib import Parallel, delayed
-from joblib import Parallel, delayed
-
-class ESA(object):
- """
- Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
- """
- supported_similarity = ['dot', 'cosine']
-
- def __init__(self, similarity='dot', centered=False, post=None):
- """
- :param similarity: the similarity measure between documents to be used
- :param centered: set to True to subtract the expected similarity due to randomness (experimental)
- :param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
- """
- assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
- self.similarity = similarity
- self.centered = centered
- self.post_processing = post
- self.W = None
-
- def fit(self, W):
- """
- :param W: doc-by-term already processed matrix of wikipedia documents
- :return: self
- """
- self.W = W
- return self
-
- def transform(self, X):
- """
- :param X: doc-by-term matrix that is to be transformed into the ESA space.
- :return: the matrix X transformed into the ESA space in numpy format
- """
- assert self.W is not None, 'transform method called before fit'
-
- W = self.W
- assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
-
- if self.similarity in ['dot', 'cosine']:
- if self.similarity == 'cosine':
- X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
- W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
-
- esa = (X.dot(W.T)).toarray()
- if self.centered:
- pX = (X > 0).sum(1) / float(X.shape[1])
- pW = (W > 0).sum(1) / float(W.shape[1])
- pXpW = np.sqrt(pX.dot(pW.transpose()))
- esa = esa - pXpW
-
- if self.post_processing:
- esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
-
- return esa
-
- def fit_transform(self, W, X, Y=None):
- self.fit(W)
- return self.transform(X, Y)
-
- def dimensionality(self):
- return self.W.shape[0]
-
-
-
-class CLESA(ESA):
- """
- Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
- """
-
- def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
- super(CLESA, self).__init__(similarity, centered, post)
- self.lESA = None
- self.langs = None
- self.n_jobs = n_jobs
-
- def fit(self, lW):
- """
- :param lW: a dictionary of {language: doc-by-term wiki matrix}
- :return: self
- """
- assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
-
- self.dimensions = list(lW.values())[0].shape[0]
- self.langs = list(lW.keys())
- self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
- return self
-
- def transform(self, lX):
- """
- :param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
- :return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
- """
- assert self.lESA is not None, 'transform method called before fit'
- assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
- langs = list(lX.keys())
- trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
- return {lang:trans[i] for i,lang in enumerate(langs)}
-
- def fit_transform(self, lW, lX):
- return self.fit(lW).transform(lX)
-
- def languages(self):
- return list(self.lESA.keys())
-
-
-
-
diff --git a/src/util_transformers/dci.py b/src/util_transformers/dci.py
deleted file mode 100644
index 6e84ed9..0000000
--- a/src/util_transformers/dci.py
+++ /dev/null
@@ -1,154 +0,0 @@
-import numpy as np
-from sklearn.preprocessing import normalize
-from scipy.sparse import csr_matrix, issparse
-from scipy.spatial.distance import cosine
-import operator
-import functools
-import math, sys
-# from sklearn.externals.joblib import Parallel, delayed
-from joblib import Parallel, delayed
-
-
-class DistributionalCorrespondenceIndexing:
-
- prob_dcf = ['linear', 'pmi']
- vect_dcf = ['cosine']
- valid_dcf = prob_dcf + vect_dcf
- valid_post = ['normal', 'l2', None]
-
- def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
- """
- :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
- the distribucional correspondence between vectors u and v
- :param post: post-processing function to apply to document embeddings. Default is to standardize it into a
- normal distribution; other functions allowed are 'l2' or None
- """
- if post not in self.valid_post:
- raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
-
- if isinstance(dcf, str):
- if dcf not in self.valid_dcf:
- raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
- self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
- elif hasattr(dcf, '__call__'):
- self.dcf = dcf
- else:
- raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
- #self.dcf = lambda u,v:dcf(u,v)
- self.post = post
- self.domains = None
- self.dFP = None
- self.n_jobs = n_jobs
-
- def fit(self, dU, dP):
- """
- :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
- distributional semantic model for a specific domain
- :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
- and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
- number of pivots
- :return: self
- """
- self.domains = list(dP.keys())
- assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
- assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
- assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
- "inconsistent dimensions between distributional and pivot spaces"
- self.dimensions = list(dP.values())[0].shape[1]
- # embed the feature space from each domain using the pivots of that domain
- #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
- transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
- self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
-
- def _dom_transform(self, X, FP):
- _X = X.dot(FP)
- if self.post == 'l2':
- _X = normalize(_X, norm='l2', axis=1)
- elif self.post == 'normal':
- std = np.clip(np.std(_X, axis=0), 1e-5, None)
- _X = (_X - np.mean(_X, axis=0)) / std
- return _X
-
- # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
- def transform(self, dX):
- assert self.dFP is not None, 'transform method called before fit'
- assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
- domains = list(dX.keys())
- transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
- return {d: transformations[i] for i, d in enumerate(domains)}
-
- def fit_transform(self, dU, dP, dX):
- return self.fit(dU, dP).transform(dX)
-
- def _prevalence(self, v):
- if issparse(v):
- return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
- elif isinstance(v, np.ndarray):
- return float(v[v>0].size) / v.size
-
- def linear(self, u, v, D):
- tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
- den1=tp+fn
- den2=tn+fp
- tpr = (tp*1./den1) if den1!=0 else 0.
- tnr = (tn*1./den2) if den2!=0 else 0.
- return tpr + tnr - 1
-
- def pmi(self, u, v, D):
- tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
-
- Pxy = tp * 1. / D
- Pxny = fp * 1. / D
- Pnxy = fn * 1. / D
- Px = Pxy + Pxny
- Py = Pxy + Pnxy
-
- if (Px == 0 or Py == 0 or Pxy == 0):
- return 0.0
-
- score = math.log2(Pxy / (Px * Py))
- if np.isnan(score) or np.isinf(score):
- print('NAN')
- sys.exit()
- return score
-
- def cosine(self, u, v):
- pu = self._prevalence(u)
- pv = self._prevalence(v)
- return cosine(u, v) - np.sqrt(pu * pv)
-
- def _get_4cellcounters(self, u, v, D):
- """
- :param u: a set of indexes with a non-zero value
- :param v: a set of indexes with a non-zero value
- :param D: the number of events (i.e., all posible indexes)
- :return: the 4-cell contingency values tp, fp, fn, tn)
- """
- common=u.intersection(v)
- tp = len(common)
- fp = len(u) - len(common)
- fn = len(v) - len(common)
- tn = D - (tp + fp + fn)
- return tp, fp, fn, tn
-
- def dcf_dist(self, U, V):
- nU,D = U.shape
- nV = V.shape[0]
- if issparse(U): U = U.toarray()
- if issparse(V): V = V.toarray()
-
- dists = np.zeros((nU, nV))
- if self.dcf.__name__ in self.prob_dcf:
- def hits_index(v):
- return set(np.argwhere(v>0).reshape(-1).tolist())
- Vhits = {i:hits_index(V[i]) for i in range(nV)}
- for i in range(nU):
- Ui_hits = hits_index(U[i])
- for j in range(nV):
- dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
- else:
- for i in range(nU):
- for j in range(nV):
- dists[i, j] = self.dcf(self, U[i], V[j])
- return dists
-
diff --git a/src/util_transformers/riboc.py b/src/util_transformers/riboc.py
deleted file mode 100644
index 7dfbc42..0000000
--- a/src/util_transformers/riboc.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import math
-import numpy as np
-from scipy.sparse import csr_matrix, issparse
-
-class RandomIndexingBoC(object):
-
- def __init__(self, latent_dimensions, non_zeros=2):
- self.latent_dimensions = latent_dimensions
- self.k = non_zeros
- self.ri_dict = None
-
- def fit_transform(self, X):
- return self.fit(X).transform(X)
-
- def fit(self, X):
- nF = X.shape[1]
- nL = self.latent_dimensions
- format = 'csr' if issparse(X) else 'np'
- self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
- return self
-
- def transform(self, X):
- assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
- if self.ri_dict is None:
- raise ValueError("Error: transform method called before fit.")
- P = X.dot(self.ri_dict)
- if issparse(P):
- P.sort_indices()
- return P
-
-
-def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
- assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
- nF, latent_dimensions = shape
- print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
- val = 1.0 if not normalized else 1.0/math.sqrt(k)
- #ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions))
- ri_dict = np.zeros((nF, latent_dimensions))
-
- #TODO: optimize
- for t in range(nF):
- dims = np.zeros(k, dtype=np.int32)
- dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
- dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
- values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
- ri_dict[t,dims]=values
- print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
- print('\nDone')
-
- if format=='csr':
- ri_dict = csr_matrix(ri_dict)
- return ri_dict
-