diff --git a/src/data/__init__.py b/src/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data/languages.py b/src/data/languages.py deleted file mode 100644 index 2d03d8e..0000000 --- a/src/data/languages.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', - 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} - - -#top 10 languages in wikipedia order by the number of articles -#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] - -#all languages in JRC-acquis v3 -JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] -JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' - -RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] -RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] - -lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, - 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} - diff --git a/src/data/reader/__init__.py b/src/data/reader/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/data/reader/jrcacquis_reader.py b/src/data/reader/jrcacquis_reader.py deleted file mode 100644 index c0441ed..0000000 --- a/src/data/reader/jrcacquis_reader.py +++ /dev/null @@ -1,321 +0,0 @@ -from __future__ import print_function -import os, sys -from os.path import join -import tarfile -import xml.etree.ElementTree as ET -from sklearn.datasets import get_data_home -import pickle -from util.file import download_file, list_dirs, list_files -import rdflib -from rdflib.namespace import RDF, SKOS -from rdflib import URIRef -import zipfile -from data.languages import JRC_LANGS -from collections import Counter -from random import shuffle -from data.languages import lang_set - -""" -JRC Acquis' Nomenclature: -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -class JRCAcquis_Document: - def __init__(self, id, name, lang, year, head, body, categories): - self.id = id - self.parallel_id = name - self.lang = lang - self.year = year - self.text = body if not head else head + "\n" + body - self.categories = categories - -# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles -# however, it seems that the title is often appearing as the first paragraph in the text/body (with -# standard codification), so it might be preferable not to read the header after all (as here by default) -def _proc_acute(text): - for ch in ['a','e','i','o','u']: - text = text.replace('%'+ch+'acute%',ch) - return text - -def parse_document(file, year, head=False): - root = ET.parse(file).getroot() - - doc_name = root.attrib['n'] # e.g., '22006A0211(01)' - doc_lang = root.attrib['lang'] # e.g., 'es' - doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' - doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] - doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' - doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) - - def raise_if_empty(field, from_file): - if isinstance(field, str): - if not field.strip(): - raise ValueError("Empty field in file %s" % from_file) - - raise_if_empty(doc_name, file) - raise_if_empty(doc_lang, file) - raise_if_empty(doc_id, file) - if head: raise_if_empty(doc_head, file) - raise_if_empty(doc_body, file) - - return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) - -# removes documents without a counterpart in all other languages -def _force_parallel(doclist, langs): - n_langs = len(langs) - par_id_count = Counter([d.parallel_id for d in doclist]) - parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) - return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] - -def random_sampling_avoiding_parallel(doclist): - random_order = list(range(len(doclist))) - shuffle(random_order) - sampled_request = [] - parallel_ids = set() - for ind in random_order: - pid = doclist[ind].parallel_id - if pid not in parallel_ids: - sampled_request.append(doclist[ind]) - parallel_ids.add(pid) - print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) - return sampled_request - - -#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter -def _filter_by_category(doclist, cat_filter): - if not isinstance(cat_filter, frozenset): - cat_filter = frozenset(cat_filter) - filtered = [] - for doc in doclist: - doc.categories = list(cat_filter & set(doc.categories)) - if doc.categories: - doc.categories.sort() - filtered.append(doc) - print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) - return filtered - -#filters out categories with less than cat_threshold documents (and filters documents containing those categories) -def _filter_by_frequency(doclist, cat_threshold): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -#select top most_frequent categories (and filters documents containing those categories) -def _most_common(doclist, most_frequent): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -def _get_categories(request): - final_cats = set() - for d in request: - final_cats.update(d.categories) - return list(final_cats) - -def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, - parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): - - assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' - if not langs: - langs = JRC_LANGS - else: - if isinstance(langs, str): langs = [langs] - for l in langs: - if l not in JRC_LANGS: - raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) - - if not data_path: - data_path = get_data_home() - - if not os.path.exists(data_path): - os.mkdir(data_path) - - request = [] - total_read = 0 - for l in langs: - file_name = 'jrc-'+l+'.tgz' - archive_path = join(data_path, file_name) - - if not os.path.exists(archive_path): - print("downloading language-specific dataset (once and for all) into %s" % data_path) - DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) - download_file(DOWNLOAD_URL, archive_path) - print("untarring dataset...") - tarfile.open(archive_path, 'r:gz').extractall(data_path) - - documents_dir = join(data_path, l) - - print("Reading documents...") - read = 0 - for dir in list_dirs(documents_dir): - year = int(dir) - if years==None or year in years: - year_dir = join(documents_dir,dir) - pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') - if os.path.exists(pickle_name): - print("loading from file %s" % pickle_name) - l_y_documents = pickle.load(open(pickle_name, "rb")) - read += len(l_y_documents) - else: - l_y_documents = [] - all_documents = list_files(year_dir) - empty = 0 - for i,doc_file in enumerate(all_documents): - try: - jrc_doc = parse_document(join(year_dir, doc_file), year) - except ValueError: - jrc_doc = None - - if jrc_doc and (not ignore_unclassified or jrc_doc.categories): - l_y_documents.append(jrc_doc) - else: empty += 1 - if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): - print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') - read+=1 - print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') - print("\t\t(Pickling object for future runs in %s)" % pickle_name) - pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - request += l_y_documents - print("Read %d documents for language %s\n" % (read, l)) - total_read += read - print("Read %d documents in total" % (total_read)) - - if parallel=='force': - request = _force_parallel(request, langs) - elif parallel == 'avoid': - request = random_sampling_avoiding_parallel(request) - - final_cats = _get_categories(request) - - if cat_filter: - request = _filter_by_category(request, cat_filter) - final_cats = _get_categories(request) - if cat_threshold > 0: - request, final_cats = _filter_by_frequency(request, cat_threshold) - if most_frequent != -1 and len(final_cats) > most_frequent: - request, final_cats = _most_common(request, most_frequent) - - return request, final_cats - -def print_cat_analysis(request): - cat_count = Counter() - for d in request: - cat_count.update(d.categories) - print("Number of active categories: {}".format(len(cat_count))) - print(cat_count.most_common()) - -# inspects the Eurovoc thesaurus in order to select a subset of categories -# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented -def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', - eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", - select="broadest"): - - fullpath_pickle = join(data_path, select+'_concepts.pickle') - if os.path.exists(fullpath_pickle): - print("Pickled object found in %s. Loading it." % fullpath_pickle) - return pickle.load(open(fullpath_pickle,'rb')) - - fullpath = join(data_path, eurovoc_skos_core_concepts_filename) - if not os.path.exists(fullpath): - print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) - download_file(eurovoc_url, fullpath) - print("Unzipping file...") - zipped = zipfile.ZipFile(data_path + '.zip', 'r') - zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) - zipped.close() - - print("Parsing %s" %fullpath) - g = rdflib.Graph() - g.parse(location=fullpath, format="application/rdf+xml") - - if select == "all": - print("Selecting all concepts") - all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) - all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] - all_concepts.sort() - selected_concepts = all_concepts - elif select=="broadest": - print("Selecting broadest concepts (those without any other broader concept linked to it)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - narrower_concepts = set(g.subjects(SKOS.broader, None)) - broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] - broadest_concepts.sort() - selected_concepts = broadest_concepts - elif select=="leaves": - print("Selecting leaves concepts (those not linked as broader of any other concept)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - broad_concepts = set(g.objects(None, SKOS.broader)) - leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] - leave_concepts.sort() - selected_concepts = leave_concepts - else: - raise ValueError("Selection policy %s is not currently supported" % select) - - print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) - print("Pickling concept list for faster further requests in %s" % fullpath_pickle) - pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) - - return selected_concepts - -if __name__ == '__main__': - - def single_label_fragment(doclist): - single = [d for d in doclist if len(d.categories) < 2] - final_categories = set([d.categories[0] if d.categories else [] for d in single]) - print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), - len(final_categories), - len(doclist))) - return single, list(final_categories) - - train_years = list(range(1986, 2006)) - test_years = [2006] - cat_policy = 'leaves' - most_common_cat = 300 - # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" - JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" - langs = lang_set['JRC_NLTK'] - cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) - sys.exit() - - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) - test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - training_docs, label_names = single_label_fragment(training_docs) - test_docs, label_namestest = single_label_fragment(test_docs) - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - diff --git a/src/data/reader/rcv_reader.py b/src/data/reader/rcv_reader.py deleted file mode 100644 index cd4b416..0000000 --- a/src/data/reader/rcv_reader.py +++ /dev/null @@ -1,225 +0,0 @@ -from zipfile import ZipFile -import xml.etree.ElementTree as ET -from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS -from util.file import list_files -from sklearn.datasets import get_data_home -import gzip -from os.path import join, exists -from util.file import download_file_if_not_exists -import re -from collections import Counter -import numpy as np -import sys - -""" -RCV2's Nomenclature: -ru = Russian -da = Danish -de = German -es = Spanish -lat = Spanish Latin-American (actually is also 'es' in the collection) -fr = French -it = Italian -nl = Dutch -pt = Portuguese -sv = Swedish -ja = Japanese -htw = Chinese -no = Norwegian -""" - -RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" -RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' -RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" -RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" - -rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', - 'lyrl2004_tokens_test_pt1.dat.gz', - 'lyrl2004_tokens_test_pt2.dat.gz', - 'lyrl2004_tokens_test_pt3.dat.gz'] - -rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] - -rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' - -RCV2_LANG_DIR = {'ru':'REUTE000', - 'de':'REUTE00A', - 'fr':'REUTE00B', - 'sv':'REUTE001', - 'no':'REUTE002', - 'da':'REUTE003', - 'pt':'REUTE004', - 'it':'REUTE005', - 'es':'REUTE006', - 'lat':'REUTE007', - 'jp':'REUTE008', - 'htw':'REUTE009', - 'nl':'REUTERS_'} - - -class RCV_Document: - - def __init__(self, id, text, categories, date='', lang=None): - self.id = id - self.date = date - self.lang = lang - self.text = text - self.categories = categories - - -class ExpectedLanguageException(Exception): pass -class IDRangeException(Exception): pass - - -nwords = [] - -def parse_document(xml_content, assert_lang=None, valid_id_range=None): - root = ET.fromstring(xml_content) - if assert_lang: - if assert_lang not in root.attrib.values(): - if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' - raise ExpectedLanguageException('error: document of a different language') - - doc_id = root.attrib['itemid'] - if valid_id_range is not None: - if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: - raise IDRangeException - - doc_categories = [cat.attrib['code'] for cat in - root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] - - doc_date = root.attrib['date'] - doc_title = root.find('.//title').text - doc_headline = root.find('.//headline').text - doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) - - if not doc_body: - raise ValueError('Empty document') - - if doc_title is None: doc_title = '' - if doc_headline is None or doc_headline in doc_title: doc_headline = '' - text = '\n'.join([doc_title, doc_headline, doc_body]).strip() - - text_length = len(text.split()) - global nwords - nwords.append(text_length) - - return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) - - -def fetch_RCV1(data_path, split='all'): - - assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' - - request = [] - labels = set() - read_documents = 0 - lang = 'en' - - training_documents = 23149 - test_documents = 781265 - - if split == 'all': - split_range = (2286, 810596) - expected = training_documents+test_documents - elif split == 'train': - split_range = (2286, 26150) - expected = training_documents - else: - split_range = (26151, 810596) - expected = test_documents - - global nwords - nwords=[] - for part in list_files(data_path): - if not re.match('\d+\.zip', part): continue - target_file = join(data_path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) - labels.update(doc.categories) - request.append(doc) - read_documents += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents'.format(part, len(request)), end='') - if read_documents == expected: break - if read_documents == expected: break - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_RCV2(data_path, languages=None): - - if not languages: - languages = list(RCV2_LANG_DIR.keys()) - else: - assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' - - request = [] - labels = set() - global nwords - nwords=[] - for lang in languages: - path = join(data_path, RCV2_LANG_DIR[lang]) - lang_docs_read = 0 - for part in list_files(path): - target_file = join(path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang) - labels.update(doc.categories) - request.append(doc) - lang_docs_read += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_topic_hierarchy(path, topics='all'): - assert topics in ['all', 'leaves'] - - download_file_if_not_exists(RCV1_TOPICHIER_URL, path) - hierarchy = {} - for line in open(path, 'rt'): - parts = line.strip().split() - parent,child = parts[1],parts[3] - if parent not in hierarchy: - hierarchy[parent]=[] - hierarchy[parent].append(child) - - del hierarchy['None'] - del hierarchy['Root'] - print(hierarchy) - - if topics=='all': - topics = set(hierarchy.keys()) - for parent in hierarchy.keys(): - topics.update(hierarchy[parent]) - return list(topics) - elif topics=='leaves': - parents = set(hierarchy.keys()) - childs = set() - for parent in hierarchy.keys(): - childs.update(hierarchy[parent]) - return list(childs.difference(parents)) - - diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py deleted file mode 100644 index 83e11e3..0000000 --- a/src/data/reader/wikipedia_tools.py +++ /dev/null @@ -1,304 +0,0 @@ -from __future__ import print_function -# import ijson -# from ijson.common import ObjectBuilder -import os, sys -from os.path import join -from bz2 import BZ2File -import pickle -from util.file import list_dirs, list_files, makedirs_if_not_exist -from itertools import islice -import re -from xml.sax.saxutils import escape -import numpy as np - -policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] - -""" -This file contains a set of tools for processing the Wikipedia multilingual documents. -In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) -and have processed each document to clean their texts with one of the tools: - - https://github.com/aesuli/wikipediatools (Python 2) - - https://github.com/aesuli/wikipedia-extractor (Python 3) -It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) - -This tools help you in: - - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. - Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" - extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). - Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". - - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. - - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all - language-specific versions from the document. - - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, - in a way that the i-th element from any list refers to the same element in the respective language. -""" - -def _doc_generator(text_path, langs): - dotspace = re.compile(r'\.(?!\s)') - for l,lang in enumerate(langs): - print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) - lang_dir = join(text_path, lang) - split_dirs = list_dirs(lang_dir) - for sd,split_dir in enumerate(split_dirs): - print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) - split_files = list_files(join(lang_dir, split_dir)) - for sf,split_file in enumerate(split_files): - print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) - with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: - while True: - doc_lines = list(islice(fi, 3)) - if doc_lines: - # some sentences are not followed by a space after the dot - doc_lines[1] = dotspace.sub('. ', doc_lines[1]) - # [workaround] I found   html symbol was not treated, and unescaping it now might not help... - doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) - yield doc_lines, lang - else: break - -def _extract_title(doc_lines): - m = re.search('title="(.+?)"', doc_lines[0]) - if m: return m.group(1).decode('utf-8') - else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) - -def _create_doc(target_file, id, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) - with open(target_file, 'w') as fo: - fo.write('\n'%id) - [fo.write(line) for line in doc] - fo.write('') - -def _append_doc(target_file, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) - with open(target_file, 'r', buffering=1024*1024) as fi: - lines = fi.readlines() - if doc[0] in lines[1::3]: - return - lines[-1:-1]=doc - with open(target_file, 'w', buffering=1024*1024) as fo: - [fo.write(line) for line in lines] - -def extract_multilingual_documents(inv_dict, langs, text_path, out_path): - if not os.path.exists(out_path): - os.makedirs(out_path) - for lang in langs: - if lang not in inv_dict: - raise ValueError("Lang %s is not in the dictionary" % lang) - - docs_created = len(list_files(out_path)) - print("%d multilingual documents found." % docs_created) - for doc,lang in _doc_generator(text_path, langs): - title = _extract_title(doc) - - if title in inv_dict[lang]: - #pass - ids = inv_dict[lang][title] - for id in ids: - target_file = join(out_path, id) + ".xml" - if os.path.exists(target_file): - _append_doc(target_file, doc, lang) - else: - _create_doc(target_file, id, doc, lang) - docs_created+=1 - else: - if not re.match('[A-Za-z]+', title): - print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) - - - -def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): - simplified_file = join(data_dir,filename) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy - pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") - pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") - if os.path.exists(pickle_invdict): - if return_both and os.path.exists(pickle_dict): - print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) - return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) - elif return_both==False: - print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) - return pickle.load(open(pickle_invdict, 'rb')) - - multiling_titles = {} - inv_dict = {lang:{} for lang in langs} - - def process_entry(line): - parts = line.strip().split('\t') - id = parts[0] - if id in multiling_titles: - raise ValueError("id <%s> already indexed" % id) - - titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) - for lang in titles.keys(): - if lang not in langs: - del titles[lang] - - if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ - or (policy == "IN_ANY_LANG" and len(titles) > 0): - multiling_titles[id] = titles - for lang, title in titles.items(): - if title in inv_dict[lang]: - inv_dict[lang][title].append(id) - inv_dict[lang][title] = [id] - - with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: - completed = 0 - try: - for line in fi: - process_entry(line) - completed += 1 - if completed % 10 == 0: - print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") - print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") - except EOFError: - print("\nUnexpected file ending... saving anyway") - - print("Pickling dictionaries in %s" % data_dir) - pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) - print("Done") - - return (multiling_titles, inv_dict) if return_both else inv_dict - - -# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): - latest_all_json_file = join(data_dir,json_file) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) - - def process_entry(last, fo): - global written - id = last["id"] - titles = None - if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): - titles = {lang: last["labels"][lang]["value"] for lang in langs} - elif policy == "IN_ANY_LANG": - titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} - - if titles: - fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) - return True - else: - return False - - written = 0 - with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ - BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: - builder = ObjectBuilder() - completed = 0 - for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): - builder.event(event, value) - if len(builder.value)>1: - if process_entry(builder.value.pop(0), fo): written += 1 - completed += 1 - print("\rCompleted %d\ttitles %d" % (completed,written), end="") - print("") - - #process the last entry - process_entry(builder.value.pop(0)) - - return simple_titles_path - -""" -Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the -specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- -specific version of the same document. Documents are forced to contain version in all specified languages and to contain -a minimum number of words; otherwise it is discarded. -""" -class MinWordsNotReached(Exception): pass -class WrongDocumentFormat(Exception): pass - -def _load_multilang_doc(path, langs, min_words=100): - import xml.etree.ElementTree as ET - from xml.etree.ElementTree import Element, ParseError - try: - root = ET.parse(path).getroot() - doc = {} - for lang in langs: - doc_body = root.find('.//doc[@lang="' + lang + '"]') - if isinstance(doc_body, Element): - n_words = len(doc_body.text.split(' ')) - if n_words >= min_words: - doc[lang] = doc_body.text - else: - raise MinWordsNotReached - else: - raise WrongDocumentFormat - except ParseError: - raise WrongDocumentFormat - return doc - -#returns the multilingual documents mapped by language, and a counter with the number of documents readed -def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): - if pickle_name and os.path.exists(pickle_name): - print("unpickling %s" % pickle_name) - return pickle.load(open(pickle_name, 'rb')) - - multi_docs = list_files(wiki_multi_path) - mling_documents = {l:[] for l in langs} - valid_documents = 0 - minwords_exception = 0 - wrongdoc_exception = 0 - for d,multi_doc in enumerate(multi_docs): - print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % - (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") - doc_path = join(wiki_multi_path, multi_doc) - try: - m_doc = _load_multilang_doc(doc_path, langs, min_words) - valid_documents += 1 - for l in langs: - mling_documents[l].append(m_doc[l]) - except MinWordsNotReached: - minwords_exception += 1 - if deletions: os.remove(doc_path) - except WrongDocumentFormat: - wrongdoc_exception += 1 - if deletions: os.remove(doc_path) - if max_documents>0 and valid_documents>=max_documents: - break - - if pickle_name: - print("Pickling wikipedia documents object in %s" % pickle_name) - pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - - return mling_documents - -def random_wiki_sample(l_wiki, max_documents): - if max_documents == 0: return None - langs = list(l_wiki.keys()) - assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' - ndocs_per_lang = len(l_wiki[langs[0]]) - if ndocs_per_lang > max_documents: - sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) - for lang in langs: - l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] - return l_wiki - - -if __name__ == "__main__": - - wikipedia_home = "../Datasets/Wikipedia" - - from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs - langs = frozenset(langs) - - simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") - _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') - extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), - out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) - - diff --git a/src/data/text_preprocessor.py b/src/data/text_preprocessor.py deleted file mode 100644 index 1a6e3ae..0000000 --- a/src/data/text_preprocessor.py +++ /dev/null @@ -1,33 +0,0 @@ -from nltk.corpus import stopwords -from data.languages import NLTK_LANGMAP -from nltk import word_tokenize -from nltk.stem import SnowballStemmer - - -def preprocess_documents(documents, lang): - tokens = NLTKStemTokenizer(lang, verbose=True) - sw = stopwords.words(NLTK_LANGMAP[lang]) - return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] - - -class NLTKStemTokenizer(object): - - def __init__(self, lang, verbose=False): - if lang not in NLTK_LANGMAP: - raise ValueError('Language %s is not supported in NLTK' % lang) - self.verbose=verbose - self.called = 0 - self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) - self.cache = {} - - def __call__(self, doc): - self.called += 1 - if self.verbose: - print("\r\t\t[documents processed %d]" % (self.called), end="") - tokens = word_tokenize(doc) - stems = [] - for t in tokens: - if t not in self.cache: - self.cache[t] = self.wnl.stem(t) - stems.append(self.cache[t]) - return stems \ No newline at end of file diff --git a/src/data/tsr_function__.py b/src/data/tsr_function__.py deleted file mode 100755 index 0af8690..0000000 --- a/src/data/tsr_function__.py +++ /dev/null @@ -1,270 +0,0 @@ -import math -import numpy as np -from scipy.stats import t -from joblib import Parallel, delayed -from scipy.sparse import csr_matrix, csc_matrix - - -def get_probs(tpr, fpr, pc): - # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) - # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) - pnc = 1.0 - pc - tp = tpr * pc - fn = pc - tp - fp = fpr * pnc - tn = pnc - fp - return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) - - -def apply_tsr(tpr, fpr, pc, tsr): - cell = get_probs(tpr, fpr, pc) - return tsr(cell) - - -def positive_information_gain(cell): - if cell.tpr() < cell.fpr(): - return 0.0 - else: - return information_gain(cell) - - -def posneg_information_gain(cell): - ig = information_gain(cell) - if cell.tpr() < cell.fpr(): - return -ig - else: - return ig - - -def __ig_factor(p_tc, p_t, p_c): - den = p_t * p_c - if den != 0.0 and p_tc != 0: - return p_tc * math.log(p_tc / den, 2) - else: - return 0.0 - - -def information_gain(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ - __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ - __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ - __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) - - -def information_gain_mod(cell): - return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ - - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) - - -def pointwise_mutual_information(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) - - -def gain_ratio(cell): - pc = cell.p_c() - pnc = 1.0 - pc - norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) - return information_gain(cell) / (-norm) - - -def chi_square(cell): - den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() - if den==0.0: return 0.0 - num = gss(cell)**2 - return num / den - - -def relevance_frequency(cell): - a = cell.tp - c = cell.fp - if c == 0: c = 1 - return math.log(2.0 + (a * 1.0 / c), 2) - - -def idf(cell): - if cell.p_f()>0: - return math.log(1.0 / cell.p_f()) - return 0.0 - - -def gss(cell): - return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() - - -def conf_interval(xt, n): - if n>30: - z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 - else: - z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 - p = (xt + 0.5 * z2) / (n + z2) - amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) - return p, amplitude - -def strength(minPosRelFreq, minPos, maxNeg): - if minPos > maxNeg: - return math.log(2.0 * minPosRelFreq, 2.0) - else: - return 0.0 - - -#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) -#however, for some extremely imbalanced dataset caused all documents to be 0 -def conf_weight(cell, cancel_features=False): - c = cell.get_c() - not_c = cell.get_not_c() - tp = cell.tp - fp = cell.fp - - pos_p, pos_amp = conf_interval(tp, c) - neg_p, neg_amp = conf_interval(fp, not_c) - - min_pos = pos_p-pos_amp - max_neg = neg_p+neg_amp - den = (min_pos + max_neg) - minpos_relfreq = min_pos / (den if den != 0 else 1) - - str_tplus = strength(minpos_relfreq, min_pos, max_neg); - - if str_tplus == 0 and not cancel_features: - return 1e-20 - - return str_tplus; - - -class ContTable: - - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - -def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): - print(f'[selectiong {k} terms]') - nC = Y.shape[1] - FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T - best_features_idx = np.argsort(-FC, axis=0).flatten() - tsr_values = FC.flatten() - selected_indexes_set = set() - selected_indexes = list() - selected_value = list() - from_category = list() - round_robin = iter(best_features_idx) - values_iter = iter(tsr_values) - round=0 - while len(selected_indexes) < k: - term_idx = next(round_robin) - term_val = next(values_iter) - if term_idx not in selected_indexes_set: - selected_indexes_set.add(term_idx) - selected_indexes.append(term_idx) - selected_value.append(term_val) - from_category.append(round) - round = (round + 1) % nC - return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) - - -def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): - tp_ = len(positive_document_indexes & feature_document_indexes) - fp_ = len(feature_document_indexes - positive_document_indexes) - fn_ = len(positive_document_indexes - feature_document_indexes) - tn_ = nD - (tp_ + fp_ + fn_) - return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) - - -def category_tables(feature_sets, category_sets, c, nD, nF): - return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] - - -""" -Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. -Efficiency O(nF x nC x log(S)) where S is the sparse factor -""" -def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): - nD, nF = coocurrence_matrix.shape - nD2, nC = label_matrix.shape - - if nD != nD2: - raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % - (coocurrence_matrix.shape,label_matrix.shape)) - - def nonzero_set(matrix, col): - return set(matrix[:, col].nonzero()[0]) - - if isinstance(coocurrence_matrix, csr_matrix): - coocurrence_matrix = csc_matrix(coocurrence_matrix) - feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] - category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] - cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) - return np.array(cell_matrix) - -# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f -def get_tsr_matrix(cell_matrix, tsr_score_funtion): - nC,nF = cell_matrix.shape - tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] - return np.array(tsr_matrix) - - -""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can -take as input any real-valued feature column (e.g., tf-idf weights). -feat is the feature vector, and c is a binary classification vector. -This implementation covers only the binary case, while the formula is defined for multiclass -single-label scenarios, for which the version [2] might be preferred. -[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. -[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. -""" -def fisher_score_binary(feat, c): - neg = np.ones_like(c) - c - - npos = np.sum(c) - nneg = np.sum(neg) - - mupos = np.mean(feat[c == 1]) - muneg = np.mean(feat[neg == 1]) - mu = np.mean(feat) - - stdpos = np.std(feat[c == 1]) - stdneg = np.std(feat[neg == 1]) - - num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) - den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) - - if den>0: - return num / den - else: - return num diff --git a/src/dataset_builder.py b/src/dataset_builder.py deleted file mode 100644 index b9650c7..0000000 --- a/src/dataset_builder.py +++ /dev/null @@ -1,710 +0,0 @@ -from os.path import join, exists -from nltk.corpus import stopwords -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.preprocessing import MultiLabelBinarizer -from data.reader.jrcacquis_reader import * -from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents -import pickle -import numpy as np -from sklearn.model_selection import train_test_split -from scipy.sparse import issparse -import itertools -from tqdm import tqdm -import re -from scipy.sparse import csr_matrix - - -class MultilingualDataset: - """ - A multilingual dataset is a dictionary of training and test documents indexed by language code. - Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the - documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the - labels of each document, and ids is a list of document-identifiers from the original collection. - """ - - def __init__(self): - self.dataset_name = "" - self.multiling_dataset = {} - - def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): - self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) - - def save(self, file): - self.sort_indexes() - pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) - return self - - def __getitem__(self, item): - if item in self.langs(): - return self.multiling_dataset[item] - return None - - @classmethod - def load(cls, file): - data = pickle.load(open(file, 'rb')) - data.sort_indexes() - return data - - @classmethod - def load_ids(cls, file): - data = pickle.load(open(file, 'rb')) - tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} - te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} - return tr_ids, te_ids - - def sort_indexes(self): - for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): - if issparse(Xtr): Xtr.sort_indices() - if issparse(Xte): Xte.sort_indices() - - def set_view(self, categories=None, languages=None): - if categories is not None: - if isinstance(categories, int): - categories = np.array([categories]) - elif isinstance(categories, list): - categories = np.array(categories) - self.categories_view = categories - if languages is not None: - self.languages_view = languages - - def training(self, mask_numbers=False, target_as_csr=False): - return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) - - def test(self, mask_numbers=False, target_as_csr=False): - return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) - - def lXtr(self, mask_numbers=False): - proc = lambda x:_mask_numbers(x) if mask_numbers else x - # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - - def lXte(self, mask_numbers=False): - proc = lambda x: _mask_numbers(x) if mask_numbers else x - # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} - - def lYtr(self, as_csr=False): - lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def lYte(self, as_csr=False): - lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def cat_view(self, Y): - if hasattr(self, 'categories_view'): - return Y[:,self.categories_view] - else: - return Y - - def langs(self): - if hasattr(self, 'languages_view'): - langs = self.languages_view - else: - langs = sorted(self.multiling_dataset.keys()) - return langs - - def num_categories(self): - return self.lYtr()[self.langs()[0]].shape[1] - - def show_dimensions(self): - def shape(X): - return X.shape if hasattr(X, 'shape') else len(X) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) - - def show_category_prevalences(self): - nC = self.num_categories() - accum_tr = np.zeros(nC, dtype=np.int) - accum_te = np.zeros(nC, dtype=np.int) - in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - prev_train = np.sum(self.cat_view(Ytr), axis=0) - prev_test = np.sum(self.cat_view(Yte), axis=0) - accum_tr += prev_train - accum_te += prev_test - in_langs += (prev_train>0)*1 - print(lang+'-train', prev_train) - print(lang+'-test', prev_test) - print('all-train', accum_tr) - print('all-test', accum_te) - - return accum_tr, accum_te, in_langs - - def set_labels(self, labels): - self.labels = labels - -def _mask_numbers(data): - mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') - mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') - mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') - mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') - mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - text = ' ' + text - text = mask_moredigit.sub(' MoreDigitMask', text) - text = mask_4digit.sub(' FourDigitMask', text) - text = mask_3digit.sub(' ThreeDigitMask', text) - text = mask_2digit.sub(' TwoDigitMask', text) - text = mask_1digit.sub(' OneDigitMask', text) - masked.append(text.replace('.','').replace(',','').strip()) - return masked - - - - -# ---------------------------------------------------------------------------------------------------------------------- -# Helpers -# ---------------------------------------------------------------------------------------------------------------------- -def get_active_labels(doclist): - cat_list = set() - for d in doclist: - cat_list.update(d.categories) - return list(cat_list) - -def filter_by_categories(doclist, keep_categories): - catset = frozenset(keep_categories) - for d in doclist: - d.categories = list(set(d.categories).intersection(catset)) - -def __years_to_str(years): - if isinstance(years, list): - if len(years) > 1: - return str(years[0])+'-'+str(years[-1]) - return str(years[0]) - return str(years) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Matrix builders -# ---------------------------------------------------------------------------------------------------------------------- -def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are independent of each other, - i.e., each language-specific matrix lies in a dedicate feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - lW = {} - - multilingual_dataset = MultilingualDataset() - multilingual_dataset.dataset_name = dataset_name - multilingual_dataset.set_labels(mlb.classes_) - for lang in langs: - print("\nprocessing %d training, %d test, %d wiki for language <%s>" % - (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) - - tr_data, tr_labels, IDtr = zip(*training_docs[lang]) - te_data, te_labels, IDte = zip(*test_docs[lang]) - - if preprocess: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, - tokenizer=NLTKStemTokenizer(lang, verbose=True), - stop_words=stopwords.words(NLTK_LANGMAP[lang])) - else: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - - Xtr = tfidf.fit_transform(tr_data) - Xte = tfidf.transform(te_data) - if wiki_docs: - lW[lang] = tfidf.transform(wiki_docs[lang]) - - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - - multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - multilingual_dataset.show_dimensions() - multilingual_dataset.show_category_prevalences() - - if wiki_docs: - return multilingual_dataset, lW - else: - return multilingual_dataset - - -# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space -def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, - since all of them lie on the same yuxtaposed feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - multiling_dataset = MultilingualDataset() - multiling_dataset.dataset_name = dataset_name - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - multiling_dataset.set_labels(mlb.classes_) - - tr_data_stack = [] - for lang in langs: - print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) - tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) - te_data, te_labels, te_ID = zip(*test_docs[lang]) - if preprocess: - tr_data = preprocess_documents(tr_data, lang) - te_data = preprocess_documents(te_data, lang) - tr_data_stack.extend(tr_data) - multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) - - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - tfidf.fit(tr_data_stack) - - for lang in langs: - print("\nweighting documents for language <%s>" % (lang)) - (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] - Xtr = tfidf.transform(tr_data) - Xte = tfidf.transform(te_data) - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) - - multiling_dataset.show_dimensions() - return multiling_dataset - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to recover the original documents from the MultilingualDataset's ids -# ---------------------------------------------------------------------------------------------------------------------- -""" -This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent -article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents -from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath -""" -def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - all_docs = rcv1_documents + rcv2_documents - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -""" -Same thing but for JRC-Acquis -""" -def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - def filter_by_id(doclist, ids): - ids_set = frozenset(itertools.chain.from_iterable(ids.values())) - return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] - - training_docs = filter_by_id(training_docs, tr_ids) - test_docs = filter_by_id(test_docs, te_ids) - - print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -# ---------------------------------------------------------------------------------------------------------------------- -# Dataset Generators -# ---------------------------------------------------------------------------------------------------------------------- -def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - - - """ - Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - In all cases, training documents are strictly non-parallel, and test documents are strictly parallel - :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where - all splits will be generated - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_years: a list of ints containing the years to be considered as training documents - :param test_years: a list of ints containing the years to be considered as test documents - :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" - (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the - leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details - :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - name = 'JRCacquis' - run = '_run' + str(run) - config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ - 'vs' + __years_to_str(test_years) + \ - '_' + cat_policy + \ - ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ - '_noparallel_processed' - - indep_path = join(jrc_data_home, config_name + run + '.pickle') - upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') - yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') - wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') - wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - print('Generating feature-independent dataset...') - training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) - - def _group_by_lang(doc_list, langs): - return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] - for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) - test_docs = _group_by_lang(test_docs, langs) - if not exists(indep_path): - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) - pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) - - lang_data.save(indep_path) - - print('Generating upper-bound (English-only) dataset...') - if not exists(upper_path): - training_docs_eng_only = {'en':training_docs['en']} - test_docs_eng_only = {'en':test_docs['en']} - build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) - - print('Generating yuxtaposed dataset...') - if not exists(yuxta_path): - build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) - - -def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, - train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - """ - Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - - :param outpath: path where all splits will be dumped - :param rcv1_data_home: path to the RCV1-v2 dataset (English only) - :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_for_lang: maximum number of training documents per language - :param test_for_lang: maximum number of test documents per language - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' - assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' - assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ - "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" - - name = 'RCV1/2' - run = '_run' + str(run) - config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ - ('_processed' if preprocess else '_raw') - - indep_path = join(outpath, config_name + run + '.pickle') - upper_path = join(outpath, config_name + run +'_upper.pickle') - yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') - wiki_path = join(outpath, config_name + run + '.wiki.pickle') - wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents+rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} - - # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there - # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases - print('Generating upper-bound (English-only) dataset...') - train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) - train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} - test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} - build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) - - train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] - for lang in langs: - if lang=='en': continue # already split - test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) - train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) - train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] - test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] - - print('Generating feature-independent dataset...') - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - - lang_data.save(indep_path) - - print('Generating yuxtaposed dataset...') - build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to generate full RCV and JRC datasets -# ---------------------------------------------------------------------------------------------------------------------- -def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): - - - print('fetching the datasets') - rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_train_documents, labels_rcv2) - filter_by_categories(rcv1_test_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_train_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents - lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} - - def get_ids(doclist): - return frozenset([d.id for d in doclist]) - - tr_ids = {'en': get_ids(rcv1_train_documents)} - te_ids = {'en': get_ids(rcv1_test_documents)} - for lang in langs: - if lang == 'en': continue - tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) - - dataset = MultilingualDataset() - dataset.dataset_name = 'RCV1/2-full' - for lang in langs: - print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): - - print('fetching the datasets') - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat - ) - test_docs, _ = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' - ) - - def _group_by_lang(doc_list, langs): - return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - test_docs = _group_by_lang(test_docs, langs) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - data.dataset_name = 'JRC-Acquis-full' - for lang in langs: - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) - Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -#----------------------------------------------------------------------------------------------------------------------- -# MAIN BUILDER -#----------------------------------------------------------------------------------------------------------------------- - -if __name__=='__main__': - import sys - RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = '../Datasets/RCV2' - JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" - full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) - # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) - sys.exit(0) - - # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' - # data = MultilingualDataset.load(datasetpath) - # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' - # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: - # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] - # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) - # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') - # sys.exit(0) - - assert len(sys.argv) == 5, "wrong number of arguments; required: " \ - " " - - JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" - RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' - WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" - - langs = lang_set['JRC_NLTK'] - max_wiki = 5000 - - for run in range(0,10): - print('Building JRC-Acquis datasets run', run) - prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, - train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, - cat_policy='all', most_common_cat=300, run=run) - - print('Building RCV1-v2/2 datasets run', run) - prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], - train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) - - # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE - # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) - # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_','_doclist_') - # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) - - # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_', '_doclist_') - # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) - - - diff --git a/src/embeddings/__init__.py b/src/embeddings/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py deleted file mode 100644 index 27367e9..0000000 --- a/src/embeddings/embeddings.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -from torchtext.vocab import Vectors -import torch -from abc import ABC, abstractmethod -from util.SIF_embed import * - - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - if isinstance(words, dict): - words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] - - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx - - -class FastTextWikiNews(Vectors): - - url_base = 'Cant auto-download MUSE embeddings' - path = '../embeddings/wiki.multi.{}.vec' - _name = '/wiki.multi.{}.vec' - - def __init__(self, cache, language="en", **kwargs): - url = self.url_base.format(language) - name = cache + self._name.format(language) - super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) - - -class FastTextMUSE(PretrainedEmbeddings): - def __init__(self, path, lang, limit=None): - super().__init__() - assert os.path.exists(path), print(f'pre-trained vectors not found in {path}') - self.embed = FastTextWikiNews(path, lang, max_vectors=limit) - - def vocabulary(self): - return set(self.embed.stoi.keys()) - - def dim(self): - return self.embed.dim - - def extract(self, words): - source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_idx] = self.embed.vectors[target_idx] - return extraction - - - diff --git a/src/embeddings/pretrained.py b/src/embeddings/pretrained.py deleted file mode 100644 index 026823e..0000000 --- a/src/embeddings/pretrained.py +++ /dev/null @@ -1,102 +0,0 @@ -from abc import ABC, abstractmethod -import torch, torchtext -# import gensim -# import os -import numpy as np - - -# class KeyedVectors: -# -# def __init__(self, word2index, weights): -# assert len(word2index)==weights.shape[0], 'wrong number of dimensions' -# index2word = {i:w for w,i in word2index.items()} -# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed' -# self.word2index = word2index -# self.index2word = index2word -# self.weights = weights -# -# def extract(self, words): -# dim = self.weights.shape[1] -# v_size = len(words) -# -# source_idx, target_idx = [], [] -# for i,word in enumerate(words): -# if word not in self.word2index: continue -# j = self.word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# -# extraction = np.zeros((v_size, dim)) -# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)] -# -# return extraction - - -# class PretrainedEmbeddings(ABC): -# -# def __init__(self): -# super().__init__() -# -# @abstractmethod -# def vocabulary(self): pass -# -# @abstractmethod -# def dim(self): pass -# -# @classmethod -# def reindex(cls, words, word2index): -# source_idx, target_idx = [], [] -# for i, word in enumerate(words): -# if word not in word2index: continue -# j = word2index[word] -# source_idx.append(i) -# target_idx.append(j) -# source_idx = np.asarray(source_idx) -# target_idx = np.asarray(target_idx) -# return source_idx, target_idx - - -# class GloVe(PretrainedEmbeddings): -# -# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None): -# super().__init__() -# print(f'Loading GloVe pretrained vectors from torchtext') -# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors) -# print('Done') -# -# def vocabulary(self): -# return set(self.embed.stoi.keys()) -# -# def dim(self): -# return self.embed.dim -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) -# extraction = torch.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# return extraction - - -# class Word2Vec(PretrainedEmbeddings): -# -# def __init__(self, path, limit=None): -# super().__init__() -# print(f'Loading word2vec pretrained vectors from {path}') -# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}') -# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit) -# self.word2index={w:i for i,w in enumerate(self.embed.index2word)} -# print('Done') -# -# def vocabulary(self): -# return set(self.word2index.keys()) -# -# def dim(self): -# return self.embed.vector_size -# -# def extract(self, words): -# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index) -# extraction = np.zeros((len(words), self.dim())) -# extraction[source_idx] = self.embed.vectors[target_idx] -# extraction = torch.from_numpy(extraction).float() -# return extraction - diff --git a/src/embeddings/supervised.py b/src/embeddings/supervised.py deleted file mode 100755 index f84793e..0000000 --- a/src/embeddings/supervised.py +++ /dev/null @@ -1,74 +0,0 @@ -from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square -import numpy as np - - -def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur - std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None) - mean = np.mean(x, axis=axis) - return (x - mean) / std - - -def supervised_embeddings_tfidf(X,Y): - tfidf_norm = X.sum(axis=0) - tfidf_norm[tfidf_norm==0] = 1 - F = (X.T).dot(Y) / tfidf_norm.T - return F - - -def supervised_embeddings_ppmi(X,Y): - Xbin = X>0 - D = X.shape[0] - Pxy = (Xbin.T).dot(Y)/D - Px = Xbin.sum(axis=0)/D - Py = Y.sum(axis=0)/D - F = np.asarray(Pxy/(Px.T*Py)) - F = np.maximum(F, 1.0) - F = np.log(F) - return F - - -def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000): - D = X.shape[0] - if D>max_documents: - print(f'sampling {max_documents}') - random_sample = np.random.permutation(D)[:max_documents] - X = X[random_sample] - Y = Y[random_sample] - cell_matrix = get_supervised_matrix(X, Y) - F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T - return F - - -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): - if max_label_space != 0: - print('computing supervised embeddings...') - nC = Y.shape[1] - - if method=='ppmi': - F = supervised_embeddings_ppmi(X, Y) - elif method == 'dotn': - F = supervised_embeddings_tfidf(X, Y) - elif method == 'ig': - F = supervised_embeddings_tsr(X, Y, information_gain) - elif method == 'chi2': - F = supervised_embeddings_tsr(X, Y, chi_square) - - if dozscore: - F = zscores(F, axis=0) - - # Dumping F-matrix for further studies - dump_it = False - if dump_it: - with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: - np.savetxt(outfile, F, delimiter='\t') - with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: - for token in voc.keys(): - outfile.write(token+'\n') - - return F - - - - - - diff --git a/src/experiment_scripts/10run_dl_jrc.sh b/src/experiment_scripts/10run_dl_jrc.sh deleted file mode 100644 index ce04aa8..0000000 --- a/src/experiment_scripts/10run_dl_jrc.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=../log/log10run_dl_jrc.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_dl_rcv.sh b/src/experiment_scripts/10run_dl_rcv.sh deleted file mode 100644 index 51ca64b..0000000 --- a/src/experiment_scripts/10run_dl_rcv.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=../log/log10run_dl_rcv.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -done diff --git a/src/experiment_scripts/10run_jrc.sh b/src/experiment_scripts/10run_jrc.sh deleted file mode 100644 index 37e3333..0000000 --- a/src/experiment_scripts/10run_jrc.sh +++ /dev/null @@ -1,12 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/10run_jrc_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done diff --git a/src/experiment_scripts/10run_jrc_combinations.sh b/src/experiment_scripts/10run_jrc_combinations.sh deleted file mode 100644 index 156a0a5..0000000 --- a/src/experiment_scripts/10run_jrc_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -logfile=./results/funnelling_10run_jrc_CIKM.csv - -runs='6 7 8 9' #0 1 2 3 4 5 -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated (done up to run5) - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/10run_rcv.sh b/src/experiment_scripts/10run_rcv.sh deleted file mode 100644 index 9d49f94..0000000 --- a/src/experiment_scripts/10run_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/10run_rcv_final_results.csv - -runs='0 1 2 3 4 5 6 7 8 9' - -for run in $runs -do - dataset=$dataset_path$run.pickle - python main_multimodal_cls.py $dataset -o $logfile -P -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -S -z -c --l2 - python main_multimodal_cls.py $dataset -o $logfile -U -z -c --l2 - -done - - diff --git a/src/experiment_scripts/10run_rcv_combinations.sh b/src/experiment_scripts/10run_rcv_combinations.sh deleted file mode 100644 index b5d8a3b..0000000 --- a/src/experiment_scripts/10run_rcv_combinations.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -logfile=./results/funnelling_10run_rcv_CIKM_allprob_concatenated.csv - -runs='0 1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 Pr(views) concatenated - python main_multimodal_cls.py $dataset -o $logfile -P -U -S -c -r -z --l2 --allprob # last combination for CIKM 3 views concatenated - #python main_gFun.py $dataset -o $logfile -P -U -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -S -c -r -a -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -S -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -P -U -c -r -z --l2 --allprob - #python main_gFun.py $dataset -o $logfile -c -P -U -r -z --l2 - #python main_gFun.py $dataset -o $logfile -c -P -U -S -r -z --l2 -done \ No newline at end of file diff --git a/src/experiment_scripts/extract_features.sh b/src/experiment_scripts/extract_features.sh deleted file mode 100644 index d0bd3ac..0000000 --- a/src/experiment_scripts/extract_features.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run# - -runs='1 2 3 4 5 6 7 8 9' -for run in $runs -do - dataset=$dataset_path$run.pickle - modelpath=/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run$runs - python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath -done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert_extractor.py --dataset $dataset --modelpath $modelpath \ No newline at end of file diff --git a/src/experiment_scripts/main_deep_learning.py b/src/experiment_scripts/main_deep_learning.py deleted file mode 100755 index ee56054..0000000 --- a/src/experiment_scripts/main_deep_learning.py +++ /dev/null @@ -1,329 +0,0 @@ -import argparse -import torch.nn as nn -from torch.optim.lr_scheduler import StepLR -from dataset_builder import MultilingualDataset -from learning.transformers import load_muse_embeddings -from models.lstm_class import RNNMultilingualClassifier -from util.csv_log import CSVLog -from util.early_stop import EarlyStopping -from util.common import * -from util.file import create_if_not_exist -from time import time -from tqdm import tqdm -from util.evaluation import evaluate -from util.file import get_file_name -# import pickle - -allowed_nets = {'rnn'} - -# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested -def init_Net(nC, multilingual_index, xavier_uniform=True): - net=opt.net - assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}' - - # instantiate the required net - if net=='rnn': - only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised) - if only_post: - print('working on ONLY POST mode') - model = RNNMultilingualClassifier( - output_size=nC, - hidden_size=opt.hidden, - lvocab_size=multilingual_index.l_vocabsize(), - learnable_length=opt.learnable, - lpretrained=multilingual_index.l_embeddings(), - drop_embedding_range=multilingual_index.sup_range, - drop_embedding_prop=opt.sup_drop, - post_probabilities=opt.posteriors, - only_post=only_post, - bert_embeddings=opt.mbert - ) - - # weight initialization - if xavier_uniform: - for p in model.parameters(): - if p.dim() > 1 and p.requires_grad: - nn.init.xavier_uniform_(p) - - if opt.tunable: - # this has to be performed *after* Xavier initialization is done, - # otherwise the pretrained embedding parameters will be overrided - model.finetune_pretrained() - - return model.cuda() - - -def set_method_name(): - method_name = f'{opt.net}(H{opt.hidden})' - if opt.pretrained: - method_name += f'-Muse' - if opt.supervised: - method_name += f'-WCE' - if opt.posteriors: - method_name += f'-Posteriors' - if opt.mbert: - method_name += f'-mBert' - if (opt.pretrained or opt.supervised) and opt.tunable: - method_name += '-(trainable)' - else: - method_name += '-(static)' - if opt.learnable > 0: - method_name += f'-Learnable{opt.learnable}' - return method_name - - -def init_optimizer(model, lr): - return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay) - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise -def load_pretrained_embeddings(we_path, langs): - lpretrained = lpretrained_vocabulary = none_dict(langs) - if opt.pretrained: - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def train(model, batcher, ltrain_index, ltrain_posteriors, ltrain_bert, lytr, tinit, logfile, criterion, optim, epoch, method_name): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - # _out = model(batch, post, bert_emb, lang) - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l:[] for l in langs} - yte_stacked = {l:[] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -# ---------------------------------------------------------------------------------------------------------------------- -def main(): - DEBUGGING = False - - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - # Loading the dataset - data = MultilingualDataset.load(opt.dataset) - # data.set_view(languages=['it', 'fr']) # Testing with less langs - data.show_dimensions() - langs = data.langs() - l_devel_raw, l_devel_target = data.training(target_as_csr=True) - l_test_raw, l_test_target = data.test(target_as_csr=True) - - # Loading the MUSE pretrained embeddings (only if requested) - lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) - # lpretrained_vocabulary = none_dict(langs) # do not keep track of words known in pretrained embeddings vocabulary that are also present in test set - - # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs - multilingual_index = MultilingualIndex() - multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary) - multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) - multilingual_index.embedding_matrices(lpretrained, opt.supervised) - if opt.posteriors: - if DEBUGGING: - import pickle - with open('/home/andreapdr/funneling_pdr/dumps/posteriors_jrc_run0.pickle', 'rb') as infile: - data_post = pickle.load(infile) - lPtr = data_post[0] - lPva = data_post[1] - lPte = data_post[2] - print('## DEBUGGING MODE: loaded dumped posteriors for jrc run0') - else: - lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=5000) - else: - lPtr, lPva, lPte = None, None, None - - if opt.mbert: - _dataset_path = opt.dataset.split('/')[-1].split('_') - _model_folder = _dataset_path[0] + '_' + _dataset_path[-1].replace('.pickle', '') - # print(f'Model Folder: {_model_folder}') - - if DEBUGGING: - with open('/home/andreapdr/funneling_pdr/dumps/mBert_jrc_run0.pickle', 'rb') as infile: - data_embed = pickle.load(infile) - tr_bert_embeddings = data_embed[0] - va_bert_embeddings = data_embed[1] - te_bert_embeddings = data_embed[2] - print('## DEBUGGING MODE: loaded dumped mBert embeddings for jrc run0') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings \ - = multilingual_index.bert_embeddings(f'/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-{_model_folder}/') - else: - tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings = None, None, None - - # Model initialization - model = init_Net(data.num_categories(), multilingual_index) - - optim = init_optimizer(model, lr=opt.lr) - criterion = torch.nn.BCEWithLogitsLoss().cuda() - lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) - batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) - batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) - - tinit = time() - create_if_not_exist(opt.checkpoint_dir) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') - - l_train_index, l_train_target = multilingual_index.l_train() - l_val_index, l_val_target = multilingual_index.l_val() - l_test_index = multilingual_index.l_test_index() - - print('-'*80) - print('Start training') - for epoch in range(1, opt.nepochs + 1): - train(model, batcher_train, l_train_index, lPtr, tr_bert_embeddings, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) - lr_scheduler.step() # reduces the learning rate - - # validation - macrof1 = test(model, batcher_eval, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - if opt.test_each>0: - if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - batcher_train.init_offset() - train(model, batcher_train, l_val_index, lPva, va_bert_embeddings, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) - - # final test - print('Training complete: testing') - test(model, batcher_eval, l_test_index, lPte, te_bert_embeddings, l_test_target, tinit, epoch, logfile, criterion, 'te') - - -# ---------------------------------------------------------------------------------------------------------------------- -if __name__ == '__main__': - - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings') - parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)') - parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)') - parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)') - parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)') - parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)') - parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by ' - 'language used to train the calibrated SVMs (only used if --posteriors is active)') - parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file') - parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints') - parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}') - parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings') - parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings') - parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings') - parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the ' - 'validation set once training is over (default 1)') - parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str', - help=f'path to MUSE pretrained embeddings') - parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the ' - 'feature-label embedding (if larger, then PCA with this number of components is applied ' - '(default 300)') - parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run') - parser.add_argument('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - parser.add_argument('--mbert', action='store_true', default=False, - help='use mBert embeddings') - - opt = parser.parse_args() - - assert torch.cuda.is_available(), 'CUDA not available' - assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0' - # if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle') - torch.manual_seed(opt.seed) - - main() diff --git a/src/experiment_scripts/main_embeddings_cls.py b/src/experiment_scripts/main_embeddings_cls.py deleted file mode 100644 index 08552d3..0000000 --- a/src/experiment_scripts/main_embeddings_cls.py +++ /dev/null @@ -1,127 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from util.util import get_learner, get_params - -parser = OptionParser() - -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-e", "--mode-embed", dest="mode_embed", - help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/') - -parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str, - default='MUSE') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. " - "If set to 0 it will automatically search for the best number of components. " - "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", - default=300) - -parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." - " If set to 0 it will automatically search for the best number of components", default=300) - -parser.add_option("-l", dest="lang", type=str) - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert exists(op.dataset), 'Unable to find file '+str(op.dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - - dataset_file = os.path.basename(op.dataset) - - results = PolylingualClassificationResults('./results/PLE_results.csv') - - data = MultilingualDataset.load(op.dataset) - data.show_dimensions() - - # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) - # data.set_view(languages=[op.lang]) - # data.set_view(categories=list(range(10))) - lXtr, lytr = data.training() - lXte, lyte = data.test() - - if op.set_c != -1: - meta_parameters = None - else: - meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}] - - # Embeddings and WCE config - _available_mode = ['none', 'unsupervised', 'supervised', 'both'] - _available_type = ['MUSE', 'FastText'] - assert op.mode_embed in _available_mode, f'{op.mode_embed} not in {_available_mode}' - assert op.we_type in _available_type, f'{op.we_type} not in {_available_type}' - - if op.mode_embed == 'none': - config = {'unsupervised': False, - 'supervised': False, - 'we_type': None} - _config_id = 'None' - elif op.mode_embed == 'unsupervised': - config = {'unsupervised': True, - 'supervised': False, - 'we_type': op.we_type} - _config_id = 'M' - elif op.mode_embed == 'supervised': - config = {'unsupervised': False, - 'supervised': True, - 'we_type': None} - _config_id = 'F' - elif op.mode_embed == 'both': - config = {'unsupervised': True, - 'supervised': True, - 'we_type': op.we_type} - _config_id = 'M+F' - - config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels_S - config['dim_reduction_unsupervised'] = op.max_labels_U - # config['post_pca'] = op.post_pca - # config['plot_covariance_matrices'] = True - - result_id = dataset_file + 'MLE_andrea' + _config_id + ('_optimC' if op.optimc else '') - - ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/andreapdr/CLESA/', - config = config, - learner=get_learner(calibrate=False), - c_parameters=get_params(dense=False), - n_jobs=op.n_jobs) - - print('# Fitting ...') - ple.fit(lXtr, lytr) - - print('# Evaluating ...') - ple_eval = evaluate_method(ple, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = ple_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row('MLE', 'svm', _config_id, config['we_type'], - 'no','no', op.optimc, op.dataset.split('/')[-1], ple.time, - lang, macrof1, microf1, macrok, microk, '') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_majorityvoting_cls.py b/src/experiment_scripts/main_majorityvoting_cls.py deleted file mode 100644 index ee5efe5..0000000 --- a/src/experiment_scripts/main_majorityvoting_cls.py +++ /dev/null @@ -1,155 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -# from learning.learners import * -# from learning.learners import FunnellingMultimodal -from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting -from util.evaluation import * -from optparse import OptionParser -from util.file import exists -from util.results import PolylingualClassificationResults -from sklearn.svm import SVC - -parser = OptionParser() - -# parser.add_option("-d", "--dataset", dest="dataset", -# help="Path to the multilingual dataset processed and stored in .pickle format", -# default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='./results/results.csv') - -parser.add_option("-P", "--probs", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-S", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c",type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=False) - -# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, -# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." -# " If set to 0 it will automatically search for the best number of components", default=300) - -# parser.add_option("-a", dest="post_pca", -# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " -# "embedding space", default=False) - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - -####################################################################################################################### - - -if __name__ == '__main__': - (op, args) = parser.parse_args() - - assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' - dataset = args[0] - - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' - - dataset_file = os.path.basename(dataset) - - results = PolylingualClassificationResults(op.output) - - data = MultilingualDataset.load(dataset) - data.show_dimensions() - - lXtr, lytr = data.training() - lXte, lyte = data.test() - - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - - # result_id = f'{dataset_file}_Prob{op.posteriors}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - result_id = f'{dataset_file}_ProbPost={op.posteriors}_WCE={op.supervised}(PCA={op.max_labels_S})_' \ - f'MUSE={op.pretrained}_weight={"todo"}_l2={"todo"}_zscore={"todo"}{"_optimC" if op.optimc else ""}' - print(f'{result_id}') - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - lXtr = tfidfvectorizer.fit_transform(lXtr, lytr) - lXte = tfidfvectorizer.transform(lXte) - lV = tfidfvectorizer.vocabulary() - - classifiers = [] - if op.posteriors: - classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) - if op.supervised: - classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) - if op.pretrained: - classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV))) - - classifier = Voting(*classifiers) - - print('# Fitting ...') - classifier.fit(lXtr, lytr) - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - # renaming arguments to be printed on log - _id = '' - _id_conf = [op.posteriors, op.supervised, op.pretrained] - _id_name = ['+P', '+W', '+M'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id.lstrip('+') - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='Voting', - learner='svm', - optimp=op.optimc, - sif=op.sif, - zscore='todo', - l2='todo', - wescaler='todo', - pca=op.max_labels_S, - id=_id, - dataset=dataset_id, - time='todo', - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/experiment_scripts/main_mbert.py b/src/experiment_scripts/main_mbert.py deleted file mode 100644 index aa44407..0000000 --- a/src/experiment_scripts/main_mbert.py +++ /dev/null @@ -1,390 +0,0 @@ -from dataset_builder import MultilingualDataset -from transformers import BertTokenizer, BertForSequenceClassification, AdamW -from torch.utils.data import Dataset, DataLoader -import numpy as np -import torch -from util.common import predict -from time import time -from util.csv_log import CSVLog -from util.evaluation import evaluate -from util.early_stop import EarlyStopping -from torch.optim.lr_scheduler import StepLR -from sklearn.model_selection import train_test_split -from copy import deepcopy -import argparse -# from torch.utils.tensorboard import SummaryWriter - - -def check_sentences(sentences): - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - for sentence in sentences: - converted = [tokenizer._convert_id_to_token(token) for token in sentence.numpy() if token != 0] - print(converted) - return - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def set_method_name(): - return 'mBERT' - - -def init_optimizer(model, lr): - # return AdamW(model.parameters(), lr=lr, weight_decay=opt.weight_decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': opt.weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def init_logfile(method_name, opt): - logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', method_name) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_dataset_name(datapath): - possible_splits = [str(i) for i in range(10)] - splitted = datapath.split('_') - id_split = splitted[-1].split('.')[0][-1] - if id_split in possible_splits: - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_run{id_split}' - elif splitted[-2].split('.')[0] == 'full': - dataset_name = splitted[0].split('/')[-1] - return f'{dataset_name}_fullrun' - - -def load_datasets(datapath): - data = MultilingualDataset.load(datapath) - # data.set_view(languages=['it']) #, categories=[0, 1, 2, 3, 4]) # Testing with less langs - data.show_dimensions() - - l_devel_raw, l_devel_target = data.training(target_as_csr=False) - l_test_raw, l_test_target = data.test(target_as_csr=False) - - return l_devel_raw, l_devel_target, l_test_raw, l_test_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -class TrainingDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -def freeze_encoder(model): - for param in model.base_model.parameters(): - param.requires_grad = False - return model - - -def check_param_grad_status(model): - print('#' * 50) - print('Model paramater status:') - for name, child in model.named_children(): - trainable = False - for param in child.parameters(): - if param.requires_grad: - trainable = True - if not trainable: - print(f'{name} is frozen') - else: - print(f'{name} is not frozen') - print('#' * 50) - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if writer is not None: - _n_step = (epoch - 1) * (len(train_dataloader)) + idx - writer.add_scalar('Loss_step/Train', loss, _n_step) - - # Check tokenized sentences consistency - # check_sentences(batch.cpu()) - - if idx % opt.log_interval == 0: - interval_loss = np.mean(loss_history[-opt.log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix, writer): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - if writer is not None: - writer.add_scalars('Eval Metrics', {'Mf1': Mf1, 'mF1': mF1, 'MK': MK, 'mk':mk}, epoch) - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def main(): - print('Running main ...') - - DATAPATH = opt.dataset - MAX_LEN = 512 - method_name = set_method_name() - logfile = init_logfile(method_name, opt) - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, l_devel_target, - val_prop=0.2, max_val=2000, - seed=opt.seed) - - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - - tr_dataloader = DataLoader(tr_dataset, batch_size=4, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=2, shuffle=True) - te_dataloader = DataLoader(te_dataset, batch_size=2, shuffle=False) - - - # Initializing model - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=opt.lr) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=opt.patience, - checkpoint=f'/home/andreapdr/funneling_pdr/hug_checkpoint/{method_name}-{get_dataset_name(opt.dataset)}', - is_bert=True) - - # Freezing encoder - # model = freeze_encoder(model) - check_param_grad_status(model) - - # Tensorboard logger - # writer = SummaryWriter('../log/tensorboard_logs/') - - # Training loop - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, opt.nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile, writer=None) - lr_scheduler.step() # reduces the learning rate - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va', writer=None) - early_stop(macrof1, epoch) - if opt.test_each > 0: - if (opt.plotmode and (epoch == 1 or epoch % opt.test_each == 0)) or ( - not opt.plotmode and epoch % opt.test_each == 0 and epoch < opt.nepochs): - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - if early_stop.STOP: - print('[early-stop] STOP') - if not opt.plotmode: - break - - if not opt.plotmode: - print('-' * 80) - print('Training over. Performing final evaluation') - - model = early_stop.restore_checkpoint() - model = model.cuda() - - if opt.val_epochs > 0: - print(f'running last {opt.val_epochs} training epochs on the validation set') - for val_epoch in range(1, opt.val_epochs + 1): - train(model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile, writer=None) - - # final test - print('Training complete: testing') - test(model, te_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'te', writer=None) - - # writer.flush() - # writer.close() - exit('Code Executed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings - mBert model') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--nepochs', type=int, default=200, metavar='int', - help='number of epochs (default: 200)') - parser.add_argument('--lr', type=float, default=2e-5, metavar='float', - help='learning rate (default: 2e-5)') - parser.add_argument('--weight_decay', type=float, default=0, metavar='float', - help='weight decay (default: 0)') - parser.add_argument('--patience', type=int, default=10, metavar='int', - help='patience for early-stop (default: 10)') - parser.add_argument('--log-interval', type=int, default=20, metavar='int', - help='how many batches to wait before printing training status') - parser.add_argument('--log-file', type=str, default='../log/log_mBert.csv', metavar='str', - help='path to the log csv file') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', - help='path to the directory containing checkpoints') - parser.add_argument('--plotmode', action='store_true', default=False, - help='in plot mode executes a long run in order ' - 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' - 'used to produce plots, and does not perform an evaluation on the test set.') - parser.add_argument('--test-each', type=int, default=0, metavar='int', - help='how many epochs to wait before invoking test (default: 0, only at the end)') - parser.add_argument('--val-epochs', type=int, default=1, metavar='int', - help='number of training epochs to perform on the validation set once training is over (default 1)') - opt = parser.parse_args() - - # Testing different parameters ... - opt.weight_decay = 0.01 - opt.lr = 1e-5 - opt.patience = 5 - - main() - # TODO: refactor .cuda() -> .to(device) in order to check if the process is faster on CPU given the bigger batch size diff --git a/src/experiment_scripts/main_mbert_extractor.py b/src/experiment_scripts/main_mbert_extractor.py deleted file mode 100644 index 16f09d3..0000000 --- a/src/experiment_scripts/main_mbert_extractor.py +++ /dev/null @@ -1,110 +0,0 @@ -from experiment_scripts.main_mbert import * -import pickle - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def feature_extractor(data, lang_ids, model_path='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0/'): - print('# Feature Extractor Mode...') - from transformers import BertConfig - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, num_labels=300) - model = BertForSequenceClassification.from_pretrained(model_path, - config=config).cuda() - - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - all_batch_embeddings = {} - id2lang = {v:k for k,v in lang_ids.items()} - with torch.no_grad(): - for batch, target, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang - - -def main(): - print('Running main ...') - print(f'Model path: {opt.modelpath}\nDataset path: {opt.dataset}') - DATAPATH = opt.dataset - MAX_LEN = 512 - - l_devel_raw, l_devel_target, l_test_raw, l_test_target = load_datasets(DATAPATH) - l_tokenized_tr = do_tokenization(l_devel_raw, max_len=MAX_LEN) - l_tokenized_te = do_tokenization(l_test_raw, max_len=MAX_LEN) - - tr_dataset = TrainingDataset(l_tokenized_tr, l_devel_target) - tr_lang_ids = tr_dataset.lang_ids - - te_dataset = TrainingDataset(l_tokenized_te, l_test_target) - te_lang_ids = te_dataset.lang_ids - - tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc embeddings - te_dataloader = DataLoader(te_dataset, batch_size=64, shuffle=False) # Shuffle False to extract doc - - tr_all_batch_embeddings, id2lang_tr = feature_extractor(tr_dataloader, tr_lang_ids, opt.modelpath) # Extracting doc embed for devel - with open(f'{opt.modelpath}/TR_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((tr_all_batch_embeddings, id2lang_tr), outfile) - - te_all_batch_embeddings, id2lang_te = feature_extractor(te_dataloader, te_lang_ids, opt.modelpath) # Extracting doc embed for test - with open(f'{opt.modelpath}/TE_embed_{get_dataset_name(opt.dataset)}.pkl', 'wb') as outfile: - pickle.dump((te_all_batch_embeddings, id2lang_te), outfile) - - exit('Extraction completed!') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='mBert model document embedding extractor') - - parser.add_argument('--dataset', type=str, - default='/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle', - metavar='datasetpath', help=f'path to the pickled dataset') - parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - parser.add_argument('--modelpath', type=str, default='/home/andreapdr/funneling_pdr/hug_checkpoint/mBERT-jrc_run0', - metavar='modelpath', help=f'path to pre-trained mBert model') - opt = parser.parse_args() - - main() - diff --git a/src/experiment_scripts/main_qualitative_analysis.py b/src/experiment_scripts/main_qualitative_analysis.py deleted file mode 100644 index aead994..0000000 --- a/src/experiment_scripts/main_qualitative_analysis.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from optparse import OptionParser -from util.file import exists -import numpy as np -from sklearn.feature_extraction.text import CountVectorizer - -parser = OptionParser(usage="usage: %prog datapath [options]") - -(op, args) = parser.parse_args() -assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' -dataset = args[0] -assert exists(dataset), 'Unable to find file '+str(dataset) - -dataset_file = os.path.basename(dataset) - -data = MultilingualDataset.load(dataset) -data.set_view(languages=['it']) -data.show_dimensions() -lXtr, lytr = data.training() -lXte, lyte = data.test() - -vect_lXtr = dict() -vectorizer = CountVectorizer() -vect_lXtr['it'] = vectorizer.fit_transform(lXtr['it']) -# print(type(vect_lXtr['it'])) - -corr = vect_lXtr['it'].T.dot(lytr['it']) -# print(corr.shape) -sum_correlated_class = corr.sum(axis=0) -print(len(sum_correlated_class)) -print(sum_correlated_class.max()) - - -w2idx = vectorizer.vocabulary_ -idx2w = {v:k for k,v in w2idx.items()} - -word_tot_corr = corr.sum(axis=1) -print(word_tot_corr.shape) -dict_word_tot_corr = {v:k for k,v in enumerate(word_tot_corr)} - -sorted_word_tot_corr = np.sort(word_tot_corr) -sorted_word_tot_corr = sorted_word_tot_corr[len(sorted_word_tot_corr)-200:] - -top_idx = [dict_word_tot_corr[k] for k in sorted_word_tot_corr] -print([idx2w[idx] for idx in top_idx]) -print([elem for elem in top_idx]) -print(corr[8709]) -print('Finished...') \ No newline at end of file diff --git a/src/experiment_scripts/run_combinations_jrc.sh b/src/experiment_scripts/run_combinations_jrc.sh deleted file mode 100644 index a4aabde..0000000 --- a/src/experiment_scripts/run_combinations_jrc.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -logfile=./results/final_combinations_jrc.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# - -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r - - diff --git a/src/experiment_scripts/run_combinations_rcv.sh b/src/experiment_scripts/run_combinations_rcv.sh deleted file mode 100644 index 4e1acfb..0000000 --- a/src/experiment_scripts/run_combinations_rcv.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -logfile=./results/final_combinations_rcv.csv -#A.2: ensembling feature sets (combinations of posteriors, wce, muse): -# - exploring different ways of putting different feature sets together: concatenation, FeatureSetToPosteriors, averaging, voting, etc... -# (no one seems to improve over standard funnelling [the improved version after A.1] with posteriors probabilities...) - -# aggregation=concatenation -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 -# -##FeatureSetToPosteriors (aggregation mean) -python main_multimodal_cls.py $dataset -o $logfile -P -U -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -U -S -r -a -z --l2 --allprob -python main_multimodal_cls.py $dataset -o $logfile -P -U -S -r -a -z --l2 --allprob - -##FeatureSetToPosteriors -#python main_gFun.py $dataset -o $logfile -P -U -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -U -S -r -z --l2 --allprob -#python main_gFun.py $dataset -o $logfile -P -U -S -r -z --l2 --allprob - -#MajorityVoting -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -U -S -r -#python main_majorityvoting_cls.py $dataset -o $logfile -P -U -S -r \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_jrc.sh b/src/experiment_scripts/run_dl_jrc.sh deleted file mode 100644 index 1d28e83..0000000 --- a/src/experiment_scripts/run_dl_jrc.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -logfile=../log/log_pre_jrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle -python main_deep_learning.py $dataset --log-file $logfile --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --log-file $logfile --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_dl_rcv.sh b/src/experiment_scripts/run_dl_rcv.sh deleted file mode 100644 index 4782887..0000000 --- a/src/experiment_scripts/run_dl_rcv.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle -python main_deep_learning.py $dataset --pretrained --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --hidden 256 --tunable --plotmode --test-each 20 - -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 128 --tunable --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --plotmode --test-each 20 -python main_deep_learning.py $dataset --pretrained --supervised --posteriors --hidden 256 --tunable --plotmode --test-each 20 \ No newline at end of file diff --git a/src/experiment_scripts/run_fulljrc_dl.sh b/src/experiment_scripts/run_fulljrc_dl.sh deleted file mode 100644 index 4d5eeaa..0000000 --- a/src/experiment_scripts/run_fulljrc_dl.sh +++ /dev/null @@ -1,16 +0,0 @@ -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -seeds='5' #2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed --force - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_wce_muse_trainable40000.csv --supervised --pretrained --tunable --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/jrc_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed --force - -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fullrcv_dl.sh b/src/experiment_scripts/run_fullrcv_dl.sh deleted file mode 100644 index 5894aef..0000000 --- a/src/experiment_scripts/run_fullrcv_dl.sh +++ /dev/null @@ -1,20 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 ' #2 3 4 5' # 6 7 8 9 10' -for seed in $seeds -do - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce.csv --supervised --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_trainable.csv --supervised --tunable --seed $seed - python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static_plotmode.csv --posteriors --supervised --pretrained --seed $seed --plotmode --test-each 200 - - - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse.csv --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_muse_trainable.csv --pretrained --tunable --seed $seed - - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse.csv --supervised --pretrained --seed $seed - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_wce_muse_trainable.csv --supervised --pretrained --tunable --seed $seed - -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_static.csv --posteriors --supervised --pretrained --seed $seed -# python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable_plotmode.csv --posteriors --supervised --pretrained --tunable --seed $seed --plotmode --test-each 200 - #python main_deep_learning.py $dataset --log-file ../log/rcv_fullrun_post_wce_muse_trainable.csv --posteriors --supervised --pretrained --tunable --seed $seed -done \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_jrc.sh b/src/experiment_scripts/run_fun_bert_jrc.sh deleted file mode 100644 index fc2e2c3..0000000 --- a/src/experiment_scripts/run_fun_bert_jrc.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_FunBert_jrc.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile #--tunable -#done - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -logfile=../log/log_FunBert_fulljrc_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_fun_bert_rcv.sh b/src/experiment_scripts/run_fun_bert_rcv.sh deleted file mode 100644 index e27fe54..0000000 --- a/src/experiment_scripts/run_fun_bert_rcv.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_FunBert_rcv_static.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile -#done - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -logfile=../log/log_FunBert_fullrcv_static.csv - -python main_deep_learning.py $dataset --supervised --pretrained --posteriors --mbert --log-file $logfile \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_jrc.sh b/src/experiment_scripts/run_mbert_jrc.sh deleted file mode 100644 index 08733a4..0000000 --- a/src/experiment_scripts/run_mbert_jrc.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run -#logfile=../log/log_mBert_jrc_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fulljrc.csv -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 \ No newline at end of file diff --git a/src/experiment_scripts/run_mbert_rcv.sh b/src/experiment_scripts/run_mbert_rcv.sh deleted file mode 100644 index 66ffba1..0000000 --- a/src/experiment_scripts/run_mbert_rcv.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -#dataset_path=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run -#logfile=../log/log_mBert_rcv_NEW.csv -# -#runs='0 1 2 3 4' -#for run in $runs -#do -# dataset=$dataset_path$run.pickle -# python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=50 -#done - -logfile=../log/log_mBert_fullrcv.csv -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -python main_mbert.py --dataset $dataset --log-file $logfile --nepochs=30 --patience 3 \ No newline at end of file diff --git a/src/experiment_scripts/run_traditional_jrc.sh b/src/experiment_scripts/run_traditional_jrc.sh deleted file mode 100644 index 460c9e8..0000000 --- a/src/experiment_scripts/run_traditional_jrc.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 250 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 250 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/run_traditional_rcv.sh b/src/experiment_scripts/run_traditional_rcv.sh deleted file mode 100644 index 0dcfa2c..0000000 --- a/src/experiment_scripts/run_traditional_rcv.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle - -######################################## POSTERIORS - # Posteriors -python main_multimodal_cls.py $dataset -P # + zscore -python main_multimodal_cls.py $dataset -P -z # +l2norm -python main_multimodal_cls.py $dataset -P -z --l2 # +feature weight - - -######################################### WCE - #WCE supervised -python main_multimodal_cls.py $dataset -S # + zscore -python main_multimodal_cls.py $dataset -S -z # +l2norm -python main_multimodal_cls.py $dataset -S -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 # +feature weight + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 # + SIF - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight ig # -feature weight -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight ig -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight ig # + pca -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight ig - - -python main_multimodal_cls.py $dataset -S -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -p 50 --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -S -z -r -p 50 --l2 --feat-weight pmi - -################################# MUSE - - # MUSE unsupervised -python main_multimodal_cls.py $dataset -U # + zscore -python main_multimodal_cls.py $dataset -U -z # +l2norm -python main_multimodal_cls.py $dataset -U -z --l2 # +feature weight -python main_multimodal_cls.py $dataset -U -z -r --l2 # + SIF - PCA - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight ig # -feature weight + pca -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight ig - -python main_multimodal_cls.py $dataset -U -z --l2 --feat-weight pmi -python main_multimodal_cls.py $dataset -U -z -r --l2 --feat-weight pmi diff --git a/src/experiment_scripts/time_comparison.sh b/src/experiment_scripts/time_comparison.sh deleted file mode 100644 index 60e1c25..0000000 --- a/src/experiment_scripts/time_comparison.sh +++ /dev/null @@ -1,6 +0,0 @@ -dataset=/home/moreo/CLESA/rcv2/rcv1-2_doclist_full_processed.pickle -seeds='1 2 3 4 5 6 7 8 9 10' -for seed in $seeds -do - python main_deep_learning.py $dataset --log-file ../log/time_GRU.csv --supervised --nepochs 50 --seed $seed - done \ No newline at end of file diff --git a/src/learning/learners.py b/src/learning/learners.py deleted file mode 100644 index 708eaad..0000000 --- a/src/learning/learners.py +++ /dev/null @@ -1,171 +0,0 @@ -import numpy as np -import time -from scipy.sparse import issparse -from sklearn.multiclass import OneVsRestClassifier -from sklearn.model_selection import GridSearchCV -from joblib import Parallel, delayed - - -def _sort_if_sparse(X): - if issparse(X) and not X.has_sorted_indices: - X.sort_indices() - - -def _joblib_transform_multiling(transformer, lX, n_jobs=-1): - if n_jobs == 1: - return {lang:transformer(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) - return {lang: transformations[i] for i, lang in enumerate(langs)} - - -class TrivialRejector: - def fit(self, X, y): - self.cats = y.shape[1] - return self - - def decision_function(self, X): return np.zeros((X.shape[0],self.cats)) - - def predict(self, X): return np.zeros((X.shape[0],self.cats)) - - def predict_proba(self, X): return np.zeros((X.shape[0],self.cats)) - - def best_params(self): return {} - - -class NaivePolylingualClassifier: - """ - Is a mere set of independet MonolingualClassifiers - """ - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.base_learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - - def fit(self, lX, ly): - """ - trains the independent monolingual classifiers - :param lX: a dictionary {language_label: X csr-matrix} - :param ly: a dictionary {language_label: y np.array} - :return: self - """ - tinit = time.time() - assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' - langs = list(lX.keys()) - for lang in langs: - _sort_if_sparse(lX[lang]) - - models = Parallel(n_jobs=self.n_jobs)\ - (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) - - self.model = {lang: models[i] for i, lang in enumerate(langs)} - self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} - self.time = time.time() - tinit - return self - - def decision_function(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of classification scores for each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} - - def predict_proba(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of probabilities that each document belongs to each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs=list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) - return {lang:scores[i] for i,lang in enumerate(langs)} - - def predict(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of predictions - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' - if self.n_jobs == 1: - return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def best_params(self): - return {l:model.best_params() for l,model in self.model.items()} - - -class MonolingualClassifier: - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - self.best_params_ = None - - def fit(self, X, y): - if X.shape[0] == 0: - print('Warning: X has 0 elements, a trivial rejector will be created') - self.model = TrivialRejector().fit(X,y) - self.empty_categories = np.arange(y.shape[1]) - return self - - tinit = time.time() - _sort_if_sparse(X) - self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten() - - # multi-class format - if len(y.shape) == 2: - if self.parameters is not None: - self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} - for params in self.parameters] - self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) - else: - self.model = self.learner - raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' - 'the labels across languages') - - # parameter optimization? - if self.parameters: - print('debug: optimizing parameters:', self.parameters) - self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, - error_score=0, verbose=10) - - # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') - print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') - self.model.fit(X, y) - if isinstance(self.model, GridSearchCV): - self.best_params_ = self.model.best_params_ - print('best parameters: ', self.best_params_) - self.time = time.time()-tinit - return self - - def decision_function(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.decision_function(X) - - def predict_proba(self, X): - assert self.model is not None, 'predict called before fit' - assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' - _sort_if_sparse(X) - return self.model.predict_proba(X) - - def predict(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.predict(X) - - def best_params(self): - return self.best_params_ \ No newline at end of file diff --git a/src/learning/transformers.py b/src/learning/transformers.py deleted file mode 100644 index 5a76740..0000000 --- a/src/learning/transformers.py +++ /dev/null @@ -1,863 +0,0 @@ -from torch.optim.lr_scheduler import StepLR -from torch.utils.data import DataLoader -from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain -from embeddings.embeddings import FastTextMUSE -from embeddings.supervised import supervised_embeddings_tfidf, zscores -from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling -from sklearn.decomposition import PCA -from scipy.sparse import hstack -from util_transformers.StandardizeTransformer import StandardizeTransformer -from util.SIF_embed import remove_pc -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix -from models.mBert import * -from models.lstm_class import * -from util.csv_log import CSVLog -from util.file import get_file_name, create_if_not_exist, exists -from util.early_stop import EarlyStopping -from util.common import * -import pickle -import time - - -# ------------------------------------------------------------------ -# Data Processing -# ------------------------------------------------------------------ - - -class FeatureWeight: - - def __init__(self, weight='tfidf', agg='mean'): - assert weight in ['tfidf', 'pmi', 'ig'] or callable( - weight), 'weight should either be "tfidf" or a callable function' - assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' - self.weight = weight - self.agg = agg - self.fitted = False - if weight == 'pmi': - self.weight = pointwise_mutual_information - elif weight == 'ig': - self.weight = information_gain - - def fit(self, lX, ly): - if not self.fitted: - if self.weight == 'tfidf': - self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()} - else: - self.lF = {} - for l in lX.keys(): - X, y = lX[l], ly[l] - - print(f'getting supervised cell-matrix lang {l}') - tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight) - if self.agg == 'max': - F = tsr_matrix.max(axis=0) - elif self.agg == 'mean': - F = tsr_matrix.mean(axis=0) - self.lF[l] = F - self.fitted = True - return self - - def transform(self, lX): - return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()} - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - -# ------------------------------------------------------------------ -# View Generators (aka first-tier learners) -# ------------------------------------------------------------------ - - -class PosteriorProbabilitiesEmbedder: - - def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1, is_training=True, storing_path='../dumps/'): - self.fist_tier_learner = first_tier_learner - self.fist_tier_parameters = first_tier_parameters - self.l2 = l2 - self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier( - self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs - ) - self.requires_tfidf = True - self.storing_path = storing_path - self.is_training = is_training - - def fit(self, lX, lY, lV=None, called_by_viewgen=False): - # if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): - # print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') - # return self - if not called_by_viewgen: - # Avoid printing if method is called by another View Gen (e.g., GRU ViewGen) - print('### Posterior Probabilities View Generator (X)') - print('fitting the projectors... {}'.format(lX.keys())) - self.doc_projector.fit(lX, lY) - return self - - def transform(self, lX): - # if dir exist, load and return already computed results - # _endpoint = 'tr' if self.is_training else 'te' - # _actual_path = self.storing_path + '/' + _endpoint - # if exists(_actual_path): - # print('NB: loading pre-computed results!') - # with open(_actual_path + '/X.pickle', 'rb') as infile: - # self.is_training = False - # return pickle.load(infile) - - lZ = self.predict_proba(lX) - lZ = _normalize(lZ, self.l2) - # create dir and dump computed results - # create_if_not_exist(_actual_path) - # with open(_actual_path + '/X.pickle', 'wb') as outfile: - # pickle.dump(lZ, outfile) - self.is_training = False - return lZ - - def fit_transform(self, lX, ly=None, lV=None): - return self.fit(lX, ly).transform(lX) - - def best_params(self): - return self.doc_projector.best_params() - - def predict(self, lX, ly=None): - return self.doc_projector.predict(lX) - - def predict_proba(self, lX, ly=None): - print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} documents') - lZ = self.doc_projector.predict_proba(lX) - return lZ - - -class MuseEmbedder: - - def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight(), sif=False): - self.path = path - self.lV = lV - self.l2 = l2 - self.n_jobs = n_jobs - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - assert lV is not None or self.lV is not None, 'lV not specified' - print('### MUSE View Generator (M)') - print(f'Loading fastText pretrained vectors for languages {list(lX.keys())}...') - self.langs = sorted(lX.keys()) - self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) - lWordList = {l: self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l: Muse.extract(lWordList[l]).numpy() for l, Muse in self.MUSE.items()} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - MUSE = self.MUSE - lX = self.featureweight.transform(lX) - XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], MUSE[lang], self.sif) for lang in self.langs) - lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} - lMuse = _normalize(lMuse, self.l2) - return lMuse - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def _get_wordlist_from_word2index(self, word2index): - return list(zip(*sorted(word2index.items(), key=lambda x: x[1])))[0] - - def _get_output_dim(self): - return self.MUSE['da'].shape[1] - - -class WordClassEmbedder: - - def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight(), sif=False): - self.n_jobs = n_jobs - self.l2 = l2 - self.max_label_space = max_label_space - self.featureweight = featureweight - self.sif = sif - self.requires_tfidf = True - - def fit(self, lX, ly, lV=None): - print('### WCE View Generator (M)') - print('Computing supervised embeddings...') - self.langs = sorted(lX.keys()) - WCE = Parallel(n_jobs=self.n_jobs)( - delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs - ) - self.lWCE = {l: WCE[i] for i, l in enumerate(self.langs)} - self.featureweight.fit(lX, ly) - return self - - def transform(self, lX): - lWCE = self.lWCE - lX = self.featureweight.transform(lX) - XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang], self.sif) for lang in self.langs - ) - lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} - lwce = _normalize(lwce, self.l2) - return lwce - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - def _get_output_dim(self): - return 73 # TODO ! - - -class MBertEmbedder: - - def __init__(self, doc_embed_path=None, patience=10, checkpoint_dir='../hug_checkpoint/', path_to_model=None, - nC=None, avoid_loading=False): - self.doc_embed_path = doc_embed_path - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.fitted = False - self.requires_tfidf = False - self.avoid_loading = avoid_loading - if path_to_model is None: - self.model = None - else: - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=nC) - if self.avoid_loading: - self.model = None - else: - self.model = BertForSequenceClassification.from_pretrained(path_to_model, config=config).cuda() # TODO: setting model to None in order to avoid loading it onto gpu if we have already pre-computed results! - self.fitted = True - - def fit(self, lX, ly, lV=None, seed=0, nepochs=200, lr=1e-5, val_epochs=1): - print('### mBERT View Generator (B)') - if self.fitted is True: - print('Bert model already fitted!') - return self - - print('Fine-tune mBert on the given dataset.') - l_tokenized_tr = do_tokenization(lX, max_len=512) - l_split_tr, l_split_tr_target, l_split_va, l_split_val_target = get_tr_val_split(l_tokenized_tr, ly, - val_prop=0.2, max_val=2000, - seed=seed) - - tr_dataset = TrainingDataset(l_split_tr, l_split_tr_target) - va_dataset = TrainingDataset(l_split_va, l_split_val_target) - tr_dataloader = DataLoader(tr_dataset, batch_size=64, shuffle=True) - va_dataloader = DataLoader(va_dataset, batch_size=64, shuffle=True) - - nC = tr_dataset.get_nclasses() - model = get_model(nC) - model = model.cuda() - criterion = torch.nn.BCEWithLogitsLoss().cuda() - optim = init_optimizer(model, lr=lr, weight_decay=0.01) - lr_scheduler = StepLR(optim, step_size=25, gamma=0.1) - early_stop = EarlyStopping(model, optimizer=optim, patience=self.patience, - checkpoint=self.checkpoint_dir, - is_bert=True) - - # Training loop - logfile = '../log/log_mBert_extractor.csv' - method_name = 'mBert_feature_extractor' - - tinit = time() - lang_ids = va_dataset.lang_ids - for epoch in range(1, nepochs + 1): - print('# Start Training ...') - train(model, tr_dataloader, epoch, criterion, optim, method_name, tinit, logfile) - lr_scheduler.step() # reduces the learning rate # TODO arg epoch? - - # Validation - macrof1 = test(model, va_dataloader, lang_ids, tinit, epoch, logfile, criterion, 'va') - early_stop(macrof1, epoch) - - if early_stop.STOP: - print('[early-stop] STOP') - break - - model = early_stop.restore_checkpoint() - self.model = model.cuda() - - if val_epochs > 0: - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs + 1): - train(self.model, va_dataloader, epoch + val_epoch, criterion, optim, method_name, tinit, logfile) - - self.fitted = True - return self - - def transform(self, lX): - assert self.fitted is True, 'Calling transform without any initialized model! - call init first or on init' \ - 'pass the "path_to_model" arg.' - print('Obtaining document embeddings from pretrained mBert ') - l_tokenized_X = do_tokenization(lX, max_len=512, verbose=True) - feat_dataset = ExtractorDataset(l_tokenized_X) - feat_lang_ids = feat_dataset.lang_ids - dataloader = DataLoader(feat_dataset, batch_size=64) - all_batch_embeddings, id2lang = feature_extractor(dataloader, feat_lang_ids, self.model) - return all_batch_embeddings - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - -class RecurrentEmbedder: - - def __init__(self, pretrained, supervised, multilingual_dataset, options, concat=False, lr=1e-3, - we_path='../embeddings', hidden_size=512, sup_drop=0.5, posteriors=False, patience=10, - test_each=0, checkpoint_dir='../checkpoint', model_path=None, n_jobs=-1): - self.pretrained = pretrained - self.supervised = supervised - self.concat = concat - self.requires_tfidf = False - self.multilingual_dataset = multilingual_dataset - self.model = None - self.we_path = we_path - self.langs = multilingual_dataset.langs() - self.hidden_size = hidden_size - self.sup_drop = sup_drop - self.posteriors = posteriors - self.patience = patience - self.checkpoint_dir = checkpoint_dir - self.test_each = test_each - self.options = options - self.seed = options.seed - self.model_path = model_path - self.n_jobs = n_jobs - self.is_trained = False - - ## INIT MODEL for training - self.lXtr, self.lytr = self.multilingual_dataset.training(target_as_csr=True) - self.lXte, self.lyte = self.multilingual_dataset.test(target_as_csr=True) - self.nC = self.lyte[self.langs[0]].shape[1] - lpretrained, self.lpretrained_vocabulary = self._load_pretrained_embeddings(self.we_path, self.langs) - self.multilingual_index = MultilingualIndex() - self.multilingual_index.index(self.lXtr, self.lytr, self.lXte, self.lpretrained_vocabulary) - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - self.multilingual_index.embedding_matrices(lpretrained, self.supervised) - - if model_path is not None: - self.is_trained = True - self.model = torch.load(model_path) - else: - self.model = self._init_Net() - - self.optim = init_optimizer(self.model, lr=lr) - self.criterion = torch.nn.BCEWithLogitsLoss().cuda() - self.lr_scheduler = StepLR(self.optim, step_size=25, gamma=0.5) - self.early_stop = EarlyStopping(self.model, optimizer=self.optim, patience=self.patience, - checkpoint=f'{self.checkpoint_dir}/gru_viewgen_-{get_file_name(self.options.dataset)}') - - def fit(self, lX, ly, lV=None, batch_size=128, nepochs=200, val_epochs=1): - print('### Gated Recurrent Unit View Generator (G)') - if self.model is None: - print('TODO: Init model!') - if not self.is_trained: - # Batchify input - self.multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=self.seed) - l_train_index, l_train_target = self.multilingual_index.l_train() - l_val_index, l_val_target = self.multilingual_index.l_val() - l_test_index = self.multilingual_index.l_test_index() - batcher_train = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - batcher_eval = BatchGRU(batch_size, batches_per_epoch=batch_size, languages=self.langs, - lpad=self.multilingual_index.l_pad()) - - # Train loop - print('Start training') - method_name = 'gru_view_generator' - logfile = init_logfile_nn(method_name, self.options) - tinit = time.time() - for epoch in range(1, nepochs + 1): - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.lr_scheduler.step() - - # validation step - macrof1 = test_gru(self.model, batcher_eval, l_val_index, None, None, l_val_target, tinit, epoch, - logfile, self.criterion, 'va') - - self.early_stop(macrof1, epoch) - if self.test_each > 0: - test_gru(self.model, batcher_eval, l_test_index, None, None, self.lyte, tinit, epoch, - logfile, self.criterion, 'te') - - if self.early_stop.STOP: - print('[early-stop] STOP') - print('Restoring best model...') - break - - self.model = self.early_stop.restore_checkpoint() - print(f'running last {val_epochs} training epochs on the validation set') - for val_epoch in range(1, val_epochs+1): - batcher_train.init_offset() - train_gru(model=self.model, batcher=batcher_train, ltrain_index=l_train_index, lytr=l_train_target, - tinit=tinit, logfile=logfile, criterion=self.criterion, optim=self.optim, - epoch=epoch, method_name=method_name, opt=self.options, ltrain_posteriors=None, - ltrain_bert=None) - self.is_trained = True - - return self - - def transform(self, lX, batch_size=64): - lX = self.multilingual_index.get_indexed(lX, self.lpretrained_vocabulary) - lX = self._get_doc_embeddings(lX, batch_size) - return lX - - def fit_transform(self, lX, ly, lV=None): - return self.fit(lX, ly).transform(lX) - - def _get_doc_embeddings(self, lX, batch_size): - assert self.is_trained, 'Model is not trained, cannot call transform before fitting the model!' - print('Generating document embeddings via GRU') - _lX = {} - - l_devel_target = self.multilingual_index.l_devel_target() - - # show_gpu('RNN init at extraction') - for idx, (batch, post, target, lang) in enumerate(batchify(lX, None, l_devel_target, - batch_size, self.multilingual_index.l_pad())): - if lang not in _lX.keys(): - _lX[lang] = self.model.get_embeddings(batch, lang) - else: - _lX[lang] = np.concatenate((_lX[lang], self.model.get_embeddings(batch, lang)), axis=0) - # show_gpu('RNN after batch pred at extraction') - return _lX - - # loads the MUSE embeddings if requested, or returns empty dictionaries otherwise - def _load_pretrained_embeddings(self, we_path, langs): - lpretrained = lpretrained_vocabulary = self._none_dict(langs) - lpretrained = load_muse_embeddings(we_path, langs, n_jobs=self.n_jobs) - lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} - return lpretrained, lpretrained_vocabulary - - def _none_dict(self, langs): - return {l:None for l in langs} - - # instantiates the net, initializes the model parameters, and sets embeddings trainable if requested - def _init_Net(self, xavier_uniform=True): - model = RNNMultilingualClassifier( - output_size=self.nC, - hidden_size=self.hidden_size, - lvocab_size=self.multilingual_index.l_vocabsize(), - learnable_length=0, - lpretrained=self.multilingual_index.l_embeddings(), - drop_embedding_range=self.multilingual_index.sup_range, - drop_embedding_prop=self.sup_drop, - post_probabilities=self.posteriors - ) - return model.cuda() - - -class DocEmbedderList: - - def __init__(self, *embedder_list, aggregation='concat'): - assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' - if len(embedder_list) == 0: - embedder_list = [] - self.embedders = embedder_list - self.aggregation = aggregation - print(f'Aggregation mode: {self.aggregation}') - - def fit(self, lX, ly, lV=None, tfidf=None): - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - transformer.fit(_lX, ly, lV) - return self - - def transform(self, lX, tfidf=None): - if self.aggregation == 'concat': - return self.transform_concat(lX, tfidf) - elif self.aggregation == 'mean': - return self.transform_mean(lX, tfidf) - - def transform_concat(self, lX, tfidf): - if len(self.embedders) == 1: - if self.embedders[0].requires_tfidf: - lX = tfidf - return self.embedders[0].transform(lX) - - some_sparse = False - langs = sorted(lX.keys()) - - lZparts = {l: [] for l in langs} - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - for l in langs: - Z = lZ[l] - some_sparse = some_sparse or issparse(Z) - lZparts[l].append(Z) - - hstacker = hstack if some_sparse else np.hstack - return {l: hstacker(lZparts[l]) for l in langs} - - def transform_mean(self, lX, tfidf): - if len(self.embedders) == 1: - if self.embedders[0].requires_tfidf: - lX = tfidf - return self.embedders[0].transform(lX) - - langs = sorted(lX.keys()) - lZparts = {l: None for l in langs} - - for transformer in self.embedders: - _lX = lX - if transformer.requires_tfidf: - _lX = tfidf - lZ = transformer.transform(_lX) - for l in langs: - Z = lZ[l] - if lZparts[l] is None: - lZparts[l] = Z - else: - lZparts[l] += Z - - n_transformers = len(self.embedders) - - return {l: lZparts[l] / n_transformers for l in langs} - - def fit_transform(self, lX, ly, lV=None, tfidf=None): - return self.fit(lX, ly, lV, tfidf).transform(lX, tfidf) - - def best_params(self): - return {'todo'} - - def append(self, embedder): - self.embedders.append(embedder) - - -class FeatureSet2Posteriors: - def __init__(self, transformer, method_id, requires_tfidf=False, l2=True, n_jobs=-1, storing_path='../dumps/'): - self.transformer = transformer - self.l2 = l2 - self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) - self.requires_tfidf = requires_tfidf - - self.storing_path = storing_path - self.is_training = True - self.method_id = method_id - - def fit(self, lX, ly, lV=None): - if exists(self.storing_path + '/tr') or exists(self.storing_path + '/te'): - print(f'NB: Avoid fitting {self.storing_path.split("/")[2]} since we have already pre-computed results') - return self - - if lV is None and hasattr(self.transformer, 'lV'): - lV = self.transformer.lV - lZ = self.transformer.fit_transform(lX, ly, lV) - self.prob_classifier.fit(lZ, ly) - return self - - def transform(self, lX): - # if dir exist, load and return already computed results - # _endpoint = 'tr' if self.is_training else 'te' - # _actual_path = self.storing_path + '/' + _endpoint - # if exists(_actual_path): - # print('NB: loading pre-computed results!') - # with open(_actual_path + '/' + self.method_id + '.pickle', 'rb') as infile: - # self.is_training = False - # return pickle.load(infile) - - lP = self.predict_proba(lX) - lP = _normalize(lP, self.l2) - # create dir and dump computed results - # create_if_not_exist(_actual_path) - # with open(_actual_path + '/' + self.method_id + '.pickle', 'wb') as outfile: - # pickle.dump(lP, outfile) - self.is_training = False - return lP - - def fit_transform(self, lX, ly, lV): - return self.fit(lX, ly, lV).transform(lX) - - def predict(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict(lZ) - - def predict_proba(self, lX, ly=None): - lZ = self.transformer.transform(lX) - return self.prob_classifier.predict_proba(lZ) - - -# ------------------------------------------------------------------ -# Meta-Classifier (aka second-tier learner) -# ------------------------------------------------------------------ -class MetaClassifier: - - def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs = n_jobs - self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) - self.standardize_range = standardize_range - - def fit(self, lZ, ly): - tinit = time.time() - Z, y = self.stack(lZ, ly) - - self.standardizer = StandardizeTransformer(range=self.standardize_range) - Z = self.standardizer.fit_transform(Z) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model.fit(Z, y) - self.time = time.time() - tinit - - def stack(self, lZ, ly=None): - langs = list(lZ.keys()) - Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space - if ly is not None: - y = np.vstack([ly[lang] for lang in langs]) - return Z, y - else: - return Z - - def predict(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - def predict_proba(self, lZ, ly=None): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) - - def best_params(self): - return self.model.best_params() - - -# ------------------------------------------------------------------ -# Ensembling (aka Funnelling) -# ------------------------------------------------------------------ -class Funnelling: - def __init__(self, - vectorizer: TfidfVectorizerMultilingual, - first_tier: DocEmbedderList, - meta: MetaClassifier): - self.vectorizer = vectorizer - self.first_tier = first_tier - self.meta = meta - self.n_jobs = meta.n_jobs - - def fit(self, lX, ly, target_lang=None): - if target_lang is not None: - LX = lX.copy() - LX.update(target_lang) - self.vectorizer.fit(LX) - tfidf_lX = self.vectorizer.transform(lX) - else: - tfidf_lX = self.vectorizer.fit_transform(lX, ly) - lV = self.vectorizer.vocabulary() - print('## Fitting first-tier learners!') - lZ = self.first_tier.fit_transform(lX, ly, lV, tfidf=tfidf_lX) - print('## Fitting meta-learner!') - self.meta.fit(lZ, ly) - - def predict(self, lX, ly=None): - tfidf_lX = self.vectorizer.transform(lX) - lZ = self.first_tier.transform(lX, tfidf=tfidf_lX) - ly_ = self.meta.predict(lZ) - return ly_ - - def best_params(self): - return {'1st-tier': self.first_tier.best_params(), - 'meta': self.meta.best_params()} - - -class Voting: - def __init__(self, *prob_classifiers): - assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic' - self.prob_classifiers = prob_classifiers - - def fit(self, lX, ly, lV=None): - for classifier in self.prob_classifiers: - classifier.fit(lX, ly, lV) - - def predict(self, lX, ly=None): - lP = {l: [] for l in lX.keys()} - for classifier in self.prob_classifiers: - lPi = classifier.predict_proba(lX) - for l in lX.keys(): - lP[l].append(lPi[l]) - - lP = {l: np.stack(Plist).mean(axis=0) for l, Plist in lP.items()} - ly = {l: P > 0.5 for l, P in lP.items()} - - return ly - - -# ------------------------------------------------------------------------------ -# HELPERS -# ------------------------------------------------------------------------------ - -def load_muse_embeddings(we_path, langs, n_jobs=-1): - MUSE = Parallel(n_jobs=n_jobs)( - delayed(FastTextMUSE)(we_path, lang) for lang in langs - ) - return {l: MUSE[i] for i, l in enumerate(langs)} - - -def word_class_embedding_matrix(X, Y, max_label_space=300): - WCE = supervised_embeddings_tfidf(X, Y) - WCE = zscores(WCE, axis=0) - - nC = Y.shape[1] - if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - WCE = pca.fit(WCE).transform(WCE) - - return WCE - - -def XdotM(X, M, sif): - E = X.dot(M) - if sif: - # print("removing pc...") - E = remove_pc(E, npc=1) - return E - - -def _normalize(lX, l2=True): - return {l: normalize(X) for l, X in lX.items()} if l2 else lX - - -class BatchGRU: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad = lpad - self.max_pad_length = max_pad_length - self.init_offset() - - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} - - def batchify(self, l_index, l_post, l_bert, llabels, extractor=False): - langs = self.languages - l_num_samples = {l: len(l_index[l]) for l in langs} - - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch - - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - bert_emb = None - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -def train_gru(model, batcher, ltrain_index, lytr, tinit, logfile, criterion, optim, epoch, method_name, opt, - ltrain_posteriors=None, ltrain_bert=None, log_interval=10): - _dataset_path = opt.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - - # show_gpu('RNN init pre-training') - loss_history = [] - model.train() - for idx, (batch, post, bert_emb, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, ltrain_bert, lytr)): - optim.zero_grad() - loss = criterion(model(batch, post, bert_emb, lang), target) - loss.backward() - clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - # show_gpu('RNN after batch prediction') - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[-log_interval:]) - print(f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, ' - f'Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time.time() - tinit) - return mean_loss - - -def test_gru(model, batcher, ltest_index, ltest_posteriors, lte_bert, lyte, tinit, epoch, logfile, criterion, measure_prefix): - loss_history = [] - model.eval() - langs = sorted(ltest_index.keys()) - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - batcher.init_offset() - for batch, post, bert_emb, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lte_bert, lyte), - desc='evaluation: '): - logits = model(batch, post, bert_emb, lang) - loss = criterion(logits, target).item() - prediction = predict(logits) - predictions[lang].append(prediction) - yte_stacked[lang].append(target.detach().cpu().numpy()) - loss_history.append(loss) - - ly = {l:np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l:np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time.time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time.time() - tinit) - - return Mf1 - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def init_logfile_nn(method_name, opt): - import os - logfile = CSVLog(opt.logfile_gru, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) - logfile.set_default('dataset', opt.dataset) - logfile.set_default('run', opt.seed) - logfile.set_default('method', get_method_name(os.path.basename(opt.dataset), opt.posteriors, opt.supervised, opt.pretrained, opt.mbert, - opt.gruViewGenerator, opt.gruMUSE, opt.gruWCE, opt.agg, opt.allprob)) - assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} ' \ - f'and run {opt.seed} already calculated' - return logfile diff --git a/src/main_gFun.py b/src/main_gFun.py deleted file mode 100644 index 8694087..0000000 --- a/src/main_gFun.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -from dataset_builder import MultilingualDataset -from learning.transformers import * -from util.evaluation import * -from util.file import exists -from util.results import PolylingualClassificationResults -from util.common import * -from util.parser_options import * - -if __name__ == '__main__': - (op, args) = parser.parse_args() - dataset = op.dataset - assert exists(dataset), 'Unable to find file '+str(dataset) - assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.posteriors or op.supervised or op.pretrained or op.mbert or op.gruViewGenerator, \ - 'empty set of document embeddings is not allowed' - if op.gruViewGenerator: - assert op.gruWCE or op.gruMUSE, 'Initializing Gated Recurrent embedding layer without ' \ - 'explicit initialization of GRU View Generator' - - l2 = op.l2 - dataset_file = os.path.basename(dataset) - results = PolylingualClassificationResults('../log/' + op.output) - allprob = 'Prob' if op.allprob else '' - - method_name, dataset_name = get_method_name(dataset, op.posteriors, op.supervised, op.pretrained, op.mbert, - op.gruViewGenerator, op.gruMUSE, op.gruWCE, op.agg, op.allprob) - - print(f'Method: gFun{method_name}\nDataset: {dataset_name}') - print('-'*50) - - n_jobs = -1 # TODO SETTING n_JOBS - - standardize_range = slice(0, 0) - if op.zscore: - standardize_range = None - - # load dataset - data = MultilingualDataset.load(dataset) - # data.set_view(languages=['it']) # TODO: DEBUG SETTING - data.show_dimensions() - lXtr, lytr = data.training() - lXte, lyte = data.test() - - # text preprocessing - tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - # feature weighting (for word embeddings average) - feat_weighting = FeatureWeight(op.feat_weight, agg='mean') - - # document embedding modules aka View Generators - doc_embedder = DocEmbedderList(aggregation='mean' if op.agg else 'concat') - - # init View Generators - if op.posteriors: - """ - View Generator (-X): cast document representations encoded via TFIDF into posterior probabilities by means - of a set of SVM. - """ - # Check if we already have VG outputs from previous runs - VG_name = 'X' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, - kernel='linear', - C=op.set_c), - l2=l2, storing_path=storing_path, n_jobs=n_jobs)) - - if op.supervised: - """ - View Generator (-W): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - VG_name = 'W' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting, - sif=op.sif, n_jobs=n_jobs) - if op.allprob: - wce = FeatureSet2Posteriors(wce, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, - n_jobs=n_jobs) - doc_embedder.append(wce) - - if op.pretrained: - """ - View Generator (-M): generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - VG_name = 'M' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - exist = exists(storing_path) - muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting, sif=op.sif, n_jobs=n_jobs) - if op.allprob: - muse = FeatureSet2Posteriors(muse, method_id=VG_name, requires_tfidf=True, l2=l2, storing_path=storing_path, - n_jobs=n_jobs) - doc_embedder.append(muse) - - if op.gruViewGenerator: - """ - View Generator (-G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). If --allprob output will be casted to posterior prob space via SVM. - """ - VG_name = 'G' - VG_name += '_muse' if op.gruMUSE else '' - VG_name += '_wce' if op.gruWCE else '' - storing_path = 'Nope' # f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - rnn_embedder = RecurrentEmbedder(pretrained=op.gruMUSE, supervised=op.gruWCE, multilingual_dataset=data, - options=op, model_path=None, n_jobs=n_jobs) - if op.allprob: - rnn_embedder = FeatureSet2Posteriors(rnn_embedder, method_id=VG_name, requires_tfidf=False, - storing_path=storing_path, n_jobs=n_jobs) - doc_embedder.append(rnn_embedder) - - if op.mbert: - """ - View generator (-B): generates document embedding via mBERT model. - """ - VG_name = 'B' - storing_path = f'../dumps/{VG_name}/{dataset_name.split(".")[0]}' - avoid_loading = False if op.avoid_loading else True # TODO research setting (set to false mBert will be loaded into gpu to get doc emebds (aka, only the first time for each run)) - - mbert = MBertEmbedder(path_to_model=op.bert_path, nC=data.num_categories(), avoid_loading=avoid_loading) - if op.allprob: - mbert = FeatureSet2Posteriors(mbert, method_id=VG_name, l2=l2, storing_path=storing_path) - doc_embedder.append(mbert) - - # metaclassifier - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf', C=op.set_c), - meta_parameters=get_params(op.optimc), standardize_range=standardize_range, n_jobs=n_jobs) - - # ensembling the modules - classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) - - print('\n# Fitting Funnelling Architecture...') - tinit = time.time() - classifier.fit(lXtr, lytr) - time = time.time()-tinit - - print('\n# Evaluating ...') - l_eval = evaluate_method(classifier, lXte, lyte) - - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - results.add_row(method='MultiModal', - learner='SVM', - optimp=op.optimc, - sif=op.sif, - zscore=op.zscore, - l2=op.l2, - wescaler=op.feat_weight, - pca=op.max_labels_S, - id=method_name, - dataset=dataset_name, - time=time, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) diff --git a/src/models/cnn_class_bu.py b/src/models/cnn_class_bu.py deleted file mode 100644 index a47d5fc..0000000 --- a/src/models/cnn_class_bu.py +++ /dev/null @@ -1,42 +0,0 @@ -import torch.nn as nn -from torch.nn import functional as F -import torch - -class CNN_pdr(nn.Module): - - def __init__(self, output_size, out_channels, compositional_dim, vocab_size, emb_dim, embeddings=None, drop_embedding_range=None, - drop_embedding_prop=0, drop_prob=0.5): - super(CNN_pdr, self).__init__() - self.vocab_size = vocab_size - self.emb_dim = emb_dim - self.embeddings = torch.FloatTensor(embeddings) - self.embedding_layer = nn.Embedding(vocab_size, emb_dim, _weight=self.embeddings) - self.kernel_heights = kernel_heights=[3,5,7] - self.stride = 1 - self.padding = 0 - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - self.nC = 73 - - self.conv1 = nn.Conv2d(1, compositional_dim, (self.kernel_heights[0], self.emb_dim), self.stride, self.padding) - self.dropout = nn.Dropout(drop_prob) - self.label = nn.Linear(len(kernel_heights) * out_channels, output_size) - self.fC = nn.Linear(compositional_dim + self.nC, self.nC) - - - def forward(self, x, svm_output): - x = torch.LongTensor(x) - svm_output = torch.FloatTensor(svm_output) - x = self.embedding_layer(x) - x = self.conv1(x.unsqueeze(1)) - x = F.relu(x.squeeze(3)) - x = F.max_pool1d(x, x.size()[2]).squeeze(2) - x = torch.cat((x, svm_output), 1) - x = F.sigmoid(self.fC(x)) - return x #.detach().numpy() - - # logits = self.label(x) - # return logits - - diff --git a/src/models/helpers.py b/src/models/helpers.py deleted file mode 100755 index 93e5805..0000000 --- a/src/models/helpers.py +++ /dev/null @@ -1,47 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn import functional as F - - - -def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): - pretrained_embeddings = None - pretrained_length = 0 - if pretrained is not None: - pretrained_length = pretrained.shape[1] - assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' - pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) - pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - # pretrained_embeddings.to(device) - - learnable_embeddings = None - if learnable_length > 0: - learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - # learnable_embeddings.to(device) - - embedding_length = learnable_length + pretrained_length - assert embedding_length > 0, '0-size embeddings' - - return pretrained_embeddings, learnable_embeddings, embedding_length - - -def embed(model, input, lang): - input_list = [] - if model.lpretrained_embeddings[lang]: - input_list.append(model.lpretrained_embeddings[lang](input)) - if model.llearnable_embeddings[lang]: - input_list.append(model.llearnable_embeddings[lang](input)) - return torch.cat(tensors=input_list, dim=2) - - -def embedding_dropout(input, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from #length of the supervised embedding - l = input.shape[2] #total embedding length - corr = (1 - p) - input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) - input /= (1 - (p * m / l)) - - return input diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py deleted file mode 100755 index 98424f1..0000000 --- a/src/models/lstm_class.py +++ /dev/null @@ -1,114 +0,0 @@ -#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -import torch -import torch.nn as nn -from torch.autograd import Variable -from models.helpers import * - - -class RNNMultilingualClassifier(nn.Module): - - def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, - drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, - bert_embeddings=False): - - super(RNNMultilingualClassifier, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.post_probabilities = post_probabilities - self.bert_embeddings = bert_embeddings - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - - self.lpretrained_embeddings = nn.ModuleDict() - self.llearnable_embeddings = nn.ModuleDict() - self.embedding_length = None - self.langs = sorted(lvocab_size.keys()) - self.only_post = only_post - - self.n_layers = 1 - self.n_directions = 1 - - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - if only_post==False: - for l in self.langs: - pretrained = lpretrained[l] if lpretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, lvocab_size[l], learnable_length - ) - lpretrained_embeddings[l] = pretrained_embeddings - llearnable_embeddings[l] = learnable_embeddings - self.embedding_length = embedding_length - - # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.lpretrained_embeddings.update(lpretrained_embeddings) - self.llearnable_embeddings.update(llearnable_embeddings) - - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - - if only_post: - self.label = nn.Linear(output_size, output_size) - elif post_probabilities and not bert_embeddings: - self.label = nn.Linear(ff2 + output_size, output_size) - elif bert_embeddings and not post_probabilities: - self.label = nn.Linear(ff2 + 768, output_size) - elif post_probabilities and bert_embeddings: - self.label = nn.Linear(ff2 + output_size + 768, output_size) - else: - self.label = nn.Linear(ff2, output_size) - - def forward(self, input, post, bert_embed, lang): - if self.only_post: - doc_embedding = post - else: - doc_embedding = self.transform(input, lang) - if self.post_probabilities: - doc_embedding = torch.cat([doc_embedding, post], dim=1) - if self.bert_embeddings: - doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) - - logits = self.label(doc_embedding) - return logits - - def transform(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # output, (_, _) = self.lstm(input, (h_0, c_0)) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def finetune_pretrained(self): - for l in self.langs: - self.lpretrained_embeddings[l].requires_grad = True - self.lpretrained_embeddings[l].weight.requires_grad = True - - def get_embeddings(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - return output.cpu().detach().numpy() - diff --git a/src/models/mBert.py b/src/models/mBert.py deleted file mode 100644 index 56695a6..0000000 --- a/src/models/mBert.py +++ /dev/null @@ -1,247 +0,0 @@ -from copy import deepcopy -import torch -from torch.utils.data import Dataset -from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig -from sklearn.model_selection import train_test_split -from util.evaluation import * -from time import time -from util.common import show_gpu - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -class TrainingDataset(Dataset): - - def __init__(self, data, labels): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _labels = labels[lang] - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.labels = _labels - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.labels = np.vstack((self.labels, _labels)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - y = self.labels[idx] - lang = self.lang_index[idx] - - return x, torch.tensor(y, dtype=torch.float), lang - - def get_lang_ids(self): - return self.lang_ids - - def get_nclasses(self): - if hasattr(self, 'labels'): - return len(self.labels[0]) - else: - print('Method called before init!') - - -class ExtractorDataset(Dataset): - """ - data: dict of lang specific tokenized data - labels: dict of lang specific targets - """ - - def __init__(self, data): - self.langs = data.keys() - self.lang_ids = {lang: identifier for identifier, lang in enumerate(self.langs)} - - for i, lang in enumerate(self.langs): - _data = data[lang]['input_ids'] - _data = np.array(_data) - _lang_value = np.full(len(_data), self.lang_ids[lang]) - - if i == 0: - self.data = _data - self.lang_index = _lang_value - else: - self.data = np.vstack((self.data, _data)) - self.lang_index = np.concatenate((self.lang_index, _lang_value)) - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - x = self.data[idx] - lang = self.lang_index[idx] - - return x, lang - - def get_lang_ids(self): - return self.lang_ids - - -def get_model(n_out): - print('# Initializing model ...') - model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=n_out) - return model - - -def init_optimizer(model, lr, weight_decay=0): - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay}, - {'params': [p for n, p in model.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - return optimizer - - -def get_lr(optimizer): - for param_group in optimizer.param_groups: - return param_group['lr'] - - -def get_tr_val_split(l_tokenized_tr, l_devel_target, val_prop, max_val, seed): - l_split_va = deepcopy(l_tokenized_tr) - l_split_val_target = {l: [] for l in l_tokenized_tr.keys()} - l_split_tr = deepcopy(l_tokenized_tr) - l_split_tr_target = {l: [] for l in l_tokenized_tr.keys()} - - for lang in l_tokenized_tr.keys(): - val_size = int(min(len(l_tokenized_tr[lang]['input_ids']) * val_prop, max_val)) - l_split_tr[lang]['input_ids'], l_split_va[lang]['input_ids'], l_split_tr_target[lang], l_split_val_target[ - lang] = \ - train_test_split(l_tokenized_tr[lang]['input_ids'], l_devel_target[lang], test_size=val_size, - random_state=seed, shuffle=True) - - return l_split_tr, l_split_tr_target, l_split_va, l_split_val_target - - -def do_tokenization(l_dataset, max_len=512, verbose=True): - if verbose: - print('# Starting Tokenization ...') - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - langs = l_dataset.keys() - l_tokenized = {} - for lang in langs: - l_tokenized[lang] = tokenizer(l_dataset[lang], - truncation=True, - max_length=max_len, - padding='max_length') - return l_tokenized - - -def train(model, train_dataloader, epoch, criterion, optim, method_name, tinit, logfile, log_interval=10): - # _dataset_path = opt.dataset.split('/')[-1].split('_') - # dataset_id = _dataset_path[0] + _dataset_path[-1] - dataset_id = 'TODO fix this!' # TODO - - loss_history = [] - model.train() - - for idx, (batch, target, lang_idx) in enumerate(train_dataloader): - optim.zero_grad() - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()) - loss.backward() - # clip_gradient(model) - optim.step() - loss_history.append(loss.item()) - - if idx % log_interval == 0: - interval_loss = np.mean(loss_history[log_interval:]) - print( - f'{dataset_id} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') - - mean_loss = np.mean(interval_loss) - logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) - return mean_loss - - -def test(model, test_dataloader, lang_ids, tinit, epoch, logfile, criterion, measure_prefix): - print('# Validating model ...') - loss_history = [] - model.eval() - langs = lang_ids.keys() - id_2_lang = {v: k for k, v in lang_ids.items()} - predictions = {l: [] for l in langs} - yte_stacked = {l: [] for l in langs} - - for batch, target, lang_idx in test_dataloader: - out = model(batch.cuda()) - logits = out[0] - loss = criterion(logits, target.cuda()).item() - prediction = predict(logits) - loss_history.append(loss) - - # Assigning prediction to dict in predictions and yte_stacked according to lang_idx - for i, pred in enumerate(prediction): - lang_pred = id_2_lang[lang_idx.numpy()[i]] - predictions[lang_pred].append(pred) - yte_stacked[lang_pred].append(target[i].detach().cpu().numpy()) - - ly = {l: np.vstack(yte_stacked[l]) for l in langs} - ly_ = {l: np.vstack(predictions[l]) for l in langs} - l_eval = evaluate(ly, ly_) - metrics = [] - for lang in langs: - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if measure_prefix == 'te': - print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') - Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) - print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') - - mean_loss = np.mean(loss_history) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mF1, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-K', value=MK, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-K', value=mk, timelapse=time() - tinit) - logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=mean_loss, timelapse=time() - tinit) - - return Mf1 - - -def feature_extractor(data, lang_ids, model): - print('# Feature Extractor Mode...') - """ - Hidden State = Tuple of torch.FloatTensor (one for the output of the embeddings + one for - the output of each layer) of shape (batch_size, sequence_length, hidden_size) - """ - # show_gpu('Before Training') - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - out = model(batch.cuda()) - # show_gpu('After Batch Prediction') - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - # show_gpu('After Full Prediction') - return all_batch_embeddings, id2lang diff --git a/src/results/results_manager.py b/src/results/results_manager.py deleted file mode 100644 index 1fe57dd..0000000 --- a/src/results/results_manager.py +++ /dev/null @@ -1,11 +0,0 @@ -import pandas as pd -import numpy as np - -# df = pd.read_csv("/home/andreapdr/funneling_pdr/src/results/final_results.csv", delimiter='\t') -df = pd.read_csv("10run_rcv_final_results.csv", delimiter='\t') -pivot = pd.pivot_table(df, values=['macrof1', 'microf1', 'macrok', 'microk'], index=['method', 'id', 'optimp', 'zscore', 'l2', 'wescaler', 'pca', 'sif'], aggfunc=[np.mean, np.std]) -with pd.option_context('display.max_rows', None): - print(pivot.round(3)) -print('Finished ...') - - diff --git a/src/util/SIF_embed.py b/src/util/SIF_embed.py deleted file mode 100644 index cfe096e..0000000 --- a/src/util/SIF_embed.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np -from sklearn.decomposition import TruncatedSVD - -def get_weighted_average(We, x, w): - """ - Compute the weighted average vectors - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in sentence i - :param w: w[i, :] are the weights for the words in sentence i - :return: emb[i, :] are the weighted average vector for sentence i - """ - n_samples = x.shape[0] - emb = np.zeros((n_samples, We.shape[1])) - for i in range(n_samples): - emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) - return emb - -def compute_pc(X,npc=1): - """ - Compute the principal components. - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: component_[i,:] is the i-th pc - """ - svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) - svd.fit(X) - return svd.components_ - -def remove_pc(X, npc=1): - """ - Remove the projection on the principal components - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: XX[i, :] is the data point after removing its projection - """ - pc = compute_pc(X, npc) - if npc==1: - XX = X - X.dot(pc.transpose()) * pc - else: - XX = X - X.dot(pc.transpose()).dot(pc) - return XX - - -def SIF_embedding(We, x, w, params): - """ - Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in the i-th sentence - :param w: w[i, :] are the weights for the words in the i-th sentence - :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component - :return: emb, emb[i, :] is the embedding for sentence i - """ - emb = get_weighted_average(We, x, w) - if params.rmpc > 0: - emb = remove_pc(emb, params.rmpc) - return emb \ No newline at end of file diff --git a/src/util/common.py b/src/util/common.py deleted file mode 100755 index 48a0525..0000000 --- a/src/util/common.py +++ /dev/null @@ -1,542 +0,0 @@ -import subprocess -import warnings -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.svm import SVC -from sklearn.model_selection import train_test_split -from embeddings.supervised import get_supervised_embeddings -import numpy as np -from tqdm import tqdm -import torch -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): - """ - Index (i.e., replaces word strings with numerical indexes) a list of string documents - :param data: list of string documents - :param vocab: a fixed mapping [str]->[int] of words to indexes - :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained - because they are anyway contained in a pre-trained embedding set that we know in advance) - :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words - :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep - :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that - are not in the original vocab but that are in the known_words - :return: - """ - indexes=[] - vocabsize = len(vocab) - unk_count = 0 - knw_count = 0 - out_count = 0 - pbar = tqdm(data, desc=f'indexing documents') - for text in pbar: - words = analyzer(text) - index = [] - for word in words: - if word in vocab: - idx = vocab[word] - else: - if word in known_words: - if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) - idx = out_of_vocabulary[word] - out_count += 1 - else: - idx = unk_index - unk_count += 1 - index.append(idx) - indexes.append(index) - knw_count += len(index) - pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' - f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') - return indexes - - -def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths)+np.std(lengths)) - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -class Index: - def __init__(self, devel_raw, devel_target, test_raw, lang): - self.lang = lang - self.devel_raw = devel_raw - self.devel_target = devel_target - self.test_raw = test_raw - - def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) # word2idx - known_words = set(self.word2index.keys()) - if pretrained_vocabulary is not None: - known_words.update(pretrained_vocabulary) - - self.word2index['UNKTOKEN'] = len(self.word2index) - self.word2index['PADTOKEN'] = len(self.word2index) - self.unk_index = self.word2index['UNKTOKEN'] - self.pad_index = self.word2index['PADTOKEN'] - - # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) - self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - - self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) - - print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') - - def train_val_split(self, val_prop, max_val, seed): - devel = self.devel_index - target = self.devel_target - devel_raw = self.devel_raw - - val_size = int(min(len(devel) * val_prop, max_val)) - - self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ - train_test_split( - devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True - ) - - print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') - - def get_word_list(self): - def extract_word_list(word2index): - return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])] - - word_list = extract_word_list(self.word2index) - word_list += extract_word_list(self.out_of_vocabulary) - return word_list - - def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): - print(f'[generating embedding matrix for lang {self.lang}]') - - self.wce_range = None - embedding_parts = [] - - if pretrained is not None: - print('\t[pretrained-matrix]') - word_list = self.get_word_list() - muse_embeddings = pretrained.extract(word_list) - embedding_parts.append(muse_embeddings) - del pretrained - - if supervised: - print('\t[supervised-matrix]') - F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn') - num_missing_rows = self.vocabsize - F.shape[0] - F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) - F = torch.from_numpy(F).float() - - offset = 0 - if embedding_parts: - offset = embedding_parts[0].shape[1] - self.wce_range = [offset, offset + F.shape[1]] - - embedding_parts.append(F) - - make_dumps = False - if make_dumps: - print(f'Dumping Embedding Matrices ...') - import pickle - with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile: - pickle.dump((self.lang, embedding_parts, self.word2index), outfile) - with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2: - pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2) - - self.embedding_matrix = torch.cat(embedding_parts, dim=1) - - print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') - - -def none_dict(langs): - return {l:None for l in langs} - - -class MultilingualIndex: - def __init__(self): #, add_language_trace=False): - self.l_index = {} - self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - # self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000) - # self.add_language_trace=add_language_trace} - - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): - self.langs = sorted(l_devel_raw.keys()) - - #build the vocabularies - self.l_vectorizer.fit(l_devel_raw) - l_vocabulary = self.l_vectorizer.vocabulary() - l_analyzer = self.l_vectorizer.get_analyzer() - - for l in self.langs: - self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l) - self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l]) - - def get_indexed(self, l_texts, pretrained_vocabulary=None): - assert len(self.l_index) != 0, 'Cannot index data without first index call to multilingual index!' - l_indexed = {} - for l, texts in l_texts.items(): - if l in self.langs: - word2index = self.l_index[l].word2index - known_words = set(word2index.keys()) - if pretrained_vocabulary[l] is not None: - known_words.update(pretrained_vocabulary[l]) - l_indexed[l] = index(texts, - vocab=word2index, - known_words=known_words, - analyzer=self.l_vectorizer.get_analyzer(l), - unk_index=word2index['UNKTOKEN'], - out_of_vocabulary=dict()) - return l_indexed - - def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l,index in self.l_index.items(): - index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): - lXtr = self.get_lXtr() if supervised else none_dict(self.langs) - lYtr = self.l_train_target() if supervised else none_dict(self.langs) - for l,index in self.l_index.items(): - index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) - self.sup_range = index.wce_range - - - def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False): - show_gpu('GPU memory before initializing mBert model:') - # TODO: load dumped embeddings? - from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader - from transformers import BertConfig, BertForSequenceClassification - - print('[mBERT] generating mBERT doc embeddings') - lXtr_raw = self.get_raw_lXtr() - lXva_raw = self.get_raw_lXva() - lXte_raw = self.get_raw_lXte() - - print('# Tokenizing datasets') - l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False) - tr_dataset = ExtractorDataset(l_tokenized_tr) - tr_lang_ids = tr_dataset.lang_ids - tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False) - va_dataset = ExtractorDataset(l_tokenized_va) - va_lang_ids = va_dataset.lang_ids - va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False) - - l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False) - te_dataset = ExtractorDataset(l_tokenized_te) - te_lang_ids = te_dataset.lang_ids - te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False) - - num_labels = self.l_index[self.langs[0]].val_target.shape[1] - config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True, - num_labels=num_labels) - model = BertForSequenceClassification.from_pretrained(bert_path, - config=config).cuda() - print('# Extracting document embeddings') - tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False) - va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False) - te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False) - - show_gpu('GPU memory before after mBert model:') - # Freeing GPU's memory - import gc - del model, tr_dataloader, va_dataloader, te_dataloader - gc.collect() - torch.cuda.empty_cache() - show_gpu('GPU memory after clearing cache:') - return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings - - - @staticmethod - def do_bert_embeddings(model, data, lang_ids, verbose=True): - if verbose: - print('# Feature Extractor Mode...') - all_batch_embeddings = {} - id2lang = {v: k for k, v in lang_ids.items()} - with torch.no_grad(): - for batch, lang_idx in data: - out = model(batch.cuda()) - last_hidden_state = out[1][-1] - batch_embeddings = last_hidden_state[:, 0, :] - for i, l_idx in enumerate(lang_idx.numpy()): - if id2lang[l_idx] not in all_batch_embeddings.keys(): - all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy() - else: - all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]], - batch_embeddings[i].detach().cpu().numpy())) - - return all_batch_embeddings, id2lang - - def get_raw_lXtr(self): - lXtr_raw = {k:[] for k in self.langs} - lYtr_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXtr_raw[lang] = self.l_index[lang].train_raw - lYtr_raw[lang] = self.l_index[lang].train_raw - return lXtr_raw - - def get_raw_lXva(self): - lXva_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXva_raw[lang] = self.l_index[lang].val_raw - - return lXva_raw - - def get_raw_lXte(self): - lXte_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXte_raw[lang] = self.l_index[lang].test_raw - - return lXte_raw - - def get_lXtr(self): - if not hasattr(self, 'lXtr'): - self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) - return self.lXtr - - def get_lXva(self): - if not hasattr(self, 'lXva'): - self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) - return self.lXva - - def get_lXte(self): - if not hasattr(self, 'lXte'): - self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) - return self.lXte - - def l_vocabsize(self): - return {l:index.vocabsize for l,index in self.l_index.items()} - - def l_embeddings(self): - return {l:index.embedding_matrix for l,index in self.l_index.items()} - - def l_pad(self): - return {l: index.pad_index for l, index in self.l_index.items()} - - def l_train_index(self): - return {l: index.train_index for l, index in self.l_index.items()} - - def l_train_target(self): - return {l: index.train_target for l, index in self.l_index.items()} - - def l_val_index(self): - return {l: index.val_index for l, index in self.l_index.items()} - - def l_val_target(self): - return {l: index.val_target for l, index in self.l_index.items()} - - def l_test_index(self): - return {l: index.test_index for l, index in self.l_index.items()} - - def l_devel_index(self): - return {l: index.devel_index for l, index in self.l_index.items()} - - def l_devel_target(self): - return {l: index.devel_target for l, index in self.l_index.items()} - - def l_train(self): - return self.l_train_index(), self.l_train_target() - - def l_val(self): - return self.l_val_index(), self.l_val_target() - - -class Batch: - def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): - self.batchsize = batchsize - self.batches_per_epoch = batches_per_epoch - self.languages = languages - self.lpad=lpad - self.max_pad_length=max_pad_length - self.init_offset() - - def init_offset(self): - self.offset = {lang: 0 for lang in self.languages} - - def batchify(self, l_index, l_post, l_bert, llabels): - langs = self.languages - l_num_samples = {l:len(l_index[l]) for l in langs} - - max_samples = max(l_num_samples.values()) - n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) - if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: - n_batches = self.batches_per_epoch - - for b in range(n_batches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - offset = self.offset[lang] - if offset >= l_num_samples[lang]: - offset = 0 - limit = offset+self.batchsize - - batch_slice = slice(offset, limit) - batch = index[batch_slice] - batch_labels = labels[batch_slice].toarray() - - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() - - bert_emb = None - if l_bert is not None: - bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda() - - batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) - - batch = torch.LongTensor(batch).cuda() - target = torch.FloatTensor(batch_labels).cuda() - - self.offset[lang] = limit - - yield batch, post, bert_emb, target, lang - - -def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): - langs = sorted(l_index.keys()) - nsamples = max([len(l_index[l]) for l in langs]) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - for lang in langs: - index, labels = l_index[lang], llabels[lang] - - if b * batchsize >= len(index): - continue - batch = index[b*batchsize:(b+1)*batchsize] - batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray() - post = None - if l_post is not None: - post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda() - batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - target = torch.FloatTensor(batch_labels) - yield batch.cuda(), post, target.cuda(), lang - - -def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500): - nsamples = len(index_list) - nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) - for b in range(nbatches): - batch = index_list[b*batchsize:(b+1)*batchsize] - batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) - batch = torch.LongTensor(batch) - yield batch.cuda() - - -def clip_gradient(model, clip_value=1e-1): - params = list(filter(lambda p: p.grad is not None, model.parameters())) - for p in params: - p.grad.data.clamp_(-clip_value, clip_value) - - -def predict(logits, classification_type='multilabel'): - if classification_type == 'multilabel': - prediction = torch.sigmoid(logits) > 0.5 - elif classification_type == 'singlelabel': - prediction = torch.argmax(logits, dim=1).view(-1, 1) - else: - print('unknown classification type') - - return prediction.detach().cpu().numpy() - - -def count_parameters(model): - return sum(p.numel() for p in model.parameters() if p.requires_grad) - - -def show_gpu(msg): - """ - ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4 - """ - - def query(field): - return (subprocess.check_output( - ['nvidia-smi', f'--query-gpu={field}', - '--format=csv,nounits,noheader'], - encoding='utf-8')) - - def to_int(result): - return int(result.strip().split('\n')[0]) - - used = to_int(query('memory.used')) - total = to_int(query('memory.total')) - pct = used / total - print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})') - - -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs = kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - # self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in lX.keys()} - return self - - def transform(self, lX): - return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - # return {l: self.vectorizer[l].transform(lX[l]) for l in lX.keys()} - - def fit_transform(self, lX, ly=None): - return self.fit(lX, ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l: self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l: self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() - - -def get_learner(calibrate=False, kernel='linear', C=1): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) - - -def get_params(optimc=False): - if not optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru, - gruMUSE, gruWCE, agg, allprob): - _id = '-' - _id_conf = [posteriors, supervised, pretrained, mbert, gru] - _id_name = ['X', 'W', 'M', 'B', 'G'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id if not gruMUSE else _id + '_muse' - _id = _id if not gruWCE else _id + '_wce' - _id = _id if not agg else _id + '_mean' - _id = _id if not allprob else _id + '_allprob' - - _dataset_path = dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - return _id, dataset_id - - -def get_zscl_setting(langs): - settings = [] - for elem in langs: - for tar in langs: - if elem != tar: - settings.append((elem, tar)) - return settings \ No newline at end of file diff --git a/src/util/csv_log.py b/src/util/csv_log.py deleted file mode 100755 index 8c11e36..0000000 --- a/src/util/csv_log.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import pandas as pd -pd.set_option('display.max_rows', 500) -pd.set_option('display.max_columns', 500) -pd.set_option('display.width', 1000) - - -class CSVLog: - - def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False): - self.file = file - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file) and not overwrite: - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - self.columns = sorted(self.df.columns.values.tolist()) - else: - self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file)) - assert columns is not None, 'columns cannot be None' - self.columns = sorted(columns) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - self.defaults={} - - def already_calculated(self, **kwargs): - df = self.df - if df.shape[0]==0: - return False - if len(kwargs)==0: - kwargs = self.defaults - for key,val in kwargs.items(): - df = df.loc[df[key]==val] - if df.shape[0]==0: return False - return True - - def set_default(self, param, value): - self.defaults[param]=value - - def add_row(self, **kwargs): - for key in self.defaults.keys(): - if key not in kwargs: - kwargs[key]=self.defaults[key] - colums = sorted(list(kwargs.keys())) - values = [kwargs[col_i] for col_i in colums] - s = pd.Series(values, index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - # self.tell(s.to_string()) - self.tell(kwargs) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) - - - diff --git a/src/util/decompositions.py b/src/util/decompositions.py deleted file mode 100644 index 9d14a0c..0000000 --- a/src/util/decompositions.py +++ /dev/null @@ -1,50 +0,0 @@ -from sklearn.decomposition import PCA -import numpy as np -import matplotlib.pyplot as plt - - -def run_pca(dim, X): - """ - :param dim: number of pca components to keep - :param X: dictionary str(lang): matrix - :return: dict lang: reduced matrix - """ - r = dict() - pca = PCA(n_components=dim) - for lang in X.keys(): - r[lang] = pca.fit_transform(X[lang]) - return r - - -def get_optimal_dim(X, embed_type): - """ - :param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised - :param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT) - :return: - """ - _idx = [] - - plt.figure(figsize=(15, 10)) - if embed_type == 'U': - plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance') - else: - plt.title(f'WCE Explained Variance') - plt.xlabel('Number of Components') - plt.ylabel('Variance (%)') - - for lang in X.keys(): - pca = PCA(n_components=X[lang].shape[1]) - pca.fit(X[lang]) - _r = pca.explained_variance_ratio_ - _r = np.cumsum(_r) - plt.plot(_r, label=lang) - for i in range(len(_r) - 1, 1, -1): - delta = _r[i] - _r[i - 1] - if delta > 0: - _idx.append(i) - break - best_n = max(_idx) - plt.axvline(best_n, color='r', label='optimal N') - plt.legend() - plt.show() - return best_n diff --git a/src/util/early_stop.py b/src/util/early_stop.py deleted file mode 100755 index 7d72cde..0000000 --- a/src/util/early_stop.py +++ /dev/null @@ -1,71 +0,0 @@ -#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py -import torch -from transformers import BertForSequenceClassification -from time import time -from util.file import create_if_not_exist -import warnings - -class EarlyStopping: - - def __init__(self, model, optimizer, patience=20, verbose=True, checkpoint='./checkpoint.pt', is_bert=False): - # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters - self.patience_limit = patience - self.patience = patience - self.verbose = verbose - self.best_score = None - self.best_epoch = None - self.stop_time = None - self.checkpoint = checkpoint - self.model = model - self.optimizer = optimizer - self.STOP = False - self.is_bert = is_bert - - def __call__(self, watch_score, epoch): - - if self.STOP: - return - - if self.best_score is None or watch_score >= self.best_score: - self.best_score = watch_score - self.best_epoch = epoch - self.stop_time = time() - if self.checkpoint: - self.print(f'[early-stop] improved, saving model in {self.checkpoint}') - if self.is_bert: - print(f'Serializing Huggingface model...') - create_if_not_exist(self.checkpoint) - self.model.save_pretrained(self.checkpoint) - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - torch.save(self.model, self.checkpoint) - # with open(self.checkpoint) - # torch.save({'state_dict': self.model.state_dict(), - # 'optimizer_state_dict': self.optimizer.state_dict()}, self.checkpoint) - else: - self.print(f'[early-stop] improved') - self.patience = self.patience_limit - else: - self.patience -= 1 - if self.patience == 0: - self.STOP = True - self.print(f'[early-stop] patience exhausted') - else: - if self.patience>0: # if negative, then early-stop is ignored - self.print(f'[early-stop] patience={self.patience}') - - def reinit_counter(self): - self.STOP = False - self.patience=self.patience_limit - - def restore_checkpoint(self): - print(f'restoring best model from epoch {self.best_epoch}...') - if self.is_bert: - return BertForSequenceClassification.from_pretrained(self.checkpoint) - else: - return torch.load(self.checkpoint) - - def print(self, msg): - if self.verbose: - print(msg) diff --git a/src/util/evaluation.py b/src/util/evaluation.py deleted file mode 100644 index 41a2813..0000000 --- a/src/util/evaluation.py +++ /dev/null @@ -1,102 +0,0 @@ -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed -from util.metrics import * -from sklearn.metrics import f1_score -import numpy as np -import time - - -def evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers - return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) - - -def soft_evaluation_metrics(y, y_): - if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label - raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers - return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_) - - -def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): - print('evaluation (n_jobs={})'.format(n_jobs)) - if n_jobs == 1: - return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} - else: - langs = list(ly_true.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} - - -def average_results(l_eval, show=True): - metrics = [] - for lang in l_eval.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - if show: - print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - - ave = np.mean(np.array(metrics), axis=0) - if show: - print('Averages: MF1, mF1, MK, mK', ave) - return ave - - -def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False): - tinit = time.time() - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1 - - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - ly_ = predictor(lX, ly) - - eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs) - if return_time: - return eval_, time.time()-tinit - else: - return eval_ - - -def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False): - print('prediction for test in a single language') - if predictor is None: - predictor = polylingual_method.predict - - metrics = evaluation_metrics - if soft is True: - metrics = soft_evaluation_metrics - - ly_ = predictor({lang:X}) - return metrics(y, ly_[lang]) - - -def get_binary_counters(polylingual_method, lX, ly, predictor=None): - print('prediction for test') - assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs - if predictor is None: - predictor = polylingual_method.predict - ly_ = predictor(lX) - print('evaluation (n_jobs={})'.format(n_jobs)) - if n_jobs == 1: - return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()} - else: - langs = list(ly.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} - - -def binary_counters(y, y_): - y = np.reshape(y, (-1)) - assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected' - counters = hard_single_metric_statistics(y, y_) - return counters.tp, counters.tn, counters.fp, counters.fn - diff --git a/src/util/file.py b/src/util/file.py deleted file mode 100644 index a3d0a3a..0000000 --- a/src/util/file.py +++ /dev/null @@ -1,44 +0,0 @@ -from os import listdir, makedirs -from os.path import isdir, isfile, join, exists, dirname -#from sklearn.externals.six.moves import urllib -import urllib -from pathlib import Path - - -def download_file(url, archive_filename): - def progress(blocknum, bs, size): - total_sz_mb = '%.2f MB' % (size / 1e6) - current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) - print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') - print("Downloading %s" % url) - urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) - print("") - -def download_file_if_not_exists(url, archive_path): - if exists(archive_path): return - makedirs_if_not_exist(dirname(archive_path)) - download_file(url,archive_path) - - -def ls(dir, typecheck): - el = [f for f in listdir(dir) if typecheck(join(dir, f))] - el.sort() - return el - -def list_dirs(dir): - return ls(dir, typecheck=isdir) - -def list_files(dir): - return ls(dir, typecheck=isfile) - -def makedirs_if_not_exist(path): - if not exists(path): makedirs(path) - -def create_if_not_exist(path): - if not exists(path): makedirs(path) - -def get_parent_name(path): - return Path(path).parent - -def get_file_name(path): - return Path(path).name diff --git a/src/util/metrics.py b/src/util/metrics.py deleted file mode 100644 index ca688b7..0000000 --- a/src/util/metrics.py +++ /dev/null @@ -1,255 +0,0 @@ -import numpy as np -from scipy.sparse import lil_matrix, issparse -from sklearn.metrics import f1_score, accuracy_score - - - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - -class ContTable: - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - def __add__(self, other): - return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) - -def accuracy(cell): - return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) - -def f1(cell): - num = 2.0 * cell.tp - den = 2.0 * cell.tp + cell.fp + cell.fn - if den>0: return num / den - #we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative - return 1.0 - -def K(cell): - specificity, recall = 0., 0. - - AN = cell.tn + cell.fp - if AN != 0: - specificity = cell.tn*1. / AN - - AP = cell.tp + cell.fn - if AP != 0: - recall = cell.tp*1. / AP - - if AP == 0: - return 2. * specificity - 1. - elif AN == 0: - return 2. * recall - 1. - else: - return specificity + recall - 1. - -#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions -#true_labels and predicted_labels are two vectors of shape (number_documents,) -def hard_single_metric_statistics(true_labels, predicted_labels): - assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels." - nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels==1]) - fp = np.sum(predicted_labels[true_labels == 0]) - fn = np.sum(true_labels[predicted_labels == 0]) - tn = nd - (tp+fp+fn) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - -#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir -# probabilitiesfron with respect to the true binary labels -#true_labels and posterior_probabilities are two vectors of shape (number_documents,) -def soft_single_metric_statistics(true_labels, posterior_probabilities): - assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels." - tp = np.sum(posterior_probabilities[true_labels == 1]) - fn = np.sum(1. - posterior_probabilities[true_labels == 1]) - fp = np.sum(posterior_probabilities[true_labels == 0]) - tn = np.sum(1. - posterior_probabilities[true_labels == 0]) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - -#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared -#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. -def __check_consistency_and_adapt(true_labels, predictions): - if predictions.ndim == 1: - return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) - if true_labels.ndim == 1: - return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions) - if true_labels.shape != predictions.shape: - raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." - % (true_labels.shape, predictions.shape)) - _,nC = true_labels.shape - return true_labels, predictions, nC - -def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) - -def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - - accum = ContTable() - for c in range(nC): - other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) - accum = accum + other - - return metric(accum) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroF1(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, f1) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microF1(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, f1) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroK(true_labels, predicted_labels): - return macro_average(true_labels,predicted_labels, K) - -#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microK(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, K) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroF1(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroF1(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmacroK(true_labels, posterior_probabilities): - return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - -#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix -#of the same shape containing real values in [0,1] -def smoothmicroK(true_labels, posterior_probabilities): - return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics) - - - - -""" -Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. -I.e., when the number of true positives, false positives, and false negatives ammount to 0, all -affected metrices (precision, recall, and thus f1) output 0 in Scikit learn. -We adhere to the common practice of outputting 1 in this case since the classifier has correctly -classified all examples as negatives. -""" - -def evaluation(y_true, y_pred, classification_type): - - if classification_type == 'multilabel': - eval_function = multilabel_eval - elif classification_type == 'singlelabel': - eval_function = singlelabel_eval - - Mf1, mf1, accuracy = eval_function(y_true, y_pred) - - return Mf1, mf1, accuracy - - -def multilabel_eval(y, y_): - - tp = y.multiply(y_) - - fn = lil_matrix(y.shape) - true_ones = y==1 - fn[true_ones]=1-tp[true_ones] - - fp = lil_matrix(y.shape) - pred_ones = y_==1 - if pred_ones.nnz>0: - fp[pred_ones]=1-tp[pred_ones] - - #macro-f1 - tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten() - fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten() - fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten() - - pos_pred = tp_macro+fp_macro - pos_true = tp_macro+fn_macro - prec=np.zeros(shape=tp_macro.shape,dtype=float) - rec=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0) - np.divide(tp_macro, pos_true, out=rec, where=pos_true>0) - den=prec+rec - - macrof1=np.zeros(shape=tp_macro.shape,dtype=float) - np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0) - macrof1 *=2 - - macrof1[(pos_pred==0)*(pos_true==0)]=1 - macrof1 = np.mean(macrof1) - - #micro-f1 - tp_micro = tp_macro.sum() - fn_micro = fn_macro.sum() - fp_micro = fp_macro.sum() - pos_pred = tp_micro + fp_micro - pos_true = tp_micro + fn_micro - prec = (tp_micro / pos_pred) if pos_pred>0 else 0 - rec = (tp_micro / pos_true) if pos_true>0 else 0 - den = prec+rec - microf1 = 2*prec*rec/den if den>0 else 0 - if pos_pred==pos_true==0: - microf1=1 - - #accuracy - ndecisions = np.multiply(*y.shape) - tn = ndecisions - (tp_micro+fn_micro+fp_micro) - acc = (tp_micro+tn)/ndecisions - - return macrof1,microf1,acc - - -def singlelabel_eval(y, y_): - if issparse(y_): y_ = y_.toarray().flatten() - macrof1 = f1_score(y, y_, average='macro') - microf1 = f1_score(y, y_, average='micro') - acc = accuracy_score(y, y_) - return macrof1,microf1,acc - diff --git a/src/util/parser_options.py b/src/util/parser_options.py deleted file mode 100644 index 14d827c..0000000 --- a/src/util/parser_options.py +++ /dev/null @@ -1,94 +0,0 @@ -from optparse import OptionParser - -parser = OptionParser(usage="usage: %prog datapath [options]") - -parser.add_option("-d", dest='dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') - -parser.add_option("-o", "--output", dest="output", - help="Result file", type=str, default='../log/multiModal_log.csv') - -parser.add_option("-X", "--posteriors", dest="posteriors", action='store_true', - help="Add posterior probabilities to the document embedding representation", default=False) - -parser.add_option("-W", "--supervised", dest="supervised", action='store_true', - help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) - -parser.add_option("-M", "--pretrained", dest="pretrained", action='store_true', - help="Add pretrained MUSE embeddings to the document embedding representation", default=False) - -parser.add_option("-B", "--mbert", dest="mbert", action='store_true', - help="Add multilingual Bert (mBert) document embedding representation", default=False) - -parser.add_option('-G', dest='gruViewGenerator', action='store_true', - help="Add document embedding generated via recurrent net (GRU)", default=False) - -parser.add_option("--l2", dest="l2", action='store_true', - help="Activates l2 normalization as a post-processing for the document embedding views", - default=True) - -parser.add_option("--allprob", dest="allprob", action='store_true', - help="All views are generated as posterior probabilities. This affects the supervised and pretrained" - "embeddings, for which a calibrated classifier is generated, which generates the posteriors", - default=True) - -parser.add_option("--feat-weight", dest="feat_weight", - help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') - -parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the MUSE polylingual word embeddings", default='../embeddings') - -parser.add_option("-s", "--set_c", dest="set_c", type=float, - help="Set the C parameter", default=1) - -parser.add_option("-c", "--optimc", dest="optimc", action='store_true', - help="Optimize hyperparameters", default=False) - -parser.add_option("-j", "--n_jobs", dest="n_jobs", type=int, - help="Number of parallel jobs (default is -1, all)", default=-1) - -parser.add_option("-p", "--pca", dest="max_labels_S", type=int, - help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", - default=300) - -parser.add_option("-r", "--remove-pc", dest="sif", action='store_true', - help="Remove common component when computing dot product of word embedding matrices", default=True) - -parser.add_option("-z", "--zscore", dest="zscore", action='store_true', - help="Z-score normalize matrices (WCE and MUSE)", default=True) - -parser.add_option("-a", "--agg", dest="agg", action='store_true', - help="Set aggregation function of the common Z-space to average (Default: concatenation)", - default=True) - -parser.add_option("-l", dest="avoid_loading", action="store_true", - help="TODO", default=False) - -# ------------------------------------------------------------------------------------ - -parser.add_option('--hidden', type=int, default=512, metavar='int', - help='hidden lstm size (default: 512)') - -parser.add_option('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', - help='dropout probability for the supervised matrix (default: 0.5)') - -parser.add_option('--tunable', action='store_true', default=False, - help='pretrained embeddings are tunable from the beginning (default False, i.e., static)') - -parser.add_option('--logfile_gru', dest='logfile_gru', default='../log/log_gru_viewgenerator.csv') - -parser.add_option('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') - -parser.add_option('--force', action='store_true', default=False, - help='do not check if this experiment has already been run') - -parser.add_option('--gruMuse', dest='gruMUSE', action='store_true', default=False, - help='Deploy MUSE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gruWce', dest='gruWCE', action='store_true', default=False, - help='Deploy WCE embedding as embedding layer of the GRU View Generator') - -parser.add_option('--gru-path', dest='gru_path', default=None, - help='Set the path to a pretrained GRU model (aka, -G view generator)') - -parser.add_option('--bert-path', dest='bert_path', default=None, - help='Set the path to a pretrained mBERT model (aka, -B view generator)') diff --git a/src/util/results.py b/src/util/results.py deleted file mode 100644 index 6526303..0000000 --- a/src/util/results.py +++ /dev/null @@ -1,92 +0,0 @@ -import os -import pandas as pd -import numpy as np - -class PolylingualClassificationResults: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'learner', - 'optimp', - 'sif', - 'zscore', - 'l2', - 'wescaler', - 'pca', - 'id', - 'dataset', - 'time', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, learner, optimp, sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, learner, optimp,sif, zscore, l2, wescaler, pca, id, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) - - -class ZSCLResults: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'optimp', - 'source', - 'target', - 'id', - 'dataset', - 'time', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, optimp, id, source, target, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: print(msg) diff --git a/src/util/util.py b/src/util/util.py deleted file mode 100644 index 823c82d..0000000 --- a/src/util/util.py +++ /dev/null @@ -1,29 +0,0 @@ -from sklearn.svm import SVC -from tqdm import tqdm -import re -import sys - - -def mask_numbers(data, number_mask='numbermask'): - mask = re.compile(r'\b[0-9][0-9.,-]*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - masked.append(mask.sub(number_mask, text)) - return masked - - -def fill_missing_classes(lXtr, lytr): - pass - - -def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') - - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - diff --git a/src/util_transformers/StandardizeTransformer.py b/src/util_transformers/StandardizeTransformer.py deleted file mode 100644 index e1a10cf..0000000 --- a/src/util_transformers/StandardizeTransformer.py +++ /dev/null @@ -1,32 +0,0 @@ -import numpy as np - - -class StandardizeTransformer: - - def __init__(self, axis=0, range=None): - assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' - self.axis = axis - self.yetfit = False - self.range = range - - def fit(self, X): - print('fitting Standardizer...') - std=np.std(X, axis=self.axis, ddof=1) - self.std = np.clip(std, 1e-5, None) - self.mean = np.mean(X, axis=self.axis) - if self.range is not None: - ones = np.ones_like(self.std) - zeros = np.zeros_like(self.mean) - ones[self.range] = self.std[self.range] - zeros[self.range] = self.mean[self.range] - self.std = ones - self.mean = zeros - self.yetfit=True - return self - - def transform(self, X): - if not self.yetfit: 'transform called before fit' - return (X - self.mean) / self.std - - def fit_transform(self, X): - return self.fit(X).transform(X) diff --git a/src/util_transformers/__init__.py b/src/util_transformers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/util_transformers/clesa.py b/src/util_transformers/clesa.py deleted file mode 100644 index da17393..0000000 --- a/src/util_transformers/clesa.py +++ /dev/null @@ -1,110 +0,0 @@ -import numpy as np -import sklearn -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - -class ESA(object): - """ - Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer - """ - supported_similarity = ['dot', 'cosine'] - - def __init__(self, similarity='dot', centered=False, post=None): - """ - :param similarity: the similarity measure between documents to be used - :param centered: set to True to subtract the expected similarity due to randomness (experimental) - :param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default) - """ - assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity) - self.similarity = similarity - self.centered = centered - self.post_processing = post - self.W = None - - def fit(self, W): - """ - :param W: doc-by-term already processed matrix of wikipedia documents - :return: self - """ - self.W = W - return self - - def transform(self, X): - """ - :param X: doc-by-term matrix that is to be transformed into the ESA space. - :return: the matrix X transformed into the ESA space in numpy format - """ - assert self.W is not None, 'transform method called before fit' - - W = self.W - assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape))) - - if self.similarity in ['dot', 'cosine']: - if self.similarity == 'cosine': - X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True) - W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True) - - esa = (X.dot(W.T)).toarray() - if self.centered: - pX = (X > 0).sum(1) / float(X.shape[1]) - pW = (W > 0).sum(1) / float(W.shape[1]) - pXpW = np.sqrt(pX.dot(pW.transpose())) - esa = esa - pXpW - - if self.post_processing: - esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True) - - return esa - - def fit_transform(self, W, X, Y=None): - self.fit(W) - return self.transform(X, Y) - - def dimensionality(self): - return self.W.shape[0] - - - -class CLESA(ESA): - """ - Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer - """ - - def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1): - super(CLESA, self).__init__(similarity, centered, post) - self.lESA = None - self.langs = None - self.n_jobs = n_jobs - - def fit(self, lW): - """ - :param lW: a dictionary of {language: doc-by-term wiki matrix} - :return: self - """ - assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages" - - self.dimensions = list(lW.values())[0].shape[0] - self.langs = list(lW.keys()) - self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs} - return self - - def transform(self, lX): - """ - :param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space - :return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions - """ - assert self.lESA is not None, 'transform method called before fit' - assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope' - langs = list(lX.keys()) - trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs) - return {lang:trans[i] for i,lang in enumerate(langs)} - - def fit_transform(self, lW, lX): - return self.fit(lW).transform(lX) - - def languages(self): - return list(self.lESA.keys()) - - - - diff --git a/src/util_transformers/dci.py b/src/util_transformers/dci.py deleted file mode 100644 index 6e84ed9..0000000 --- a/src/util_transformers/dci.py +++ /dev/null @@ -1,154 +0,0 @@ -import numpy as np -from sklearn.preprocessing import normalize -from scipy.sparse import csr_matrix, issparse -from scipy.spatial.distance import cosine -import operator -import functools -import math, sys -# from sklearn.externals.joblib import Parallel, delayed -from joblib import Parallel, delayed - - -class DistributionalCorrespondenceIndexing: - - prob_dcf = ['linear', 'pmi'] - vect_dcf = ['cosine'] - valid_dcf = prob_dcf + vect_dcf - valid_post = ['normal', 'l2', None] - - def __init__(self, dcf='cosine', post='normal', n_jobs=-1): - """ - :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures - the distribucional correspondence between vectors u and v - :param post: post-processing function to apply to document embeddings. Default is to standardize it into a - normal distribution; other functions allowed are 'l2' or None - """ - if post not in self.valid_post: - raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post)) - - if isinstance(dcf, str): - if dcf not in self.valid_dcf: - raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf)) - self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf) - elif hasattr(dcf, '__call__'): - self.dcf = dcf - else: - raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors') - #self.dcf = lambda u,v:dcf(u,v) - self.post = post - self.domains = None - self.dFP = None - self.n_jobs = n_jobs - - def fit(self, dU, dP): - """ - :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the - distributional semantic model for a specific domain - :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain, - and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the - number of pivots - :return: self - """ - self.domains = list(dP.keys()) - assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains" - assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP" - assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \ - "inconsistent dimensions between distributional and pivot spaces" - self.dimensions = list(dP.values())[0].shape[1] - # embed the feature space from each domain using the pivots of that domain - #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains} - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains) - self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)} - - def _dom_transform(self, X, FP): - _X = X.dot(FP) - if self.post == 'l2': - _X = normalize(_X, norm='l2', axis=1) - elif self.post == 'normal': - std = np.clip(np.std(_X, axis=0), 1e-5, None) - _X = (_X - np.mean(_X, axis=0)) / std - return _X - - # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix - def transform(self, dX): - assert self.dFP is not None, 'transform method called before fit' - assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope' - domains = list(dX.keys()) - transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains) - return {d: transformations[i] for i, d in enumerate(domains)} - - def fit_transform(self, dU, dP, dX): - return self.fit(dU, dP).transform(dX) - - def _prevalence(self, v): - if issparse(v): - return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank - elif isinstance(v, np.ndarray): - return float(v[v>0].size) / v.size - - def linear(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - den1=tp+fn - den2=tn+fp - tpr = (tp*1./den1) if den1!=0 else 0. - tnr = (tn*1./den2) if den2!=0 else 0. - return tpr + tnr - 1 - - def pmi(self, u, v, D): - tp, fp, fn, tn = self._get_4cellcounters(u, v, D) - - Pxy = tp * 1. / D - Pxny = fp * 1. / D - Pnxy = fn * 1. / D - Px = Pxy + Pxny - Py = Pxy + Pnxy - - if (Px == 0 or Py == 0 or Pxy == 0): - return 0.0 - - score = math.log2(Pxy / (Px * Py)) - if np.isnan(score) or np.isinf(score): - print('NAN') - sys.exit() - return score - - def cosine(self, u, v): - pu = self._prevalence(u) - pv = self._prevalence(v) - return cosine(u, v) - np.sqrt(pu * pv) - - def _get_4cellcounters(self, u, v, D): - """ - :param u: a set of indexes with a non-zero value - :param v: a set of indexes with a non-zero value - :param D: the number of events (i.e., all posible indexes) - :return: the 4-cell contingency values tp, fp, fn, tn) - """ - common=u.intersection(v) - tp = len(common) - fp = len(u) - len(common) - fn = len(v) - len(common) - tn = D - (tp + fp + fn) - return tp, fp, fn, tn - - def dcf_dist(self, U, V): - nU,D = U.shape - nV = V.shape[0] - if issparse(U): U = U.toarray() - if issparse(V): V = V.toarray() - - dists = np.zeros((nU, nV)) - if self.dcf.__name__ in self.prob_dcf: - def hits_index(v): - return set(np.argwhere(v>0).reshape(-1).tolist()) - Vhits = {i:hits_index(V[i]) for i in range(nV)} - for i in range(nU): - Ui_hits = hits_index(U[i]) - for j in range(nV): - dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D) - else: - for i in range(nU): - for j in range(nV): - dists[i, j] = self.dcf(self, U[i], V[j]) - return dists - diff --git a/src/util_transformers/riboc.py b/src/util_transformers/riboc.py deleted file mode 100644 index 7dfbc42..0000000 --- a/src/util_transformers/riboc.py +++ /dev/null @@ -1,53 +0,0 @@ -import math -import numpy as np -from scipy.sparse import csr_matrix, issparse - -class RandomIndexingBoC(object): - - def __init__(self, latent_dimensions, non_zeros=2): - self.latent_dimensions = latent_dimensions - self.k = non_zeros - self.ri_dict = None - - def fit_transform(self, X): - return self.fit(X).transform(X) - - def fit(self, X): - nF = X.shape[1] - nL = self.latent_dimensions - format = 'csr' if issparse(X) else 'np' - self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format) - return self - - def transform(self, X): - assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary' - if self.ri_dict is None: - raise ValueError("Error: transform method called before fit.") - P = X.dot(self.ri_dict) - if issparse(P): - P.sort_indices() - return P - - -def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False): - assert format in ['csr', 'np'], 'Format should be in "[csr, np]"' - nF, latent_dimensions = shape - print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions)) - val = 1.0 if not normalized else 1.0/math.sqrt(k) - #ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions)) - ri_dict = np.zeros((nF, latent_dimensions)) - - #TODO: optimize - for t in range(nF): - dims = np.zeros(k, dtype=np.int32) - dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps) - dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False) - values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k) - ri_dict[t,dims]=values - print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='') - print('\nDone') - - if format=='csr': - ri_dict = csr_matrix(ri_dict) - return ri_dict -