bulk upload after refactoring

This commit is contained in:
Andrea Pedrotti 2023-02-07 18:40:17 +01:00
commit 6b75483b55
19 changed files with 2797 additions and 0 deletions

179
.gitignore vendored Normal file
View File

@ -0,0 +1,179 @@
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
# Byte-compiled / optimized / DLL files
__pycache__/
__pycache__
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# user defined
out/*
amazon_cateogories.bu.txt
models/*

21
amazon_categories.txt Executable file
View File

@ -0,0 +1,21 @@
Appliances
Arts Crafts and Sewing
Automotive
CDs and Vinyl
Cell Phones and Accessories
Electronics
Grocery and Gourmet Food
Home and Kitchen
Industrial and Scientific
Luxury Beauty
Magazine Subscriptions
Movies and TV
Musical Instruments
Office Products
Patio Lawn and Garden
Pet Supplies
Software
Sports and Outdoors
Tools and Home Improvement
Toys and Games
Video Games

View File

@ -0,0 +1,370 @@
import gzip
import os
import re
import warnings
from argparse import ArgumentParser
from collections import Counter
import numpy as np
from bs4 import BeautifulSoup
from sklearn.preprocessing import MultiLabelBinarizer
from plotters.distributions import plot_distribution
# TODO: AmazonDataset should be a instanc of MultimodalDataset
warnings.filterwarnings("ignore", category=UserWarning, module="bs4")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
BASEPATH = "/home/moreo/Datasets/raw"
with open("dataManager/excluded.csv", "r") as f:
EXCLUDED = f.read().splitlines()
REGEX = re.compile(r"\s{2,}", re.MULTILINE)
def parse(dataset_name, ext="json.gz", nrows=0):
dataset_name = dataset_name.replace(" ", "_")
meta_path = os.path.join(BASEPATH, f"meta_{dataset_name}.{ext}")
path = os.path.join(BASEPATH, f"{dataset_name}.{ext}")
mapper = {"false": False, "true": True}
data = []
metadata = []
_data = gzip.open(path, "r")
_metadata = gzip.open(meta_path, "r")
for i, (d, m) in enumerate(zip(_data, _metadata)):
data.append(eval(d.replace(b"&", b"&"), mapper))
metadata.append(eval(m.replace(b"&", b"&"), mapper))
if i + 1 == nrows:
break
return data, metadata
def get_categories(data, min_count=0):
if data[0].get("category", None) is None:
return [], set()
categories = []
for item in data:
if item["category"] != "":
categories.extend(item["category"])
categories = list(filter(lambda x: x not in EXCLUDED, categories))
# return categories, sorted(set(categories))
return categories, _filter_counter(Counter(categories), min_count)
def _filter_counter(counter, min_count):
return {k: v for k, v in counter.items() if v >= min_count}
def get_main_cat(data, min_count=0):
if data[0].get("main_cat", None) is None:
return [], set()
main_cats = [item["main_cat"] for item in data if item["main_cat"] != ""]
main_cats = list(filter(lambda x: x not in EXCLUDED, main_cats))
# return main_cats, sorted(set(main_cats))
return main_cats, _filter_counter(Counter(main_cats), min_count)
def filter_sample_with_images(metadata):
# TODO: check whether images are really available and store them locally
# print(f"(Pre-filter) Total items: {len(metadata)}")
data = []
for i, m in enumerate(metadata):
if "imageURL" not in m.keys():
continue
if len(m["imageURL"]) != 0 or len(m["imageURLHighRes"]) != 0:
data.append(m)
# print(f"(Post-filter) Total items: {len(data)}")
return data
def select_description(descriptions):
"""
Some items have multiple descriptions (len(item["description"]) > 1).
Most of these descriptions are just empty strings. Some items instead actually have
multiple strings describing them
At the moment, we rely on a simple heuristic: select the longest string and use it
the only description.
"""
if len(descriptions) == 0:
return [""]
return [max(descriptions, key=len)]
def build_product_json(metadata, binarizer):
data = []
for item in metadata:
if len(item["description"]) != 1:
item["description"] = select_description(item["description"])
product = {
"asin": item["asin"],
"title": item["title"],
"description": item["description"],
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
"cleaned_description": clean_description(
BeautifulSoup(
item["title"] + ". " + item["description"][0],
features="html.parser",
).text
),
# TODO: is it faster to call transform on the whole dataset?
"main_category": item["main_cat"],
"categories": item["category"],
"all_categories": _get_cats(item["main_cat"], item["category"]),
"vect_categories": binarizer.transform(
[_get_cats(item["main_cat"], item["category"])]
)[0],
}
data.append(product)
return data
def _get_cats(main_cat, cats):
return [main_cat] + cats
def get_label_binarizer(cats):
mlb = MultiLabelBinarizer()
mlb.fit([cats])
return mlb
def clean_description(description):
description = re.sub(REGEX, " ", description)
description = description.rstrip()
description = description.replace("\t", "")
description = description.replace("\n", " ")
return description
def construct_target_matrix(data):
return np.stack([d["vect_categories"] for d in data], axis=0)
def get_all_classes(counter_cats, counter_sub_cats):
if len(counter_cats) == 0:
return counter_sub_cats.keys()
elif len(counter_sub_cats) == 0:
return counter_cats.keys()
else:
return list(counter_cats.keys()) + list(counter_sub_cats.keys())
class AmazonDataset:
def __init__(
self,
domains=["Appliances", "Automotive", "Movies and TV"],
basepath="/home/moreo/Datasets/raw",
min_count=10,
max_labels=50,
nrows=1000,
):
print(f"[Init AmazonDataset]")
print(f"- Domains: {domains}")
self.REGEX = re.compile(r"\s{2,}", re.MULTILINE)
with open("dataManager/excluded.csv", "r") as f:
self.EXCLUDED = f.read().splitlines()
self.basepath = basepath
self.domains = self.parse_domains(domains)
self.nrows = nrows
self.min_count = min_count
self.max_labels = max_labels
self.len_data = 0
self.domain_data = self.load_data()
self.labels, self.domain_labels = self.get_all_cats()
self.label_binarizer = get_label_binarizer(self.labels)
self.vectorized_labels = self.vecorize_labels()
self.dX = self.construct_data_matrix()
self.dY = self.construct_target_matrix()
self.langs = ["en"]
def parse_domains(self, domains):
with open("amazon_categories.txt", "r") as f:
all_domains = f.read().splitlines()
if domains == "all":
return all_domains
else:
assert all([d in all_domains for d in domains]), "Invalid domain name"
return domains
def parse(self, dataset_name, nrows, ext="json.gz"):
dataset_name = dataset_name.replace(" ", "_")
meta_path = os.path.join(self.basepath, f"meta_{dataset_name}.{ext}")
path = os.path.join(self.basepath, f"{dataset_name}.{ext}")
mapper = {"false": False, "true": True}
data = []
metadata = []
_data = gzip.open(path, "r")
_metadata = gzip.open(meta_path, "r")
for i, (d, m) in enumerate(zip(_data, _metadata)):
data.append(eval(d.replace(b"&", b"&"), mapper))
metadata.append(eval(m.replace(b"&", b"&"), mapper))
if i + 1 == nrows:
break
return data, metadata
def load_data(self):
print(f"- Loading up to {self.nrows} items per domain")
domain_data = {}
for domain in self.domains:
_, metadata = self.parse(domain, nrows=self.nrows)
metadata = filter_sample_with_images(metadata)
domain_data[domain] = self.build_product_scheme(metadata)
self.len_data += len(metadata)
print(f"- Loaded {self.len_data} items")
return domain_data
def get_all_cats(self):
assert len(self.domain_data) != 0, "Load data first"
labels = set()
domain_labels = {}
for domain, data in self.domain_data.items():
_, counter_cats = self._get_counter_cats(data, self.min_count)
labels.update(counter_cats.keys())
domain_labels[domain] = counter_cats
print(f"- Found {len(labels)} labels")
return labels, domain_labels
def export_to_torch(self):
pass
def get_label_binarizer(self):
mlb = MultiLabelBinarizer()
mlb.fit([self.labels])
return mlb
def vecorize_labels(self):
for domain, data in self.domain_data.items():
for item in data:
item["vect_categories"] = self.label_binarizer.transform(
[item["all_categories"]]
)[0]
def build_product_scheme(self, metadata):
data = []
for item in metadata:
if len(item["description"]) != 1:
_desc = self._select_description(item["description"])
else:
_desc = item["description"][0]
product = {
"asin": item["asin"],
"title": item["title"],
"description": _desc,
# TODO: some items have multiple descriptions (len(item["description"]) > 1))
"cleaned_text": self._clean_description(
BeautifulSoup(
item["title"] + ". " + _desc,
features="html.parser",
).text
),
# TODO: is it faster to call transform on the whole dataset?
"main_category": item["main_cat"],
"categories": item["category"],
"all_categories": self._get_cats(item["main_cat"], item["category"]),
# "vect_categories": binarizer.transform(
# [_get_cats(item["main_cat"], item["category"])]
# )[0],
}
data.append(product)
return data
def construct_data_matrix(self):
dX = {}
for domain, data in self.domain_data.items():
dX[domain] = [d["cleaned_text"] for d in data]
return dX
def construct_target_matrix(self):
dY = {}
for domain, data in self.domain_data.items():
dY[domain] = np.stack([d["vect_categories"] for d in data], axis=0)
return dY
def get_overall_label_matrix(self):
assert hasattr(self, "label_matrices"), "Init label matrices first"
return np.vstack([x for x in self.dY.values()])
def _get_counter_cats(self, data, min_count):
cats = []
for item in data:
cats.extend(item["all_categories"])
cats = list(filter(lambda x: x not in self.EXCLUDED, cats))
return cats, self._filter_counter(Counter(cats), min_count)
def _filter_counter(self, counter, min_count):
return {k: v for k, v in counter.items() if v >= min_count}
def _clean_description(self, description):
description = re.sub(self.REGEX, " ", description)
description = description.rstrip()
description = description.replace("\t", "")
description = description.replace("\n", " ")
return description
def _get_cats(self, main_cat, cats):
return [main_cat] + cats
def _select_description(self, descriptions) -> str:
"""
Some items have multiple descriptions (len(item["description"]) > 1).
Most of these descriptions are just empty strings. Some items instead actually have
multiple strings describing them
At the moment, we rely on a simple heuristic: select the longest string and use it
the only description.
"""
if len(descriptions) == 0:
return ""
return max(descriptions, key=len)
def plot_label_distribution(self):
overall_mat = self.get_overall_label_matrix()
plot_distribution(
np.arange(len(self.labels)),
np.sum(overall_mat, axis=0),
title="Amazon Dataset",
labels=self.labels,
notes=overall_mat.shape,
max_labels=args.max_labels,
figsize=(10, 10),
save=True,
path="out",
)
def plot_per_domain_label_distribution(self):
for domain, matrix in self.vecorize_labels:
pass
def main(args):
dataset = AmazonDataset(
domains=args.domains,
nrows=args.nrows,
min_count=args.min_count,
max_labels=args.max_labels,
)
dataset.plot_label_distribution()
exit()
if __name__ == "__main__":
import sys
sys.path.append("/home/andreapdr/devel/gFunMultiModal/")
parser = ArgumentParser()
parser.add_argument("--domains", type=str, default="all")
parser.add_argument("--nrows", type=int, default=10000)
parser.add_argument("--min_count", type=int, default=10)
parser.add_argument("--max_labels", type=int, default=50)
args = parser.parse_args()
main(args)

27
dataManager/excluded.csv Normal file
View File

@ -0,0 +1,27 @@
</span></span></span>
</span></span></span>
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/handmade/brand/logos/2018/subnav_logo._CB502360610_.png" class="nav-categ-image" alt="Handmade"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
<img src="https://m.media-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION" />
<img src="https://images-na.ssl-images-amazon.com/images/G/01/nav2/images/gui/amazon-fashion-store-new._CB520838675_.png" class="nav-categ-image" alt="AMAZON FASHION"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music"/>
<img src="https://m.media-amazon.com/images/G/01/digital/music/logos/amzn_music_logo_subnav._CB471835632_.png" class="nav-categ-image" alt="Digital Music" />
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry"/>
<img src="https://images-na.ssl-images-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
<img src="https://m.media-amazon.com/images/G/01/pantry/subnav/pantry-subnav-logo._CB474181323_.png" class="nav-categ-image" alt="Prime Pantry" />
Can't render this file because it contains an unexpected character in line 3 and column 10.

View File

@ -0,0 +1,142 @@
import re
from os import listdir
from os.path import isdir, join
from dataManager.torchDataset import TorchMultiNewsDataset
# TODO: labels must be aligned between languages
# TODO: remove copyright and also tags (doc.split("More about:")[0])
# TODO: define fn to represent the dataset as a torch Dataset
# TODO: this should be a instance of a abstract MultimodalMultilingualDataset
class MultiNewsDataset:
def __init__(self, data_dir, excluded_langs=[], debug=False):
self.debug = debug
self.data_dir = data_dir
self.langs = self.get_langs()
self.excluded_langs = excluded_langs
self.lang_multiModalDataset = {}
print(
f"[{'DEBUG MODE: ' if debug else ''}Loaded MultiNewsDataset - langs: {self.langs}]"
)
self.load_data()
self.print_stats()
def load_data(self):
for lang in self.langs:
if lang not in self.excluded_langs:
self.lang_multiModalDataset[lang] = MultiModalDataset(
lang, join(self.data_dir, lang)
)
def get_langs(self):
from os import listdir
if self.debug:
return ["it", "en"]
return tuple(sorted([folder for folder in listdir(self.data_dir)]))
def print_stats(self):
print(f"[MultiNewsDataset stats]")
# print(f" - langs: {self.langs}")
total_docs = 0
for lang in self.langs:
_len = len(self.lang_multiModalDataset[lang].data)
total_docs += _len
print(
f" - {lang} docs: {_len}\t- labels: {self._count_lang_labels(self.lang_multiModalDataset[lang].data)}"
)
print(f" - total docs: {total_docs}")
def _count_lang_labels(self, data):
lang_labels = set()
for sample in data:
lang_labels.update(sample[-1])
return len(lang_labels)
def export_to_torch_dataset(self, tokenizer_id):
raise NotImplementedError
# torch_datasets = []
# for lang, multimodal_dataset in self.lang_multiModalDataset.keys():
# dataset = TorchMultiNewsDataset(
# lang=lang,
# data=multimodal_dataset.get_docs(),
# ids=multimodal_dataset.get_ids(),
# imgs=multimodal_dataset.get_imgs(),
# labels=multimodal_dataset.get_labels(),
# tokenizer_id=tokenizer_id,
# )
# torch_datasets.append(dataset)
# raise NotImplementedError
def save_to_disk(self):
raise NotImplementedError
class MultiModalDataset:
def __init__(self, lang, data_dir):
self.lang = lang
self.data_dir = data_dir
self.re_labels = re.compile(r"<a rel=\"tag\" href=\"\/tag\/.+?\/\">(.+?)<\/a>")
self.re_cleaner = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
self.re_white = re.compile(r" +")
self.data = self.get_docs()
def get_docs(self):
raise NotImplementedError
def get_imgs(self):
raise NotImplementedError
def get_labels(self):
raise NotImplementedError
def get_ids(self):
raise NotImplementedError
def get_docs(self):
data = []
news_folder = [doc_folder for doc_folder in listdir(self.data_dir)]
for news_folder in news_folder:
if isdir(join(self.data_dir, news_folder)):
fname_doc = f"text.{news_folder.split('.')[-1]}"
with open(join(self.data_dir, news_folder, fname_doc)) as f:
html_doc = f.read()
img = self.get_image()
clean_doc, labels = self.preprocess_html(html_doc)
data.append((fname_doc, clean_doc, html_doc, img, labels))
return data
def preprocess_html(self, html_doc):
labels = self._extract_labels(html_doc)
cleaned = self._clean_up_str(self._remove_html_tags(html_doc))
return cleaned, labels
def _extract_labels(self, data):
return re.findall(self.re_labels, data)
def _remove_html_tags(self, data):
cleaned = re.sub(self.re_cleaner, "", data)
return cleaned
def _clean_up_str(self, doc):
doc = re.sub(self.re_white, " ", doc)
doc = doc.lstrip()
doc = doc.rstrip()
doc = doc.replace("\n", " ")
doc = doc.replace("\t", " ")
return doc
def get_image(self):
# TODO: implement
pass
if __name__ == "__main__":
from os.path import expanduser
_dataset_path_hardcoded = "~/datasets/MultiNews/20110730/"
dataset = MultiNewsDataset(expanduser(_dataset_path_hardcoded), debug=True)

View File

@ -0,0 +1,270 @@
# TODO: this should be a instance of an abstract MultilingualDataset
from abc import ABC, abstractmethod
from scipy.sparse import issparse
from os.path import join, expanduser
import pickle
import re
import numpy as np
from tqdm import tqdm
class NewMultilingualDataset(ABC):
@abstractmethod
def get_training(self):
pass
@abstractmethod
def get_validation(self):
pass
@abstractmethod
def get_test(self):
pass
@abstractmethod
def mask_numbers(self):
pass
@abstractmethod
def save(self):
pass
@abstractmethod
def load(self):
pass
# class RcvMultilingualDataset(MultilingualDataset):
class RcvMultilingualDataset:
def __init__(
self,
run="0",
):
self.dataset_name = "rcv1-2"
self.dataset_path = expanduser(
f"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run{run}.pickle"
)
def load(self):
import pickle
data = pickle.load(open(self.dataset_path, "rb"))
return self
class MultilingualDataset:
"""
A multilingual dataset is a dictionary of training and test documents indexed by language code.
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
labels of each document, and ids is a list of document-identifiers from the original collection.
"""
def __init__(self, dataset_name):
self.dataset_name = dataset_name
self.multiling_dataset = {}
print(f"[Init Multilingual Dataset: {self.dataset_name}]")
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
def save(self, file):
self.sort_indexes()
pickle.dump(self, open(file, "wb"), pickle.HIGHEST_PROTOCOL)
return self
def __getitem__(self, item):
if item in self.langs():
return self.multiling_dataset[item]
return None
@classmethod
def load(cls, file):
data = pickle.load(open(file, "rb"))
data.sort_indexes()
return data
@classmethod
def load_ids(cls, file):
data = pickle.load(open(file, "rb"))
tr_ids = {
lang: tr_ids
for (lang, ((_, _, tr_ids), (_, _, _))) in data.multiling_dataset.items()
}
te_ids = {
lang: te_ids
for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()
}
return tr_ids, te_ids
def sort_indexes(self):
for lang, ((Xtr, _, _), (Xte, _, _)) in self.multiling_dataset.items():
if issparse(Xtr):
Xtr.sort_indices()
if issparse(Xte):
Xte.sort_indices()
def set_view(self, categories=None, languages=None):
if categories is not None:
if isinstance(categories, int):
categories = np.array([categories])
elif isinstance(categories, list):
categories = np.array(categories)
self.categories_view = categories
if languages is not None:
self.languages_view = languages
def training(self, mask_numbers=False, target_as_csr=False):
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
def test(self, mask_numbers=False, target_as_csr=False):
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
def lXtr(self, mask_numbers=False):
proc = lambda x: _mask_numbers(x) if mask_numbers else x
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
return {
lang: proc(Xtr)
for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items()
if lang in self.langs()
}
def lXte(self, mask_numbers=False):
proc = lambda x: _mask_numbers(x) if mask_numbers else x
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
return {
lang: proc(Xte)
for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items()
if lang in self.langs()
}
def lYtr(self, as_csr=False):
lY = {
lang: self.cat_view(Ytr)
for (lang, ((_, Ytr, _), _)) in self.multiling_dataset.items()
if lang in self.langs()
}
if as_csr:
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
return lY
def lYte(self, as_csr=False):
lY = {
lang: self.cat_view(Yte)
for (lang, (_, (_, Yte, _))) in self.multiling_dataset.items()
if lang in self.langs()
}
if as_csr:
lY = {l: csr_matrix(Y) for l, Y in lY.items()}
return lY
def cat_view(self, Y):
if hasattr(self, "categories_view"):
return Y[:, self.categories_view]
else:
return Y
def langs(self):
if hasattr(self, "languages_view"):
langs = self.languages_view
else:
langs = sorted(self.multiling_dataset.keys())
return langs
def num_categories(self):
return self.lYtr()[self.langs()[0]].shape[1]
def show_dimensions(self):
def shape(X):
return X.shape if hasattr(X, "shape") else len(X)
for lang, (
(Xtr, Ytr, IDtr),
(Xte, Yte, IDte),
) in self.multiling_dataset.items():
if lang not in self.langs():
continue
print(
"Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(
lang,
shape(Xtr),
self.cat_view(Ytr).shape,
shape(Xte),
self.cat_view(Yte).shape,
)
)
def show_category_prevalences(self):
nC = self.num_categories()
accum_tr = np.zeros(nC, dtype=np.int)
accum_te = np.zeros(nC, dtype=np.int)
in_langs = np.zeros(
nC, dtype=np.int
) # count languages with at least one positive example (per category)
for lang, (
(Xtr, Ytr, IDtr),
(Xte, Yte, IDte),
) in self.multiling_dataset.items():
if lang not in self.langs():
continue
prev_train = np.sum(self.cat_view(Ytr), axis=0)
prev_test = np.sum(self.cat_view(Yte), axis=0)
accum_tr += prev_train
accum_te += prev_test
in_langs += (prev_train > 0) * 1
print(lang + "-train", prev_train)
print(lang + "-test", prev_test)
print("all-train", accum_tr)
print("all-test", accum_te)
return accum_tr, accum_te, in_langs
def set_labels(self, labels):
self.labels = labels
def reduce_data(self, langs=["it", "en"], maxn=50):
print(f"- Reducing data: {langs} with max {maxn} documents...")
self.set_view(languages=langs)
data = {
lang: self._reduce(data, maxn)
for lang, data in self.multiling_dataset.items()
if lang in langs
}
self.multiling_dataset = data
return self
def _reduce(self, multilingual_dataset, maxn):
new_data = []
for split in multilingual_dataset:
docs, labels, ids = split
new_data.append((docs[:maxn], labels[:maxn], ids[:maxn]))
return new_data
def _mask_numbers(data):
mask_moredigit = re.compile(r"\s[\+-]?\d{5,}([\.,]\d*)*\b")
mask_4digit = re.compile(r"\s[\+-]?\d{4}([\.,]\d*)*\b")
mask_3digit = re.compile(r"\s[\+-]?\d{3}([\.,]\d*)*\b")
mask_2digit = re.compile(r"\s[\+-]?\d{2}([\.,]\d*)*\b")
mask_1digit = re.compile(r"\s[\+-]?\d{1}([\.,]\d*)*\b")
masked = []
for text in tqdm(data, desc="masking numbers"):
text = " " + text
text = mask_moredigit.sub(" MoreDigitMask", text)
text = mask_4digit.sub(" FourDigitMask", text)
text = mask_3digit.sub(" ThreeDigitMask", text)
text = mask_2digit.sub(" TwoDigitMask", text)
text = mask_1digit.sub(" OneDigitMask", text)
masked.append(text.replace(".", "").replace(",", "").strip())
return masked
if __name__ == "__main__":
DATAPATH = expanduser(
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
)
print(DATAPATH)
dataset = MultilingualDataset().load(DATAPATH)
print(dataset.show_dimensions())

View File

@ -0,0 +1,2 @@
class TorchMultiNewsDataset:
pass

40
evaluation/evaluate.py Normal file
View File

@ -0,0 +1,40 @@
from joblib import Parallel, delayed
from evaluation.metrics import *
def evaluation_metrics(y, y_):
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
# return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_)
# return macroF1(y, y_), microF1(y, y_), macroAcc(y, y_), microAcc(y, y_), macroP(y, y_), microP(y, y_), macroR(y, y_), microR(y, y_), macroAcc(y, y_)
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
if n_jobs == 1:
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(
delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs
)
return {lang: evals[i] for i, lang in enumerate(langs)}
def log_eval(l_eval, phase="training"):
print(f"\n[Results {phase}]")
metrics = []
for lang in l_eval.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if phase != "validation":
print(f"Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}")
averages = np.mean(np.array(metrics), axis=0)
print(
"Averages: MF1, mF1, MK, mK",
np.round(averages, 3),
"\n",
)
return averages

237
evaluation/metrics.py Normal file
View File

@ -0,0 +1,237 @@
import numpy as np
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp = tp
self.tn = tn
self.fp = fp
self.fn = fn
def get_d(self):
return self.tp + self.tn + self.fp + self.fn
def get_c(self):
return self.tp + self.fn
def get_not_c(self):
return self.tn + self.fp
def get_f(self):
return self.tp + self.fp
def get_not_f(self):
return self.tn + self.fn
def p_c(self):
return (1.0 * self.get_c()) / self.get_d()
def p_not_c(self):
return 1.0 - self.p_c()
def p_f(self):
return (1.0 * self.get_f()) / self.get_d()
def p_not_f(self):
return 1.0 - self.p_f()
def p_tp(self):
return (1.0 * self.tp) / self.get_d()
def p_tn(self):
return (1.0 * self.tn) / self.get_d()
def p_fp(self):
return (1.0 * self.fp) / self.get_d()
def p_fn(self):
return (1.0 * self.fn) / self.get_d()
def tpr(self):
c = 1.0 * self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0 * self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __add__(self, other):
return ContTable(
tp=self.tp + other.tp,
tn=self.tn + other.tn,
fp=self.fp + other.fp,
fn=self.fn + other.fn,
)
def accuracy(cell):
return (cell.tp + cell.tn) * 1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
def precision(cell):
num = cell.tp
den = cell.tp + cell.fp
if den > 0:
return num / den
return 1.0
num = cell.tn
den = cell.tn + cell.fn
return num / den
def recall(cell):
num = cell.tp
den = cell.tp + cell.fn
if den > 0:
return num / den
return 1.0
num = cell.tn
den = cell.tn + cell.fp
return num / den
def f1(cell):
num = 2.0 * cell.tp
den = 2.0 * cell.tp + cell.fp + cell.fn
if den > 0:
return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def K(cell):
specificity, recall = 0.0, 0.0
AN = cell.tn + cell.fp
if AN != 0:
specificity = cell.tn * 1.0 / AN
AP = cell.tp + cell.fn
if AP != 0:
recall = cell.tp * 1.0 / AP
if AP == 0:
return 2.0 * specificity - 1.0
elif AN == 0:
return 2.0 * recall - 1.0
else:
return specificity + recall - 1.0
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
def __check_consistency_and_adapt(true_labels, predictions):
if predictions.ndim == 1:
return __check_consistency_and_adapt(
true_labels, np.expand_dims(predictions, axis=1)
)
if true_labels.ndim == 1:
return __check_consistency_and_adapt(
np.expand_dims(true_labels, axis=1), predictions
)
if true_labels.shape != predictions.shape:
raise ValueError(
"True and predicted label matrices shapes are inconsistent %s %s."
% (true_labels.shape, predictions.shape)
)
_, nC = true_labels.shape
return true_labels, predictions, nC
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
# probabilitiesfron with respect to the true binary labels
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
def soft_single_metric_statistics(true_labels, posterior_probabilities):
assert len(true_labels) == len(
posterior_probabilities
), "Format not consistent between true and predicted labels."
tp = np.sum(posterior_probabilities[true_labels == 1])
fn = np.sum(1.0 - posterior_probabilities[true_labels == 1])
fp = np.sum(posterior_probabilities[true_labels == 0])
tn = np.sum(1.0 - posterior_probabilities[true_labels == 0])
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
# true_labels and predicted_labels are two vectors of shape (number_documents,)
def hard_single_metric_statistics(true_labels, predicted_labels):
assert len(true_labels) == len(
predicted_labels
), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp + fp + fn)
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
def macro_average(
true_labels,
predicted_labels,
metric,
metric_statistics=hard_single_metric_statistics,
):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
true_labels, predicted_labels
)
return np.mean(
[
metric(metric_statistics(true_labels[:, c], predicted_labels[:, c]))
for c in range(nC)
]
)
def micro_average(
true_labels,
predicted_labels,
metric,
metric_statistics=hard_single_metric_statistics,
):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(
true_labels, predicted_labels
)
accum = ContTable()
for c in range(nC):
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
accum = accum + other
return metric(accum)
def macroP(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, precision)
def microP(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, precision)
def macroR(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, recall)
def microR(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, recall)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroF1(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, f1)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microF1(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, f1)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroK(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, K)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microK(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, K)

View File

@ -0,0 +1,182 @@
import os
import sys
sys.path.append(os.path.join(os.getcwd(), "gfun"))
import pickle
import numpy as np
from vgfs.commons import TfidfVectorizerMultilingual
from vgfs.learners.svms import MetaClassifier, get_learner
from vgfs.multilingualGen import MultilingualGen
from vgfs.transformerGen import TransformerGen
from vgfs.vanillaFun import VanillaFunGen
from vgfs.wceGen import WceGen
# TODO: save and load gfun model
class GeneralizedFunnelling:
def __init__(
self,
posterior,
wce,
multilingual,
transformer,
langs,
embed_dir,
n_jobs,
batch_size,
max_length,
lr,
epochs,
patience,
evaluate_step,
transformer_name,
):
# Forcing VFGs -----------
self.posteriors_vgf = posterior
self.wce_vgf = wce
self.multilingual_vgf = multilingual
self.trasformer_vgf = transformer
# ------------------------
self.langs = langs
self.embed_dir = embed_dir
self.cached = True
# Transformer VGF params
self.transformer_name = transformer_name
self.epochs = epochs
self.lr_transformer = lr
self.batch_size_transformer = batch_size
self.max_length = max_length
self.early_stopping = True
self.patience = patience
self.evaluate_step = evaluate_step
# -------------------
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
self.n_jobs = n_jobs
self.first_tier_learners = []
self.metaclassifier = None
self.aggfunc = "mean"
self.init()
def init(self):
print("[Init GeneralizedFunnelling]")
if self.posteriors_vgf:
fun = VanillaFunGen(
base_learner=get_learner(calibrate=True),
first_tier_parameters=None,
n_jobs=self.n_jobs,
)
self.first_tier_learners.append(fun)
if self.multilingual_vgf:
multilingual_vgf = MultilingualGen(
embed_dir=self.embed_dir,
langs=self.langs,
n_jobs=self.n_jobs,
cached=self.cached,
probabilistic=True,
)
self.first_tier_learners.append(multilingual_vgf)
if self.wce_vgf:
wce_vgf = WceGen(n_jobs=self.n_jobs)
self.first_tier_learners.append(wce_vgf)
if self.trasformer_vgf:
transformer_vgf = TransformerGen(
model_name=self.transformer_name,
lr=self.lr_transformer,
epochs=self.epochs,
batch_size=self.batch_size_transformer,
max_length=self.max_length,
device="cuda",
print_steps=50,
probabilistic=True,
evaluate_step=self.evaluate_step,
verbose=True,
patience=self.patience,
)
self.first_tier_learners.append(transformer_vgf)
self.metaclassifier = MetaClassifier(
meta_learner=get_learner(calibrate=True, kernel="rbf"),
meta_parameters=get_params(),
n_jobs=self.n_jobs,
)
def init_vgfs_vectorizers(self):
for vgf in self.first_tier_learners:
if isinstance(vgf, (VanillaFunGen, MultilingualGen, WceGen)):
vgf.vectorizer = self.vectorizer
def fit(self, lX, lY):
print("[Fitting GeneralizedFunnelling]")
self.vectorizer.fit(lX)
self.init_vgfs_vectorizers()
projections = []
print("- fitting first tier learners")
for vgf in self.first_tier_learners:
l_posteriors = vgf.fit_transform(lX, lY)
projections.append(l_posteriors)
agg = self.aggregate(projections)
self.metaclassifier.fit(agg, lY)
return self
def transform(self, lX):
projections = []
for vgf in self.first_tier_learners:
l_posteriors = vgf.transform(lX)
projections.append(l_posteriors)
agg = self.aggregate(projections)
l_out = self.metaclassifier.predict_proba(agg)
return l_out
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def aggregate(self, first_tier_projections):
if self.aggfunc == "mean":
aggregated = self._aggregate_mean(first_tier_projections)
else:
raise NotImplementedError
return aggregated
def _aggregate_mean(self, first_tier_projections):
# TODO: deafult dict for one-liner?
aggregated = {
lang: np.zeros(data.shape)
for lang, data in first_tier_projections[0].items()
}
for lang_projections in first_tier_projections:
for lang, projection in lang_projections.items():
aggregated[lang] += projection
# Computing mean
for lang, projection in aggregated.items():
aggregated[lang] /= len(first_tier_projections)
return aggregated
def get_config(self):
from pprint import pprint
# TODO
print("[GeneralizedFunnelling config]")
print(f"- langs: {self.langs}")
print("-- vgfs:")
for vgf in self.first_tier_learners:
pprint(vgf.get_config())
def get_params(optimc=False):
if not optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = "rbf"
return [{"kernel": [kernel], "C": c_range, "gamma": ["auto"]}]

74
gfun/vgfs/commons.py Normal file
View File

@ -0,0 +1,74 @@
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
def _normalize(lX, l2=True):
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX
def XdotM(X, M, sif):
E = X.dot(M)
if sif:
E = remove_pc(E, npc=1)
return E
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc == 1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {
l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs
}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def compute_pc(X, npc=1):
"""
Compute the principal components.
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
"""
if isinstance(X, np.matrix):
X = np.asarray(X)
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_

354
gfun/vgfs/learners/svms.py Normal file
View File

@ -0,0 +1,354 @@
import time
import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import issparse
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
def _sort_if_sparse(X):
if issparse(X) and not X.has_sorted_indices:
X.sort_indices()
def get_learner(calibrate=False, kernel="linear", C=1):
"""
instantiate scikit Support Vector Classifier
:param calibrate: boolean, whether to return posterior probabilities or not
:param kernel: string,kernel to be applied to the SVC
:param C: int or dict {'C': list of integer}, Regularization parameter
:return: Support Vector Classifier
"""
return SVC(
kernel=kernel,
probability=calibrate,
cache_size=1000,
C=C,
random_state=1,
gamma="auto",
verbose=False,
)
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
if n_jobs == 1:
return {lang: transformer(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
transformations = Parallel(n_jobs=n_jobs)(
delayed(transformer)(lX[lang]) for lang in langs
)
return {lang: transformations[i] for i, lang in enumerate(langs)}
class MonolingualClassifier:
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.learner = base_learner
self.parameters = parameters
self.model = None
self.best_params_ = None
self.n_jobs = n_jobs
def fit(self, X, y):
tinit = time.time()
_sort_if_sparse(X)
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
# multi-class format
if len(y.shape) == 2:
if self.parameters is not None:
self.parameters = [
{"estimator__" + key: params[key] for key in params.keys()}
for params in self.parameters
]
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
else:
self.model = self.learner
raise NotImplementedError(
"not working as a base-classifier for funneling if there are gaps in "
"the labels across languages"
)
# parameter optimization?
if self.parameters:
print("debug: optimizing parameters:", self.parameters)
self.model = GridSearchCV(
self.model,
param_grid=self.parameters,
refit=True,
cv=5,
n_jobs=self.n_jobs,
error_score=0,
verbose=10,
)
# print(f"-- Fitting learner on matrices X={X.shape} Y={y.shape}")
self.model.fit(X, y)
if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_
print("best parameters: ", self.best_params_)
self.time = time.time() - tinit
return self
def decision_function(self, X):
assert self.model is not None, "predict called before fit"
_sort_if_sparse(X)
return self.model.decision_function(X)
def predict_proba(self, X):
assert self.model is not None, "predict called before fit"
assert hasattr(
self.model, "predict_proba"
), "the probability predictions are not enabled in this model"
_sort_if_sparse(X)
return self.model.predict_proba(X)
def predict(self, X):
assert self.model is not None, "predict called before fit"
_sort_if_sparse(X)
return self.model.predict(X)
def best_params(self):
raise NotImplementedError
class NaivePolylingualClassifier:
"""
Is a mere set of independet MonolingualClassifiers
"""
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.base_learner = base_learner
self.parameters = parameters
self.model = None
self.n_jobs = n_jobs
def fit(self, lX, ly):
"""
trains the independent monolingual classifiers
:param lX: a dictionary {language_label: X csr-matrix}
:param ly: a dictionary {language_label: y np.array}
:return: self
"""
tinit = time.time()
assert set(lX.keys()) == set(ly.keys()), "inconsistent language mappings in fit"
langs = list(lX.keys())
for lang in langs:
_sort_if_sparse(lX[lang])
models = Parallel(n_jobs=self.n_jobs)(
delayed(
MonolingualClassifier(self.base_learner, parameters=self.parameters).fit
)((lX[lang]), ly[lang])
for lang in langs
)
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {
lang: self.model[lang].empty_categories for lang in langs
}
self.time = time.time() - tinit
return self
def decision_function(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of classification scores for each class
"""
assert self.model is not None, "predict called before fit"
assert set(lX.keys()).issubset(
set(self.model.keys())
), "unknown languages requested in decision function"
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(
delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs
)
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of probabilities that each document belongs to each class
"""
assert self.model is not None, "predict called before fit"
assert set(lX.keys()).issubset(
set(self.model.keys())
), "unknown languages requested in decision function"
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs
)
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of predictions
"""
assert self.model is not None, "predict called before fit"
assert set(lX.keys()).issubset(
set(self.model.keys())
), "unknown languages requested in predict"
if self.n_jobs == 1:
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(
delayed(self.model[lang].predict)(lX[lang]) for lang in langs
)
return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self):
return {lang: model.best_params() for lang, model in self.model.items()}
class MetaClassifier:
def __init__(
self,
meta_learner,
meta_parameters=None,
n_jobs=-1,
standardize_range=None,
verbose=True,
):
self.n_jobs = n_jobs
self.model = MonolingualClassifier(
base_learner=meta_learner, parameters=meta_parameters, n_jobs=self.n_jobs
)
self.standardize_range = standardize_range
self.verbose = verbose
def fit(self, lZ, lY):
tinit = time.time()
Z, y = self.stack(lZ, lY)
self.standardizer = StandardizeTransformer(range=self.standardize_range)
Z = self.standardizer.fit_transform(Z)
if self.verbose:
print(f"- fitting the metaclassifier on data shape: {Z.shape}")
self.model.fit(Z, y)
self.time = time.time() - tinit
def stack(self, lZ, lY=None):
langs = list(lZ.keys())
Z = np.vstack([lZ[lang] for lang in langs])
if lY is not None:
y = np.vstack([lY[lang] for lang in langs])
return Z, y
else:
return Z
# def stack(self, lZ, lY=None):
# X_stacked = np.vstack(list(lZ.values()))
# if lY is not None:
# Y_stacked = np.vstack(list(lY.values()))
# return X_stacked, Y_stacked
# else:
# return X_stacked
def predict(self, lZ):
lZ = _joblib_transform_multiling(
self.standardizer.transform, lZ, n_jobs=self.n_jobs
)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def predict_proba(self, lZ):
lZ = _joblib_transform_multiling(
self.standardizer.transform, lZ, n_jobs=self.n_jobs
)
return _joblib_transform_multiling(
self.model.predict_proba, lZ, n_jobs=self.n_jobs
)
class StandardizeTransformer:
def __init__(self, axis=0, range=None):
"""
:param axis:
:param range:
"""
assert range is None or isinstance(
range, slice
), "wrong format for range, should either be None or a slice"
self.axis = axis
self.yetfit = False
self.range = range
def fit(self, X):
# print("Applying z-score standardization...")
std = np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
if self.range is not None:
ones = np.ones_like(self.std)
zeros = np.zeros_like(self.mean)
ones[self.range] = self.std[self.range]
zeros[self.range] = self.mean[self.range]
self.std = ones
self.mean = zeros
self.yetfit = True
return self
def transform(self, X):
if not self.yetfit:
"transform called before fit"
return (X - self.mean) / self.std
def fit_transform(self, X):
return self.fit(X).transform(X)
class FeatureSet2Posteriors:
"""
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
a multiclass SVM.
"""
def __init__(self, verbose=True, l2=True, n_jobs=-1):
"""
Init the class.
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
:param l2: bool, whether to apply or not L2 normalization to the projection
:param n_jobs: int, number of concurrent workers.
"""
# self.embedder = embedder
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(
kernel="rbf",
gamma="auto",
probability=True,
cache_size=1000,
random_state=1,
),
n_jobs=n_jobs,
verbose=verbose,
)
def fit(self, lX, lY):
self.prob_classifier.fit(lX, lY)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def predict(self, lX):
return self.prob_classifier.predict(lX)
def predict_proba(self, lX):
return self.prob_classifier.predict_proba(lX)
def _normalize(lX, l2=True):
return {lang: normalize(np.asarray(X)) for lang, X in lX.items()} if l2 else lX

View File

@ -0,0 +1,176 @@
from os.path import expanduser, join
import torch
import numpy as np
from torchtext.vocab import Vectors
from joblib import Parallel, delayed
from vgfs.viewGen import ViewGen
from vgfs.commons import _normalize, XdotM
from vgfs.learners.svms import FeatureSet2Posteriors
class MultilingualGen(ViewGen):
def __init__(
self,
cached=False,
langs=["en", "it"],
embed_dir="~/embeddings",
n_jobs=-1,
probabilistic=False,
):
print("- init Multilingual View Generating Function")
self.embed_dir = embed_dir
self.langs = langs
self.n_jobs = n_jobs
self.cached = cached
self.vectorizer = None
self.sif = True
self.probabilistic = probabilistic
self.fitted = False
self._init()
def _init(self):
if self.probabilistic:
self.feature2posterior_projector = FeatureSet2Posteriors(
n_jobs=self.n_jobs, verbose=False
)
def fit(self, lX, lY):
"""
Fitting Multilingual View Generating Function consists in
building/extracting the word embedding matrix for
each language;
"""
print("- fitting Multilingual View Generating Function")
self.l_vocab = self.vectorizer.vocabulary()
self.multi_embeddings, self.langs = self._load_embeddings(
self.embed_dir, self.cached
)
if self.probabilistic:
self.feature2posterior_projector.fit(self.transform(lX), lY)
self.fitted = True
return self
def transform(self, lX):
lX = self.vectorizer.transform(lX)
XdotMulti = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.multi_embeddings[lang], sif=self.sif)
for lang in self.langs
)
lZ = {lang: XdotMulti[i] for i, lang in enumerate(self.langs)}
lZ = _normalize(lZ, l2=True)
if self.probabilistic and self.fitted:
lZ = self.feature2posterior_projector.transform(lZ)
return lZ
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def _load_embeddings(self, embed_dir, cached):
if "muse" in self.embed_dir.lower():
multi_embeddings = load_MUSEs(
langs=self.langs,
l_vocab=self.vectorizer.vocabulary(),
dir_path=embed_dir,
cached=cached,
)
return multi_embeddings, sorted(multi_embeddings.keys())
def get_config(self):
return {
"name": "Multilingual VGF",
"embed_dir": self.embed_dir,
"langs": self.langs,
"n_jobs": self.n_jobs,
"cached": self.cached,
"sif": self.sif,
"probabilistic": self.probabilistic,
}
def load_MUSEs(langs, l_vocab, dir_path, cached=False):
dir_path = expanduser(dir_path)
cached_dir = join(dir_path, "cached")
nmax = 50000
l_embeddings = {}
for lang in langs:
embed_path = f"wiki.multi.{lang}.vec"
if cached:
l_embeddings[lang] = Vectors(embed_path, cache=cached_dir)
print(f"-- Loaded cached {lang} embeddings")
else:
(
_embed_matrix,
_,
_,
) = _load_vec(join(dir_path, embed_path), nmax)
l_embeddings[lang] = _embed_matrix
print(f"-- Loaded {nmax} {lang} embeddings")
# print("-- Extracting embeddings")
l_embeddings = extract(l_vocab, l_embeddings)
return l_embeddings
def _load_vec(emb_path, nmax=50000):
import io
import numpy as np
vectors = []
word2id = {}
with io.open(emb_path, "r", encoding="utf-8", newline="\n", errors="ignore") as f:
next(f)
for i, line in enumerate(f):
word, vect = line.rstrip().split(" ", 1)
vect = np.fromstring(vect, sep=" ")
assert word not in word2id, "word found twice"
vectors.append(vect)
word2id[word] = len(word2id)
if len(word2id) == nmax:
break
id2word = {v: k for k, v in word2id.items()}
embeddings = np.vstack(vectors)
return embeddings, id2word, word2id
def extract(l_voc, l_embeddings):
"""
Reindex pretrained loaded embedding in order to match indexes
assigned by scikit vectorizer. Such indexes are consistent with
those used by Word Class Embeddings (since we deploy the same vectorizer)
:param lVoc: dict {lang : {word : id}}
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
"""
l_extracted = {}
for lang, words in l_voc.items():
source_id, target_id = reindex(words, l_embeddings[lang].stoi)
extraction = torch.zeros((len(words), l_embeddings[lang].vectors.shape[-1]))
extraction[source_id] = l_embeddings[lang].vectors[target_id]
l_extracted[lang] = extraction
return l_extracted
def reindex(vectorizer_words, pretrained_word2index):
if isinstance(vectorizer_words, dict):
vectorizer_words = list(
zip(*sorted(vectorizer_words.items(), key=lambda x: x[1]))
)[0]
source_idx, target_idx = [], []
for i, word in enumerate(vectorizer_words):
if word not in pretrained_word2index:
continue
j = pretrained_word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx

390
gfun/vgfs/transformerGen.py Normal file
View File

@ -0,0 +1,390 @@
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
from collections import defaultdict
import numpy as np
import torch
import transformers
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from vgfs.learners.svms import FeatureSet2Posteriors
from evaluation.evaluate import evaluate, log_eval
transformers.logging.set_verbosity_error()
# TODO: early stopping, checkpointing, logging, model loading
# TODO: experiment name
class TransformerGen:
def __init__(
self,
model_name,
epochs=10,
lr=1e-5,
batch_size=4,
max_length=512,
print_steps=50,
device="cpu",
probabilistic=False,
n_jobs=-1,
evaluate_step=10,
verbose=False,
patience=5,
):
self.model_name = model_name
self.device = device
self.model = None
self.lr = lr
self.epochs = epochs
self.tokenizer = None
self.max_length = max_length
self.batch_size = batch_size
self.print_steps = print_steps
self.probabilistic = probabilistic
self.n_jobs = n_jobs
self.fitted = False
self.datasets = {}
self.evaluate_step = evaluate_step
self.verbose = verbose
self.patience = patience
self._init()
def _init(self):
if self.probabilistic:
self.feature2posterior_projector = FeatureSet2Posteriors(
n_jobs=self.n_jobs, verbose=False
)
self.model_name = self._get_model_name(self.model_name)
print(
f"- init TransformerModel model_name: {self.model_name}, device: {self.device}]"
)
def _get_model_name(self, name):
if "bert" == name:
name_model = "bert-base-uncased"
elif "mbert" == name:
name_model = "bert-base-multilingual-uncased"
elif "xlm" == name:
name_model = "xlm-roberta-base"
else:
raise NotImplementedError
return name_model
def load_pretrained_model(self, model_name, num_labels):
return AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=num_labels, output_hidden_states=True
)
def load_tokenizer(self, model_name):
return AutoTokenizer.from_pretrained(model_name)
def init_model(self, model_name, num_labels):
return self.load_pretrained_model(model_name, num_labels), self.load_tokenizer(
model_name
)
def get_train_val_data(self, lX, lY, split=0.2, seed=42):
tr_lX, tr_lY, val_lX, val_lY = {}, {}, {}, {}
for lang in lX.keys():
tr_X, val_X, tr_Y, val_Y = train_test_split(
lX[lang], lY[lang], test_size=split, random_state=seed, shuffle=False
)
tr_lX[lang] = tr_X
tr_lY[lang] = tr_Y
val_lX[lang] = val_X
val_lY[lang] = val_Y
return tr_lX, tr_lY, val_lX, val_lY
def build_dataloader(self, lX, lY, batch_size, split="train", shuffle=True):
l_tokenized = {lang: self._tokenize(data) for lang, data in lX.items()}
self.datasets[split] = MultilingualDatasetTorch(l_tokenized, lY, split=split)
return DataLoader(self.datasets[split], batch_size=batch_size, shuffle=shuffle)
def _tokenize(self, X):
return self.tokenizer(
X,
return_tensors="pt",
padding="max_length",
truncation=True,
max_length=self.max_length,
)
def fit(self, lX, lY):
if self.fitted:
return self
print("- fitting Transformer View Generating Function")
_l = list(lX.keys())[0]
self.num_labels = lY[_l].shape[-1]
self.model, self.tokenizer = self.init_model(
self.model_name, num_labels=self.num_labels
)
tr_lX, tr_lY, val_lX, val_lY = self.get_train_val_data(
lX, lY, split=0.2, seed=42
)
tra_dataloader = self.build_dataloader(
tr_lX, tr_lY, self.batch_size, split="train", shuffle=True
)
val_dataloader = self.build_dataloader(
val_lX, val_lY, self.batch_size, split="val", shuffle=False
)
experiment_name = f"{self.model_name}-{self.epochs}-{self.batch_size}" # TODO: add more params
trainer = Trainer(
model=self.model,
optimizer_name="adamW",
lr=self.lr,
device=self.device,
loss_fn=torch.nn.CrossEntropyLoss(),
print_steps=self.print_steps,
evaluate_step=self.evaluate_step,
patience=self.patience,
experiment_name=experiment_name,
)
trainer.train(
train_dataloader=tra_dataloader,
eval_dataloader=val_dataloader,
epochs=self.epochs,
)
if self.probabilistic:
self.feature2posterior_projector.fit(self.transform(lX), lY)
self.fitted = True
# self.save_vgf(path="models/vgf/transformers/")
return self
def transform(self, lX):
_embeds = []
l_embeds = defaultdict(list)
dataloader = self.build_dataloader(
lX, lY=None, batch_size=self.batch_size, split="whole", shuffle=False
)
self.model.eval()
with torch.no_grad():
for input_ids, lang in dataloader:
input_ids = input_ids.to(self.device)
out = self.model(input_ids).hidden_states[-1]
batch_embeddings = out[:, 0, :].cpu().numpy()
_embeds.append((batch_embeddings, lang))
for embed, lang in _embeds:
for sample_embed, sample_lang in zip(embed, lang):
l_embeds[sample_lang].append(sample_embed)
if self.probabilistic and self.fitted:
l_embeds = self.feature2posterior_projector.transform(l_embeds)
return l_embeds
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def save_vgf(self, path):
print(f"- saving Transformer View Generating Function to {path}")
return
def get_config(self):
return {
"name": "Transformer VGF",
"model_name": self.model_name,
"max_length": self.max_length,
"batch_size": self.batch_size,
"lr": self.lr,
"epochs": self.epochs,
"device": self.device,
"print_steps": self.print_steps,
"evaluate_step": self.evaluate_step,
"patience": self.patience,
"probabilistic": self.probabilistic,
}
class MultilingualDatasetTorch(Dataset):
def __init__(self, lX, lY, split="train"):
self.lX = lX
self.lY = lY
self.split = split
self.langs = []
self.init()
def init(self):
self.X = torch.vstack([data.input_ids for data in self.lX.values()])
if self.split != "whole":
self.Y = torch.vstack([torch.Tensor(data) for data in self.lY.values()])
self.langs = sum(
[
v
for v in {
lang: [lang] * len(data.input_ids) for lang, data in self.lX.items()
}.values()
],
[],
)
return self
def __len__(self):
return len(self.X)
def __getitem__(self, index):
if self.split == "whole":
return self.X[index], self.langs[index]
return self.X[index], self.Y[index], self.langs[index]
class Trainer:
def __init__(
self,
model,
optimizer_name,
device,
loss_fn,
lr,
print_steps,
evaluate_step,
patience,
experiment_name,
):
self.device = device
self.model = model.to(device)
self.optimizer = self.init_optimizer(optimizer_name, lr)
self.evaluate_steps = evaluate_step
self.loss_fn = loss_fn.to(device)
self.print_steps = print_steps
self.earlystopping = EarlyStopping(
patience=patience,
checkpoint_path="models/vgfs/transformers/",
verbose=True,
experiment_name=experiment_name,
)
def init_optimizer(self, optimizer_name, lr):
if optimizer_name.lower() == "adamw":
return AdamW(self.model.parameters(), lr=lr)
else:
raise ValueError(f"Optimizer {optimizer_name} not supported")
def train(self, train_dataloader, eval_dataloader, epochs=10):
print(
f"""- Training params:
- epochs: {epochs}
- learning rate: {self.optimizer.defaults['lr']}
- train batch size: {train_dataloader.batch_size}
- eval batch size: {'TODO'}
- max len: {train_dataloader.dataset.X.shape[-1]}\n""",
)
for epoch in range(epochs):
self.train_epoch(train_dataloader, epoch)
if (epoch + 1) % self.evaluate_steps == 0:
metric_watcher = self.evaluate(eval_dataloader)
stop = self.earlystopping(metric_watcher, self.model, epoch + 1)
if stop:
break
return self.model
def train_epoch(self, dataloader, epoch):
self.model.train()
for b_idx, (x, y, lang) in enumerate(dataloader):
self.optimizer.zero_grad()
y_hat = self.model(x.to(self.device))
loss = self.loss_fn(y_hat.logits, y.to(self.device))
loss.backward()
self.optimizer.step()
if b_idx % self.print_steps == 0:
print(f"Epoch: {epoch+1} Step: {b_idx+1} Loss: {loss:.4f}")
return self
def evaluate(self, dataloader):
self.model.eval()
lY = defaultdict(list)
lY_hat = defaultdict(list)
for b_idx, (x, y, lang) in enumerate(dataloader):
y_hat = self.model(x.to(self.device))
loss = self.loss_fn(y_hat.logits, y.to(self.device))
predictions = predict(y_hat.logits, classification_type="multilabel")
for l, _true, _pred in zip(lang, y, predictions):
lY[l].append(_true.detach().cpu().numpy())
lY_hat[l].append(_pred)
for lang in lY:
lY[lang] = np.vstack(lY[lang])
lY_hat[lang] = np.vstack(lY_hat[lang])
l_eval = evaluate(lY, lY_hat)
average_metrics = log_eval(l_eval, phase="validation")
return average_metrics[0] # macro-F1
class EarlyStopping:
def __init__(
self,
patience=5,
min_delta=0,
verbose=True,
checkpoint_path="checkpoint.pt",
experiment_name="experiment",
):
self.patience = patience
self.min_delta = min_delta
self.counter = 0
self.best_score = 0
self.best_epoch = None
self.verbose = verbose
self.checkpoint_path = checkpoint_path
self.experiment_name = experiment_name
def __call__(self, validation, model, epoch):
if validation > self.best_score:
print(
f"- earlystopping: Validation score improved from {self.best_score:.3f} to {validation:.3f}"
)
self.best_score = validation
self.counter = 0
# self.save_model(model)
elif validation < (self.best_score + self.min_delta):
self.counter += 1
print(
f"- earlystopping: Validation score decreased from {self.best_score:.3f} to {validation:.3f}, current patience: {self.patience - self.counter}"
)
if self.counter >= self.patience:
if self.verbose:
print(f"- earlystopping: Early stopping at epoch {epoch}")
return True
def save_model(self, model):
_checkpoint_dir = os.path.join(self.checkpoint_path, self.experiment_name)
print(f"- saving model to {_checkpoint_dir}")
os.makedirs(_checkpoint_dir, exist_ok=True)
model.save_pretrained(_checkpoint_dir)
def predict(logits, classification_type="multilabel"):
"""
Converts soft precictions to hard predictions [0,1]
"""
if classification_type == "multilabel":
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == "singlelabel":
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print("unknown classification type")
return prediction.detach().cpu().numpy()

59
gfun/vgfs/vanillaFun.py Normal file
View File

@ -0,0 +1,59 @@
from vgfs.viewGen import ViewGen
from vgfs.learners.svms import NaivePolylingualClassifier
from vgfs.commons import _normalize
class VanillaFunGen(ViewGen):
"""
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
Sebastiani in DOI: https://doi.org/10.1145/3326065
"""
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
"""
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
:param base_learner: naive monolingual learners to be deployed as first-tier
learners. Should be able to return posterior probabilities.
:param base_learner:
:param n_jobs: integer, number of concurrent workers
"""
print("- init VanillaFun View Generating Function")
self.learners = base_learner
self.first_tier_parameters = first_tier_parameters
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(
base_learner=self.learners,
parameters=self.first_tier_parameters,
n_jobs=self.n_jobs,
)
self.vectorizer = None
def fit(self, lX, lY):
print("- fitting VanillaFun View Generating Function")
lX = self.vectorizer.transform(lX)
self.doc_projector.fit(lX, lY)
return self
def transform(self, lX):
"""
(1) Vectorize documents;
(2) Project them according to the learners SVMs;
(3) Apply L2 normalization to the projection and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
lZ = self.doc_projector.predict_proba(lX)
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def get_config(self):
return {
"name": "VanillaFunnelling VGF",
"base_learner": self.learners,
"first_tier_parameters": self.first_tier_parameters,
"n_jobs": self.n_jobs,
}

20
gfun/vgfs/viewGen.py Normal file
View File

@ -0,0 +1,20 @@
from abc import ABC, abstractmethod
class ViewGen(ABC):
"""
Abstract class for View Generating Functions (VGFs) implementations. Every ViewGen should implement these three methods in order to
be seamlessly integrated in the overall architecture.
"""
@abstractmethod
def fit(self, lX, lY):
pass
@abstractmethod
def transform(self, lX):
pass
@abstractmethod
def fit_transform(self, lX, lY):
pass

66
gfun/vgfs/wceGen.py Normal file
View File

@ -0,0 +1,66 @@
import numpy as np
from joblib import Parallel, delayed
from vgfs.commons import XdotM, _normalize
from vgfs.viewGen import ViewGen
class WceGen(ViewGen):
def __init__(self, n_jobs=-1):
print("- init Word-Class-Embeddings View Generating Function")
self.n_jobs = -1
self.sif = True
def fit(self, lX, lY):
print("- fitting Word-Class-Embeddings View Generating Function")
lX = self.vectorizer.transform(lX)
self.langs = sorted(lX.keys())
wce = Parallel(n_jobs=self.n_jobs)(
delayed(wce_matrix)(lX[lang], lY[lang]) for lang in self.langs
)
self.l_wce = {lang: wce[i] for i, lang in enumerate(self.langs)}
return self
def transform(self, lX):
lX = self.vectorizer.transform(lX)
XdotWce = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.l_wce[lang], sif=self.sif)
for lang in self.langs
)
lZ = {l: XdotWce[i] for i, l in enumerate(self.langs)}
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, lY):
return self.fit(lX, lY).transform(lX)
def get_config(self):
return {
"name": "Word-Class Embeddings VGF",
"n_jobs": self.n_jobs,
"sif": self.sif,
}
def wce_matrix(X, Y):
wce = supervised_embeddings_tfidf(X, Y)
wce = zscores(wce, axis=0)
return wce
def supervised_embeddings_tfidf(X, Y):
tfidf_norm = X.sum(axis=0)
tfidf_norm[tfidf_norm == 0] = 1
F = (X.T).dot(Y) / tfidf_norm.T
return np.asarray(F)
def zscores(X, axis=0):
"""
scipy.stats.zscores does not avoid division by 0, which can indeed occur
:param X:
:param axis:
:return:
"""
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(X, axis=axis)
return (X - mean) / std

128
main.py Normal file
View File

@ -0,0 +1,128 @@
from os.path import expanduser
from argparse import ArgumentParser
from dataManager.multiNewsDataset import MultiNewsDataset
from dataManager.amazonDataset import AmazonDataset
from dataManager.multilingualDatset import MultilingualDataset
from gfun.generalizedFunnelling import GeneralizedFunnelling
from evaluation.evaluate import evaluate, log_eval
from time import time
import pickle
# TODO: a cleaner way to save the model?
def main(args):
# Loading dataset ------------------------
RCV_DATAPATH = expanduser(
"~/datasets/rcv1-2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle"
)
# dataset = MultiNewsDataset(expanduser(args.dataset_path))
# dataset = AmazonDataset(domains=args.domains,nrows=args.nrows,min_count=args.min_count,max_labels=args.max_labels)
dataset = (
MultilingualDataset(dataset_name="rcv1-2")
.load(RCV_DATAPATH)
.reduce_data(langs=["en", "it", "fr"], maxn=250)
)
if isinstance(dataset, MultilingualDataset):
lX, lY = dataset.training()
lX_te, lY_te = dataset.test()
else:
_lX = dataset.dX
_lY = dataset.dY
# ----------------------------------------
tinit = time()
if args.load_pretrained is None:
assert any(
[
args.posteriors,
args.wce,
args.multilingual,
args.multilingual,
args.transformer,
]
), "At least one of VGF must be True"
gfun = GeneralizedFunnelling(
posterior=args.posteriors,
multilingual=args.multilingual,
wce=args.wce,
transformer=args.transformer,
langs=dataset.langs(),
embed_dir="~/resources/muse_embeddings",
n_jobs=args.n_jobs,
max_length=args.max_length,
batch_size=args.batch_size,
epochs=args.epochs,
lr=args.lr,
patience=args.patience,
evaluate_step=args.evaluate_step,
transformer_name=args.transformer_name,
)
gfun.get_config()
gfun.fit(lX, lY)
# Saving Model ------------------------
with open("models/gfun/gfun_model.pkl", "wb") as f:
print(f"- saving model to {f.name}")
pickle.dump(gfun, f)
# -------------------------------------
preds = gfun.transform(lX)
train_eval = evaluate(lY, preds)
log_eval(train_eval, phase="train")
timetr = time()
print(f"- training completed in {timetr - tinit:.2f} seconds")
# Loading Model ------------------------
if args.load_pretrained is not None:
with open("models/gfun/gfun_model.pkl", "rb") as f:
print(f"- loading model from {f.name}")
gfun = pickle.load(f)
timetr = time()
# --------------------------------------
test_eval = evaluate(lY_te, gfun.transform(lX_te))
log_eval(test_eval, phase="test")
timeval = time()
print(f"- testing completed in {timeval - timetr:.2f} seconds")
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument("--load_pretrained", type=str, default=None)
# Dataset parameters -------------------
parser.add_argument("--domains", type=str, default="all")
parser.add_argument("--nrows", type=int, default=10000)
parser.add_argument("--min_count", type=int, default=10)
parser.add_argument("--max_labels", type=int, default=50)
# gFUN parameters ----------------------
parser.add_argument("-p", "--posteriors", action="store_true")
parser.add_argument("-m", "--multilingual", action="store_true")
parser.add_argument("-w", "--wce", action="store_true")
parser.add_argument("-t", "--transformer", action="store_true")
parser.add_argument("--n_jobs", type=int, default=1)
# transformer parameters ---------------
parser.add_argument("--transformer_name", type=str, default="mbert")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--lr", type=float, default=1e-5)
parser.add_argument("--max_length", type=int, default=512)
parser.add_argument("--patience", type=int, default=5)
parser.add_argument("--evaluate_step", type=int, default=10)
args = parser.parse_args()
main(args)

60
plotters/distributions.py Normal file
View File

@ -0,0 +1,60 @@
import matplotlib.pyplot as plt
import datetime
def plot_distribution(
x,
y,
labels,
title,
figsize=(10, 5),
logscale=False,
notes="",
max_labels=-1,
save=False,
path=None,
):
# sort values and labels accordingly
y, labels = zip(*sorted(zip(y, labels), reverse=True))
if max_labels != -1:
x = x[:max_labels]
y = y[:max_labels]
labels = labels[:max_labels]
plt.figure(figsize=figsize)
plt.bar(x, y)
plt.xticks(x, labels, rotation=90)
if len(notes) != 0:
_title = f"{title} - {notes}"
if max_labels != -1:
_title += f" - Showing {max_labels} top labels"
plt.title(_title)
if logscale:
plt.yscale("symlog")
plt.tight_layout()
# plt.show()
if save:
now = datetime.datetime.now()
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
plt.savefig(path)
plt.close()
def plot_histogram(x, title, figsize=(10, 5), save=False, path=None):
plt.figure(figsize=figsize)
plt.hist(x)
# plt.xticks(x, lables, rotation=90)
plt.yscale("symlog")
plt.title(title)
# plt.show()
if save:
now = datetime.datetime.now()
path = f"{path}/{title}_{now.strftime('%m%d_%H%M')}.png"
plt.savefig(path)
plt.close()