Compare commits

...

22 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez 6f3f103b3b committing last changes before creating a branch 2021-10-13 11:53:19 +02:00
Alejandro Moreo Fernandez 4572ec266d adding multi-label classification methods 2021-09-02 11:07:33 +02:00
Alejandro Moreo Fernandez dc2fa05cf8 launching experiments 2021-08-29 11:03:51 +02:00
Alejandro Moreo Fernandez 13eb682e53 adding tables 2021-08-27 14:01:01 +02:00
Alejandro Moreo Fernandez daba2c9fb4 adding tables generation 2021-08-27 13:57:33 +02:00
Alejandro Moreo Fernandez aeb0fcf84b adding tables generation 2021-08-27 13:57:26 +02:00
Alejandro Moreo Fernandez db1dbe2534 parallelizing stuff 2021-08-27 12:21:53 +02:00
Alejandro Moreo Fernandez b941c0665e preparing some experiments 2021-08-26 17:57:01 +02:00
Alejandro Moreo Fernandez d6abc7ac85 refactor 2021-08-26 15:52:35 +02:00
Alejandro Moreo Fernandez d040b2acb6 merged! 2021-08-25 17:10:24 +02:00
Alejandro Moreo Fernandez c6de5a043d mlq 2021-08-25 17:08:06 +02:00
Alejandro Moreo Fernandez ab746eed8d last updates 2021-08-02 11:08:52 +02:00
Alejandro Moreo Fernandez 60b6fa3c12 new methods, some experiments added 2021-07-06 18:26:05 +02:00
Alejandro Moreo Fernandez 7b8e6462ff refactoring, chain-classifiers, speeding up for aggregative methods, evaluation modularized 2021-07-06 16:56:54 +02:00
Alejandro Moreo Fernandez a4fea89122 trying stuff with multilabels 2021-07-05 19:17:29 +02:00
Alejandro Moreo Fernandez 6eac620f22 merging 2021-07-05 09:36:31 +02:00
Alejandro Moreo Fernandez 977599b9b1 cleaning branch 2021-07-05 09:15:36 +02:00
Alejandro Moreo Fernandez b94dc11ea8 ensembles runing with gridsearchQ in mode npp 2021-07-04 11:30:39 +02:00
Alejandro Moreo Fernandez f96469da18 adding tweetsentnnp a gitea 2021-07-04 11:24:26 +02:00
Alejandro Moreo Fernandez 1b20bf14ea exploring multilabel quantification 2021-07-02 17:33:05 +02:00
Alejandro Moreo Fernandez f0e93692cc fixing quanet 2021-07-02 10:19:00 +02:00
Alejandro Moreo Fernandez 75a95adfa6 copying files from tweetsent branch 2021-06-29 14:37:26 +02:00
36 changed files with 3579 additions and 113 deletions

38
MultiLabel/NOTES.txt Normal file
View File

@ -0,0 +1,38 @@
Classifiers
- Classifiers binary, single-label, OneVsRest or MultiOutput:
- LR
- LinearSVC (?)
- Classifiers natively multi-label:
- from scikit-multilearn (x11)
-
Protocols:
- NPP
- APP (for each class)
Things to test:
- MultiChain for classification, MultiChain for regression...
- Reimplement stacking with sklearn.ensemble.StackingClassifier? No parece facil.
- Independent classifiers + independent quantifiers
- Stacking + independent quantifiers
- ClassifierChain + independent quantifiers
- Independent quantifiers + cross-class regression (independent?)
- Stacking + cross-class regression
- ClassifierChain + cross-class regression
- Covariates (Means, CovMatrix from samples) + multioutput regression?
- Covariates concatented with quantifiers predictions + cross-class regression?
- Model Selection for specific protocols?
TODO:
- decide methods
- decide classifiers binary
- decide classifiers multi-label
- decide quantifiers naive
- decide quantifiers multi-label
- decide datasets

0
MultiLabel/__init__.py Normal file
View File

0
MultiLabel/data/__init__.py Executable file
View File

229
MultiLabel/data/dataset.py Executable file
View File

@ -0,0 +1,229 @@
import os,sys
from sklearn.datasets import get_data_home, fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from MultiLabel.data.jrcacquis_reader import fetch_jrcacquis
from MultiLabel.data.ohsumed_reader import fetch_ohsumed50k
from MultiLabel.data.reuters21578_reader import fetch_reuters21578
from MultiLabel.data.rcv_reader import fetch_RCV1
from MultiLabel.data.wipo_reader import fetch_WIPOgamma, WipoGammaDocument
import pickle
import numpy as np
from tqdm import tqdm
from os.path import join
import re
def init_vectorizer():
return TfidfVectorizer(min_df=5, sublinear_tf=True)
class Dataset:
dataset_available = {'reuters21578', '20newsgroups', 'ohsumed', 'rcv1', 'ohsumed', 'jrcall',
'wipo-sl-mg','wipo-ml-mg','wipo-sl-sc','wipo-ml-sc'}
def __init__(self, name):
assert name in Dataset.dataset_available, f'dataset {name} is not available'
if name=='reuters21578':
self._load_reuters()
elif name == '20newsgroups':
self._load_20news()
elif name == 'rcv1':
self._load_rcv1()
elif name == 'ohsumed':
self._load_ohsumed()
elif name == 'jrcall':
self._load_jrc(version='all')
elif name == 'wipo-sl-mg':
self._load_wipo('singlelabel', 'maingroup')
elif name == 'wipo-ml-mg':
self._load_wipo('multilabel', 'maingroup')
elif name == 'wipo-sl-sc':
self._load_wipo('singlelabel', 'subclass')
elif name == 'wipo-ml-sc':
self._load_wipo('multilabel', 'subclass')
self.nC = self.devel_labelmatrix.shape[1]
self._vectorizer = init_vectorizer()
self._vectorizer.fit(self.devel_raw)
self.vocabulary = self._vectorizer.vocabulary_
def show(self):
nTr_docs = len(self.devel_raw)
nTe_docs = len(self.test_raw)
nfeats = len(self._vectorizer.vocabulary_)
nC = self.devel_labelmatrix.shape[1]
nD=nTr_docs+nTe_docs
print(f'{self.classification_type}, nD={nD}=({nTr_docs}+{nTe_docs}), nF={nfeats}, nC={nC}')
return self
def _load_reuters(self):
data_path = os.path.join(get_data_home(), 'reuters21578')
devel = fetch_reuters21578(subset='train', data_path=data_path)
test = fetch_reuters21578(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_rcv1(self):
data_path = '../datasets/RCV1-v2/unprocessed_corpus' #TODO: check when missing
devel = fetch_RCV1(subset='train', data_path=data_path)
test = fetch_RCV1(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_jrc(self, version):
assert version in ['300','all'], 'allowed versions are "300" or "all"'
data_path = "../datasets/JRC_Acquis_v3"
tr_years=list(range(1986, 2006))
te_years=[2006]
if version=='300':
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1,most_frequent=300)
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
else:
training_docs, tr_cats = fetch_jrcacquis(data_path=data_path, years=tr_years, cat_threshold=1)
test_docs, te_cats = fetch_jrcacquis(data_path=data_path, years=te_years, cat_filter=tr_cats)
print(f'load jrc-acquis (English) with {len(tr_cats)} tr categories ({len(te_cats)} te categories)')
devel_data = JRCAcquis_Document.get_text(training_docs)
test_data = JRCAcquis_Document.get_text(test_docs)
devel_target = JRCAcquis_Document.get_target(training_docs)
test_target = JRCAcquis_Document.get_target(test_docs)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_ohsumed(self):
data_path = os.path.join(get_data_home(), 'ohsumed50k')
devel = fetch_ohsumed50k(subset='train', data_path=data_path)
test = fetch_ohsumed50k(subset='test', data_path=data_path)
self.classification_type = 'multilabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel.target, test.target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
def _load_20news(self):
metadata = ('headers', 'footers', 'quotes')
devel = fetch_20newsgroups(subset='train', remove=metadata)
test = fetch_20newsgroups(subset='test', remove=metadata)
self.classification_type = 'singlelabel'
self.devel_raw, self.test_raw = mask_numbers(devel.data), mask_numbers(test.data)
self.devel_target, self.test_target = devel.target, test.target
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1,1), self.test_target.reshape(-1,1))
def _load_fasttext_data(self,name):
data_path='../datasets/fastText'
self.classification_type = 'singlelabel'
name=name.replace('-','_')
train_file = join(data_path,f'{name}.train')
assert os.path.exists(train_file), f'file {name} not found, please place the fasttext data in {data_path}' #' or specify the path' #todo
self.devel_raw, self.devel_target = load_fasttext_format(train_file)
self.test_raw, self.test_target = load_fasttext_format(join(data_path, f'{name}.test'))
self.devel_raw = mask_numbers(self.devel_raw)
self.test_raw = mask_numbers(self.test_raw)
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
def _load_wipo(self, classmode, classlevel):
assert classmode in {'singlelabel', 'multilabel'}, 'available class_mode are sl (single-label) or ml (multi-label)'
data_path = '../datasets/WIPO/wipo-gamma/en'
data_proc = '../datasets/WIPO-extracted'
devel = fetch_WIPOgamma(subset='train', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
test = fetch_WIPOgamma(subset='test', classification_level=classlevel, data_home=data_path, extracted_path=data_proc, text_fields=['abstract'])
devel_data = [d.text for d in devel]
test_data = [d.text for d in test]
self.devel_raw, self.test_raw = mask_numbers(devel_data), mask_numbers(test_data)
self.classification_type = classmode
if classmode== 'multilabel':
devel_target = [d.all_labels for d in devel]
test_target = [d.all_labels for d in test]
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(devel_target, test_target)
self.devel_target, self.test_target = self.devel_labelmatrix, self.test_labelmatrix
else:
devel_target = [d.main_label for d in devel]
test_target = [d.main_label for d in test]
# only for labels with at least one training document
class_id = {labelname:index for index,labelname in enumerate(sorted(set(devel_target)))}
devel_target = np.array([class_id[id] for id in devel_target]).astype(int)
test_target = np.array([class_id.get(id,None) for id in test_target])
if None in test_target:
print(f'deleting {(test_target==None).sum()} test documents without valid categories')
keep_pos = test_target!=None
self.test_raw = (np.asarray(self.test_raw)[keep_pos]).tolist()
test_target = test_target[keep_pos]
test_target=test_target.astype(int)
self.devel_target, self.test_target = devel_target, test_target
self.devel_labelmatrix, self.test_labelmatrix = _label_matrix(self.devel_target.reshape(-1, 1), self.test_target.reshape(-1, 1))
def vectorize(self):
if not hasattr(self, 'Xtr') or not hasattr(self, 'Xte'):
self.Xtr = self._vectorizer.transform(self.devel_raw)
self.Xte = self._vectorizer.transform(self.test_raw)
self.Xtr.sort_indices()
self.Xte.sort_indices()
return self.Xtr, self.Xte
def analyzer(self):
return self._vectorizer.build_analyzer()
@classmethod
def load(cls, dataset_name, pickle_path=None):
if pickle_path:
if os.path.exists(pickle_path):
print(f'loading pickled dataset from {pickle_path}')
dataset = pickle.load(open(pickle_path, 'rb'))
else:
print(f'fetching dataset and dumping it into {pickle_path}')
dataset = Dataset(name=dataset_name)
print('vectorizing for faster processing')
dataset.vectorize()
print('dumping')
pickle.dump(dataset, open(pickle_path, 'wb', pickle.HIGHEST_PROTOCOL))
else:
print(f'loading dataset {dataset_name}')
dataset = Dataset(name=dataset_name)
print('[Done]')
return dataset
def _label_matrix(tr_target, te_target):
mlb = MultiLabelBinarizer(sparse_output=True)
ytr = mlb.fit_transform(tr_target)
yte = mlb.transform(te_target)
print(mlb.classes_)
return ytr, yte
def load_fasttext_format(path):
print(f'loading {path}')
labels,docs=[],[]
for line in tqdm(open(path, 'rt').readlines()):
space = line.strip().find(' ')
label = int(line[:space].replace('__label__',''))-1
labels.append(label)
docs.append(line[space+1:])
labels=np.asarray(labels,dtype=int)
return docs,labels
def mask_numbers(data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked

View File

@ -0,0 +1,263 @@
import os, sys
from os.path import join
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from collections import Counter
from tqdm import tqdm
from random import shuffle
from util.file import *
class JRCAcquis_Document:
def __init__(self, id, name, lang, year, head, body, categories):
self.id = id
self.parallel_id = name
self.lang = lang
self.year = year
self.text = body if not head else head + "\n" + body
self.categories = categories
@classmethod
def get_text(cls, jrc_documents):
return [d.text for d in jrc_documents]
@classmethod
def get_target(cls, jrc_documents):
return [d.categories for d in jrc_documents]
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
for ch in ['a','e','i','o','u']:
text = text.replace('%'+ch+'acute%',ch)
return text
def parse_document(file, year, head=False):
root = ET.parse(file).getroot()
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
doc_lang = root.attrib['lang'] # e.g., 'es'
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
def raise_if_empty(field, from_file):
if isinstance(field, str):
if not field.strip():
raise ValueError("Empty field in file %s" % from_file)
raise_if_empty(doc_name, file)
raise_if_empty(doc_lang, file)
raise_if_empty(doc_id, file)
if head: raise_if_empty(doc_head, file)
raise_if_empty(doc_body, file)
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
if not isinstance(cat_filter, frozenset):
cat_filter = frozenset(cat_filter)
filtered = []
for doc in doclist:
doc.categories = list(cat_filter & set(doc.categories))
if doc.categories:
doc.categories.sort()
filtered.append(doc)
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
return filtered
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
def _get_categories(request):
final_cats = set()
for d in request:
final_cats.update(d.categories)
return list(final_cats)
def fetch_jrcacquis(lang='en', data_path=None, years=None, ignore_unclassified=True,
cat_filter=None, cat_threshold=0, most_frequent=-1,
DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
if not data_path:
data_path = get_data_home()
if not os.path.exists(data_path):
os.mkdir(data_path)
request = []
total_read = 0
file_name = 'jrc-' + lang + '.tgz'
archive_path = join(data_path, file_name)
if not os.path.exists(archive_path):
print("downloading language-specific dataset (once and for all) into %s" % data_path)
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
download_file(DOWNLOAD_URL, archive_path)
print("untarring dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
documents_dir = join(data_path, lang)
print("Reading documents...")
read = 0
for dir in list_dirs(documents_dir):
year = int(dir)
if years==None or year in years:
year_dir = join(documents_dir,dir)
l_y_documents = []
all_documents = list_files(year_dir)
empty = 0
pbar = tqdm(enumerate(all_documents))
for i,doc_file in pbar:
try:
jrc_doc = parse_document(join(year_dir, doc_file), year)
except ValueError:
jrc_doc = None
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
l_y_documents.append(jrc_doc)
else: empty += 1
read+=1
pbar.set_description(f'from {year_dir}: discarded {empty} without categories or empty fields')
request += l_y_documents
print("Read %d documents for language %s\n" % (read, lang))
total_read += read
final_cats = _get_categories(request)
if cat_filter:
request = _filter_by_category(request, cat_filter)
final_cats = _get_categories(request)
if cat_threshold > 0:
request, final_cats = _filter_by_frequency(request, cat_threshold)
if most_frequent != -1 and len(final_cats) > most_frequent:
request, final_cats = _most_common(request, most_frequent)
return request, final_cats
def print_cat_analysis(request):
cat_count = Counter()
for d in request:
cat_count.update(d.categories)
print("Number of active categories: {}".format(len(cat_count)))
print(cat_count.most_common())
# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
select="broadest"):
fullpath_pickle = join(data_path, select+'_concepts.pickle')
if os.path.exists(fullpath_pickle):
print("Pickled object found in %s. Loading it." % fullpath_pickle)
return pickle.load(open(fullpath_pickle,'rb'))
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
if not os.path.exists(fullpath):
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
download_file(eurovoc_url, fullpath)
print("Unzipping file...")
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
zipped.close()
print("Parsing %s" %fullpath)
g = rdflib.Graph()
g.parse(location=fullpath, format="application/rdf+xml")
if select == "all":
print("Selecting all concepts")
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
all_concepts.sort()
selected_concepts = all_concepts
elif select=="broadest":
print("Selecting broadest concepts (those without any other broader concept linked to it)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
narrower_concepts = set(g.subjects(SKOS.broader, None))
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
broadest_concepts.sort()
selected_concepts = broadest_concepts
elif select=="leaves":
print("Selecting leaves concepts (those not linked as broader of any other concept)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
broad_concepts = set(g.objects(None, SKOS.broader))
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
leave_concepts.sort()
selected_concepts = leave_concepts
else:
raise ValueError("Selection policy %s is not currently supported" % select)
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
return selected_concepts
if __name__ == '__main__':
# example code
train_years = list(range(1986, 2006))
test_years = [2006]
cat_policy = 'all' #'leaves'
most_common_cat = 300
JRC_DATAPATH = "../datasets/JRC_Acquis_v3"
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
training_docs, tr_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=train_years,
cat_filter=None, cat_threshold=1,
most_frequent=most_common_cat)
test_docs, te_cats = fetch_jrcacquis(lang='en', data_path=JRC_DATAPATH, years=test_years,
cat_filter=tr_cats, cat_threshold=1)
# training_cats = jrc_get_categories(training_docs)
# test_cats = jrc_get_categories(test_docs)
# intersection_cats = [c for c in training_cats if c in test_cats]
# training_docs = jrc_filter_by_category(training_docs, intersection_cats)
# test_docs = jrc_filter_by_category(test_docs, intersection_cats)
print(f'JRC-train: {len(training_docs)} documents')
print(f'JRC-test: {len(test_docs)} documents')
print_cat_analysis(training_docs)
print_cat_analysis(test_docs)
"""
JRC-train: 12615 documents, 300 cats
JRC-test: 7055 documents, 300 cats
"""

5
MultiLabel/data/labeled.py Executable file
View File

@ -0,0 +1,5 @@
class LabelledDocuments:
def __init__(self, data, target, target_names):
self.data=data
self.target=target
self.target_names=target_names

View File

@ -0,0 +1,63 @@
import os
import pickle
import tarfile
from os.path import join
import urllib.request
from data.labeled import LabelledDocuments
from util.file import create_if_not_exist, download_file_if_not_exists
import math
def fetch_ohsumed50k(data_path=None, subset='train', train_test_split=0.7):
_dataname = 'ohsumed50k'
if data_path is None:
data_path = join(os.path.expanduser('~'), _dataname)
create_if_not_exist(data_path)
pickle_file = join(data_path, _dataname + '.' + subset + str(train_test_split) + '.pickle')
if not os.path.exists(pickle_file):
DOWNLOAD_URL = ('http://disi.unitn.it/moschitti/corpora/ohsumed-all-docs.tar.gz')
archive_path = os.path.join(data_path, 'ohsumed-all-docs.tar.gz')
download_file_if_not_exists(DOWNLOAD_URL, archive_path)
untardir = 'ohsumed-all'
if not os.path.exists(os.path.join(data_path, untardir)):
print("untarring ohsumed...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
target_names = []
doc_classes = dict()
class_docs = dict()
content = dict()
doc_ids = set()
for cat_id in os.listdir(join(data_path, untardir)):
target_names.append(cat_id)
class_docs[cat_id] = []
for doc_id in os.listdir(join(data_path, untardir, cat_id)):
doc_ids.add(doc_id)
text_content = open(join(data_path, untardir, cat_id, doc_id), 'r').read()
if doc_id not in doc_classes: doc_classes[doc_id] = []
doc_classes[doc_id].append(cat_id)
if doc_id not in content: content[doc_id] = text_content
class_docs[cat_id].append(doc_id)
target_names.sort()
print('Read %d different documents' % len(doc_ids))
splitdata = dict({'train': [], 'test': []})
for cat_id in target_names:
free_docs = [d for d in class_docs[cat_id] if (d not in splitdata['train'] and d not in splitdata['test'])]
if len(free_docs) > 0:
split_point = int(math.floor(len(free_docs) * train_test_split))
splitdata['train'].extend(free_docs[:split_point])
splitdata['test'].extend(free_docs[split_point:])
for split in ['train', 'test']:
dataset = LabelledDocuments([], [], target_names)
for doc_id in splitdata[split]:
dataset.data.append(content[doc_id])
dataset.target.append([target_names.index(cat_id) for cat_id in doc_classes[doc_id]])
pickle.dump(dataset,
open(join(data_path, _dataname + '.' + split + str(train_test_split) + '.pickle'), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
print(pickle_file)
return pickle.load(open(pickle_file, 'rb'))

152
MultiLabel/data/rcv_reader.py Executable file
View File

@ -0,0 +1,152 @@
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from data.labeled import LabelledDocuments
from util.file import list_files
from os.path import join, exists
from util.file import download_file_if_not_exists
import re
from collections import Counter
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
'lyrl2004_tokens_test_pt1.dat.gz',
'lyrl2004_tokens_test_pt2.dat.gz',
'lyrl2004_tokens_test_pt3.dat.gz']
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
class RCV_Document:
def __init__(self, id, text, categories, date=''):
self.id = id
self.date = date
self.text = text
self.categories = categories
class IDRangeException(Exception): pass
nwords = []
def parse_document(xml_content, valid_id_range=None):
root = ET.fromstring(xml_content)
doc_id = root.attrib['itemid']
if valid_id_range is not None:
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
raise IDRangeException
doc_categories = [cat.attrib['code'] for cat in
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
doc_date = root.attrib['date']
doc_title = root.find('.//title').text
doc_headline = root.find('.//headline').text
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
if not doc_body:
raise ValueError('Empty document')
if doc_title is None: doc_title = ''
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date)
def fetch_RCV1(data_path, subset='all'):
assert subset in ['train', 'test', 'all'], 'split should either be "train", "test", or "all"'
request = []
labels = set()
read_documents = 0
training_documents = 23149
test_documents = 781265
if subset == 'all':
split_range = (2286, 810596)
expected = training_documents+test_documents
elif subset == 'train':
split_range = (2286, 26150)
expected = training_documents
else:
split_range = (26151, 810596)
expected = test_documents
# global nwords
# nwords=[]
for part in list_files(data_path):
if not re.match('\d+\.zip', part): continue
target_file = join(data_path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, valid_id_range=split_range)
labels.update(doc.categories)
request.append(doc)
read_documents += 1
except (IDRangeException,ValueError) as e:
pass
print('\r[{}] read {} documents'.format(part, len(request)), end='')
if read_documents == expected: break
if read_documents == expected: break
print()
# print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return LabelledDocuments(data=[d.text for d in request], target=[d.categories for d in request], target_names=list(labels))
def fetch_topic_hierarchy(path, topics='all'):
assert topics in ['all', 'leaves']
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
hierarchy = {}
for line in open(path, 'rt'):
parts = line.strip().split()
parent,child = parts[1],parts[3]
if parent not in hierarchy:
hierarchy[parent]=[]
hierarchy[parent].append(child)
del hierarchy['None']
del hierarchy['Root']
print(hierarchy)
if topics=='all':
topics = set(hierarchy.keys())
for parent in hierarchy.keys():
topics.update(hierarchy[parent])
return list(topics)
elif topics=='leaves':
parents = set(hierarchy.keys())
childs = set()
for parent in hierarchy.keys():
childs.update(hierarchy[parent])
return list(childs.difference(parents))
if __name__=='__main__':
# example
RCV1_PATH = '../../datasets/RCV1-v2/unprocessed_corpus'
rcv1_train = fetch_RCV1(RCV1_PATH, subset='train')
rcv1_test = fetch_RCV1(RCV1_PATH, subset='test')
print('read {} documents in rcv1-train, and {} labels'.format(len(rcv1_train.data), len(rcv1_train.target_names)))
print('read {} documents in rcv1-test, and {} labels'.format(len(rcv1_test.data), len(rcv1_test.target_names)))
cats = Counter()
for cats in rcv1_train.target: cats.update(cats)
print('RCV1', cats)

View File

@ -0,0 +1,189 @@
# Modified version of the code originally implemented by Eustache Diemert <eustache@diemert.fr>
# @FedericoV <https://github.com/FedericoV/>
# with License: BSD 3 clause
import os.path
import re
import tarfile
from sklearn.datasets import get_data_home
from six.moves import html_parser
from six.moves import urllib
import pickle
from glob import glob
import numpy as np
from data.labeled import LabelledDocuments
def _not_in_sphinx():
# Hack to detect whether we are running by the sphinx builder
return '__file__' in globals()
class ReutersParser(html_parser.HTMLParser):
"""Utility class to parse a SGML file and yield documents one at a time."""
def __init__(self, encoding='latin-1', data_path=None):
self.data_path = data_path
self.download_if_not_exist()
self.tr_docs = []
self.te_docs = []
html_parser.HTMLParser.__init__(self)
self._reset()
self.encoding = encoding
self.empty_docs = 0
def handle_starttag(self, tag, attrs):
method = 'start_' + tag
getattr(self, method, lambda x: None)(attrs)
def handle_endtag(self, tag):
method = 'end_' + tag
getattr(self, method, lambda: None)()
def _reset(self):
self.in_title = 0
self.in_body = 0
self.in_topics = 0
self.in_topic_d = 0
self.in_unproc_text = 0
self.title = ""
self.body = ""
self.topics = []
self.topic_d = ""
self.text = ""
def parse(self, fd):
for chunk in fd:
self.feed(chunk.decode(self.encoding))
self.close()
def handle_data(self, data):
if self.in_body:
self.body += data
elif self.in_title:
self.title += data
elif self.in_topic_d:
self.topic_d += data
elif self.in_unproc_text:
self.text += data
def start_reuters(self, attributes):
topic_attr = attributes[0][1]
lewissplit_attr = attributes[1][1]
self.lewissplit = u'unused'
if topic_attr==u'YES':
if lewissplit_attr == u'TRAIN':
self.lewissplit = 'train'
elif lewissplit_attr == u'TEST':
self.lewissplit = 'test'
pass
def end_reuters(self):
self.body = re.sub(r'\s+', r' ', self.body)
if self.lewissplit != u'unused':
parsed_doc = {'title': self.title, 'body': self.body, 'unproc':self.text, 'topics': self.topics}
if (self.title+self.body+self.text).strip() == '':
self.empty_docs += 1
if self.lewissplit == u'train':
self.tr_docs.append(parsed_doc)
elif self.lewissplit == u'test':
self.te_docs.append(parsed_doc)
self._reset()
def start_title(self, attributes):
self.in_title = 1
def end_title(self):
self.in_title = 0
def start_body(self, attributes):
self.in_body = 1
def end_body(self):
self.in_body = 0
def start_topics(self, attributes):
self.in_topics = 1
def end_topics(self):
self.in_topics = 0
def start_text(self, attributes):
if len(attributes)>0 and attributes[0][1] == u'UNPROC':
self.in_unproc_text = 1
def end_text(self):
self.in_unproc_text = 0
def start_d(self, attributes):
self.in_topic_d = 1
def end_d(self):
if self.in_topics:
self.topics.append(self.topic_d)
self.in_topic_d = 0
self.topic_d = ""
def download_if_not_exist(self):
DOWNLOAD_URL = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
'reuters21578-mld/reuters21578.tar.gz')
ARCHIVE_FILENAME = 'reuters21578.tar.gz'
if self.data_path is None:
self.data_path = os.path.join(get_data_home(), "reuters")
if not os.path.exists(self.data_path):
"""Download the dataset."""
print("downloading dataset (once and for all) into %s" % self.data_path)
os.mkdir(self.data_path)
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
if _not_in_sphinx():
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
archive_path = os.path.join(self.data_path, ARCHIVE_FILENAME)
urllib.request.urlretrieve(DOWNLOAD_URL, filename=archive_path,
reporthook=progress)
if _not_in_sphinx():
print('\r', end='')
print("untarring Reuters dataset...")
tarfile.open(archive_path, 'r:gz').extractall(self.data_path)
print("done.")
def fetch_reuters21578(data_path=None, subset='train'):
if data_path is None:
data_path = os.path.join(get_data_home(), 'reuters21578')
reuters_pickle_path = os.path.join(data_path, "reuters." + subset + ".pickle")
if not os.path.exists(reuters_pickle_path):
parser = ReutersParser(data_path=data_path)
for filename in glob(os.path.join(data_path, "*.sgm")):
parser.parse(open(filename, 'rb'))
# index category names with a unique numerical code (only considering categories with training examples)
tr_categories = np.unique(np.concatenate([doc['topics'] for doc in parser.tr_docs])).tolist()
def pickle_documents(docs, subset):
for doc in docs:
doc['topics'] = [tr_categories.index(t) for t in doc['topics'] if t in tr_categories]
pickle_docs = {'categories': tr_categories, 'documents': docs}
pickle.dump(pickle_docs, open(os.path.join(data_path, "reuters." + subset + ".pickle"), 'wb'),
protocol=pickle.HIGHEST_PROTOCOL)
return pickle_docs
pickle_tr = pickle_documents(parser.tr_docs, "train")
pickle_te = pickle_documents(parser.te_docs, "test")
# self.sout('Empty docs %d' % parser.empty_docs)
requested_subset = pickle_tr if subset == 'train' else pickle_te
else:
requested_subset = pickle.load(open(reuters_pickle_path, 'rb'))
data = [(u'{title}\n{body}\n{unproc}'.format(**doc), doc['topics']) for doc in requested_subset['documents']]
text_data, topics = zip(*data)
return LabelledDocuments(data=text_data, target=topics, target_names=requested_subset['categories'])
if __name__=='__main__':
reuters_train = fetch_reuters21578(subset='train')
print(reuters_train.data)

280
MultiLabel/data/tsr_function__.py Executable file
View File

@ -0,0 +1,280 @@
import math
import numpy as np
from scipy.stats import t
from scipy.stats import norm
from joblib import Parallel, delayed
import time
from scipy.sparse import csr_matrix, csc_matrix
STWFUNCTIONS = ['dotn', 'ppmi', 'ig', 'chi2', 'cw', 'wp']
def get_probs(tpr, fpr, pc):
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
pnc = 1.0 - pc
tp = tpr * pc
fn = pc - tp
fp = fpr * pnc
tn = pnc - fp
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
def apply_tsr(tpr, fpr, pc, tsr):
cell = get_probs(tpr, fpr, pc)
return tsr(cell)
def positive_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0.0
else:
return information_gain(cell)
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def information_gain_mod(cell):
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gain_ratio(cell):
pc = cell.p_c()
pnc = 1.0 - pc
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
return information_gain(cell) / (-norm)
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def relevance_frequency(cell):
a = cell.tp
c = cell.fp
if c == 0: c = 1
return math.log(2.0 + (a * 1.0 / c), 2)
def idf(cell):
if cell.p_f()>0:
return math.log(1.0 / cell.p_f())
return 0.0
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
def word_prob(cell):
return cell.tpr()
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
print(f'[selectiong {k} terms]')
nC = Y.shape[1]
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
best_features_idx = np.argsort(-FC, axis=0).flatten()
tsr_values = FC.flatten()
selected_indexes_set = set()
selected_indexes = list()
selected_value = list()
from_category = list()
round_robin = iter(best_features_idx)
values_iter = iter(tsr_values)
round=0
while len(selected_indexes) < k:
term_idx = next(round_robin)
term_val = next(values_iter)
if term_idx not in selected_indexes_set:
selected_indexes_set.add(term_idx)
selected_indexes.append(term_idx)
selected_value.append(term_val)
from_category.append(round)
round = (round + 1) % nC
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC,nF = cell_matrix.shape
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
take as input any real-valued feature column (e.g., tf-idf weights).
feat is the feature vector, and c is a binary classification vector.
This implementation covers only the binary case, while the formula is defined for multiclass
single-label scenarios, for which the version [2] might be preferred.
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
"""
def fisher_score_binary(feat, c):
neg = np.ones_like(c) - c
npos = np.sum(c)
nneg = np.sum(neg)
mupos = np.mean(feat[c == 1])
muneg = np.mean(feat[neg == 1])
mu = np.mean(feat)
stdpos = np.std(feat[c == 1])
stdneg = np.std(feat[neg == 1])
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
if den>0:
return num / den
else:
return num

212
MultiLabel/data/wipo_reader.py Executable file
View File

@ -0,0 +1,212 @@
#https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/
import os, sys
from os.path import exists, join
from util.file import *
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np
import pickle
from joblib import Parallel, delayed
WIPO_URL= 'https://www.wipo.int/classifications/ipc/en/ITsupport/Categorization/dataset/'
class WipoGammaDocument:
def __init__(self, id, text, main_label, all_labels):
self.id = id
self.text = text
self.main_label = main_label
self.all_labels = all_labels
def remove_nested_claimtext_tags(xmlcontent):
from_pos = xmlcontent.find(b'<claims')
to_pos = xmlcontent.find(b'</claims>')
if from_pos > -1 and to_pos > -1:
in_between = xmlcontent[from_pos:to_pos].replace(b'<claim-text>',b'').replace(b'</claim-text>',b'')
xmlcontent = (xmlcontent[:from_pos]+in_between+xmlcontent[to_pos:]).strip()
return xmlcontent
def parse_document(xml_content, text_fields, limit_description):
root = ET.fromstring(remove_nested_claimtext_tags(xml_content))
doc_id = root.attrib['ucid']
lang = root.attrib['lang']
#take categories from the categorization up the "sub-class" level
main_group = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="true"]'))
sec_groups = set(t.text[:6] for t in root.findall('.//bibliographic-data/technical-data/classifications-ipcr/classification-ipcr[@computed="from_ecla_to_ipc_SG"][@generated_main_IPC="false"]'))
sec_groups.update(main_group)
assert len(main_group) == 1, 'more than one main groups'
main_group = list(main_group)[0]
sec_groups = sorted(list(sec_groups))
assert lang == 'EN', f'only English documents allowed (doc {doc_id})'
doc_text_fields=[]
if 'abstract' in text_fields:
abstract = '\n'.join(filter(None, [t.text for t in root.findall('.//abstract[@lang="EN"]/p')]))
doc_text_fields.append(abstract)
if 'description' in text_fields:
description = '\n'.join(filter(None, [t.text for t in root.findall('.//description[@lang="EN"]/p')]))
if limit_description>-1:
description=' '.join(description.split()[:limit_description])
doc_text_fields.append(description)
if 'claims' in text_fields:
claims = '\n'.join(filter(None, [t.text for t in root.findall('.//claims[@lang="EN"]/claim')]))
doc_text_fields.append(claims)
text = '\n'.join(doc_text_fields)
if text:
return WipoGammaDocument(doc_id, text, main_group, sec_groups)
else:
return None
def extract(fin, fout, text_fields, limit_description):
zipfile = ZipFile(fin)
ndocs=0
with open(fout, 'wt') as out:
for xmlfile in tqdm(zipfile.namelist()):
if xmlfile.endswith('.xml'):
xmlcontent = zipfile.open(xmlfile).read()
document = parse_document(xmlcontent, text_fields, limit_description)
if document:
line_text = document.text.replace('\n', ' ').replace('\t', ' ').strip()
assert line_text, f'empty document in {xmlfile}'
all_labels = ' '.join(document.all_labels)
out.write('\t'.join([document.id, document.main_label, all_labels, line_text]))
out.write('\n')
ndocs+=1
out.flush()
def read_classification_file(data_path, classification_level):
assert classification_level in ['subclass', 'maingroup'], 'wrong classification requested'
z = ZipFile(join(data_path,'EnglishWipoGamma1.zip'))
inpath='Wipo_Gamma/English/TrainTestSpits'
document_labels = dict()
train_ids, test_ids = set(), set()
labelcut = LabelCut(classification_level)
for subset in tqdm(['train', 'test'], desc='loading classification file'):
target_subset = train_ids if subset=='train' else test_ids
if classification_level == 'subclass':
file = f'{subset}set_en_sc.parts' #sub-class level
else:
file = f'{subset}set_en_mg.parts' #main-group level
for line in z.open(f'{inpath}/{file}').readlines():
line = line.decode().strip().split(',')
id = line[0]
id = id[id.rfind('/')+1:].replace('.xml','')
labels = labelcut.trim(line[1:])
document_labels[id]=labels
target_subset.add(id)
return document_labels, train_ids, test_ids
class LabelCut:
"""
Labels consists of 1 char for section, 2 chars for class, 1 class for subclass, 2 chars for maingroup and so on.
This class cuts the label at a desired level (4 for subclass, or 6 for maingroup)
"""
def __init__(self, classification_level):
assert classification_level in {'subclass','maingroup'}, 'unknown classification level'
if classification_level == 'subclass': self.cut = 4
else: self.cut = 6
def trim(self, label):
if isinstance(label, list):
return sorted(set([l[:self.cut] for l in label]))
else:
return label[:self.cut]
def fetch_WIPOgamma(subset, classification_level, data_home, extracted_path, text_fields = ['abstract', 'description'], limit_description=300):
"""
Fetchs the WIPO-gamma dataset
:param subset: 'train' or 'test' split
:param classification_level: the classification level, either 'subclass' or 'maingroup'
:param data_home: directory containing the original 11 English zips
:param extracted_path: directory used to extract and process the original files
:param text_fields: indicates the fields to extract, in 'abstract', 'description', 'claims'
:param limit_description: the maximum number of words to take from the description field (default 300); set to -1 for all
:return:
"""
assert subset in {"train", "test"}, 'unknown target request (valid ones are "train" or "test")'
assert len(text_fields)>0, 'at least some text field should be indicated'
if not exists(data_home):
raise ValueError(f'{data_home} does not exist, and the dataset cannot be automatically download, '
f'since you need to request for permission. Please refer to {WIPO_URL}')
create_if_not_exist(extracted_path)
config = f'{"-".join(text_fields)}'
if 'description' in text_fields: config+='-{limit_description}'
pickle_path=join(extracted_path, f'wipo-{subset}-{classification_level}-{config}.pickle')
if exists(pickle_path):
print(f'loading pickled file in {pickle_path}')
return pickle.load(open(pickle_path,'rb'))
print('pickle file not found, processing...(this will take some minutes)')
extracted = sum([exists(f'{extracted_path}/EnglishWipoGamma{(i+1)}-{config}.txt') for i in range(11)])==11
if not extracted:
print(f'extraction files not found, extracting files in {data_home}... (this will take some additional minutes)')
Parallel(n_jobs=-1)(
delayed(extract)(
join(data_home, file), join(extracted_path, file.replace('.zip', f'-{config}.txt')), text_fields, limit_description
)
for file in list_files(data_home)
)
doc_labels, train_ids, test_ids = read_classification_file(data_home, classification_level=classification_level) # or maingroup
print(f'{len(doc_labels)} documents classified split in {len(train_ids)} train and {len(test_ids)} test documents')
train_request = []
test_request = []
pbar = tqdm([filename for filename in list_files(extracted_path) if filename.endswith(f'-{config}.txt')])
labelcut = LabelCut(classification_level)
errors=0
for proc_file in pbar:
pbar.set_description(f'processing {proc_file} [errors={errors}]')
if not proc_file.endswith(f'-{config}.txt'): continue
lines = open(f'{extracted_path}/{proc_file}', 'rt').readlines()
for lineno,line in enumerate(lines):
parts = line.split('\t')
assert len(parts)==4, f'wrong format in {extracted_path}/{proc_file} line {lineno}'
id,mainlabel,alllabels,text=parts
mainlabel = labelcut.trim(mainlabel)
alllabels = labelcut.trim(alllabels.split())
# assert id in train_ids or id in test_ids, f'id {id} out of scope'
if id not in train_ids and id not in test_ids:
errors+=1
else:
# assert mainlabel == doc_labels[id][0], 'main label not consistent'
request = train_request if id in train_ids else test_request
request.append(WipoGammaDocument(id, text, mainlabel, alllabels))
print('pickling requests for faster subsequent runs')
pickle.dump(train_request, open(join(extracted_path,f'wipo-train-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
pickle.dump(test_request, open(join(extracted_path, f'wipo-test-{classification_level}-{config}.pickle'), 'wb', pickle.HIGHEST_PROTOCOL))
if subset== 'train':
return train_request
else:
return test_request
if __name__=='__main__':
data_home = '../../datasets/WIPO/wipo-gamma/en'
extracted_path = '../../datasets/WIPO-extracted'
train = fetch_WIPOgamma(subset='train', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
test = fetch_WIPOgamma(subset='test', classification_level='subclass', data_home=data_home, extracted_path=extracted_path, text_fields=('abstract'))
# train = fetch_WIPOgamma(subset='train', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
# test = fetch_WIPOgamma(subset='test', classification_level='maingroup', data_home=data_home, extracted_path=extracted_path)
print('Done')

118
MultiLabel/gentables.py Normal file
View File

@ -0,0 +1,118 @@
import argparse
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import itertools
from sklearn.multioutput import ClassifierChain
from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix
import quapy as qp
from MultiLabel.main import load_results, SKMULTILEARN_RED_DATASETS, TC_DATASETS, sample_size
from MultiLabel.mlclassification import MLStackedClassifier
from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \
MLPACC, MLNaiveAggregativeQuantifier
from MultiLabel.tabular import Table
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np
from data.dataset import Dataset
from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction, check_error_str
import sys
import os
import pickle
models = [#'MLPE',
'NaiveCC', 'NaivePCC', 'NaivePCCcal', 'NaiveACC', 'NaivePACC', 'NaivePACCcal', 'NaiveACCit', 'NaivePACCit',
#'NaiveHDy', 'NaiveSLD',
'ChainCC', 'ChainPCC', 'ChainACC', 'ChainPACC',
'StackCC', 'StackPCC', 'StackPCCcal', 'StackACC', 'StackPACC', 'StackPACCcal', 'StackACCit', 'StackP'
'ACCit',
'MRQ-CC', 'MRQ-PCC', 'MRQ-ACC', 'MRQ-PACC', 'MRQ-ACCit', 'MRQ-PACCit',
'StackMRQ-CC', 'StackMRQ-PCC', 'StackMRQ-ACC', 'StackMRQ-PACC',
'MRQ-StackCC', 'MRQ-StackPCC', 'MRQ-StackACC', 'MRQ-StackPACC',
'StackMRQ-StackCC', 'StackMRQ-StackPCC', 'StackMRQ-StackACC', 'StackMRQ-StackPACC',
'MRQ-StackCC-app', 'MRQ-StackPCC-app', 'MRQ-StackACC-app', 'MRQ-StackPACC-app',
'StackMRQ-StackCC-app', 'StackMRQ-StackPCC-app', 'StackMRQ-StackACC-app', 'StackMRQ-StackPACC-app',
'LSP-CC', 'LSP-ACC', 'MLKNN-CC', 'MLKNN-ACC',
'MLAdjustedC', 'MLStackAdjustedC', 'MLprobAdjustedC', 'MLStackProbAdjustedC'
]
# datasets = sorted(set([x[0] for x in available_data_sets().keys()]))
datasets = TC_DATASETS
def generate_table(path, protocol, error):
def compute_score_job(args):
dataset, model = args
result_path = f'{opt.results}/{dataset}_{model}.pkl'
if os.path.exists(result_path):
print('+', end='')
sys.stdout.flush()
result = load_results(result_path)
true_prevs, estim_prevs = result[protocol]
scores = np.asarray([error(trues, estims) for trues, estims in zip(true_prevs, estim_prevs)]).flatten()
return dataset, model, scores
print('-', end='')
sys.stdout.flush()
return None
print(f'\ngenerating {path}')
table = Table(datasets, models, prec_mean=4, significance_test='wilcoxon')
results = qp.util.parallel(compute_score_job, list(itertools.product(datasets, models)), n_jobs=-1)
print()
for r in results:
if r is not None:
dataset, model, scores = r
table.add(dataset, model, scores)
save_table(table, path)
save_table(table.getRankTable(), path.replace('.tex','.rank.tex'))
def save_table(table, path):
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * len(models)) + """} \hline
"""
dataset_replace = {'tmc2007_500': 'tmc2007\_500', 'tmc2007_500-red': 'tmc2007\_500-red'}
method_replace = {}
tabular += table.latexTabularT(benchmark_replace=dataset_replace, method_replace=method_replace, side=True)
tabular += """
\end{tabular}%
}
"""
with open(path, 'wt') as foo:
foo.write(tabular)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
parser.add_argument('--results', type=str, default='./results', metavar='str',
help=f'path where to store the results')
parser.add_argument('--tablepath', type=str, default='./tables', metavar='str',
help=f'path where to store the tables')
opt = parser.parse_args()
assert os.path.exists(opt.results), f'result directory {opt.results} does not exist'
os.makedirs(opt.tablepath, exist_ok=True)
qp.environ["SAMPLE_SIZE"] = sample_size
absolute_error = qp.error.ae
relative_absolute_error = qp.error.rae
generate_table(f'{opt.tablepath}/npp.ae.tex', protocol='npp', error=absolute_error)
generate_table(f'{opt.tablepath}/app.ae.tex', protocol='app', error=absolute_error)
generate_table(f'{opt.tablepath}/npp.rae.tex', protocol='npp', error=relative_absolute_error)
generate_table(f'{opt.tablepath}/app.rae.tex', protocol='app', error=relative_absolute_error)

290
MultiLabel/main.py Normal file
View File

@ -0,0 +1,290 @@
import argparse
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
import itertools
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import ClassifierChain
from tqdm import tqdm
from skmultilearn.dataset import load_dataset, available_data_sets
from scipy.sparse import csr_matrix
import quapy as qp
from MultiLabel.mlclassification import MLStackedClassifier, LabelSpacePartion, MLTwinSVM, MLknn
from MultiLabel.mldata import MultilabelledCollection
from MultiLabel.mlquantification import MLNaiveQuantifier, MLCC, MLPCC, MLRegressionQuantification, \
MLACC, \
MLPACC, MLNaiveAggregativeQuantifier, MLMLPE, StackMLRQuantifier, MLadjustedCount, MLprobAdjustedCount
from method.aggregative import PACC, CC, EMQ, PCC, ACC, HDy
import numpy as np
from data.dataset import Dataset
from mlevaluation import ml_natural_prevalence_prediction, ml_artificial_prevalence_prediction
import sys
import os
import pickle
def cls():
# return LinearSVC()
return LogisticRegression(max_iter=1000, solver='lbfgs')
def calibratedCls():
return CalibratedClassifierCV(cls())
# DEBUG=True
# if DEBUG:
sample_size = 100
n_samples = 5000
SKMULTILEARN_ALL_DATASETS = sorted(set([x[0] for x in available_data_sets().keys()]))
SKMULTILEARN_RED_DATASETS = [x+'-red' for x in SKMULTILEARN_ALL_DATASETS]
TC_DATASETS = ['reuters21578', 'jrcall', 'ohsumed', 'rcv1']
DATASETS = TC_DATASETS
def models():
yield 'MLPE', MLMLPE()
yield 'NaiveCC', MLNaiveAggregativeQuantifier(CC(cls()))
yield 'NaivePCC', MLNaiveAggregativeQuantifier(PCC(cls()))
yield 'NaivePCCcal', MLNaiveAggregativeQuantifier(PCC(calibratedCls()))
yield 'NaiveACC', MLNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACC', MLNaiveAggregativeQuantifier(PACC(cls()))
yield 'NaivePACCcal', MLNaiveAggregativeQuantifier(PACC(calibratedCls()))
yield 'NaiveACCit', MLNaiveAggregativeQuantifier(ACC(cls()))
yield 'NaivePACCit', MLNaiveAggregativeQuantifier(PACC(cls()))
# yield 'NaiveHDy', MLNaiveAggregativeQuantifier(HDy(cls()))
# yield 'NaiveSLD', MLNaiveAggregativeQuantifier(EMQ(calibratedCls()))
yield 'StackCC', MLCC(MLStackedClassifier(cls()))
yield 'StackPCC', MLPCC(MLStackedClassifier(cls()))
yield 'StackPCCcal', MLPCC(MLStackedClassifier(calibratedCls()))
yield 'StackACC', MLACC(MLStackedClassifier(cls()))
yield 'StackPACC', MLPACC(MLStackedClassifier(cls()))
yield 'StackPACCcal', MLPACC(MLStackedClassifier(calibratedCls()))
yield 'StackACCit', MLACC(MLStackedClassifier(cls()))
yield 'StackPACCit', MLPACC(MLStackedClassifier(cls()))
# yield 'ChainCC', MLCC(ClassifierChain(cls(), cv=None))
# yield 'ChainPCC', MLPCC(ClassifierChain(cls(), cv=None))
# yield 'ChainACC', MLACC(ClassifierChain(cls(), cv=None))
# yield 'ChainPACC', MLPACC(ClassifierChain(cls(), cv=None))
common={'sample_size':sample_size, 'n_samples': n_samples, 'norm': True, 'means':False, 'stds':False, 'regression':'svr'}
yield 'MRQ-CC', MLRegressionQuantification(MLNaiveQuantifier(CC(cls())), **common)
yield 'MRQ-PCC', MLRegressionQuantification(MLNaiveQuantifier(PCC(cls())), **common)
yield 'MRQ-ACC', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACC', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-ACCit', MLRegressionQuantification(MLNaiveQuantifier(ACC(cls())), **common)
yield 'MRQ-PACCit', MLRegressionQuantification(MLNaiveQuantifier(PACC(cls())), **common)
yield 'MRQ-StackCC', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPCC', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackACC', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackPACC', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), **common)
yield 'MRQ-StackCC-app', MLRegressionQuantification(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackPCC-app', MLRegressionQuantification(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackACC-app', MLRegressionQuantification(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MRQ-StackPACC-app', MLRegressionQuantification(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-CC', StackMLRQuantifier(MLNaiveQuantifier(CC(cls())), **common)
yield 'StackMRQ-PCC', StackMLRQuantifier(MLNaiveQuantifier(PCC(cls())), **common)
yield 'StackMRQ-ACC', StackMLRQuantifier(MLNaiveQuantifier(ACC(cls())), **common)
yield 'StackMRQ-PACC', StackMLRQuantifier(MLNaiveQuantifier(PACC(cls())), **common)
yield 'StackMRQ-StackCC', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackPCC', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackACC', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackPACC', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), **common)
yield 'StackMRQ-StackCC-app', StackMLRQuantifier(MLCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackPCC-app', StackMLRQuantifier(MLPCC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackACC-app', StackMLRQuantifier(MLACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'StackMRQ-StackPACC-app', StackMLRQuantifier(MLPACC(MLStackedClassifier(cls())), protocol='app', **common)
yield 'MLAdjustedC', MLadjustedCount(OneVsRestClassifier(cls()))
yield 'MLStackAdjustedC', MLadjustedCount(MLStackedClassifier(cls()))
# yield 'MLprobAdjustedC', MLprobAdjustedCount(OneVsRestClassifier(calibratedCls()))
# yield 'MLStackProbAdjustedC', MLprobAdjustedCount(MLStackedClassifier(calibratedCls()))
# yield 'MRQ-ChainCC', MLRegressionQuantification(MLCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPCC', MLRegressionQuantification(MLPCC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainACC', MLRegressionQuantification(MLACC(ClassifierChain(cls())), **common)
# yield 'MRQ-ChainPACC', MLRegressionQuantification(MLPACC(ClassifierChain(cls())), **common)
# yield 'LSP-CC', MLCC(LabelSpacePartion(cls()))
# yield 'LSP-ACC', MLACC(LabelSpacePartion(cls()))
# yield 'TwinSVM-CC', MLCC(MLTwinSVM())
# yield 'TwinSVM-ACC', MLACC(MLTwinSVM())
# yield 'MLKNN-CC', MLCC(MLknn())
#yield 'MLKNN-PCC', MLPCC(MLknn())
# yield 'MLKNN-ACC', MLACC(MLknn())
#yield 'MLKNN-PACC', MLPACC(MLknn())
def get_dataset(dataset_name, dopickle=True):
datadir = f'{qp.util.get_quapy_home()}/pickles'
datapath = f'{datadir}/{dataset_name}.pkl'
if dopickle:
if os.path.exists(datapath):
print(f'returning pickled object in {datapath}')
return pickle.load(open(datapath, 'rb'))
if dataset_name in SKMULTILEARN_ALL_DATASETS + SKMULTILEARN_RED_DATASETS:
clean_name = dataset_name.replace('-red','')
Xtr, ytr, feature_names, label_names = load_dataset(clean_name, 'train')
Xte, yte, _, _ = load_dataset(clean_name, 'test')
print(f'n-labels = {len(label_names)}')
Xtr = csr_matrix(Xtr)
Xte = csr_matrix(Xte)
ytr = ytr.todense().getA()
yte = yte.todense().getA()
if dataset_name.endswith('-red'):
TO_SELECT = 10
nC = ytr.shape[1]
tr_counts = ytr.sum(axis=0)
te_counts = yte.sum(axis=0)
if nC > TO_SELECT:
Y = ytr.T.dot(ytr) # class-class coincidence matrix
Y[np.triu_indices(nC)] = 0 # zeroing all duplicates entries and the diagonal
order_ij = np.argsort(-Y, axis=None)
selected = set()
p=0
while len(selected) < TO_SELECT:
highest_index = order_ij[p]
class_i = highest_index // nC
class_j = highest_index % nC
# if there is only one class to go, then add the most populated one
most_populated, least_populated = (class_i, class_j) if tr_counts[class_i] > tr_counts[class_j] else (class_j, class_i)
if te_counts[most_populated]>0:
selected.add(most_populated)
if len(selected) < TO_SELECT:
if te_counts[least_populated]>0:
selected.add(least_populated)
p+=1
selected = np.asarray(sorted(selected))
ytr = ytr[:,selected]
yte = yte[:, selected]
# else:
# remove categories without positives in the training or test splits
# valid_categories = np.logical_and(ytr.sum(axis=0)>5, yte.sum(axis=0)>5)
# ytr = ytr[:, valid_categories]
# yte = yte[:, valid_categories]
elif dataset_name in TC_DATASETS:
picklepath = '/home/moreo/word-class-embeddings/pickles'
data = Dataset.load(dataset_name, pickle_path=f'{picklepath}/{dataset_name}.pickle')
Xtr, Xte = data.vectorize()
ytr = data.devel_labelmatrix.todense().getA()
yte = data.test_labelmatrix.todense().getA()
# remove categories with < 50 training or test documents
# to_keep = np.logical_and(ytr.sum(axis=0)>=50, yte.sum(axis=0)>=50)
# keep the 10 most populated categories
to_keep = np.argsort(ytr.sum(axis=0))[-10:]
ytr = ytr[:, to_keep]
yte = yte[:, to_keep]
print(f'num categories = {ytr.shape[1]}')
else:
raise ValueError(f'unknown dataset {dataset_name}')
train = MultilabelledCollection(Xtr, ytr)
test = MultilabelledCollection(Xte, yte)
if dopickle:
os.makedirs(datadir, exist_ok=True)
pickle.dump((train, test), open(datapath, 'wb'), pickle.HIGHEST_PROTOCOL)
return train, test
def already_run(result_path):
if os.path.exists(result_path):
print(f'{result_path} already computed. Skipping')
return True
return False
def print_info(train, test):
# print((np.abs(np.corrcoef(ytr, rowvar=False))>0.1).sum())
# sys.exit(0)
print(f'Tr documents {len(train)}')
print(f'Te documents {len(test)}')
print(f'#features {train.instances.shape[1]}')
print(f'#classes {train.labels.shape[1]}')
# print(f'Train-prev: {train.prevalence()[:,1]}')
print(f'Train-counts: {train.counts()}')
# print(f'Test-prev: {test.prevalence()[:,1]}')
print(f'Test-counts: {test.counts()}')
print(f'MLPE: {qp.error.mae(train.prevalence(), test.prevalence()):.5f}')
def save_results(npp_results, app_results, result_path):
# results are lists of tuples of (true_prevs, estim_prevs)
# each true_prevs is an ndarray of ndim=2, but the second dimension is constrained
def _prepare_result_lot(lot_results):
true_prevs, estim_prevs = lot_results
return {
'true_prevs': [true_i[:,0].flatten() for true_i in true_prevs], # removes the constrained prevalence
'estim_prevs': [estim_i[:,0].flatten() for estim_i in estim_prevs] # removes the constrained prevalence
}
results = {
'npp': _prepare_result_lot(npp_results),
'app': _prepare_result_lot(app_results),
}
pickle.dump(results, open(result_path, 'wb'), pickle.HIGHEST_PROTOCOL)
def load_results(result_path):
def _unpack_result_lot(lot_result):
true_prevs = lot_result['true_prevs']
true_prevs = [np.vstack([true_i, 1 - true_i]).T for true_i in true_prevs] # add the constrained prevalence
estim_prevs = lot_result['estim_prevs']
estim_prevs = [np.vstack([estim_i, 1 - estim_i]).T for estim_i in estim_prevs] # add the constrained prevalence
return true_prevs, estim_prevs
results = pickle.load(open(result_path, 'rb'))
results = {
'npp': _unpack_result_lot(results['npp']),
'app': _unpack_result_lot(results['app']),
}
return results
# results_npp = _unpack_result_lot(results['npp'])
# results_app = _unpack_result_lot(results['app'])
# return results_npp, results_app
def run_experiment(dataset_name, model_name, model):
result_path = f'{opt.results}/{dataset_name}_{model_name}.pkl'
if already_run(result_path):
return
print(f'runing experiment {dataset_name} x {model_name}')
train, test = get_dataset(dataset_name)
# if train.n_classes>100:
# return
print_info(train, test)
model.fit(train)
results_npp = ml_natural_prevalence_prediction(model, test, sample_size, repeats=100)
results_app = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences=11, repeats=5)
save_results(results_npp, results_app, result_path)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Experiments for multi-label quantification')
parser.add_argument('--results', type=str, default='./results', metavar='str',
help=f'path where to store the results')
opt = parser.parse_args()
os.makedirs(opt.results, exist_ok=True)
for datasetname, (modelname,model) in itertools.product(DATASETS, models()):
run_experiment(datasetname, modelname, model)

View File

@ -0,0 +1,110 @@
from copy import deepcopy
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from skmultilearn.adapt import MLTSVM
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import NetworkXLabelGraphClusterer, LabelCooccurrenceGraphBuilder
from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
from sklearn.manifold import SpectralEmbedding
from sklearn.ensemble import RandomForestRegressor
from skmultilearn.adapt import MLkNN
class MLStackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
self.base = deepcopy(OneVsRestClassifier(base_estimator))
self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.norm = StandardScaler()
def fit(self, X, y):
assert y.ndim==2, 'the dataset does not seem to be multi-label'
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class MLStackedRegressor:
def __init__(self, base_regressor=Ridge(normalize=True)):
self.base = deepcopy(base_regressor)
self.meta = deepcopy(base_regressor)
def fit(self, X, y):
assert y.ndim==2, 'the dataset does not seem to be multi-label'
self.base.fit(X, y)
R = self.base.predict(X)
# R = self.norm.fit_transform(R)
self.meta.fit(R, y)
return self
def predict(self, X):
R = self.base.predict(X)
# R = self.norm.transform(R)
return self.meta.predict(R)
class LabelSpacePartion:
def __init__(self, base_estimator=LogisticRegression()):
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False)
self.classifier = LabelSpacePartitioningClassifier(
classifier=LabelPowerset(classifier=base_estimator),
clusterer=NetworkXLabelGraphClusterer(graph_builder, method='louvain')
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLTwinSVM:
def __init__(self):
self.classifier = MLTSVM()
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
class MLknn:
#http://scikit.ml/api/skmultilearn.embedding.classifier.html#skmultilearn.embedding.EmbeddingClassifier
#notes: need to install package openne
def __init__(self):
self.classifier = EmbeddingClassifier(
SKLearnEmbedder(SpectralEmbedding(n_components=10)),
RandomForestRegressor(n_estimators=10),
MLkNN(k=5)
)
def fit(self, X, y):
return self.classifier.fit(X, y)
def predict(self, X):
return self.classifier.predict(X).todense().getA()
def predict_proba(self, X):
return self.classifier.predict_proba(X)

209
MultiLabel/mldata.py Normal file
View File

@ -0,0 +1,209 @@
from typing import List, Union
import numpy as np
from sklearn.model_selection import train_test_split
from quapy.data import LabelledCollection
from quapy.functional import artificial_prevalence_sampling
from skmultilearn.model_selection import iterative_train_test_split
class MultilabelledCollection:
def __init__(self, instances, labels):
assert labels.ndim==2, f'data does not seem to be multilabel {labels}'
self.instances = instances
self.labels = labels
self.classes_ = np.arange(labels.shape[1])
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilabelledCollection(*loader_func(path))
def __len__(self):
return self.instances.shape[0]
def prevalence(self):
# return self.labels.mean(axis=0)
pos = self.labels.mean(axis=0)
neg = 1-pos
return np.asarray([neg, pos]).T
def counts(self):
return self.labels.sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def n_features(self):
return self.instances.shape[1]
@property
def binary(self):
return False
def __gen_index(self):
return np.arange(len(self))
def sampling_multi_index(self, size, cat, prev=None):
if prev is None: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=size > len(self))
aux = LabelledCollection(self.__gen_index(), self.labels[:, cat])
return aux.sampling_index(size, *[1-prev, prev])
def uniform_sampling_multi_index(self, size):
return np.random.choice(len(self), size, replace=size>len(self))
def uniform_sampling(self, size):
unif_index = self.uniform_sampling_multi_index(size)
return self.sampling_from_index(unif_index)
def sampling(self, size, category, prev=None):
prev_index = self.sampling_multi_index(size, category, prev)
return self.sampling_from_index(prev_index)
def sampling_from_index(self, index):
documents = self.instances[index]
labels = self.labels[index]
return MultilabelledCollection(documents, labels)
def train_test_split(self, train_prop=0.6, random_state=None, iterative=False):
if iterative:
tr_docs, tr_labels, te_docs, te_labels = \
iterative_train_test_split(self.instances, self.labels, test_size=1-train_prop)
else:
tr_docs, te_docs, tr_labels, te_labels = \
train_test_split(self.instances, self.labels, train_size=train_prop, random_state=random_state)
return MultilabelledCollection(tr_docs, tr_labels), MultilabelledCollection(te_docs, te_labels)
def artificial_sampling_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
yield self.sampling(sample_size, category, prevs)
def artificial_sampling_index_generator(self, sample_size, category, n_prevalences=101, repeats=1):
dimensions = 2
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats).flatten():
yield self.sampling_multi_index(sample_size, category, prevs)
def natural_sampling_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling_multi_index(sample_size)
def asLabelledCollection(self, category):
return LabelledCollection(self.instances, self.labels[:,category])
def genLabelledCollections(self):
for c in self.classes_:
yield self.asLabelledCollection(c)
# @property
# def label_cardinality(self):
# return self.labels.sum()/len(self)
@property
def Xy(self):
return self.instances, self.labels
class MultilingualLabelledCollection:
def __init__(self, langs:List[str], labelledCollections:List[Union[LabelledCollection, MultilabelledCollection]]):
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
assert all(isinstance(lc, LabelledCollection) or all(isinstance(lc, MultilabelledCollection)) for lc in labelledCollections), \
'unexpected type for labelledCollections'
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
'inconsistent classes found for some labelled collections'
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
self.classes_=labelledCollections[0].classes_
@classmethod
def fromLangDict(cls, lang_labelledCollection:dict):
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
def langs(self):
return list(sorted(self.llc.keys()))
def __getitem__(self, lang)->LabelledCollection:
return self.llc[lang]
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilingualLabelledCollection(*loader_func(path))
def __len__(self):
return sum(map(len, self.llc.values()))
def prevalence(self):
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
return prev / prev.sum()
def language_prevalence(self):
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
return lang_count / lang_count.sum()
def counts(self):
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def binary(self):
return self.n_classes == 2
def __check_langs(self, l_dict:dict):
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
def __check_sizes(self, l_sizes: Union[int,dict]):
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
if isinstance(l_sizes, int):
return {l:l_sizes for l in self.langs()}
self.__check_langs(l_sizes)
return l_sizes
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
def uniform_sampling(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
)
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
)
def sampling_from_index(self, l_index:dict):
self.__check_langs(l_index)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
)
def split_stratified(self, train_prop=0.6, random_state=None):
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
def asLabelledCollection(self, return_langs=False):
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
ls,Xs,ys = list(zip(*lXy_list))
ls = np.concatenate(ls)
vertstack = vstack if issparse(Xs[0]) else np.vstack
Xs = vertstack(Xs)
ys = np.concatenate(ys)
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
# return lc, ls if return_langs else lc

117
MultiLabel/mlevaluation.py Normal file
View File

@ -0,0 +1,117 @@
from typing import Union, Callable
import numpy as np
import quapy as qp
from MultiLabel.mlquantification import MLAggregativeQuantifier
from mldata import MultilabelledCollection
import itertools
from tqdm import tqdm
def check_error_str(error_metric):
if isinstance(error_metric, str):
error_metric = qp.error.from_name(error_metric)
assert hasattr(error_metric, '__call__'), 'invalid error function'
return error_metric
def _ml_prevalence_predictions(model,
test: MultilabelledCollection,
test_indexes):
predict_batch_fn = _predict_quantification_batch
if isinstance(model, MLAggregativeQuantifier):
test = MultilabelledCollection(model.preclassify(test.instances), test.labels)
predict_batch_fn = _predict_aggregative_batch
args = tuple([model, test, test_indexes])
true_prevs, estim_prevs = predict_batch_fn(args)
return true_prevs, estim_prevs
def ml_natural_prevalence_prediction(model,
test:MultilabelledCollection,
sample_size,
repeats=100,
random_seed=42):
with qp.util.temp_seed(random_seed):
test_indexes = list(test.natural_sampling_index_generator(sample_size=sample_size, repeats=repeats))
return _ml_prevalence_predictions(model, test, test_indexes)
def ml_natural_prevalence_evaluation(model,
test:MultilabelledCollection,
sample_size,
repeats=100,
error_metric:Union[str,Callable]='mae',
random_seed=42):
error_metric = check_error_str(error_metric)
true_prevs, estim_prevs = ml_natural_prevalence_prediction(model, test, sample_size, repeats, random_seed)
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
return np.mean(errs)
def ml_artificial_prevalence_prediction(model,
test:MultilabelledCollection,
sample_size,
n_prevalences=21,
repeats=10,
random_seed=42):
nested_test_indexes = []
with qp.util.temp_seed(random_seed):
for cat in test.classes_:
nested_test_indexes.append(list(test.artificial_sampling_index_generator(sample_size=sample_size,
category=cat,
n_prevalences=n_prevalences,
repeats=repeats)))
def _predict_batch(test_indexes):
return _ml_prevalence_predictions(model, test, test_indexes)
predictions = qp.util.parallel(_predict_batch, nested_test_indexes, n_jobs=-1)
true_prevs = list(itertools.chain.from_iterable(trues for trues, estims in predictions))
estim_prevs = list(itertools.chain.from_iterable(estims for trues, estims in predictions))
return true_prevs, estim_prevs
def ml_artificial_prevalence_evaluation(model,
test:MultilabelledCollection,
sample_size,
n_prevalences=21,
repeats=10,
error_metric:Union[str,Callable]='mae',
random_seed=42):
error_metric = check_error_str(error_metric)
true_prevs, estim_prevs = ml_artificial_prevalence_prediction(model, test, sample_size, n_prevalences, repeats, random_seed)
errs = [error_metric(true_prev_i, estim_prev_i) for true_prev_i, estim_prev_i in zip(true_prevs, estim_prevs)]
return np.mean(errs)
def _predict_quantification_batch(args):
model, test, indexes = args
return __predict_batch_fn(args, model.quantify)
def _predict_aggregative_batch(args):
model, test, indexes = args
return __predict_batch_fn(args, model.aggregate)
def __predict_batch_fn(args, quant_fn):
model, test, indexes = args
trues, estims = [], []
for index in indexes:
sample = test.sampling_from_index(index)
estims.append(quant_fn(sample.instances))
trues.append(sample.prevalence())
return trues, estims

View File

@ -0,0 +1,361 @@
import numpy as np
from copy import deepcopy
import sklearn.preprocessing
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import confusion_matrix
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, LassoCV, MultiTaskLassoCV, LassoLars, LassoLarsCV, \
ElasticNet, MultiTaskElasticNetCV, MultiTaskElasticNet, LinearRegression, ARDRegression, BayesianRidge, SGDRegressor
import quapy as qp
from MultiLabel.mlclassification import MLStackedClassifier, MLStackedRegressor
from MultiLabel.mldata import MultilabelledCollection
from method.aggregative import CC, ACC, PACC, AggregativeQuantifier
from method.base import BaseQuantifier
from abc import abstractmethod
class MLQuantifier:
@abstractmethod
def fit(self, data: MultilabelledCollection): ...
@abstractmethod
def quantify(self, instances): ...
class MLMLPE(MLQuantifier):
def fit(self, data: MultilabelledCollection):
self.tr_prev = data.prevalence()
return self
def quantify(self, instances):
return self.tr_prev
class MLAggregativeQuantifier(MLQuantifier):
def __init__(self, mlcls):
self.learner = mlcls
def fit(self, data:MultilabelledCollection):
self.learner.fit(*data.Xy)
return self
@abstractmethod
def preclassify(self, instances): ...
@abstractmethod
def aggregate(self, predictions): ...
def quantify(self, instances):
predictions = self.preclassify(instances)
return self.aggregate(predictions)
class MLCC(MLAggregativeQuantifier):
def preclassify(self, instances):
return self.learner.predict(instances)
def aggregate(self, predictions):
pos_prev = predictions.mean(axis=0)
neg_prev = 1 - pos_prev
return np.asarray([neg_prev, pos_prev]).T
class MLPCC(MLCC):
def preclassify(self, instances):
return self.learner.predict_proba(instances)
class MLACC(MLCC):
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
self.Pte_cond_estim_ = []
for c in data.classes_:
pos_c = val.labels[:,c].sum()
neg_c = len(val) - pos_c
self.Pte_cond_estim_.append(confusion_matrix(val.labels[:,c], val_predictions[:,c]).T / np.array([neg_c, pos_c]))
return self
def preclassify(self, instances):
return self.learner.predict(instances)
def aggregate(self, predictions):
cc_prevs = super(MLACC, self).aggregate(predictions)
acc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], cc_prevs[c]) for c in self.classes_])
return acc_prevs
class MLPACC(MLPCC):
def fit(self, data:MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_posteriors = self.preclassify(val.instances)
self.Pte_cond_estim_ = []
for c in data.classes_:
pos_posteriors = val_posteriors[:,c]
c_posteriors = np.asarray([1-pos_posteriors, pos_posteriors]).T
self.Pte_cond_estim_.append(PACC.getPteCondEstim([0,1], val.labels[:,c], c_posteriors))
return self
def aggregate(self, posteriors):
pcc_prevs = super(MLPACC, self).aggregate(posteriors)
pacc_prevs = np.asarray([ACC.solve_adjustment(self.Pte_cond_estim_[c], pcc_prevs[c]) for c in self.classes_])
return pacc_prevs
class MLNaiveQuantifier(MLQuantifier):
def __init__(self, q:BaseQuantifier, n_jobs=-1):
self.q = q
self.estimators = None
self.n_jobs = n_jobs
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
def cat_job(lc):
return deepcopy(self.q).fit(lc)
self.estimators = qp.util.parallel(cat_job, data.genLabelledCollections(), n_jobs=self.n_jobs)
return self
def quantify(self, instances):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].quantify(instances)[1]
neg_prevs = 1-pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
class MLNaiveAggregativeQuantifier(MLNaiveQuantifier, MLAggregativeQuantifier):
def __init__(self, q:AggregativeQuantifier, n_jobs=-1):
assert isinstance(q, AggregativeQuantifier), 'the quantifier is not of type aggregative!'
self.q = q
self.estimators = None
self.n_jobs = n_jobs
def preclassify(self, instances):
return np.asarray([q.preclassify(instances) for q in self.estimators]).swapaxes(0,1)
def aggregate(self, predictions):
pos_prevs = np.zeros(len(self.classes_), dtype=float)
for c in self.classes_:
pos_prevs[c] = self.estimators[c].aggregate(predictions[:,c])[1]
neg_prevs = 1 - pos_prevs
return np.asarray([neg_prevs, pos_prevs]).T
def quantify(self, instances):
predictions = self.preclassify(instances)
return self.aggregate(predictions)
class MLRegressionQuantification:
def __init__(self,
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
sample_size=500,
norm=True,
means=True,
stds=True):
assert protocol in ['npp', 'app'], 'unknown protocol'
self.estimator = mlquantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
self.protocol = protocol
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.n_samples = n_samples
self.sample_size = sample_size
# self.norm = StandardScaler()
self.means = means
self.stds = stds
# self.covs = covs
def _prepare_arrays(self, Xs, ys, samples_mean, samples_std):
Xs = np.asarray(Xs)
ys = np.asarray(ys)
if self.means:
samples_mean = np.asarray(samples_mean)
Xs = np.hstack([Xs, samples_mean])
if self.stds:
samples_std = np.asarray(samples_std)
Xs = np.hstack([Xs, samples_std])
# if self.covs:
return Xs, ys
def _extract_features(self, sample, Xs, ys, samples_mean, samples_std):
ys.append(sample.prevalence()[:, 1])
Xs.append(self.estimator.quantify(sample.instances)[:, 1])
if self.means:
samples_mean.append(sample.instances.mean(axis=0).getA().flatten())
if self.stds:
samples_std.append(sample.instances.todense().std(axis=0).getA().flatten())
def generate_samples_npp(self, val):
Xs, ys = [], []
samples_mean, samples_std = [], []
for sample in val.natural_sampling_generator(sample_size=self.sample_size, repeats=self.n_samples):
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def generate_samples_app(self, val):
Xs, ys = [], []
samples_mean, samples_std = [], []
ncats = len(self.classes_)
nprevs = 21
repeats = max(self.n_samples // (ncats * nprevs), 1)
for cat in self.classes_:
for sample in val.artificial_sampling_generator(sample_size=self.sample_size, category=cat, n_prevalences=nprevs, repeats=repeats):
self._extract_features(sample, Xs, ys, samples_mean, samples_std)
return self._prepare_arrays(Xs, ys, samples_mean, samples_std)
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
tr, val = data.train_test_split()
self.estimator.fit(tr)
if self.protocol == 'npp':
Xs, ys = self.generate_samples_npp(val)
elif self.protocol == 'app':
Xs, ys = self.generate_samples_app(val)
# Xs = self.norm.fit_transform(Xs)
self.reg.fit(Xs, ys)
return self
def quantify(self, instances):
Xs = self.estimator.quantify(instances)[:,1].reshape(1,-1)
if self.means:
sample_mean = instances.mean(axis=0).getA()
Xs = np.hstack([Xs, sample_mean])
if self.stds:
sample_std = instances.todense().std(axis=0).getA()
Xs = np.hstack([Xs, sample_std])
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs)
# Xs = self.norm.inverse_transform(Xs)
adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted.flatten()
neg_prevs = 1-adjusted
return np.asarray([neg_prevs, adjusted]).T
class StackMLRQuantifier:
def __init__(self,
mlquantifier=MLNaiveQuantifier(CC(LinearSVC())),
regression='ridge',
protocol='npp',
n_samples=500,
sample_size=500,
norm=True,
means=True,
stds=True):
if regression == 'ridge':
reg = MLStackedRegressor(Ridge(normalize=True))
elif regression == 'svr':
reg = MLStackedRegressor(MultiOutputRegressor(LinearSVR()))
else:
ValueError(f'unknown regressor {regression}')
self.base = MLRegressionQuantification(
mlquantifier=mlquantifier,
regression=reg,
protocol=protocol,
n_samples=n_samples,
sample_size=sample_size,
norm=norm,
means=means,
stds=stds)
def fit(self, data:MultilabelledCollection):
self.classes_ = data.classes_
self.base.fit(data)
return self
def quantify(self, instances):
return self.base.quantify(instances)
class MLadjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = val_predictions.T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T
class MLprobAdjustedCount(MLAggregativeQuantifier):
def __init__(self, learner):
self.learner = learner
def preclassify(self, instances):
return self.learner.predict_proba(instances)
def fit(self, data: MultilabelledCollection, train_prop=0.6):
self.classes_ = data.classes_
train, val = data.train_test_split(train_prop=train_prop)
self.learner.fit(*train.Xy)
val_predictions = self.preclassify(val.instances)
val_true = val.labels
N = len(val)
C = (val_predictions>0.5).T.dot(val_true) / N # join probabilities [[P(y1,\hat{y}1), P(y2,\hat{y}1)], ... ]
# not sure...
priorP = val_predictions.mean(axis=0).reshape(-1,1) # priors [P(hat{y}1), P(hat{y}2), ...]
self.Pte_cond_estim_ = np.true_divide(C, priorP, where=priorP>0) # cond probabilities [[P(y1|\hat{y}1), P(y2|\hat{y}1)], ... ]
return self
def aggregate(self, predictions):
P = sklearn.preprocessing.normalize(predictions, norm='l1')
correction = P.dot(self.Pte_cond_estim_)
adjusted = correction.mean(axis=0)
return np.asarray([1-adjusted, adjusted]).T

View File

@ -0,0 +1,79 @@
num categories = 10
Train-counts: [1650 181 389 2877 433 347 538 197 369 212]
Test-counts: [ 719 56 189 1087 149 131 179 89 117 71]
MLPE: 0.01101
NPP:
NaiveCC mae=0.01718
NaivePCC mae=0.00898
NaiveACC mae=0.01560
NaivePACC mae=0.01062
StackCC mae=0.00790
StackPCC mae=0.00659 **
StackACC mae=0.00913
StackPACC mae=0.00771
ChainCC mae=0.01644
ChainPCC mae=0.00924
ChainACC mae=0.01767
ChainPACC mae=0.01140
MRQ-CC mae=0.01130
MRQ-PCC mae=0.00941
MRQ-ACC mae=0.01153
MRQ-PACC mae=0.01000
MRQ-StackCC mae=0.00757
MRQ-StackPCC mae=0.00652 **
MRQ-StackACC mae=0.00799
MRQ-StackPACC mae=0.00763
MRQ-StackCC-app mae=0.00791
MRQ-StackPCC-appmae=0.00840
MRQ-StackACC-appmae=0.00910
MRQ-StackPACC-apmae=0.00941
MRQ-ChainCC mae=0.00989
MRQ-ChainPCC mae=0.00916
MRQ-ChainACC mae=0.01251
MRQ-ChainPACC mae=0.00954
APP:
NaiveCC mae=0.04120
NaivePCC mae=0.03741
NaiveACC mae=0.03202
NaivePACC mae=0.02293
StackCC mae=0.01969
StackPCC mae=0.01871
StackACC mae=0.01386 **
StackPACC mae=0.01267 **
ChainCC mae=0.04136
ChainPCC mae=0.03571
ChainACC mae=0.03622
ChainPACC mae=0.02659
MRQ-CC mae=0.04356
MRQ-PCC mae=0.02532
MRQ-ACC mae=0.05716
MRQ-PACC mae=0.02936
MRQ-StackCC mae=0.02448
MRQ-StackPCC mae=0.02090
MRQ-StackACC mae=0.02579
MRQ-StackPACC mae=0.02388
MRQ-StackCC-app mae=0.01535
MRQ-StackPCC-appmae=0.01457
MRQ-StackACC-appmae=0.01441
MRQ-StackPACC-apmae=0.01633
MRQ-ChainCC mae=0.04874
MRQ-ChainPCC mae=0.02537
MRQ-ChainACC mae=0.06262
MRQ-ChainPACC mae=0.02906

347
MultiLabel/tabular.py Normal file
View File

@ -0,0 +1,347 @@
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True):
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col: j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = significance_test
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
def _addcolor(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if filled_cols_idx.size == 0:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
minval = min(col_means)
maxval = max(col_means)
for col_idx in filled_cols_idx:
val = self.map['mean'][i, col_idx]
norm = (maxval - minval)
if norm > 0:
normval = (val - minval) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
self.map['color'][i, col_idx] = color_red2green_01(normval)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
_, p_val = wilcoxon(values1, values2)
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False] * self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j == best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
show_std=self.show_std)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
values = np.asarray(self.map['mean'][:, self.method_index[col]])
else: # wilcoxon
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim == 0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
if self.map['values'][rid, cid] is None:
self.map['values'][rid, cid] = values
elif values is not None:
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v, float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latexCell(self, benchmark, method):
self.update()
i, j = self._coordinates(benchmark, method)
if self.map['fill'][i, j] == False:
return self.missing_str
mean = self.map['mean'][i, j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i, j] == 1
if isbest:
l = "\\textbf{" + l.strip() + "}"
stat = ''
if self.ttest is not None and self.some_similar[j]:
test_label = self.map['ttest'][i, j]
if test_label == 'Sim':
stat = '^{\dag\phantom{\dag}}'
elif test_label == 'Same':
stat = '^{\ddag}'
elif isbest or test_label == 'Diff':
stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i, j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f" \pm {std:{self.prec_std}}"
if stat != '' or std != '':
l = f'{l}${stat}{std}$'
if self.color:
l += ' ' + self.map['color'][i, j]
return l
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
tab = ' & '
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
tab += ' \\\\\hline\n'
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRow(row)
if average:
tab += '\hline\n'
tab += 'Average & '
tab += self.latexAverage()
return tab
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
def withside(label):
return '\side{'+label+'}' if side else label
tab = ' & '
tab += ' & '.join([withside(benchmark_replace.get(col, col)) for col in self.benchmarks])
if average:
tab += ' & ' + withside('Ave')
tab += ' \\\\\hline\n'
for row in self.methods:
rowname = method_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRowT(row, endl='')
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
tab += '\\\\\hline\n'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexRowT(self, method, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col: j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

0
MultiLabel/util/__init__.py Executable file
View File

145
MultiLabel/util/common.py Executable file
View File

@ -0,0 +1,145 @@
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from tqdm import tqdm
import torch
from scipy.sparse import vstack, issparse
from joblib import Parallel, delayed
import multiprocessing
import itertools
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing documents')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
def get_word_list(word2index1, word2index2=None): #TODO: redo
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(word2index1)
if word2index2 is not None:
word_list += extract_word_list(word2index2)
return word_list
def batchify(index_list, labels, batchsize, pad_index, device, target_long=False, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize]
if issparse(batch_labels):
batch_labels = batch_labels.toarray()
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
totype = torch.LongTensor if target_long else torch.FloatTensor
target = totype(batch_labels)
yield batch.to(device), target.to(device)
def batchify_unlabelled(index_list, batchsize, pad_index, device, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.to(device)
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='singlelabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs==-1:
n_jobs = multiprocessing.cpu_count()
batch = int(n_tasks / n_jobs)
remainder = n_tasks % n_jobs
return [slice(job*batch, (job+1)*batch+ (remainder if job == n_jobs - 1 else 0)) for job in range(n_jobs)]
def tokenize_job(documents, tokenizer, max_tokens, job):
return [tokenizer(d)[:max_tokens] for d in tqdm(documents, desc=f'tokenizing [job: {job}]')]
def tokenize_parallel(documents, tokenizer, max_tokens, n_jobs=-1):
slices = get_parallel_slices(n_tasks=len(documents), n_jobs=n_jobs)
tokens = Parallel(n_jobs=n_jobs)(
delayed(tokenize_job)(
documents[slice_i], tokenizer, max_tokens, job
)
for job, slice_i in enumerate(slices)
)
return list(itertools.chain.from_iterable(tokens))

60
MultiLabel/util/csv_log.py Executable file
View File

@ -0,0 +1,60 @@
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
class CSVLog:
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
self.file = file
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file) and not overwrite:
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
self.columns = sorted(self.df.columns.values.tolist())
else:
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
assert columns is not None, 'columns cannot be None'
self.columns = sorted(columns)
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
self.defaults = {}
def already_calculated(self, **kwargs):
df = self.df
if df.shape[0] == 0:
return False
if len(kwargs) == 0:
kwargs = self.defaults
for key,val in kwargs.items():
df = df.loc[df[key] == val]
if df.shape[0] == 0:
return False
return True
def set_default(self, param, value):
self.defaults[param] = value
def add_row(self, **kwargs):
for key in self.defaults.keys():
if key not in kwargs:
kwargs[key]=self.defaults[key]
colums = sorted(list(kwargs.keys()))
values = [kwargs[col_i] for col_i in colums]
s = pd.Series(values, index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(kwargs)
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)

View File

@ -0,0 +1,33 @@
from data.dataset import Dataset
from tqdm import tqdm
import os
import numpy as np
def write_data(documents, labels, fout):
print(f'there are {len(documents)} documents')
written, empty = 0, 0
with open(fout, 'wt') as foo:
for doc, label in tqdm(list(zip(documents, labels))):
doc = doc.replace('\t', ' ').replace('\n', ' ').strip()
label = np.squeeze(np.asarray(label.todense()))
label = ' '.join([f'{x}' for x in label])
if doc:
foo.write(f'{label}\t{doc}\n')
written += 1
else:
foo.write(f'{label}\tempty document\n')
empty += 1
print(f'written = {written}')
print(f'empty = {empty}')
for dataset_name in ['reuters21578', 'ohsumed', 'jrcall', 'rcv1', 'wipo-sl-sc']: #'20newsgroups'
dataset = Dataset.load(dataset_name=dataset_name, pickle_path=f'../pickles/{dataset_name}.pickle').show()
os.makedirs(f'../leam/{dataset_name}', exist_ok=True)
write_data(dataset.devel_raw, dataset.devel_labelmatrix, f'../leam/{dataset_name}/train.csv')
#write_data(dataset.test_raw, dataset.test_labelmatrix, f'../leam/{dataset_name}/test.csv')
print('done')

View File

@ -0,0 +1,3 @@
def warn(*args, **kwargs): pass
import warnings
warnings.warn = warn

54
MultiLabel/util/early_stop.py Executable file
View File

@ -0,0 +1,54 @@
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
import torch
from time import time
from util.file import create_if_not_exist
class EarlyStopping:
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience
self.patience = patience
self.verbose = verbose
self.best_score = None
self.best_epoch = None
self.stop_time = None
self.checkpoint = checkpoint
self.model = model
self.STOP = False
def __call__(self, watch_score, epoch):
if self.STOP:
return #done
if self.best_score is None or watch_score >= self.best_score:
self.best_score = watch_score
self.best_epoch = epoch
self.stop_time = time()
if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
torch.save(self.model, self.checkpoint)
else:
self.print(f'[early-stop] improved')
self.patience = self.patience_limit
else:
self.patience -= 1
if self.patience == 0:
self.STOP = True
self.print(f'[early-stop] patience exhausted')
else:
if self.patience>0: # if negative, then early-stop is ignored
self.print(f'[early-stop] patience={self.patience}')
def reinit_counter(self):
self.STOP = False
self.patience=self.patience_limit
def restore_checkpoint(self):
return torch.load(self.checkpoint)
def print(self, msg):
if self.verbose:
print(msg)

38
MultiLabel/util/file.py Executable file
View File

@ -0,0 +1,38 @@
import urllib.request
from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if exists(archive_path): return
create_if_not_exist(dirname(archive_path))
download_file(url,archive_path)
def ls(dir, typecheck):
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
el.sort()
return el
def list_dirs(dir):
return ls(dir, typecheck=isdir)
def list_files(dir):
return ls(dir, typecheck=isfile)
def create_if_not_exist(path):
if not exists(path): makedirs(path)

86
MultiLabel/util/metrics.py Executable file
View File

@ -0,0 +1,86 @@
import numpy as np
from scipy.sparse import lil_matrix, issparse
from sklearn.metrics import f1_score, accuracy_score
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
def evaluation(y_true, y_pred, classification_type):
if classification_type == 'multilabel':
eval_function = multilabel_eval
elif classification_type == 'singlelabel':
eval_function = singlelabel_eval
Mf1, mf1, accuracy = eval_function(y_true, y_pred)
return Mf1, mf1, accuracy
def multilabel_eval(y, y_):
tp = y.multiply(y_)
fn = lil_matrix(y.shape)
true_ones = y==1
fn[true_ones]=1-tp[true_ones]
fp = lil_matrix(y.shape)
pred_ones = y_==1
if pred_ones.nnz>0:
fp[pred_ones]=1-tp[pred_ones]
#macro-f1
tp_macro = np.asarray(tp.sum(axis=0), dtype=int).flatten()
fn_macro = np.asarray(fn.sum(axis=0), dtype=int).flatten()
fp_macro = np.asarray(fp.sum(axis=0), dtype=int).flatten()
pos_pred = tp_macro+fp_macro
pos_true = tp_macro+fn_macro
prec=np.zeros(shape=tp_macro.shape,dtype=float)
rec=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(tp_macro, pos_pred, out=prec, where=pos_pred>0)
np.divide(tp_macro, pos_true, out=rec, where=pos_true>0)
den=prec+rec
macrof1=np.zeros(shape=tp_macro.shape,dtype=float)
np.divide(np.multiply(prec,rec),den,out=macrof1,where=den>0)
macrof1 *=2
macrof1[(pos_pred==0)*(pos_true==0)]=1
macrof1 = np.mean(macrof1)
#micro-f1
tp_micro = tp_macro.sum()
fn_micro = fn_macro.sum()
fp_micro = fp_macro.sum()
pos_pred = tp_micro + fp_micro
pos_true = tp_micro + fn_micro
prec = (tp_micro / pos_pred) if pos_pred>0 else 0
rec = (tp_micro / pos_true) if pos_true>0 else 0
den = prec+rec
microf1 = 2*prec*rec/den if den>0 else 0
if pos_pred==pos_true==0:
microf1=1
#accuracy
ndecisions = np.multiply(*y.shape)
tn = ndecisions - (tp_micro+fn_micro+fp_micro)
acc = (tp_micro+tn)/ndecisions
return macrof1,microf1,acc
def singlelabel_eval(y, y_):
if issparse(y_): y_ = y_.toarray().flatten()
macrof1 = f1_score(y, y_, average='macro')
microf1 = f1_score(y, y_, average='micro')
acc = accuracy_score(y, y_)
return macrof1,microf1,acc

View File

@ -0,0 +1,65 @@
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import numpy as np
from joblib import Parallel, delayed
from time import time
class MLSVC:
"""
Multi-Label Support Vector Machine, with individual optimizations per binary problem.
"""
def __init__(self, n_jobs=1, estimator=LinearSVC, *args, **kwargs):
self.n_jobs = n_jobs
self.args = args
self.kwargs = kwargs
self.verbose = False if 'verbose' not in self.kwargs else self.kwargs['verbose']
self.estimator = estimator
def fit(self, X, y, **grid_search_params):
tini = time()
assert len(y.shape)==2 and set(np.unique(y).tolist()) == {0,1}, 'data format is not multi-label'
nD,nC = y.shape
prevalence = np.sum(y, axis=0)
self.svms = np.array([self.estimator(*self.args, **self.kwargs) for _ in range(nC)])
if grid_search_params and grid_search_params['param_grid']:
self._print('grid_search activated with: {}'.format(grid_search_params))
# Grid search cannot be performed if the category prevalence is less than the parameter cv.
# In those cases we place a svm instead of a gridsearchcv
cv = 5 if 'cv' not in grid_search_params else grid_search_params['cv']
assert isinstance(cv, int), 'cv must be an int (other policies are not supported yet)'
self.svms = [GridSearchCV(svm_i, refit=True, **grid_search_params) if prevalence[i]>=cv else svm_i
for i,svm_i in enumerate(self.svms)]
for i in np.argwhere(prevalence==0).flatten():
self.svms[i] = TrivialRejector()
self.svms = Parallel(n_jobs=self.n_jobs)(
delayed(self.svms[c].fit)(X,y[:,c]) for c,svm in enumerate(self.svms)
)
self.training_time = time() - tini
def predict(self, X):
return np.vstack(list(map(lambda svmi: svmi.predict(X), self.svms))).T
def predict_proba(self, X):
return np.vstack(map(lambda svmi: svmi.predict_proba(X)[:,np.argwhere(svmi.classes_==1)[0,0]], self.svms)).T
def _print(self, msg):
if self.verbose>0:
print(msg)
def best_params(self):
return [svmi.best_params_ if isinstance(svmi, GridSearchCV) else None for svmi in self.svms]
class TrivialRejector:
def fit(self,*args,**kwargs): return self
def predict(self, X): return np.zeros(X.shape[0])
def predict_proba(self, X): return np.zeros(X.shape[0])

View File

@ -176,104 +176,6 @@ class LabelledCollection:
yield train, test
class MultilingualLabelledCollection:
def __init__(self, langs:List[str], labelledCollections:List[LabelledCollection]):
assert len(langs) == len(labelledCollections), 'length mismatch for langs and labelledCollection lists'
assert all(isinstance(lc, LabelledCollection) for lc in labelledCollections), 'unexpected type for labelledCollections'
assert all(labelledCollections[0].classes_ == lc_i.classes_ for lc_i in labelledCollections[1:]), \
'inconsistent classes found for some labelled collections'
self.llc = {l: lc for l, lc in zip(langs, labelledCollections)}
self.classes_=labelledCollections[0].classes_
@classmethod
def fromLangDict(cls, lang_labelledCollection:dict):
return MultilingualLabelledCollection(*list(zip(*list(lang_labelledCollection.items()))))
def langs(self):
return list(sorted(self.llc.keys()))
def __getitem__(self, lang)->LabelledCollection:
return self.llc[lang]
@classmethod
def load(cls, path: str, loader_func: callable):
return MultilingualLabelledCollection(*loader_func(path))
def __len__(self):
return sum(map(len, self.llc.values()))
def prevalence(self):
prev = np.asarray([lc.prevalence() * len(lc) for lc in self.llc.values()]).sum(axis=0)
return prev / prev.sum()
def language_prevalence(self):
lang_count = np.asarray([len(self.llc[l]) for l in self.langs()])
return lang_count / lang_count.sum()
def counts(self):
return np.asarray([lc.counts() for lc in self.llc.values()]).sum(axis=0)
@property
def n_classes(self):
return len(self.classes_)
@property
def binary(self):
return self.n_classes == 2
def __check_langs(self, l_dict:dict):
assert len(l_dict)==len(self.langs()), 'wrong number of languages'
assert all(l in l_dict for l in self.langs()), 'missing languages in l_sizes'
def __check_sizes(self, l_sizes: Union[int,dict]):
assert isinstance(l_sizes, int) or isinstance(l_sizes, dict), 'unexpected type for l_sizes'
if isinstance(l_sizes, int):
return {l:l_sizes for l in self.langs()}
self.__check_langs(l_sizes)
return l_sizes
def sampling_index(self, l_sizes: Union[int,dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return {l:lc.sampling_index(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
def uniform_sampling_index(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return {l: lc.uniform_sampling_index(l_sizes[l]) for l,lc in self.llc.items()}
def uniform_sampling(self, l_sizes: Union[int, dict]):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.uniform_sampling(l_sizes[l]) for l,lc in self.llc.items()}
)
def sampling(self, l_sizes: Union[int, dict], *prevs, shuffle=True):
l_sizes = self.__check_sizes(l_sizes)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling(l_sizes[l], *prevs, shuffle=shuffle) for l,lc in self.llc.items()}
)
def sampling_from_index(self, l_index:dict):
self.__check_langs(l_index)
return MultilingualLabelledCollection.fromLangDict(
{l: lc.sampling_from_index(l_index[l]) for l,lc in self.llc.items()}
)
def split_stratified(self, train_prop=0.6, random_state=None):
train, test = list(zip(*[self[l].split_stratified(train_prop, random_state) for l in self.langs()]))
return MultilingualLabelledCollection(self.langs(), train), MultilingualLabelledCollection(self.langs(), test)
def asLabelledCollection(self, return_langs=False):
lXy_list = [([l]*len(lc),*lc.Xy) for l, lc in self.llc.items()] # a list with (lang_i, Xi, yi)
ls,Xs,ys = list(zip(*lXy_list))
ls = np.concatenate(ls)
vertstack = vstack if issparse(Xs[0]) else np.vstack
Xs = vertstack(Xs)
ys = np.concatenate(ys)
lc = LabelledCollection(Xs, ys, classes_=self.classes_)
# return lc, ls if return_langs else lc
#
#
#
class Dataset:
def __init__(self, training: LabelledCollection, test: LabelledCollection, vocabulary: dict = None, name=''):

View File

@ -3,6 +3,13 @@ from scipy.sparse import dok_matrix
from tqdm import tqdm
def from_rcv2_lang_file(path, encoding='utf-8'):
lines = open(path, 'rt', encoding=encoding).readlines()
parts = [l.split('\t') for l in lines]
docs, cats = list(zip(*[(parts_i[1], parts_i[2]) for parts_i in parts]))
return docs, cats
def from_text(path, encoding='utf-8'):
"""
Reas a labelled colletion of documents.

View File

@ -105,7 +105,7 @@ def _predict_from_indexes(
estim_prevalence = quantification_func(sample.instances)
return true_prevalence, estim_prevalence
pbar = tqdm(indexes, desc='[artificial sampling protocol] generating predictions') if verbose else indexes
pbar = tqdm(indexes, desc='[sampling protocol] generating predictions') if verbose else indexes
results = qp.util.parallel(_predict_prevalences, pbar, n_jobs=n_jobs)
true_prevalences, estim_prevalences = zip(*results)

View File

@ -37,6 +37,9 @@ class AggregativeQuantifier(BaseQuantifier):
def learner(self, value):
self.learner_ = value
def preclassify(self, instances):
return self.classify(instances)
def classify(self, instances):
return self.learner.predict(instances)
@ -74,6 +77,9 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
probabilities.
"""
def preclassify(self, instances):
return self.predict_proba(instances)
def posterior_probabilities(self, instances):
return self.learner.predict_proba(instances)
@ -316,6 +322,12 @@ class PACC(AggregativeProbabilisticQuantifier):
self.pcc = PCC(self.learner)
self.Pte_cond_estim_ = self.getPteCondEstim(classes, y, y_)
return self
@classmethod
def getPteCondEstim(cls, classes, y, y_):
# estimate the matrix with entry (i,j) being the estimate of P(yi|yj), that is, the probability that a
# document that belongs to yj ends up being classified as belonging to yi
n_classes = len(classes)
@ -323,9 +335,7 @@ class PACC(AggregativeProbabilisticQuantifier):
for i, class_ in enumerate(classes):
confusion[i] = y_[y == class_].mean(axis=0)
self.Pte_cond_estim_ = confusion.T
return self
return confusion.T
def aggregate(self, classif_posteriors):
prevs_estim = self.pcc.aggregate(classif_posteriors)
@ -785,7 +795,7 @@ class OneVsAll(AggregativeQuantifier):
return self.binary_quantifier.get_params()
def _delayed_binary_classification(self, c, X):
return self.dict_binary_quantifiers[c].classify(X)
return self.dict_binary_quantifiers[c].preclassify(X)
def _delayed_binary_posteriors(self, c, X):
return self.dict_binary_quantifiers[c].posterior_probabilities(X)

View File

@ -27,7 +27,7 @@ class BaseQuantifier(metaclass=ABCMeta):
# based on class structure
@property
def binary(self):
return False
return len(self.classes_)==2
@property
def aggregative(self):

View File

@ -227,7 +227,7 @@ def _delayed_new_instance(args):
if val_split is not None:
if isinstance(val_split, float):
assert 0 < val_split < 1, 'val_split should be in (0,1)'
data, val_split = data.split_stratified(train_prop=1 - val_split)
data, val_split = data.train_test_split(train_prop=1 - val_split)
sample_index = data.sampling_index(sample_size, *prev)
sample = data.sampling_from_index(sample_index)

View File

@ -73,7 +73,7 @@ class QuaNetTrainer(BaseQuantifier):
if fit_learner:
classifier_data, unused_data = data.split_stratified(0.4)
train_data, valid_data = unused_data.split_stratified(0.66) # 0.66 split of 60% makes 40% and 20%
train_data, valid_data = unused_data.train_test_split(0.66) # 0.66 split of 60% makes 40% and 20%
self.learner.fit(*classifier_data.Xy)
else:
classifier_data = None
@ -87,8 +87,9 @@ class QuaNetTrainer(BaseQuantifier):
train_posteriors = self.learner.predict_proba(train_data.instances)
# turn instances' original representations into embeddings
valid_data.instances = self.learner.transform(valid_data.instances)
train_data.instances = self.learner.transform(train_data.instances)
valid_data_embed = LabelledCollection(self.learner.transform(valid_data.instances), valid_data.labels, self._classes_)
train_data_embed = LabelledCollection(self.learner.transform(train_data.instances), train_data.labels, self._classes_)
self.quantifiers = {
'cc': CC(self.learner).fit(None, fit_learner=False),
@ -110,9 +111,9 @@ class QuaNetTrainer(BaseQuantifier):
nQ = len(self.quantifiers)
nC = data.n_classes
self.quanet = QuaNetModule(
doc_embedding_size=train_data.instances.shape[1],
doc_embedding_size=train_data_embed.instances.shape[1],
n_classes=data.n_classes,
stats_size=nQ*nC, #+ 2*nC*nC,
stats_size=nQ*nC,
order_by=0 if data.binary else None,
**self.quanet_params
).to(self.device)
@ -124,8 +125,8 @@ class QuaNetTrainer(BaseQuantifier):
checkpoint = self.checkpoint
for epoch_i in range(1, self.n_epochs):
self.epoch(train_data, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
self.epoch(valid_data, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
self.epoch(train_data_embed, train_posteriors, self.tr_iter, epoch_i, early_stop, train=True)
self.epoch(valid_data_embed, valid_posteriors, self.va_iter, epoch_i, early_stop, train=False)
early_stop(self.status['va-loss'], epoch_i)
if early_stop.IMPROVED:

View File

@ -97,7 +97,7 @@ class GridSearchQ(BaseQuantifier):
return training, validation
elif isinstance(validation, float):
assert 0. < validation < 1., 'validation proportion should be in (0,1)'
training, validation = training.split_stratified(train_prop=1 - validation)
training, validation = training.train_test_split(train_prop=1 - validation)
return training, validation
else:
raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'