From f285e936ad0f05bd576ab57013931c0a47347f06 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 3 Mar 2022 18:33:27 +0100 Subject: [PATCH] first experiments --- Ordinal/evaluation.py | 43 +++++++++++++++++++++++++++++++++++++++++++ Ordinal/main.py | 35 +++++++++++++++++++++++++++++++++++ Ordinal/utils.py | 13 +++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 Ordinal/evaluation.py create mode 100644 Ordinal/main.py create mode 100644 Ordinal/utils.py diff --git a/Ordinal/evaluation.py b/Ordinal/evaluation.py new file mode 100644 index 0000000..a23181c --- /dev/null +++ b/Ordinal/evaluation.py @@ -0,0 +1,43 @@ +import numpy as np + + +def _check_arrays(prevs): + prevs = np.asarray(prevs) + if prevs.ndim==1: + prevs = prevs.reshape(1,-1) + return prevs + + +def mnmd(prevs, prevs_hat): + prevs = _check_arrays(prevs) + prevs_hat = _check_arrays(prevs_hat) + assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}' + + nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)] + return np.mean(nmds) + + +def nmd(prev, prev_hat): + n = len(prev) + return (1./(n-1))*mdpa(prev, prev_hat) + + +""" +Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`. +The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be +computed efficiently. +[Mirko Bunse's code from Julia adapted] +""" +def mdpa(a, b): + assert len(a) == len(b), "histograms have to have the same length" + assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))" + + # algorithm 1 in [cha2002measuring] + prefixsum = 0.0 + distance = 0.0 + for i in range(len(a)): + prefixsum += a[i] - b[i] + distance += abs(prefixsum) + + return distance / sum(a) # the normalization is a fix to the original MDPA + diff --git a/Ordinal/main.py b/Ordinal/main.py new file mode 100644 index 0000000..0f921b0 --- /dev/null +++ b/Ordinal/main.py @@ -0,0 +1,35 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression +import quapy as qp +from method.aggregative import PACC, CC, EMQ +from quapy.data import LabelledCollection +from os.path import join +from utils import load_samples +from evaluation import nmd + +domain = 'Books' +datapath = './data' +protocol = 'app' + +tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2)) + +train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text) +train.instances = tfidf.fit_transform(train.instances) + + +def load_test_samples(): + for sample in load_samples(join(datapath, domain, protocol, 'test_samples'), classes=train.classes_): + sample.instances = tfidf.transform(sample.instances) + yield sample.instances, sample.prevalence() + + +q = EMQ(LogisticRegression()) +q.fit(train) +report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd], eval_budget=100) +mean_nmd = report['nmd'].mean() +std_nmd = report['nmd'].std() + +print(f'{mean_nmd:.4f} +-{std_nmd:.4f}') + + + diff --git a/Ordinal/utils.py b/Ordinal/utils.py new file mode 100644 index 0000000..d0fdf7e --- /dev/null +++ b/Ordinal/utils.py @@ -0,0 +1,13 @@ +import quapy as qp +from quapy.data import LabelledCollection +from glob import glob +import os +from os.path import join + + +def load_samples(path_dir, classes): + nsamples = len(glob(join(path_dir, f'*.txt'))) + for id in range(nsamples): + yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes) + +