1
0
Fork 0

first experiments

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-03 18:33:27 +01:00
parent c63325e364
commit f285e936ad
3 changed files with 91 additions and 0 deletions

43
Ordinal/evaluation.py Normal file
View File

@ -0,0 +1,43 @@
import numpy as np
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
prevs = prevs.reshape(1,-1)
return prevs
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
return np.mean(nmds)
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)
"""
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
computed efficiently.
[Mirko Bunse's code from Julia adapted]
"""
def mdpa(a, b):
assert len(a) == len(b), "histograms have to have the same length"
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
# algorithm 1 in [cha2002measuring]
prefixsum = 0.0
distance = 0.0
for i in range(len(a)):
prefixsum += a[i] - b[i]
distance += abs(prefixsum)
return distance / sum(a) # the normalization is a fix to the original MDPA

35
Ordinal/main.py Normal file
View File

@ -0,0 +1,35 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
from method.aggregative import PACC, CC, EMQ
from quapy.data import LabelledCollection
from os.path import join
from utils import load_samples
from evaluation import nmd
domain = 'Books'
datapath = './data'
protocol = 'app'
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2))
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
def load_test_samples():
for sample in load_samples(join(datapath, domain, protocol, 'test_samples'), classes=train.classes_):
sample.instances = tfidf.transform(sample.instances)
yield sample.instances, sample.prevalence()
q = EMQ(LogisticRegression())
q.fit(train)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd], eval_budget=100)
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')

13
Ordinal/utils.py Normal file
View File

@ -0,0 +1,13 @@
import quapy as qp
from quapy.data import LabelledCollection
from glob import glob
import os
from os.path import join
def load_samples(path_dir, classes):
nsamples = len(glob(join(path_dir, f'*.txt')))
for id in range(nsamples):
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)