forked from moreo/QuaPy
first experiments
This commit is contained in:
parent
c63325e364
commit
f285e936ad
|
@ -0,0 +1,43 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
prevs = prevs.reshape(1,-1)
|
||||
return prevs
|
||||
|
||||
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||
|
||||
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||
return np.mean(nmds)
|
||||
|
||||
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
||||
|
||||
"""
|
||||
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||
computed efficiently.
|
||||
[Mirko Bunse's code from Julia adapted]
|
||||
"""
|
||||
def mdpa(a, b):
|
||||
assert len(a) == len(b), "histograms have to have the same length"
|
||||
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||
|
||||
# algorithm 1 in [cha2002measuring]
|
||||
prefixsum = 0.0
|
||||
distance = 0.0
|
||||
for i in range(len(a)):
|
||||
prefixsum += a[i] - b[i]
|
||||
distance += abs(prefixsum)
|
||||
|
||||
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy as qp
|
||||
from method.aggregative import PACC, CC, EMQ
|
||||
from quapy.data import LabelledCollection
|
||||
from os.path import join
|
||||
from utils import load_samples
|
||||
from evaluation import nmd
|
||||
|
||||
domain = 'Books'
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2))
|
||||
|
||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
for sample in load_samples(join(datapath, domain, protocol, 'test_samples'), classes=train.classes_):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
q = EMQ(LogisticRegression())
|
||||
q.fit(train)
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd], eval_budget=100)
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
|
||||
print(f'{mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from glob import glob
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
|
||||
def load_samples(path_dir, classes):
|
||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
||||
for id in range(nsamples):
|
||||
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
|
Loading…
Reference in New Issue