106 lines
3.2 KiB
Python
106 lines
3.2 KiB
Python
|
import os.path
|
||
|
import pickle
|
||
|
from collections import defaultdict
|
||
|
from itertools import zip_longest
|
||
|
from pathlib import Path
|
||
|
|
||
|
import numpy as np
|
||
|
import pandas as pd
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from sklearn.linear_model import LogisticRegression
|
||
|
from sklearn.model_selection import GridSearchCV
|
||
|
from sklearn.svm import LinearSVC
|
||
|
|
||
|
import quapy as qp
|
||
|
import quapy.functional as F
|
||
|
from Retrieval.commons import RetrievedSamples, load_sample
|
||
|
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||
|
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||
|
from quapy.protocol import AbstractProtocol
|
||
|
from quapy.data.base import LabelledCollection
|
||
|
|
||
|
from glob import glob
|
||
|
from os.path import join
|
||
|
from tqdm import tqdm
|
||
|
|
||
|
from result_table.src.table import Table
|
||
|
import numpy as np
|
||
|
import matplotlib.pyplot as plt
|
||
|
|
||
|
"""
|
||
|
Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
|
||
|
- training pool size (100K, 500K, 1M, FULL)
|
||
|
- rank
|
||
|
"""
|
||
|
|
||
|
|
||
|
data_home = 'data'
|
||
|
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
||
|
|
||
|
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
|
||
|
test_added = False
|
||
|
Mtrs, Mtes, source = [], [], []
|
||
|
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
|
||
|
|
||
|
class_home = join(data_home, class_name, data_size)
|
||
|
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
|
||
|
test_rankings_path = join(data_home, 'testRanking_Results.json')
|
||
|
|
||
|
_, classifier = pickle.load(open(classifier_path, 'rb'))
|
||
|
|
||
|
experiment_prot = RetrievedSamples(
|
||
|
class_home,
|
||
|
test_rankings_path,
|
||
|
vectorizer=None,
|
||
|
class_name=class_name,
|
||
|
classes=classifier.classes_
|
||
|
)
|
||
|
|
||
|
Mtr = []
|
||
|
Mte = []
|
||
|
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
|
||
|
for train, test in pbar:
|
||
|
Xtr, ytr, score_tr = train
|
||
|
Xte, yte, score_te = test
|
||
|
Mtr.append(score_tr)
|
||
|
Mte.append(score_te)
|
||
|
|
||
|
Mtrs.append(Mtr)
|
||
|
if not test_added:
|
||
|
Mtes.append(Mte)
|
||
|
test_added = True
|
||
|
source.append(data_size)
|
||
|
|
||
|
fig, ax = plt.subplots()
|
||
|
train_source = ['train-'+s for s in source]
|
||
|
Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
|
||
|
|
||
|
for M, source in Ms:
|
||
|
M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
|
||
|
|
||
|
num_rep, num_docs = M.shape
|
||
|
|
||
|
mean_values = np.nanmean(M, axis=0)
|
||
|
n_filled = np.count_nonzero(~np.isnan(M), axis=0)
|
||
|
std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
|
||
|
|
||
|
line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
|
||
|
color = line[-1].get_color()
|
||
|
ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
|
||
|
|
||
|
|
||
|
ax.set_xlabel('Doc. Rank')
|
||
|
ax.set_ylabel('Rel. Score')
|
||
|
ax.set_title(class_name)
|
||
|
|
||
|
ax.legend()
|
||
|
|
||
|
# plt.show()
|
||
|
os.makedirs('plots', exist_ok=True)
|
||
|
plotpath = f'plots/{class_name}.pdf'
|
||
|
print(f'saving plot in {plotpath}')
|
||
|
plt.savefig(plotpath)
|
||
|
|
||
|
|
||
|
|