QuaPy/Retrieval/relscore_distribution.py

106 lines
3.2 KiB
Python
Raw Normal View History

2024-04-19 18:16:14 +02:00
import os.path
import pickle
from collections import defaultdict
from itertools import zip_longest
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import numpy as np
import matplotlib.pyplot as plt
"""
Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
- training pool size (100K, 500K, 1M, FULL)
- rank
"""
data_home = 'data'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
test_added = False
Mtrs, Mtes, source = [], [], []
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
class_home = join(data_home, class_name, data_size)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
test_rankings_path = join(data_home, 'testRanking_Results.json')
_, classifier = pickle.load(open(classifier_path, 'rb'))
experiment_prot = RetrievedSamples(
class_home,
test_rankings_path,
vectorizer=None,
class_name=class_name,
classes=classifier.classes_
)
Mtr = []
Mte = []
pbar = tqdm(experiment_prot(), total=experiment_prot.total())
for train, test in pbar:
Xtr, ytr, score_tr = train
Xte, yte, score_te = test
Mtr.append(score_tr)
Mte.append(score_te)
Mtrs.append(Mtr)
if not test_added:
Mtes.append(Mte)
test_added = True
source.append(data_size)
fig, ax = plt.subplots()
train_source = ['train-'+s for s in source]
Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
for M, source in Ms:
M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
num_rep, num_docs = M.shape
mean_values = np.nanmean(M, axis=0)
n_filled = np.count_nonzero(~np.isnan(M), axis=0)
std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
color = line[-1].get_color()
ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
ax.set_xlabel('Doc. Rank')
ax.set_ylabel('Rel. Score')
ax.set_title(class_name)
ax.legend()
# plt.show()
os.makedirs('plots', exist_ok=True)
plotpath = f'plots/{class_name}.pdf'
print(f'saving plot in {plotpath}')
plt.savefig(plotpath)