QuaPy/Retrieval/relscore_distribution.py

import os.path
import pickle
from collections import defaultdict
from itertools import zip_longest
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection

from glob import glob
from os.path import join
from tqdm import tqdm

from result_table.src.table import Table
import numpy as np
import matplotlib.pyplot as plt

"""
Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
- training pool size (100K, 500K, 1M, FULL)
- rank  
"""


data_home = 'data'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]

for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
    test_added = False
    Mtrs, Mtes, source = [], [], []
    for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:

        class_home = join(data_home, class_name, data_size)
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
        test_rankings_path = join(data_home, 'testRanking_Results.json')

        _, classifier = pickle.load(open(classifier_path, 'rb'))

        experiment_prot = RetrievedSamples(
            class_home,
            test_rankings_path,
            vectorizer=None,
            class_name=class_name,
            classes=classifier.classes_
        )

        Mtr = []
        Mte = []
        pbar = tqdm(experiment_prot(), total=experiment_prot.total())
        for train, test in pbar:
            Xtr, ytr, score_tr = train
            Xte, yte, score_te = test
            Mtr.append(score_tr)
            Mte.append(score_te)

        Mtrs.append(Mtr)
        if not test_added:
            Mtes.append(Mte)
            test_added = True
        source.append(data_size)

    fig, ax = plt.subplots()
    train_source = ['train-'+s for s in source]
    Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))

    for M, source in Ms:
        M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T

        num_rep, num_docs = M.shape

        mean_values = np.nanmean(M, axis=0)
        n_filled = np.count_nonzero(~np.isnan(M), axis=0)
        std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)

        line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
        color = line[-1].get_color()
        ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)


    ax.set_xlabel('Doc. Rank')
    ax.set_ylabel('Rel. Score')
    ax.set_title(class_name)

    ax.legend()

    # plt.show()
    os.makedirs('plots', exist_ok=True)
    plotpath = f'plots/{class_name}.pdf'
    print(f'saving plot in {plotpath}')
    plt.savefig(plotpath)
kde working 2024-04-19 18:16:14 +02:00			`import os.path`
			`import pickle`
			`from collections import defaultdict`
			`from itertools import zip_longest`
			`from pathlib import Path`

			`import numpy as np`
			`import pandas as pd`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.linear_model import LogisticRegression`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.svm import LinearSVC`

			`import quapy as qp`
			`import quapy.functional as F`
			`from Retrieval.commons import RetrievedSamples, load_sample`
			`from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive`
			`from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML`
			`from quapy.protocol import AbstractProtocol`
			`from quapy.data.base import LabelledCollection`

			`from glob import glob`
			`from os.path import join`
			`from tqdm import tqdm`

			`from result_table.src.table import Table`
			`import numpy as np`
			`import matplotlib.pyplot as plt`

			`"""`
			`Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:`
			`- training pool size (100K, 500K, 1M, FULL)`
			`- rank`
			`"""`


			`data_home = 'data'`
			`Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]`

			`for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:`
			`test_added = False`
			`Mtrs, Mtes, source = [], [], []`
			`for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:`

			`class_home = join(data_home, class_name, data_size)`
			`classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')`
			`test_rankings_path = join(data_home, 'testRanking_Results.json')`

			`_, classifier = pickle.load(open(classifier_path, 'rb'))`

			`experiment_prot = RetrievedSamples(`
			`class_home,`
			`test_rankings_path,`
			`vectorizer=None,`
			`class_name=class_name,`
			`classes=classifier.classes_`
			`)`

			`Mtr = []`
			`Mte = []`
			`pbar = tqdm(experiment_prot(), total=experiment_prot.total())`
			`for train, test in pbar:`
			`Xtr, ytr, score_tr = train`
			`Xte, yte, score_te = test`
			`Mtr.append(score_tr)`
			`Mte.append(score_te)`

			`Mtrs.append(Mtr)`
			`if not test_added:`
			`Mtes.append(Mte)`
			`test_added = True`
			`source.append(data_size)`

			`fig, ax = plt.subplots()`
			`train_source = ['train-'+s for s in source]`
			`Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))`

			`for M, source in Ms:`
			`M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T`

			`num_rep, num_docs = M.shape`

			`mean_values = np.nanmean(M, axis=0)`
			`n_filled = np.count_nonzero(~np.isnan(M), axis=0)`
			`std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)`

			`line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)`
			`color = line[-1].get_color()`
			`ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)`


			`ax.set_xlabel('Doc. Rank')`
			`ax.set_ylabel('Rel. Score')`
			`ax.set_title(class_name)`

			`ax.legend()`

			`# plt.show()`
			`os.makedirs('plots', exist_ok=True)`
			`plotpath = f'plots/{class_name}.pdf'`
			`print(f'saving plot in {plotpath}')`
			`plt.savefig(plotpath)`