diff --git a/Retrieval/commons.py b/Retrieval/commons.py index b2007ea..289993c 100644 --- a/Retrieval/commons.py +++ b/Retrieval/commons.py @@ -6,6 +6,19 @@ from os.path import join import quapy.functional as F +Ks = [50, 100, 500, 1000] + +CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']: + +DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL'] + +protected_group = { + 'gender': 'Female', + 'continent': 'Africa', + 'years_category': 'Pre-1900s', +} + + def load_sample(path, class_name): """ Loads a sample json as a dataframe and returns text and labels for @@ -48,7 +61,9 @@ class RetrievedSamples: self.positive_class = positive_class self.classes = classes - def get_text_label_score(self, df): + def get_text_label_score(self, df, filter_rank=1000): + df = df[df['rank'] data_size -> method_name -> k -> stat -> float # where stat is "mean", "std", "max" def load_all_results(): - for class_name in CLASS_NAME: + for class_name in CLASS_NAMES: all_results[class_name] = {} - for data_size in DATA_SIZE: + for data_size in DATA_SIZES: all_results[class_name][data_size] = {} @@ -75,8 +55,8 @@ results = load_all_results() # generates the class-independent, size-independent plots for y-axis=MRAE in which: # - the x-axis displays the Ks -for class_name in CLASS_NAME: - for data_size in DATA_SIZE: +for class_name in CLASS_NAMES: + for data_size in DATA_SIZES: log = True diff --git a/Retrieval/plot_mrae_xaxis_size.py b/Retrieval/plot_mrae_xaxis_size.py index fca7710..55797cf 100644 --- a/Retrieval/plot_mrae_xaxis_size.py +++ b/Retrieval/plot_mrae_xaxis_size.py @@ -1,39 +1,15 @@ import os.path -import pickle -from collections import defaultdict -from pathlib import Path - -import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV -from sklearn.svm import LinearSVC - -import quapy as qp -from Retrieval.commons import RetrievedSamples, load_sample -from Retrieval.experiments import methods, benchmark_name -from Retrieval.plot_mrae_xaxis_k import load_all_results -from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive -from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML -from quapy.data.base import LabelledCollection - -from os.path import join -from tqdm import tqdm - -from result_table.src.table import Table +from Retrieval.experiments import methods +from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES import matplotlib.pyplot as plt - +from Retrieval.plot_mrae_xaxis_k import load_all_results data_home = 'data' class_mode = 'multiclass' method_names = [name for name, *other in methods(None)] -# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] -Ks = [50, 100, 500, 1000] -DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL'] -CLASS_NAME = ['gender', 'continent', 'years_category'] all_results = {} @@ -44,7 +20,7 @@ results = load_all_results() # generates the class-independent, size-independent plots for y-axis=MRAE in which: # - the x-axis displays the Ks -for class_name in CLASS_NAME: +for class_name in CLASS_NAMES: for k in Ks: log = True @@ -55,10 +31,10 @@ for class_name in CLASS_NAME: for method_name in method_names: # class_name -> data_size -> method_name -> k -> stat -> float means = [ - results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE + results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES ] stds = [ - results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE + results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES ] # max_mean = np.max([ # results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE @@ -67,7 +43,7 @@ for class_name in CLASS_NAME: max_means.append(max(means)) style = 'o-' if method_name != 'CC' else '--' - line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None) + line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None) color = line[-1].get_color() if log: ax.set_yscale('log') diff --git a/Retrieval/relscore_distribution.py b/Retrieval/relscore_distribution.py index 1db4b38..aac52d5 100644 --- a/Retrieval/relscore_distribution.py +++ b/Retrieval/relscore_distribution.py @@ -1,29 +1,9 @@ import os.path import pickle -from collections import defaultdict from itertools import zip_longest -from pathlib import Path - -import numpy as np -import pandas as pd -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import GridSearchCV -from sklearn.svm import LinearSVC - -import quapy as qp -import quapy.functional as F -from Retrieval.commons import RetrievedSamples, load_sample -from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive -from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML -from quapy.protocol import AbstractProtocol -from quapy.data.base import LabelledCollection - -from glob import glob +from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES from os.path import join from tqdm import tqdm - -from result_table.src.table import Table import numpy as np import matplotlib.pyplot as plt @@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f data_home = 'data' -Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000] for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']: test_added = False Mtrs, Mtes, source = [], [], [] - for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']: + for data_size in DATA_SIZES: class_home = join(data_home, class_name, data_size) classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')