setting a rank threshold to 1000, and finalizing plots

2024-05-10 15:46:13 +02:00 · 2024-05-10 15:46:13 +02:00 · 2ac48a9798
parent 67ed6e4c6c
commit 2ac48a9798
5 changed files with 41 additions and 107 deletions
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -6,6 +6,19 @@ from os.path import join
 import quapy.functional as F
 Ks = [50, 100, 500, 1000]
 CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
 DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 protected_group = {
    'gender': 'Female',
    'continent': 'Africa',
    'years_category': 'Pre-1900s',
 }
 def load_sample(path, class_name):
    """
    Loads a sample json as a dataframe and returns text and labels for
@ -48,7 +61,9 @@ class RetrievedSamples:
        self.positive_class = positive_class
        self.classes = classes
-    def get_text_label_score(self, df):
+    def get_text_label_score(self, df, filter_rank=1000):
        df = df[df['rank']<filter_rank]
        class_name = self.class_name
        vectorizer = self.vectorizer
        filter_classes = self.classes
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@ -1,25 +1,14 @@
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import GridSearchCV, cross_val_predict
 from sklearn.base import clone
 from sklearn.svm import LinearSVC
 from scipy.special import rel_entr as KLD
 import quapy as qp
-import quapy.functional as F
+from Retrieval.commons import *
-from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
+from Retrieval.methods import *
 from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from scipy.sparse import vstack
 from os.path import join
 from tqdm import tqdm
@ -62,7 +51,8 @@ def methods(classifier, class_name=None, binarize=False):
        'years_category':0.03
    }
-    yield ('Naive', Naive())
+    # yield ('Naive', Naive())
    # yield ('NaiveHalf', Naive())
    yield ('NaiveQuery', Naive())
    yield ('CC', ClassifyAndCount(classifier))
    # yield ('PCC', PCC(classifier))
@ -159,10 +149,14 @@ def run_experiment():
        train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
-        if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
+        if not method_name.startswith('Naive') and not method_name.startswith('M3'):
            method.fit(train_col, val_split=train_col, fit_classifier=False)
        elif method_name == 'Naive':
            method.fit(train_col)
        elif method_name == 'NaiveHalf':
            n = len(ytr)//2
            train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
            method.fit(train_col)
        test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
        rKL_estim, rKL_true = [], []
@ -231,17 +225,7 @@ def run_experiment():
    return results
 # Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 Ks = [50, 100, 500, 1000]
 CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
 DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 data_home = 'data'
 protected_group = {
    'gender': 'Female',
    'continent': 'Africa',
    'years_category': 'Pre-1900s',
 }
 if __name__ == '__main__':
@ -249,7 +233,7 @@ if __name__ == '__main__':
    # the corresponding rND (for binary) or rKL (for multiclass) score
    tables_RND, tables_DKL = [], []
    tables_final = []
-    for class_mode in ['binary', 'multiclass']:
+    for class_mode in ['multiclass', 'binary']:
        BINARIZE = (class_mode=='binary')
        method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
--- a/Retrieval/plot_mrae_xaxis_k.py
+++ b/Retrieval/plot_mrae_xaxis_k.py
@ -1,25 +1,9 @@
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
+from Retrieval.experiments import methods
-from sklearn.linear_model import LogisticRegression
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from Retrieval.experiments import methods, benchmark_name
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 import matplotlib.pyplot as plt
@ -29,10 +13,6 @@ class_mode = 'multiclass'
 method_names = [name for name, *other in methods(None, 'continent')]
 # Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 Ks = [50, 100, 500, 1000]
 DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 CLASS_NAME = ['gender', 'continent', 'years_category']
 all_results = {}
@ -40,11 +20,11 @@ all_results = {}
 # class_name -> data_size -> method_name -> k -> stat -> float
 # where stat is "mean", "std", "max"
 def load_all_results():
-    for class_name in CLASS_NAME:
+    for class_name in CLASS_NAMES:
        all_results[class_name] = {}
-        for data_size in DATA_SIZE:
+        for data_size in DATA_SIZES:
            all_results[class_name][data_size] = {}
@ -75,8 +55,8 @@ results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks
-for class_name in CLASS_NAME:
+for class_name in CLASS_NAMES:
-    for data_size in DATA_SIZE:
+    for data_size in DATA_SIZES:
        log = True
--- a/Retrieval/plot_mrae_xaxis_size.py
+++ b/Retrieval/plot_mrae_xaxis_size.py
@ -1,39 +1,15 @@
 import os.path
-import pickle
+from Retrieval.experiments import methods
-from collections import defaultdict
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from Retrieval.experiments import methods, benchmark_name
 from Retrieval.plot_mrae_xaxis_k import load_all_results
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 import matplotlib.pyplot as plt
-
+from Retrieval.plot_mrae_xaxis_k import load_all_results
 data_home = 'data'
 class_mode = 'multiclass'
 method_names = [name for name, *other in methods(None)]
 # Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 Ks = [50, 100, 500, 1000]
 DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 CLASS_NAME = ['gender', 'continent', 'years_category']
 all_results = {}
@ -44,7 +20,7 @@ results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks
-for class_name in CLASS_NAME:
+for class_name in CLASS_NAMES:
    for k in Ks:
        log = True
@ -55,10 +31,10 @@ for class_name in CLASS_NAME:
        for method_name in method_names:
            # class_name -> data_size -> method_name -> k -> stat -> float
            means = [
-                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
+                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
            ]
            stds = [
-                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
+                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
            ]
            # max_mean = np.max([
            #         results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
@ -67,7 +43,7 @@ for class_name in CLASS_NAME:
            max_means.append(max(means))
            style = 'o-' if method_name != 'CC' else '--'
-            line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
+            line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None)
            color = line[-1].get_color()
            if log:
                ax.set_yscale('log')
--- a/Retrieval/relscore_distribution.py
+++ b/Retrieval/relscore_distribution.py
@ -1,29 +1,9 @@
 import os.path
 import pickle
 from collections import defaultdict
 from itertools import zip_longest
-from pathlib import Path
+from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 import numpy as np
 import matplotlib.pyplot as plt
@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f
 data_home = 'data'
 Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
    test_added = False
    Mtrs, Mtes, source = [], [], []
-    for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
+    for data_size in DATA_SIZES:
        class_home = join(data_home, class_name, data_size)
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')