setting a rank threshold to 1000, and finalizing plots

2024-05-10 15:46:13 +02:00 · 2024-05-10 15:46:13 +02:00 · 2ac48a9798
parent 67ed6e4c6c
commit 2ac48a9798
5 changed files with 41 additions and 107 deletions
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -6,6 +6,19 @@ from os.path import join
 import quapy.functional as F


+Ks = [50, 100, 500, 1000]
+
+CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
+
+DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
+
+protected_group = {
+    'gender': 'Female',
+    'continent': 'Africa',
+    'years_category': 'Pre-1900s',
+}
+
+
 def load_sample(path, class_name):
    """
    Loads a sample json as a dataframe and returns text and labels for
@ -48,7 +61,9 @@ class RetrievedSamples:
        self.positive_class = positive_class
        self.classes = classes

-    def get_text_label_score(self, df):
+    def get_text_label_score(self, df, filter_rank=1000):
+        df = df[df['rank']<filter_rank]
+
        class_name = self.class_name
        vectorizer = self.vectorizer
        filter_classes = self.classes
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@ -1,25 +1,14 @@
-import os.path
-import pickle
-from collections import defaultdict
-from pathlib import Path
-
-import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import GridSearchCV, cross_val_predict
 from sklearn.base import clone
-from sklearn.svm import LinearSVC
-from scipy.special import rel_entr as KLD

 import quapy as qp
-import quapy.functional as F
-from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
-from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
+from Retrieval.commons import *
+from Retrieval.methods import *
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
-from scipy.sparse import vstack

 from os.path import join
 from tqdm import tqdm
@ -62,7 +51,8 @@ def methods(classifier, class_name=None, binarize=False):
        'years_category':0.03
    }

-    yield ('Naive', Naive())
+    # yield ('Naive', Naive())
+    # yield ('NaiveHalf', Naive())
    yield ('NaiveQuery', Naive())
    yield ('CC', ClassifyAndCount(classifier))
    # yield ('PCC', PCC(classifier))
@ -159,10 +149,14 @@ def run_experiment():

        train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)

-        if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
+        if not method_name.startswith('Naive') and not method_name.startswith('M3'):
            method.fit(train_col, val_split=train_col, fit_classifier=False)
        elif method_name == 'Naive':
            method.fit(train_col)
+        elif method_name == 'NaiveHalf':
+            n = len(ytr)//2
+            train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
+            method.fit(train_col)

        test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
        rKL_estim, rKL_true = [], []
@ -231,17 +225,7 @@ def run_experiment():
    return results


-
-# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
-Ks = [50, 100, 500, 1000]
-CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
-DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
 data_home = 'data'
-protected_group = {
-    'gender': 'Female',
-    'continent': 'Africa',
-    'years_category': 'Pre-1900s',
-}

 if __name__ == '__main__':

@ -249,7 +233,7 @@ if __name__ == '__main__':
    # the corresponding rND (for binary) or rKL (for multiclass) score
    tables_RND, tables_DKL = [], []
    tables_final = []
-    for class_mode in ['binary', 'multiclass']:
+    for class_mode in ['multiclass', 'binary']:
        BINARIZE = (class_mode=='binary')
        method_names = [name for name, *other in methods(None, binarize=BINARIZE)]

--- a/Retrieval/plot_mrae_xaxis_k.py
+++ b/Retrieval/plot_mrae_xaxis_k.py
@ -1,25 +1,9 @@
 import os.path
 import pickle
-from collections import defaultdict
-from pathlib import Path
-
 import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.svm import LinearSVC
-
-import quapy as qp
-from Retrieval.commons import RetrievedSamples, load_sample
-from Retrieval.experiments import methods, benchmark_name
-from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
-from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
-from quapy.data.base import LabelledCollection
-
+from Retrieval.experiments import methods
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 from os.path import join
-from tqdm import tqdm
-
-from result_table.src.table import Table
 import matplotlib.pyplot as plt


@ -29,10 +13,6 @@ class_mode = 'multiclass'

 method_names = [name for name, *other in methods(None, 'continent')]

-# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
-Ks = [50, 100, 500, 1000]
-DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
-CLASS_NAME = ['gender', 'continent', 'years_category']
 all_results = {}


@ -40,11 +20,11 @@ all_results = {}
 # class_name -> data_size -> method_name -> k -> stat -> float
 # where stat is "mean", "std", "max"
 def load_all_results():
-    for class_name in CLASS_NAME:
+    for class_name in CLASS_NAMES:

        all_results[class_name] = {}

-        for data_size in DATA_SIZE:
+        for data_size in DATA_SIZES:

            all_results[class_name][data_size] = {}

@ -75,8 +55,8 @@ results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks

-for class_name in CLASS_NAME:
-    for data_size in DATA_SIZE:
+for class_name in CLASS_NAMES:
+    for data_size in DATA_SIZES:

        log = True

--- a/Retrieval/plot_mrae_xaxis_size.py
+++ b/Retrieval/plot_mrae_xaxis_size.py
@ -1,39 +1,15 @@
 import os.path
-import pickle
-from collections import defaultdict
-from pathlib import Path
-
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.svm import LinearSVC
-
-import quapy as qp
-from Retrieval.commons import RetrievedSamples, load_sample
-from Retrieval.experiments import methods, benchmark_name
-from Retrieval.plot_mrae_xaxis_k import load_all_results
-from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
-from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
-from quapy.data.base import LabelledCollection
-
-from os.path import join
-from tqdm import tqdm
-
-from result_table.src.table import Table
+from Retrieval.experiments import methods
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
 import matplotlib.pyplot as plt

-
+from Retrieval.plot_mrae_xaxis_k import load_all_results

 data_home = 'data'
 class_mode = 'multiclass'

 method_names = [name for name, *other in methods(None)]

-# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
-Ks = [50, 100, 500, 1000]
-DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
-CLASS_NAME = ['gender', 'continent', 'years_category']
 all_results = {}


@ -44,7 +20,7 @@ results = load_all_results()
 # generates the class-independent, size-independent plots for y-axis=MRAE in which:
 # - the x-axis displays the Ks

-for class_name in CLASS_NAME:
+for class_name in CLASS_NAMES:
    for k in Ks:

        log = True
@ -55,10 +31,10 @@ for class_name in CLASS_NAME:
        for method_name in method_names:
            # class_name -> data_size -> method_name -> k -> stat -> float
            means = [
-                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
+                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
            ]
            stds = [
-                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
+                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
            ]
            # max_mean = np.max([
            #         results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
@ -67,7 +43,7 @@ for class_name in CLASS_NAME:
            max_means.append(max(means))

            style = 'o-' if method_name != 'CC' else '--'
-            line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
+            line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None)
            color = line[-1].get_color()
            if log:
                ax.set_yscale('log')
--- a/Retrieval/relscore_distribution.py
+++ b/Retrieval/relscore_distribution.py
@ -1,29 +1,9 @@
 import os.path
 import pickle
-from collections import defaultdict
 from itertools import zip_longest
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.linear_model import LogisticRegression
-from sklearn.model_selection import GridSearchCV
-from sklearn.svm import LinearSVC
-
-import quapy as qp
-import quapy.functional as F
-from Retrieval.commons import RetrievedSamples, load_sample
-from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
-from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
-from quapy.protocol import AbstractProtocol
-from quapy.data.base import LabelledCollection
-
-from glob import glob
+from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
 from os.path import join
 from tqdm import tqdm
-
-from result_table.src.table import Table
 import numpy as np
 import matplotlib.pyplot as plt

@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f


 data_home = 'data'
-Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]

 for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
    test_added = False
    Mtrs, Mtes, source = [], [], []
-    for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
+    for data_size in DATA_SIZES:

        class_home = join(data_home, class_name, data_size)
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')