setting a rank threshold to 1000, and finalizing plots

This commit is contained in:
Alejandro Moreo Fernandez 2024-05-10 15:46:13 +02:00
parent 67ed6e4c6c
commit 2ac48a9798
5 changed files with 41 additions and 107 deletions

View File

@ -6,6 +6,19 @@ from os.path import join
import quapy.functional as F
Ks = [50, 100, 500, 1000]
CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
protected_group = {
'gender': 'Female',
'continent': 'Africa',
'years_category': 'Pre-1900s',
}
def load_sample(path, class_name):
"""
Loads a sample json as a dataframe and returns text and labels for
@ -48,7 +61,9 @@ class RetrievedSamples:
self.positive_class = positive_class
self.classes = classes
def get_text_label_score(self, df):
def get_text_label_score(self, df, filter_rank=1000):
df = df[df['rank']<filter_rank]
class_name = self.class_name
vectorizer = self.vectorizer
filter_classes = self.classes

View File

@ -1,25 +1,14 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.base import clone
from sklearn.svm import LinearSVC
from scipy.special import rel_entr as KLD
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
from Retrieval.commons import *
from Retrieval.methods import *
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from scipy.sparse import vstack
from os.path import join
from tqdm import tqdm
@ -62,7 +51,8 @@ def methods(classifier, class_name=None, binarize=False):
'years_category':0.03
}
yield ('Naive', Naive())
# yield ('Naive', Naive())
# yield ('NaiveHalf', Naive())
yield ('NaiveQuery', Naive())
yield ('CC', ClassifyAndCount(classifier))
# yield ('PCC', PCC(classifier))
@ -159,10 +149,14 @@ def run_experiment():
train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
if not method_name.startswith('Naive') and not method_name.startswith('M3'):
method.fit(train_col, val_split=train_col, fit_classifier=False)
elif method_name == 'Naive':
method.fit(train_col)
elif method_name == 'NaiveHalf':
n = len(ytr)//2
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
method.fit(train_col)
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
rKL_estim, rKL_true = [], []
@ -231,17 +225,7 @@ def run_experiment():
return results
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
Ks = [50, 100, 500, 1000]
CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
data_home = 'data'
protected_group = {
'gender': 'Female',
'continent': 'Africa',
'years_category': 'Pre-1900s',
}
if __name__ == '__main__':
@ -249,7 +233,7 @@ if __name__ == '__main__':
# the corresponding rND (for binary) or rKL (for multiclass) score
tables_RND, tables_DKL = [], []
tables_final = []
for class_mode in ['binary', 'multiclass']:
for class_mode in ['multiclass', 'binary']:
BINARIZE = (class_mode=='binary')
method_names = [name for name, *other in methods(None, binarize=BINARIZE)]

View File

@ -1,25 +1,9 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from Retrieval.experiments import methods, benchmark_name
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from Retrieval.experiments import methods
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import matplotlib.pyplot as plt
@ -29,10 +13,6 @@ class_mode = 'multiclass'
method_names = [name for name, *other in methods(None, 'continent')]
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
Ks = [50, 100, 500, 1000]
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
CLASS_NAME = ['gender', 'continent', 'years_category']
all_results = {}
@ -40,11 +20,11 @@ all_results = {}
# class_name -> data_size -> method_name -> k -> stat -> float
# where stat is "mean", "std", "max"
def load_all_results():
for class_name in CLASS_NAME:
for class_name in CLASS_NAMES:
all_results[class_name] = {}
for data_size in DATA_SIZE:
for data_size in DATA_SIZES:
all_results[class_name][data_size] = {}
@ -75,8 +55,8 @@ results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
for class_name in CLASS_NAME:
for data_size in DATA_SIZE:
for class_name in CLASS_NAMES:
for data_size in DATA_SIZES:
log = True

View File

@ -1,39 +1,15 @@
import os.path
import pickle
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
from Retrieval.commons import RetrievedSamples, load_sample
from Retrieval.experiments import methods, benchmark_name
from Retrieval.plot_mrae_xaxis_k import load_all_results
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.data.base import LabelledCollection
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
from Retrieval.experiments import methods
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
import matplotlib.pyplot as plt
from Retrieval.plot_mrae_xaxis_k import load_all_results
data_home = 'data'
class_mode = 'multiclass'
method_names = [name for name, *other in methods(None)]
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
Ks = [50, 100, 500, 1000]
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
CLASS_NAME = ['gender', 'continent', 'years_category']
all_results = {}
@ -44,7 +20,7 @@ results = load_all_results()
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
# - the x-axis displays the Ks
for class_name in CLASS_NAME:
for class_name in CLASS_NAMES:
for k in Ks:
log = True
@ -55,10 +31,10 @@ for class_name in CLASS_NAME:
for method_name in method_names:
# class_name -> data_size -> method_name -> k -> stat -> float
means = [
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
]
stds = [
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
]
# max_mean = np.max([
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
@ -67,7 +43,7 @@ for class_name in CLASS_NAME:
max_means.append(max(means))
style = 'o-' if method_name != 'CC' else '--'
line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None)
color = line[-1].get_color()
if log:
ax.set_yscale('log')

View File

@ -1,29 +1,9 @@
import os.path
import pickle
from collections import defaultdict
from itertools import zip_longest
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
import quapy as qp
import quapy.functional as F
from Retrieval.commons import RetrievedSamples, load_sample
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection
from glob import glob
from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
from os.path import join
from tqdm import tqdm
from result_table.src.table import Table
import numpy as np
import matplotlib.pyplot as plt
@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f
data_home = 'data'
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
test_added = False
Mtrs, Mtes, source = [], [], []
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
for data_size in DATA_SIZES:
class_home = join(data_home, class_name, data_size)
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')