setting a rank threshold to 1000, and finalizing plots
This commit is contained in:
parent
67ed6e4c6c
commit
2ac48a9798
|
@ -6,6 +6,19 @@ from os.path import join
|
|||
import quapy.functional as F
|
||||
|
||||
|
||||
Ks = [50, 100, 500, 1000]
|
||||
|
||||
CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
|
||||
|
||||
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
||||
|
||||
protected_group = {
|
||||
'gender': 'Female',
|
||||
'continent': 'Africa',
|
||||
'years_category': 'Pre-1900s',
|
||||
}
|
||||
|
||||
|
||||
def load_sample(path, class_name):
|
||||
"""
|
||||
Loads a sample json as a dataframe and returns text and labels for
|
||||
|
@ -48,7 +61,9 @@ class RetrievedSamples:
|
|||
self.positive_class = positive_class
|
||||
self.classes = classes
|
||||
|
||||
def get_text_label_score(self, df):
|
||||
def get_text_label_score(self, df, filter_rank=1000):
|
||||
df = df[df['rank']<filter_rank]
|
||||
|
||||
class_name = self.class_name
|
||||
vectorizer = self.vectorizer
|
||||
filter_classes = self.classes
|
||||
|
|
|
@ -1,25 +1,14 @@
|
|||
import os.path
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import confusion_matrix
|
||||
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
||||
from sklearn.base import clone
|
||||
from sklearn.svm import LinearSVC
|
||||
from scipy.special import rel_entr as KLD
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
|
||||
from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
|
||||
from Retrieval.commons import *
|
||||
from Retrieval.methods import *
|
||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||||
from quapy.data.base import LabelledCollection
|
||||
from scipy.sparse import vstack
|
||||
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
@ -62,7 +51,8 @@ def methods(classifier, class_name=None, binarize=False):
|
|||
'years_category':0.03
|
||||
}
|
||||
|
||||
yield ('Naive', Naive())
|
||||
# yield ('Naive', Naive())
|
||||
# yield ('NaiveHalf', Naive())
|
||||
yield ('NaiveQuery', Naive())
|
||||
yield ('CC', ClassifyAndCount(classifier))
|
||||
# yield ('PCC', PCC(classifier))
|
||||
|
@ -159,10 +149,14 @@ def run_experiment():
|
|||
|
||||
train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
|
||||
|
||||
if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
|
||||
if not method_name.startswith('Naive') and not method_name.startswith('M3'):
|
||||
method.fit(train_col, val_split=train_col, fit_classifier=False)
|
||||
elif method_name == 'Naive':
|
||||
method.fit(train_col)
|
||||
elif method_name == 'NaiveHalf':
|
||||
n = len(ytr)//2
|
||||
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
|
||||
method.fit(train_col)
|
||||
|
||||
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
|
||||
rKL_estim, rKL_true = [], []
|
||||
|
@ -231,17 +225,7 @@ def run_experiment():
|
|||
return results
|
||||
|
||||
|
||||
|
||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
||||
Ks = [50, 100, 500, 1000]
|
||||
CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
|
||||
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
||||
data_home = 'data'
|
||||
protected_group = {
|
||||
'gender': 'Female',
|
||||
'continent': 'Africa',
|
||||
'years_category': 'Pre-1900s',
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
@ -249,7 +233,7 @@ if __name__ == '__main__':
|
|||
# the corresponding rND (for binary) or rKL (for multiclass) score
|
||||
tables_RND, tables_DKL = [], []
|
||||
tables_final = []
|
||||
for class_mode in ['binary', 'multiclass']:
|
||||
for class_mode in ['multiclass', 'binary']:
|
||||
BINARIZE = (class_mode=='binary')
|
||||
method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
|
||||
|
||||
|
|
|
@ -1,25 +1,9 @@
|
|||
import os.path
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
from Retrieval.commons import RetrievedSamples, load_sample
|
||||
from Retrieval.experiments import methods, benchmark_name
|
||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||||
from quapy.data.base import LabelledCollection
|
||||
|
||||
from Retrieval.experiments import methods
|
||||
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
from result_table.src.table import Table
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
|
@ -29,10 +13,6 @@ class_mode = 'multiclass'
|
|||
|
||||
method_names = [name for name, *other in methods(None, 'continent')]
|
||||
|
||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
||||
Ks = [50, 100, 500, 1000]
|
||||
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
||||
CLASS_NAME = ['gender', 'continent', 'years_category']
|
||||
all_results = {}
|
||||
|
||||
|
||||
|
@ -40,11 +20,11 @@ all_results = {}
|
|||
# class_name -> data_size -> method_name -> k -> stat -> float
|
||||
# where stat is "mean", "std", "max"
|
||||
def load_all_results():
|
||||
for class_name in CLASS_NAME:
|
||||
for class_name in CLASS_NAMES:
|
||||
|
||||
all_results[class_name] = {}
|
||||
|
||||
for data_size in DATA_SIZE:
|
||||
for data_size in DATA_SIZES:
|
||||
|
||||
all_results[class_name][data_size] = {}
|
||||
|
||||
|
@ -75,8 +55,8 @@ results = load_all_results()
|
|||
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
||||
# - the x-axis displays the Ks
|
||||
|
||||
for class_name in CLASS_NAME:
|
||||
for data_size in DATA_SIZE:
|
||||
for class_name in CLASS_NAMES:
|
||||
for data_size in DATA_SIZES:
|
||||
|
||||
log = True
|
||||
|
||||
|
|
|
@ -1,39 +1,15 @@
|
|||
import os.path
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
from Retrieval.commons import RetrievedSamples, load_sample
|
||||
from Retrieval.experiments import methods, benchmark_name
|
||||
from Retrieval.plot_mrae_xaxis_k import load_all_results
|
||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||||
from quapy.data.base import LabelledCollection
|
||||
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
from result_table.src.table import Table
|
||||
from Retrieval.experiments import methods
|
||||
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
from Retrieval.plot_mrae_xaxis_k import load_all_results
|
||||
|
||||
data_home = 'data'
|
||||
class_mode = 'multiclass'
|
||||
|
||||
method_names = [name for name, *other in methods(None)]
|
||||
|
||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
||||
Ks = [50, 100, 500, 1000]
|
||||
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
||||
CLASS_NAME = ['gender', 'continent', 'years_category']
|
||||
all_results = {}
|
||||
|
||||
|
||||
|
@ -44,7 +20,7 @@ results = load_all_results()
|
|||
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
||||
# - the x-axis displays the Ks
|
||||
|
||||
for class_name in CLASS_NAME:
|
||||
for class_name in CLASS_NAMES:
|
||||
for k in Ks:
|
||||
|
||||
log = True
|
||||
|
@ -55,10 +31,10 @@ for class_name in CLASS_NAME:
|
|||
for method_name in method_names:
|
||||
# class_name -> data_size -> method_name -> k -> stat -> float
|
||||
means = [
|
||||
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
|
||||
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
|
||||
]
|
||||
stds = [
|
||||
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
|
||||
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
|
||||
]
|
||||
# max_mean = np.max([
|
||||
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
|
||||
|
@ -67,7 +43,7 @@ for class_name in CLASS_NAME:
|
|||
max_means.append(max(means))
|
||||
|
||||
style = 'o-' if method_name != 'CC' else '--'
|
||||
line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
|
||||
line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None)
|
||||
color = line[-1].get_color()
|
||||
if log:
|
||||
ax.set_yscale('log')
|
||||
|
|
|
@ -1,29 +1,9 @@
|
|||
import os.path
|
||||
import pickle
|
||||
from collections import defaultdict
|
||||
from itertools import zip_longest
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.svm import LinearSVC
|
||||
|
||||
import quapy as qp
|
||||
import quapy.functional as F
|
||||
from Retrieval.commons import RetrievedSamples, load_sample
|
||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||||
from quapy.protocol import AbstractProtocol
|
||||
from quapy.data.base import LabelledCollection
|
||||
|
||||
from glob import glob
|
||||
from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
from result_table.src.table import Table
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f
|
|||
|
||||
|
||||
data_home = 'data'
|
||||
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
||||
|
||||
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
|
||||
test_added = False
|
||||
Mtrs, Mtes, source = [], [], []
|
||||
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
|
||||
for data_size in DATA_SIZES:
|
||||
|
||||
class_home = join(data_home, class_name, data_size)
|
||||
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
|
||||
|
|
Loading…
Reference in New Issue