setting a rank threshold to 1000, and finalizing plots
This commit is contained in:
parent
67ed6e4c6c
commit
2ac48a9798
|
@ -6,6 +6,19 @@ from os.path import join
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
|
|
||||||
|
|
||||||
|
Ks = [50, 100, 500, 1000]
|
||||||
|
|
||||||
|
CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
|
||||||
|
|
||||||
|
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
||||||
|
|
||||||
|
protected_group = {
|
||||||
|
'gender': 'Female',
|
||||||
|
'continent': 'Africa',
|
||||||
|
'years_category': 'Pre-1900s',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def load_sample(path, class_name):
|
def load_sample(path, class_name):
|
||||||
"""
|
"""
|
||||||
Loads a sample json as a dataframe and returns text and labels for
|
Loads a sample json as a dataframe and returns text and labels for
|
||||||
|
@ -48,7 +61,9 @@ class RetrievedSamples:
|
||||||
self.positive_class = positive_class
|
self.positive_class = positive_class
|
||||||
self.classes = classes
|
self.classes = classes
|
||||||
|
|
||||||
def get_text_label_score(self, df):
|
def get_text_label_score(self, df, filter_rank=1000):
|
||||||
|
df = df[df['rank']<filter_rank]
|
||||||
|
|
||||||
class_name = self.class_name
|
class_name = self.class_name
|
||||||
vectorizer = self.vectorizer
|
vectorizer = self.vectorizer
|
||||||
filter_classes = self.classes
|
filter_classes = self.classes
|
||||||
|
|
|
@ -1,25 +1,14 @@
|
||||||
import os.path
|
|
||||||
import pickle
|
|
||||||
from collections import defaultdict
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from sklearn.metrics import confusion_matrix
|
|
||||||
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
from sklearn.model_selection import GridSearchCV, cross_val_predict
|
||||||
from sklearn.base import clone
|
from sklearn.base import clone
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
from scipy.special import rel_entr as KLD
|
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
from Retrieval.commons import *
|
||||||
from Retrieval.commons import RetrievedSamples, load_sample, binarize_labels
|
from Retrieval.methods import *
|
||||||
from Retrieval.methods import M3rND_ModelB, M3rND_ModelD, AbstractM3rND
|
|
||||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
||||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
||||||
from quapy.data.base import LabelledCollection
|
from quapy.data.base import LabelledCollection
|
||||||
from scipy.sparse import vstack
|
|
||||||
|
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -62,7 +51,8 @@ def methods(classifier, class_name=None, binarize=False):
|
||||||
'years_category':0.03
|
'years_category':0.03
|
||||||
}
|
}
|
||||||
|
|
||||||
yield ('Naive', Naive())
|
# yield ('Naive', Naive())
|
||||||
|
# yield ('NaiveHalf', Naive())
|
||||||
yield ('NaiveQuery', Naive())
|
yield ('NaiveQuery', Naive())
|
||||||
yield ('CC', ClassifyAndCount(classifier))
|
yield ('CC', ClassifyAndCount(classifier))
|
||||||
# yield ('PCC', PCC(classifier))
|
# yield ('PCC', PCC(classifier))
|
||||||
|
@ -159,10 +149,14 @@ def run_experiment():
|
||||||
|
|
||||||
train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
|
train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
|
||||||
|
|
||||||
if method_name not in ['Naive', 'NaiveQuery', 'M3b', 'M3b+', 'M3d', 'M3d+']:
|
if not method_name.startswith('Naive') and not method_name.startswith('M3'):
|
||||||
method.fit(train_col, val_split=train_col, fit_classifier=False)
|
method.fit(train_col, val_split=train_col, fit_classifier=False)
|
||||||
elif method_name == 'Naive':
|
elif method_name == 'Naive':
|
||||||
method.fit(train_col)
|
method.fit(train_col)
|
||||||
|
elif method_name == 'NaiveHalf':
|
||||||
|
n = len(ytr)//2
|
||||||
|
train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_)
|
||||||
|
method.fit(train_col)
|
||||||
|
|
||||||
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
|
test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
|
||||||
rKL_estim, rKL_true = [], []
|
rKL_estim, rKL_true = [], []
|
||||||
|
@ -231,17 +225,7 @@ def run_experiment():
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
|
||||||
Ks = [50, 100, 500, 1000]
|
|
||||||
CLASS_NAMES = ['years_category', 'continent', 'gender'] # ['relative_pageviews_category', 'num_sitelinks_category']:
|
|
||||||
DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
|
||||||
data_home = 'data'
|
data_home = 'data'
|
||||||
protected_group = {
|
|
||||||
'gender': 'Female',
|
|
||||||
'continent': 'Africa',
|
|
||||||
'years_category': 'Pre-1900s',
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
@ -249,7 +233,7 @@ if __name__ == '__main__':
|
||||||
# the corresponding rND (for binary) or rKL (for multiclass) score
|
# the corresponding rND (for binary) or rKL (for multiclass) score
|
||||||
tables_RND, tables_DKL = [], []
|
tables_RND, tables_DKL = [], []
|
||||||
tables_final = []
|
tables_final = []
|
||||||
for class_mode in ['binary', 'multiclass']:
|
for class_mode in ['multiclass', 'binary']:
|
||||||
BINARIZE = (class_mode=='binary')
|
BINARIZE = (class_mode=='binary')
|
||||||
method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
|
method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
|
||||||
|
|
||||||
|
|
|
@ -1,25 +1,9 @@
|
||||||
import os.path
|
import os.path
|
||||||
import pickle
|
import pickle
|
||||||
from collections import defaultdict
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from Retrieval.experiments import methods
|
||||||
from sklearn.linear_model import LogisticRegression
|
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
|
||||||
from sklearn.model_selection import GridSearchCV
|
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from Retrieval.commons import RetrievedSamples, load_sample
|
|
||||||
from Retrieval.experiments import methods, benchmark_name
|
|
||||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
|
||||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
|
||||||
from quapy.data.base import LabelledCollection
|
|
||||||
|
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from result_table.src.table import Table
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
@ -29,10 +13,6 @@ class_mode = 'multiclass'
|
||||||
|
|
||||||
method_names = [name for name, *other in methods(None, 'continent')]
|
method_names = [name for name, *other in methods(None, 'continent')]
|
||||||
|
|
||||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
|
||||||
Ks = [50, 100, 500, 1000]
|
|
||||||
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
|
||||||
CLASS_NAME = ['gender', 'continent', 'years_category']
|
|
||||||
all_results = {}
|
all_results = {}
|
||||||
|
|
||||||
|
|
||||||
|
@ -40,11 +20,11 @@ all_results = {}
|
||||||
# class_name -> data_size -> method_name -> k -> stat -> float
|
# class_name -> data_size -> method_name -> k -> stat -> float
|
||||||
# where stat is "mean", "std", "max"
|
# where stat is "mean", "std", "max"
|
||||||
def load_all_results():
|
def load_all_results():
|
||||||
for class_name in CLASS_NAME:
|
for class_name in CLASS_NAMES:
|
||||||
|
|
||||||
all_results[class_name] = {}
|
all_results[class_name] = {}
|
||||||
|
|
||||||
for data_size in DATA_SIZE:
|
for data_size in DATA_SIZES:
|
||||||
|
|
||||||
all_results[class_name][data_size] = {}
|
all_results[class_name][data_size] = {}
|
||||||
|
|
||||||
|
@ -75,8 +55,8 @@ results = load_all_results()
|
||||||
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
||||||
# - the x-axis displays the Ks
|
# - the x-axis displays the Ks
|
||||||
|
|
||||||
for class_name in CLASS_NAME:
|
for class_name in CLASS_NAMES:
|
||||||
for data_size in DATA_SIZE:
|
for data_size in DATA_SIZES:
|
||||||
|
|
||||||
log = True
|
log = True
|
||||||
|
|
||||||
|
|
|
@ -1,39 +1,15 @@
|
||||||
import os.path
|
import os.path
|
||||||
import pickle
|
from Retrieval.experiments import methods
|
||||||
from collections import defaultdict
|
from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from Retrieval.commons import RetrievedSamples, load_sample
|
|
||||||
from Retrieval.experiments import methods, benchmark_name
|
|
||||||
from Retrieval.plot_mrae_xaxis_k import load_all_results
|
|
||||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
|
||||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
|
||||||
from quapy.data.base import LabelledCollection
|
|
||||||
|
|
||||||
from os.path import join
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
from result_table.src.table import Table
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
from Retrieval.plot_mrae_xaxis_k import load_all_results
|
||||||
|
|
||||||
data_home = 'data'
|
data_home = 'data'
|
||||||
class_mode = 'multiclass'
|
class_mode = 'multiclass'
|
||||||
|
|
||||||
method_names = [name for name, *other in methods(None)]
|
method_names = [name for name, *other in methods(None)]
|
||||||
|
|
||||||
# Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
|
||||||
Ks = [50, 100, 500, 1000]
|
|
||||||
DATA_SIZE = ['10K', '50K', '100K', '500K', '1M', 'FULL']
|
|
||||||
CLASS_NAME = ['gender', 'continent', 'years_category']
|
|
||||||
all_results = {}
|
all_results = {}
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +20,7 @@ results = load_all_results()
|
||||||
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
# generates the class-independent, size-independent plots for y-axis=MRAE in which:
|
||||||
# - the x-axis displays the Ks
|
# - the x-axis displays the Ks
|
||||||
|
|
||||||
for class_name in CLASS_NAME:
|
for class_name in CLASS_NAMES:
|
||||||
for k in Ks:
|
for k in Ks:
|
||||||
|
|
||||||
log = True
|
log = True
|
||||||
|
@ -55,10 +31,10 @@ for class_name in CLASS_NAME:
|
||||||
for method_name in method_names:
|
for method_name in method_names:
|
||||||
# class_name -> data_size -> method_name -> k -> stat -> float
|
# class_name -> data_size -> method_name -> k -> stat -> float
|
||||||
means = [
|
means = [
|
||||||
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZE
|
results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
|
||||||
]
|
]
|
||||||
stds = [
|
stds = [
|
||||||
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZE
|
results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
|
||||||
]
|
]
|
||||||
# max_mean = np.max([
|
# max_mean = np.max([
|
||||||
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
|
# results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
|
||||||
|
@ -67,7 +43,7 @@ for class_name in CLASS_NAME:
|
||||||
max_means.append(max(means))
|
max_means.append(max(means))
|
||||||
|
|
||||||
style = 'o-' if method_name != 'CC' else '--'
|
style = 'o-' if method_name != 'CC' else '--'
|
||||||
line = ax.plot(DATA_SIZE, means, style, label=method_name, color=None)
|
line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None)
|
||||||
color = line[-1].get_color()
|
color = line[-1].get_color()
|
||||||
if log:
|
if log:
|
||||||
ax.set_yscale('log')
|
ax.set_yscale('log')
|
||||||
|
|
|
@ -1,29 +1,9 @@
|
||||||
import os.path
|
import os.path
|
||||||
import pickle
|
import pickle
|
||||||
from collections import defaultdict
|
|
||||||
from itertools import zip_longest
|
from itertools import zip_longest
|
||||||
from pathlib import Path
|
from Retrieval.commons import RetrievedSamples, load_sample, DATA_SIZES
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from sklearn.model_selection import GridSearchCV
|
|
||||||
from sklearn.svm import LinearSVC
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
import quapy.functional as F
|
|
||||||
from Retrieval.commons import RetrievedSamples, load_sample
|
|
||||||
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
|
|
||||||
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
|
|
||||||
from quapy.protocol import AbstractProtocol
|
|
||||||
from quapy.data.base import LabelledCollection
|
|
||||||
|
|
||||||
from glob import glob
|
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from result_table.src.table import Table
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
@ -35,12 +15,11 @@ Plots the distribution of (predicted) relevance score for the test samples and f
|
||||||
|
|
||||||
|
|
||||||
data_home = 'data'
|
data_home = 'data'
|
||||||
Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
|
|
||||||
|
|
||||||
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
|
for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
|
||||||
test_added = False
|
test_added = False
|
||||||
Mtrs, Mtes, source = [], [], []
|
Mtrs, Mtes, source = [], [], []
|
||||||
for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
|
for data_size in DATA_SIZES:
|
||||||
|
|
||||||
class_home = join(data_home, class_name, data_size)
|
class_home = join(data_home, class_name, data_size)
|
||||||
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
|
classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
|
||||||
|
|
Loading…
Reference in New Issue