Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel
This commit is contained in:
commit
817aab1d99
|
@ -69,6 +69,9 @@ instance/
|
||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
|
# vscode config:
|
||||||
|
.vscode/
|
||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
|
|
||||||
|
@ -85,6 +88,11 @@ ipython_config.py
|
||||||
# pyenv
|
# pyenv
|
||||||
.python-version
|
.python-version
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
poetry.toml
|
||||||
|
pyproject.toml
|
||||||
|
poetry.lock
|
||||||
|
|
||||||
# pipenv
|
# pipenv
|
||||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
|
|
@ -29,12 +29,17 @@ def newLR():
|
||||||
|
|
||||||
|
|
||||||
def calibratedLR():
|
def calibratedLR():
|
||||||
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
|
return CalibratedClassifierCV(newLR())
|
||||||
|
|
||||||
|
|
||||||
__C_range = np.logspace(-3, 3, 7)
|
__C_range = np.logspace(-3, 3, 7)
|
||||||
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
|
lr_params = {
|
||||||
svmperf_params = {'classifier__C': __C_range}
|
'classifier__C': __C_range,
|
||||||
|
'classifier__class_weight': [None, 'balanced']
|
||||||
|
}
|
||||||
|
svmperf_params = {
|
||||||
|
'classifier__C': __C_range
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def quantification_models():
|
def quantification_models():
|
||||||
|
|
|
@ -0,0 +1,124 @@
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from time import time
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.method.aggregative import PACC, EMQ, KDEyML
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
SEED = 1
|
||||||
|
|
||||||
|
|
||||||
|
def newLR():
|
||||||
|
return LogisticRegression(max_iter=3000)
|
||||||
|
|
||||||
|
# typical hyperparameters explored for Logistic Regression
|
||||||
|
logreg_grid = {
|
||||||
|
'C': np.logspace(-3, 3, 7),
|
||||||
|
'class_weight': ['balanced', None]
|
||||||
|
}
|
||||||
|
|
||||||
|
def wrap_hyper(classifier_hyper_grid:dict):
|
||||||
|
return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()}
|
||||||
|
|
||||||
|
METHODS = [
|
||||||
|
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
|
||||||
|
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
|
||||||
|
# ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def show_results(result_path):
|
||||||
|
import pandas as pd
|
||||||
|
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
|
||||||
|
print(pv)
|
||||||
|
|
||||||
|
def load_timings(result_path):
|
||||||
|
import pandas as pd
|
||||||
|
timings = defaultdict(lambda: {})
|
||||||
|
if not Path(result_path + '.csv').exists():
|
||||||
|
return timings
|
||||||
|
|
||||||
|
df = pd.read_csv(result_path+'.csv', sep='\t')
|
||||||
|
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 500
|
||||||
|
qp.environ['N_JOBS'] = -1
|
||||||
|
n_bags_val = 250
|
||||||
|
n_bags_test = 1000
|
||||||
|
result_dir = f'results/ucimulti'
|
||||||
|
|
||||||
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
|
global_result_path = f'{result_dir}/allmethods'
|
||||||
|
timings = load_timings(global_result_path)
|
||||||
|
with open(global_result_path + '.csv', 'wt') as csv:
|
||||||
|
csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
|
||||||
|
|
||||||
|
for method_name, quantifier, param_grid in METHODS:
|
||||||
|
|
||||||
|
print('Init method', method_name)
|
||||||
|
|
||||||
|
with open(global_result_path + '.csv', 'at') as csv:
|
||||||
|
|
||||||
|
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
|
||||||
|
|
||||||
|
print('init', dataset)
|
||||||
|
|
||||||
|
local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
|
||||||
|
|
||||||
|
if os.path.exists(local_result_path):
|
||||||
|
print(f'result file {local_result_path} already exist; skipping')
|
||||||
|
report = qp.util.load_report(local_result_path)
|
||||||
|
|
||||||
|
else:
|
||||||
|
with qp.util.temp_seed(SEED):
|
||||||
|
|
||||||
|
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
train, test = data.train_test
|
||||||
|
train, val = train.split_stratified(random_state=SEED)
|
||||||
|
|
||||||
|
protocol = UPP(val, repeats=n_bags_val)
|
||||||
|
modsel = GridSearchQ(
|
||||||
|
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
|
||||||
|
)
|
||||||
|
|
||||||
|
t_init = time()
|
||||||
|
try:
|
||||||
|
modsel.fit(train)
|
||||||
|
|
||||||
|
print(f'best params {modsel.best_params_}')
|
||||||
|
print(f'best score {modsel.best_score_}')
|
||||||
|
|
||||||
|
quantifier = modsel.best_model()
|
||||||
|
except:
|
||||||
|
print('something went wrong... trying to fit the default model')
|
||||||
|
quantifier.fit(train)
|
||||||
|
timings[method_name][dataset] = time() - t_init
|
||||||
|
|
||||||
|
|
||||||
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
|
report = qp.evaluation.evaluation_report(
|
||||||
|
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
|
||||||
|
)
|
||||||
|
report.to_csv(local_result_path)
|
||||||
|
|
||||||
|
means = report.mean(numeric_only=True)
|
||||||
|
csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{timings[method_name][dataset]:.3f}\n')
|
||||||
|
csv.flush()
|
||||||
|
|
||||||
|
show_results(global_result_path)
|
|
@ -1,20 +1,17 @@
|
||||||
import os
|
import os
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from quapy.protocol import AbstractProtocol
|
from quapy.protocol import AbstractProtocol
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def get_sample_list(path_dir):
|
def get_sample_list(path_dir):
|
||||||
"""Gets a sample list finding the csv files in a directory
|
"""
|
||||||
|
Gets a sample list finding the csv files in a directory
|
||||||
|
|
||||||
Args:
|
:param path_dir: directory to look for samples
|
||||||
path_dir (_type_): directory to look for samples
|
:return: list of samples
|
||||||
|
|
||||||
Returns:
|
|
||||||
_type_: list of samples
|
|
||||||
"""
|
"""
|
||||||
samples = []
|
samples = []
|
||||||
for filename in sorted(os.listdir(path_dir)):
|
for filename in sorted(os.listdir(path_dir)):
|
||||||
|
@ -23,18 +20,15 @@ def get_sample_list(path_dir):
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
def generate_modelselection_split(samples, split=0.3):
|
def generate_modelselection_split(samples, test_prop=0.3):
|
||||||
"""This function generates a train/test split for model selection
|
"""This function generates a train/test partition for model selection
|
||||||
without the use of random numbers so the split is always the same
|
without the use of random numbers so the split is always the same
|
||||||
|
|
||||||
Args:
|
:param samples: list of samples
|
||||||
samples (_type_): list of samples
|
:param test_prop: float, percentage saved for test. Defaults to 0.3.
|
||||||
split (float, optional): percentage saved for test. Defaults to 0.3.
|
:return: list of samples to use as train and list of samples to use as test
|
||||||
|
|
||||||
Returns:
|
|
||||||
_type_: list of samples to use as train and list of samples to use as test
|
|
||||||
"""
|
"""
|
||||||
num_items_to_pick = math.ceil(len(samples) * split)
|
num_items_to_pick = math.ceil(len(samples) * test_prop)
|
||||||
step_size = math.floor(len(samples) / num_items_to_pick)
|
step_size = math.floor(len(samples) / num_items_to_pick)
|
||||||
test_indices = [i * step_size for i in range(num_items_to_pick)]
|
test_indices = [i * step_size for i in range(num_items_to_pick)]
|
||||||
test = [samples[i] for i in test_indices]
|
test = [samples[i] for i in test_indices]
|
||||||
|
|
|
@ -14,12 +14,17 @@ from quapy.util import download_file_if_not_exists, download_file, get_quapy_hom
|
||||||
|
|
||||||
|
|
||||||
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
|
||||||
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
|
|
||||||
|
TWITTER_SENTIMENT_DATASETS_TEST = [
|
||||||
|
'gasp', 'hcr', 'omd', 'sanders',
|
||||||
'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
'semeval13', 'semeval14', 'semeval15', 'semeval16',
|
||||||
'sst', 'wa', 'wb']
|
'sst', 'wa', 'wb',
|
||||||
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
|
]
|
||||||
|
TWITTER_SENTIMENT_DATASETS_TRAIN = [
|
||||||
|
'gasp', 'hcr', 'omd', 'sanders',
|
||||||
'semeval', 'semeval16',
|
'semeval', 'semeval16',
|
||||||
'sst', 'wa', 'wb']
|
'sst', 'wa', 'wb',
|
||||||
|
]
|
||||||
UCI_BINARY_DATASETS = [
|
UCI_BINARY_DATASETS = [
|
||||||
#'acute.a', 'acute.b',
|
#'acute.a', 'acute.b',
|
||||||
'balance.1',
|
'balance.1',
|
||||||
|
@ -44,14 +49,37 @@ UCI_BINARY_DATASETS = [
|
||||||
'transfusion',
|
'transfusion',
|
||||||
'wdbc',
|
'wdbc',
|
||||||
'wine.1', 'wine.2', 'wine.3',
|
'wine.1', 'wine.2', 'wine.3',
|
||||||
'wine-q-red', 'wine-q-white',
|
'wine-q-red',
|
||||||
'yeast']
|
'wine-q-white',
|
||||||
|
'yeast',
|
||||||
|
]
|
||||||
|
|
||||||
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
UCI_MULTICLASS_DATASETS = [
|
||||||
|
'dry-bean',
|
||||||
'wine-quality',
|
'wine-quality',
|
||||||
'academic-success',
|
'academic-success',
|
||||||
'digits',
|
'digits',
|
||||||
'letter']
|
'letter',
|
||||||
|
'abalone',
|
||||||
|
'obesity',
|
||||||
|
'nursery',
|
||||||
|
'yeast',
|
||||||
|
'hand_digits',
|
||||||
|
'satellite',
|
||||||
|
'shuttle',
|
||||||
|
'cmc',
|
||||||
|
'isolet',
|
||||||
|
'waveform-v1',
|
||||||
|
'molecular',
|
||||||
|
'poker_hand',
|
||||||
|
'connect-4',
|
||||||
|
'mhr',
|
||||||
|
'chess',
|
||||||
|
'page_block',
|
||||||
|
'phishing',
|
||||||
|
'image_seg',
|
||||||
|
'hcv',
|
||||||
|
]
|
||||||
|
|
||||||
LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
|
LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
|
||||||
LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
|
LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
|
||||||
|
@ -561,7 +589,13 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
def fetch_UCIMulticlassDataset(
|
||||||
|
dataset_name,
|
||||||
|
data_home=None,
|
||||||
|
min_test_split=0.3,
|
||||||
|
max_train_instances=25000,
|
||||||
|
min_class_support=100,
|
||||||
|
verbose=False) -> Dataset:
|
||||||
"""
|
"""
|
||||||
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
|
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
|
||||||
|
|
||||||
|
@ -583,15 +617,28 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
|
||||||
:param dataset_name: a dataset name
|
:param dataset_name: a dataset name
|
||||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
~/quay_data/ directory)
|
~/quay_data/ directory)
|
||||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
:param min_test_split: minimum proportion of instances to be included in the test set. This value is interpreted
|
||||||
|
as a minimum proportion, meaning that the real proportion could be higher in case the training proportion
|
||||||
|
(1-`min_test_split`% of the instances) surpasses `max_train_instances`. In such case, only `max_train_instances`
|
||||||
|
are taken for training, and the rest (irrespective of `min_test_split`) is taken for test.
|
||||||
|
:param max_train_instances: maximum number of instances to keep for training (defaults to 25000)
|
||||||
|
:param min_class_support: minimum number of istances per class. Classes with fewer instances
|
||||||
|
are discarded (deafult is 100)
|
||||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||||
:return: a :class:`quapy.data.base.Dataset` instance
|
:return: a :class:`quapy.data.base.Dataset` instance
|
||||||
"""
|
"""
|
||||||
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
|
|
||||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
|
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose)
|
||||||
|
n = len(data)
|
||||||
|
train_prop = (1.-min_test_split)
|
||||||
|
n_train = int(n*train_prop)
|
||||||
|
if n_train > max_train_instances:
|
||||||
|
train_prop = (max_train_instances / n)
|
||||||
|
|
||||||
|
return Dataset(*data.split_stratified(train_prop, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection:
|
||||||
"""
|
"""
|
||||||
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
|
||||||
|
|
||||||
|
@ -613,7 +660,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
:param dataset_name: a dataset name
|
:param dataset_name: a dataset name
|
||||||
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
|
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
|
||||||
~/quay_data/ directory)
|
~/quay_data/ directory)
|
||||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
:param test_split: proportion of instances to be included in the test set. The rest conforms the training set
|
||||||
|
:param min_class_support: minimum number of istances per class. Classes with fewer instances
|
||||||
|
are discarded (deafult is 100)
|
||||||
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
:param verbose: set to True (default is False) to get information (stats) about the dataset
|
||||||
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||||
"""
|
"""
|
||||||
|
@ -626,19 +675,57 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
data_home = get_quapy_home()
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
identifiers = {
|
identifiers = {
|
||||||
"dry-bean": 602,
|
'dry-bean': 602,
|
||||||
"wine-quality": 186,
|
'wine-quality': 186,
|
||||||
"academic-success": 697,
|
'academic-success': 697,
|
||||||
"digits": 80,
|
'digits': 80,
|
||||||
"letter": 59
|
'letter': 59,
|
||||||
|
'abalone': 1,
|
||||||
|
'obesity': 544,
|
||||||
|
'nursery': 76,
|
||||||
|
'yeast': 110,
|
||||||
|
'hand_digits': 81,
|
||||||
|
'satellite': 146,
|
||||||
|
'shuttle': 148,
|
||||||
|
'cmc': 30,
|
||||||
|
'isolet': 54,
|
||||||
|
'waveform-v1': 107,
|
||||||
|
'molecular': 69,
|
||||||
|
'poker_hand': 158,
|
||||||
|
'connect-4': 26,
|
||||||
|
'mhr': 863,
|
||||||
|
'chess': 23,
|
||||||
|
'page_block': 78,
|
||||||
|
'phishing': 379,
|
||||||
|
'image_seg': 147,
|
||||||
|
'hcv': 503,
|
||||||
}
|
}
|
||||||
|
|
||||||
full_names = {
|
full_names = {
|
||||||
"dry-bean": "Dry Bean Dataset",
|
'dry-bean': 'Dry Bean Dataset',
|
||||||
"wine-quality": "Wine Quality",
|
'wine-quality': 'Wine Quality',
|
||||||
"academic-success": "Predict students' dropout and academic success",
|
'academic-success': 'Predict students\' dropout and academic success',
|
||||||
"digits": "Optical Recognition of Handwritten Digits",
|
'digits': 'Optical Recognition of Handwritten Digits',
|
||||||
"letter": "Letter Recognition"
|
'letter': 'Letter Recognition',
|
||||||
|
'abalone': 'Abalone',
|
||||||
|
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
|
||||||
|
'nursery': 'Nursery',
|
||||||
|
'yeast': 'Yeast',
|
||||||
|
'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
|
||||||
|
'satellite': 'Statlog Landsat Satellite',
|
||||||
|
'shuttle': 'Statlog Shuttle',
|
||||||
|
'cmc': 'Contraceptive Method Choice',
|
||||||
|
'isolet': 'ISOLET',
|
||||||
|
'waveform-v1': 'Waveform Database Generator (Version 1)',
|
||||||
|
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
|
||||||
|
'poker_hand': 'Poker Hand',
|
||||||
|
'connect-4': 'Connect-4',
|
||||||
|
'mhr': 'Maternal Health Risk',
|
||||||
|
'chess': 'Chess (King-Rook vs. King)',
|
||||||
|
'page_block': 'Page Blocks Classification',
|
||||||
|
'phishing': 'Website Phishing',
|
||||||
|
'image_seg': 'Statlog (Image Segmentation)',
|
||||||
|
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
|
||||||
}
|
}
|
||||||
|
|
||||||
identifier = identifiers[dataset_name]
|
identifier = identifiers[dataset_name]
|
||||||
|
@ -649,14 +736,36 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
|
||||||
|
|
||||||
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
|
||||||
|
|
||||||
def download(id):
|
def download(id, name):
|
||||||
data = fetch_ucirepo(id=id)
|
df = fetch_ucirepo(id=id)
|
||||||
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
|
||||||
|
df.data.features = pd.get_dummies(df.data.features, drop_first=True)
|
||||||
|
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
|
||||||
|
|
||||||
|
assert y.ndim == 1, 'more than one y'
|
||||||
|
|
||||||
classes = np.sort(np.unique(y))
|
classes = np.sort(np.unique(y))
|
||||||
y = np.searchsorted(classes, y)
|
y = np.searchsorted(classes, y)
|
||||||
return LabelledCollection(X, y)
|
return LabelledCollection(X, y)
|
||||||
|
|
||||||
data = pickled_resource(file, download, identifier)
|
def filter_classes(data: LabelledCollection, min_ipc):
|
||||||
|
classes = data.classes_
|
||||||
|
# restrict classes to only those with at least min_ipc instances
|
||||||
|
classes = classes[data.counts() >= min_ipc]
|
||||||
|
# filter X and y keeping only datapoints belonging to valid classes
|
||||||
|
filter_idx = np.in1d(data.y, classes)
|
||||||
|
X, y = data.X[filter_idx], data.y[filter_idx]
|
||||||
|
# map classes to range(len(classes))
|
||||||
|
y = np.searchsorted(classes, y)
|
||||||
|
return LabelledCollection(X, y)
|
||||||
|
|
||||||
|
data = pickled_resource(file, download, identifier, dataset_name)
|
||||||
|
data = filter_classes(data, min_class_support)
|
||||||
|
if data.n_classes <= 2:
|
||||||
|
raise ValueError(
|
||||||
|
f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} '
|
||||||
|
f'is no longer multiclass. Try a reducing this value.'
|
||||||
|
)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
data.stats()
|
data.stats()
|
||||||
|
@ -746,8 +855,8 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
|
||||||
information on this dataset, please follow the zenodo link).
|
information on this dataset, please follow the zenodo link).
|
||||||
This dataset is based on the data available publicly at
|
This dataset is based on the data available publicly at
|
||||||
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
||||||
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
The dataset already comes with processed features.
|
||||||
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
The scripts used for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
||||||
|
|
||||||
The datasets are downloaded only once, and stored for fast reuse.
|
The datasets are downloaded only once, and stored for fast reuse.
|
||||||
|
|
||||||
|
@ -803,7 +912,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
|
||||||
if for_model_selection:
|
if for_model_selection:
|
||||||
# In this case, return 70% of training data as the training set and 30% as the test set
|
# In this case, return 70% of training data as the training set and 30% as the test set
|
||||||
samples = get_sample_list(train_samples_path)
|
samples = get_sample_list(train_samples_path)
|
||||||
train, test = generate_modelselection_split(samples, split=0.3)
|
train, test = generate_modelselection_split(samples, test_prop=0.3)
|
||||||
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
|
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
|
||||||
|
|
||||||
# Test prevalence is computed from class labels
|
# Test prevalence is computed from class labels
|
||||||
|
|
|
@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
>>> import quapy as qp
|
>>> import quapy as qp
|
||||||
>>> from quapy.method.meta import QuaNet
|
>>> from quapy.method_name.meta import QuaNet
|
||||||
>>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
|
>>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
|
||||||
>>>
|
>>>
|
||||||
>>> # use samples of 100 elements
|
>>> # use samples of 100 elements
|
||||||
|
|
|
@ -593,6 +593,7 @@ class PACC(AggregativeSoftQuantifier):
|
||||||
if self.norm not in ACC.NORMALIZATIONS:
|
if self.norm not in ACC.NORMALIZATIONS:
|
||||||
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
|
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
|
||||||
|
|
||||||
|
|
||||||
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
|
||||||
"""
|
"""
|
||||||
Estimates the misclassification rates
|
Estimates the misclassification rates
|
||||||
|
|
|
@ -6,6 +6,9 @@ import pickle
|
||||||
import urllib
|
import urllib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from contextlib import ExitStack
|
from contextlib import ExitStack
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -248,6 +251,28 @@ def _check_sample_size(sample_size):
|
||||||
return sample_size
|
return sample_size
|
||||||
|
|
||||||
|
|
||||||
|
def load_report(path, as_dict=False):
|
||||||
|
def str2prev_arr(strprev):
|
||||||
|
within = strprev.strip('[]').split()
|
||||||
|
float_list = [float(p) for p in within]
|
||||||
|
float_list[-1] = 1. - sum(float_list[:-1])
|
||||||
|
return np.asarray(float_list)
|
||||||
|
|
||||||
|
df = pd.read_csv(path, index_col=0)
|
||||||
|
df['true-prev'] = df['true-prev'].apply(str2prev_arr)
|
||||||
|
df['estim-prev'] = df['estim-prev'].apply(str2prev_arr)
|
||||||
|
if as_dict:
|
||||||
|
d = {}
|
||||||
|
for col in df.columns.values:
|
||||||
|
vals = df[col].values
|
||||||
|
if col in ['true-prev', 'estim-prev']:
|
||||||
|
vals = np.vstack(vals)
|
||||||
|
d[col] = vals
|
||||||
|
return d
|
||||||
|
else:
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
class EarlyStop:
|
class EarlyStop:
|
||||||
"""
|
"""
|
||||||
A class implementing the early-stopping condition typically used for training neural networks.
|
A class implementing the early-stopping condition typically used for training neural networks.
|
||||||
|
|
Loading…
Reference in New Issue