Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel

This commit is contained in:
Alejandro Moreo Fernandez 2024-04-30 09:55:50 +02:00
commit 817aab1d99
8 changed files with 346 additions and 80 deletions

8
.gitignore vendored
View File

@ -69,6 +69,9 @@ instance/
# Scrapy stuff: # Scrapy stuff:
.scrapy .scrapy
# vscode config:
.vscode/
# Sphinx documentation # Sphinx documentation
docs/_build/ docs/_build/
@ -85,6 +88,11 @@ ipython_config.py
# pyenv # pyenv
.python-version .python-version
# poetry
poetry.toml
pyproject.toml
poetry.lock
# pipenv # pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies # However, in case of collaboration, if having platform-specific dependencies or dependencies

View File

@ -29,12 +29,17 @@ def newLR():
def calibratedLR(): def calibratedLR():
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) return CalibratedClassifierCV(newLR())
__C_range = np.logspace(-3, 3, 7) __C_range = np.logspace(-3, 3, 7)
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} lr_params = {
svmperf_params = {'classifier__C': __C_range} 'classifier__C': __C_range,
'classifier__class_weight': [None, 'balanced']
}
svmperf_params = {
'classifier__C': __C_range
}
def quantification_models(): def quantification_models():

View File

@ -0,0 +1,124 @@
import pickle
import os
from time import time
from collections import defaultdict
import numpy as np
from sklearn.linear_model import LogisticRegression
import quapy as qp
from quapy.method.aggregative import PACC, EMQ, KDEyML
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
from pathlib import Path
SEED = 1
def newLR():
return LogisticRegression(max_iter=3000)
# typical hyperparameters explored for Logistic Regression
logreg_grid = {
'C': np.logspace(-3, 3, 7),
'class_weight': ['balanced', None]
}
def wrap_hyper(classifier_hyper_grid:dict):
return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()}
METHODS = [
('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)),
# ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
]
def show_results(result_path):
import pandas as pd
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
print(pv)
def load_timings(result_path):
import pandas as pd
timings = defaultdict(lambda: {})
if not Path(result_path + '.csv').exists():
return timings
df = pd.read_csv(result_path+'.csv', sep='\t')
return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 500
qp.environ['N_JOBS'] = -1
n_bags_val = 250
n_bags_test = 1000
result_dir = f'results/ucimulti'
os.makedirs(result_dir, exist_ok=True)
global_result_path = f'{result_dir}/allmethods'
timings = load_timings(global_result_path)
with open(global_result_path + '.csv', 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
for method_name, quantifier, param_grid in METHODS:
print('Init method', method_name)
with open(global_result_path + '.csv', 'at') as csv:
for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
print('init', dataset)
local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
if os.path.exists(local_result_path):
print(f'result file {local_result_path} already exist; skipping')
report = qp.util.load_report(local_result_path)
else:
with qp.util.temp_seed(SEED):
data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
# model selection
train, test = data.train_test
train, val = train.split_stratified(random_state=SEED)
protocol = UPP(val, repeats=n_bags_val)
modsel = GridSearchQ(
quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
)
t_init = time()
try:
modsel.fit(train)
print(f'best params {modsel.best_params_}')
print(f'best score {modsel.best_score_}')
quantifier = modsel.best_model()
except:
print('something went wrong... trying to fit the default model')
quantifier.fit(train)
timings[method_name][dataset] = time() - t_init
protocol = UPP(test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(
quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
)
report.to_csv(local_result_path)
means = report.mean(numeric_only=True)
csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{timings[method_name][dataset]:.3f}\n')
csv.flush()
show_results(global_result_path)

View File

@ -1,20 +1,17 @@
import os import os
import pandas as pd import pandas as pd
import math import math
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol from quapy.protocol import AbstractProtocol
from pathlib import Path from pathlib import Path
def get_sample_list(path_dir): def get_sample_list(path_dir):
"""Gets a sample list finding the csv files in a directory """
Gets a sample list finding the csv files in a directory
Args: :param path_dir: directory to look for samples
path_dir (_type_): directory to look for samples :return: list of samples
Returns:
_type_: list of samples
""" """
samples = [] samples = []
for filename in sorted(os.listdir(path_dir)): for filename in sorted(os.listdir(path_dir)):
@ -23,18 +20,15 @@ def get_sample_list(path_dir):
return samples return samples
def generate_modelselection_split(samples, split=0.3): def generate_modelselection_split(samples, test_prop=0.3):
"""This function generates a train/test split for model selection """This function generates a train/test partition for model selection
without the use of random numbers so the split is always the same without the use of random numbers so the split is always the same
Args: :param samples: list of samples
samples (_type_): list of samples :param test_prop: float, percentage saved for test. Defaults to 0.3.
split (float, optional): percentage saved for test. Defaults to 0.3. :return: list of samples to use as train and list of samples to use as test
Returns:
_type_: list of samples to use as train and list of samples to use as test
""" """
num_items_to_pick = math.ceil(len(samples) * split) num_items_to_pick = math.ceil(len(samples) * test_prop)
step_size = math.floor(len(samples) / num_items_to_pick) step_size = math.floor(len(samples) / num_items_to_pick)
test_indices = [i * step_size for i in range(num_items_to_pick)] test_indices = [i * step_size for i in range(num_items_to_pick)]
test = [samples[i] for i in test_indices] test = [samples[i] for i in test_indices]

View File

@ -14,12 +14,17 @@ from quapy.util import download_file_if_not_exists, download_file, get_quapy_hom
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
TWITTER_SENTIMENT_DATASETS_TEST = [
'gasp', 'hcr', 'omd', 'sanders',
'semeval13', 'semeval14', 'semeval15', 'semeval16', 'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb'] 'sst', 'wa', 'wb',
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', ]
TWITTER_SENTIMENT_DATASETS_TRAIN = [
'gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16', 'semeval', 'semeval16',
'sst', 'wa', 'wb'] 'sst', 'wa', 'wb',
]
UCI_BINARY_DATASETS = [ UCI_BINARY_DATASETS = [
#'acute.a', 'acute.b', #'acute.a', 'acute.b',
'balance.1', 'balance.1',
@ -44,14 +49,37 @@ UCI_BINARY_DATASETS = [
'transfusion', 'transfusion',
'wdbc', 'wdbc',
'wine.1', 'wine.2', 'wine.3', 'wine.1', 'wine.2', 'wine.3',
'wine-q-red', 'wine-q-white', 'wine-q-red',
'yeast'] 'wine-q-white',
'yeast',
]
UCI_MULTICLASS_DATASETS = ['dry-bean', UCI_MULTICLASS_DATASETS = [
'dry-bean',
'wine-quality', 'wine-quality',
'academic-success', 'academic-success',
'digits', 'digits',
'letter'] 'letter',
'abalone',
'obesity',
'nursery',
'yeast',
'hand_digits',
'satellite',
'shuttle',
'cmc',
'isolet',
'waveform-v1',
'molecular',
'poker_hand',
'connect-4',
'mhr',
'chess',
'page_block',
'phishing',
'image_seg',
'hcv',
]
LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B'] LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
LEQUA2022_TEXT_TASKS = ['T2A', 'T2B'] LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
@ -561,7 +589,13 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
return data return data
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: def fetch_UCIMulticlassDataset(
dataset_name,
data_home=None,
min_test_split=0.3,
max_train_instances=25000,
min_class_support=100,
verbose=False) -> Dataset:
""" """
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`.
@ -583,15 +617,28 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
:param dataset_name: a dataset name :param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param min_test_split: minimum proportion of instances to be included in the test set. This value is interpreted
as a minimum proportion, meaning that the real proportion could be higher in case the training proportion
(1-`min_test_split`% of the instances) surpasses `max_train_instances`. In such case, only `max_train_instances`
are taken for training, and the rest (irrespective of `min_test_split`) is taken for test.
:param max_train_instances: maximum number of instances to keep for training (defaults to 25000)
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param verbose: set to True (default is False) to get information (stats) about the dataset :param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.Dataset` instance :return: a :class:`quapy.data.base.Dataset` instance
""" """
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name) data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose)
n = len(data)
train_prop = (1.-min_test_split)
n_train = int(n*train_prop)
if n_train > max_train_instances:
train_prop = (max_train_instances / n)
return Dataset(*data.split_stratified(train_prop, random_state=0))
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection:
""" """
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.
@ -613,7 +660,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
:param dataset_name: a dataset name :param dataset_name: a dataset name
:param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param test_split: proportion of instances to be included in the test set. The rest conforms the training set
:param min_class_support: minimum number of istances per class. Classes with fewer instances
are discarded (deafult is 100)
:param verbose: set to True (default is False) to get information (stats) about the dataset :param verbose: set to True (default is False) to get information (stats) about the dataset
:return: a :class:`quapy.data.base.LabelledCollection` instance :return: a :class:`quapy.data.base.LabelledCollection` instance
""" """
@ -626,19 +675,57 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
data_home = get_quapy_home() data_home = get_quapy_home()
identifiers = { identifiers = {
"dry-bean": 602, 'dry-bean': 602,
"wine-quality": 186, 'wine-quality': 186,
"academic-success": 697, 'academic-success': 697,
"digits": 80, 'digits': 80,
"letter": 59 'letter': 59,
'abalone': 1,
'obesity': 544,
'nursery': 76,
'yeast': 110,
'hand_digits': 81,
'satellite': 146,
'shuttle': 148,
'cmc': 30,
'isolet': 54,
'waveform-v1': 107,
'molecular': 69,
'poker_hand': 158,
'connect-4': 26,
'mhr': 863,
'chess': 23,
'page_block': 78,
'phishing': 379,
'image_seg': 147,
'hcv': 503,
} }
full_names = { full_names = {
"dry-bean": "Dry Bean Dataset", 'dry-bean': 'Dry Bean Dataset',
"wine-quality": "Wine Quality", 'wine-quality': 'Wine Quality',
"academic-success": "Predict students' dropout and academic success", 'academic-success': 'Predict students\' dropout and academic success',
"digits": "Optical Recognition of Handwritten Digits", 'digits': 'Optical Recognition of Handwritten Digits',
"letter": "Letter Recognition" 'letter': 'Letter Recognition',
'abalone': 'Abalone',
'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
'nursery': 'Nursery',
'yeast': 'Yeast',
'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
'satellite': 'Statlog Landsat Satellite',
'shuttle': 'Statlog Shuttle',
'cmc': 'Contraceptive Method Choice',
'isolet': 'ISOLET',
'waveform-v1': 'Waveform Database Generator (Version 1)',
'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
'poker_hand': 'Poker Hand',
'connect-4': 'Connect-4',
'mhr': 'Maternal Health Risk',
'chess': 'Chess (King-Rook vs. King)',
'page_block': 'Page Blocks Classification',
'phishing': 'Website Phishing',
'image_seg': 'Statlog (Image Segmentation)',
'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
} }
identifier = identifiers[dataset_name] identifier = identifiers[dataset_name]
@ -649,14 +736,36 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
def download(id): def download(id, name):
data = fetch_ucirepo(id=id) df = fetch_ucirepo(id=id)
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
df.data.features = pd.get_dummies(df.data.features, drop_first=True)
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
assert y.ndim == 1, 'more than one y'
classes = np.sort(np.unique(y)) classes = np.sort(np.unique(y))
y = np.searchsorted(classes, y) y = np.searchsorted(classes, y)
return LabelledCollection(X, y) return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier) def filter_classes(data: LabelledCollection, min_ipc):
classes = data.classes_
# restrict classes to only those with at least min_ipc instances
classes = classes[data.counts() >= min_ipc]
# filter X and y keeping only datapoints belonging to valid classes
filter_idx = np.in1d(data.y, classes)
X, y = data.X[filter_idx], data.y[filter_idx]
# map classes to range(len(classes))
y = np.searchsorted(classes, y)
return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier, dataset_name)
data = filter_classes(data, min_class_support)
if data.n_classes <= 2:
raise ValueError(
f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} '
f'is no longer multiclass. Try a reducing this value.'
)
if verbose: if verbose:
data.stats() data.stats()
@ -746,8 +855,8 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
information on this dataset, please follow the zenodo link). information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at This dataset is based on the data available publicly at
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_. `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_. The dataset already comes with processed features.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. The scripts used for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
The datasets are downloaded only once, and stored for fast reuse. The datasets are downloaded only once, and stored for fast reuse.
@ -803,7 +912,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
if for_model_selection: if for_model_selection:
# In this case, return 70% of training data as the training set and 30% as the test set # In this case, return 70% of training data as the training set and 30% as the test set
samples = get_sample_list(train_samples_path) samples = get_sample_list(train_samples_path)
train, test = generate_modelselection_split(samples, split=0.3) train, test = generate_modelselection_split(samples, test_prop=0.3)
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)
# Test prevalence is computed from class labels # Test prevalence is computed from class labels

View File

@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier):
Example: Example:
>>> import quapy as qp >>> import quapy as qp
>>> from quapy.method.meta import QuaNet >>> from quapy.method_name.meta import QuaNet
>>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
>>> >>>
>>> # use samples of 100 elements >>> # use samples of 100 elements

View File

@ -593,6 +593,7 @@ class PACC(AggregativeSoftQuantifier):
if self.norm not in ACC.NORMALIZATIONS: if self.norm not in ACC.NORMALIZATIONS:
raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}") raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")
def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
""" """
Estimates the misclassification rates Estimates the misclassification rates

View File

@ -6,6 +6,9 @@ import pickle
import urllib import urllib
from pathlib import Path from pathlib import Path
from contextlib import ExitStack from contextlib import ExitStack
import pandas as pd
import quapy as qp import quapy as qp
import numpy as np import numpy as np
@ -248,6 +251,28 @@ def _check_sample_size(sample_size):
return sample_size return sample_size
def load_report(path, as_dict=False):
def str2prev_arr(strprev):
within = strprev.strip('[]').split()
float_list = [float(p) for p in within]
float_list[-1] = 1. - sum(float_list[:-1])
return np.asarray(float_list)
df = pd.read_csv(path, index_col=0)
df['true-prev'] = df['true-prev'].apply(str2prev_arr)
df['estim-prev'] = df['estim-prev'].apply(str2prev_arr)
if as_dict:
d = {}
for col in df.columns.values:
vals = df[col].values
if col in ['true-prev', 'estim-prev']:
vals = np.vstack(vals)
d[col] = vals
return d
else:
return df
class EarlyStop: class EarlyStop:
""" """
A class implementing the early-stopping condition typically used for training neural networks. A class implementing the early-stopping condition typically used for training neural networks.