Compare commits
2 Commits
7593fde2e0
...
1c258a2000
Author | SHA1 | Date |
---|---|---|
Alejandro Moreo Fernandez | 1c258a2000 | |
Alejandro Moreo Fernandez | dca955819c |
|
@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from method_kdey_closed_efficient import KDEyclosed_efficient
|
from method_kdey_closed_efficient import KDEyclosed_efficient
|
||||||
|
|
||||||
METHODS = ['KDEy-closed++', 'KDEy-closed+', 'KDEy-closed', 'ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
||||||
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,6 +61,8 @@ class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
|
||||||
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print('training over')
|
||||||
|
|
||||||
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
assert all(sorted(np.unique(y)) == np.arange(data.n_classes)), \
|
||||||
'label name gaps not allowed in current implementation'
|
'label name gaps not allowed in current implementation'
|
||||||
|
|
||||||
|
@ -94,11 +96,14 @@ class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
|
||||||
self.tr_tr_sums = tr_tr_sums
|
self.tr_tr_sums = tr_tr_sums
|
||||||
self.counts_inv = counts_inv
|
self.counts_inv = counts_inv
|
||||||
|
|
||||||
|
print('fit over')
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def aggregate(self, posteriors: np.ndarray):
|
def aggregate(self, posteriors: np.ndarray):
|
||||||
|
|
||||||
|
# print('aggregating')
|
||||||
Ptr = self.Ptr
|
Ptr = self.Ptr
|
||||||
Pte = posteriors
|
Pte = posteriors
|
||||||
|
|
||||||
|
@ -121,6 +126,8 @@ class KDEyclosed_efficient_corr(AggregativeProbabilisticQuantifier):
|
||||||
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
|
partB = 0.5 * np.log((alpha_l[:,np.newaxis] * tr_tr_sums * alpha_l).sum())
|
||||||
return partA + partB + partC
|
return partA + partB + partC
|
||||||
|
|
||||||
|
# print('starting search')
|
||||||
|
|
||||||
# the initial point is set as the uniform distribution
|
# the initial point is set as the uniform distribution
|
||||||
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
uniform_distribution = np.full(fill_value=1 / n, shape=(n,))
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,8 @@ import zipfile
|
||||||
from os.path import join
|
from os.path import join
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import scipy
|
import scipy
|
||||||
|
import pickle
|
||||||
|
from ucimlrepo import fetch_ucirepo
|
||||||
from quapy.data.base import Dataset, LabelledCollection
|
from quapy.data.base import Dataset, LabelledCollection
|
||||||
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
from quapy.data.preprocessing import text2tfidf, reduce_columns
|
||||||
from quapy.data.reader import *
|
from quapy.data.reader import *
|
||||||
|
@ -45,6 +46,12 @@ UCI_DATASETS = ['acute.a', 'acute.b',
|
||||||
'wine-q-red', 'wine-q-white',
|
'wine-q-red', 'wine-q-white',
|
||||||
'yeast']
|
'yeast']
|
||||||
|
|
||||||
|
UCI_MULTICLASS_DATASETS = ['dry-bean',
|
||||||
|
'wine-quality',
|
||||||
|
'academic-success',
|
||||||
|
'digits',
|
||||||
|
'letter']
|
||||||
|
|
||||||
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B']
|
||||||
|
|
||||||
_TXA_SAMPLE_SIZE = 250
|
_TXA_SAMPLE_SIZE = 250
|
||||||
|
@ -548,6 +555,103 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
||||||
data.stats()
|
data.stats()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
|
||||||
|
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
|
||||||
|
"""
|
||||||
|
Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`, as used in
|
||||||
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||||
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||||
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||||
|
and
|
||||||
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||||
|
Dynamic ensemble selection for quantification tasks.
|
||||||
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||||
|
The datasets do not come with a predefined train-test split (see :meth:`fetch_UCILabelledCollection` for further
|
||||||
|
information on how to use these collections), and so a train-test split is generated at desired proportion.
|
||||||
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
|
||||||
|
|
||||||
|
:param dataset_name: a dataset name
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||||
|
:return: a :class:`quapy.data.base.Dataset` instance
|
||||||
|
"""
|
||||||
|
data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
|
||||||
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
||||||
|
"""
|
||||||
|
Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
||||||
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||||
|
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
|
||||||
|
Information Fusion, 34, 87-100. <https://www.sciencedirect.com/science/article/pii/S1566253516300628>`_
|
||||||
|
and
|
||||||
|
`Pérez-Gállego, P., Castano, A., Quevedo, J. R., & del Coz, J. J. (2019).
|
||||||
|
Dynamic ensemble selection for quantification tasks.
|
||||||
|
Information Fusion, 45, 1-15. <https://www.sciencedirect.com/science/article/pii/S1566253517303652>`_.
|
||||||
|
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
|
||||||
|
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
|
||||||
|
This can be reproduced by using :meth:`quapy.data.base.Dataset.kFCV`, e.g.:
|
||||||
|
|
||||||
|
>>> import quapy as qp
|
||||||
|
>>> collection = qp.datasets.fetch_UCILabelledCollection("dry-bean")
|
||||||
|
>>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
|
||||||
|
>>> ...
|
||||||
|
|
||||||
|
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_MULTICLASS_DATASETS`
|
||||||
|
|
||||||
|
:param dataset_name: a dataset name
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||||
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||||
|
"""
|
||||||
|
assert dataset_name in UCI_MULTICLASS_DATASETS, \
|
||||||
|
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository (multiclass). ' \
|
||||||
|
f'Valid ones are {UCI_MULTICLASS_DATASETS}'
|
||||||
|
|
||||||
|
if data_home is None:
|
||||||
|
data_home = get_quapy_home()
|
||||||
|
|
||||||
|
identifiers = {"dry-bean": 602,
|
||||||
|
"wine-quality": 186,
|
||||||
|
"academic-success": 697,
|
||||||
|
"digits": 80,
|
||||||
|
"letter": 59}
|
||||||
|
|
||||||
|
full_names = {"dry-bean": "Dry Bean Dataset",
|
||||||
|
"wine-quality": "Wine Quality",
|
||||||
|
"academic-success": "Predict students' dropout and academic success",
|
||||||
|
"digits": "Optical Recognition of Handwritten Digits",
|
||||||
|
"letter": "Letter Recognition"
|
||||||
|
}
|
||||||
|
|
||||||
|
identifier = identifiers[dataset_name]
|
||||||
|
fullname = full_names[dataset_name]
|
||||||
|
|
||||||
|
print(f'Loading UCI Muticlass {dataset_name} ({fullname})')
|
||||||
|
|
||||||
|
file = join(data_home, 'uci_multiclass', dataset_name + '.pkl')
|
||||||
|
if os.path.exists(file):
|
||||||
|
with open(file, 'rb') as file:
|
||||||
|
data = pickle.load(file)
|
||||||
|
else:
|
||||||
|
data = fetch_ucirepo(id=identifier)
|
||||||
|
X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
|
||||||
|
data = LabelledCollection(X, y)
|
||||||
|
os.makedirs(os.path.dirname(file), exist_ok=True)
|
||||||
|
with open(file, 'wb') as file:
|
||||||
|
pickle.dump(data, file)
|
||||||
|
|
||||||
|
data.stats()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def _df_replace(df, col, repl={'yes': 1, 'no': 0}, astype=float):
|
def _df_replace(df, col, repl={'yes': 1, 'no': 0}, astype=float):
|
||||||
df[col] = df[col].apply(lambda x: repl[x]).astype(astype, copy=False)
|
df[col] = df[col].apply(lambda x: repl[x]).astype(astype, copy=False)
|
||||||
|
|
Loading…
Reference in New Issue