added model selection for quantification
This commit is contained in:
parent
7d6f523e4b
commit
7bed93dcbf
|
@ -4,11 +4,19 @@ from sklearn.model_selection import train_test_split
|
||||||
from quapy.functional import artificial_prevalence_sampling
|
from quapy.functional import artificial_prevalence_sampling
|
||||||
from scipy.sparse import vstack
|
from scipy.sparse import vstack
|
||||||
|
|
||||||
|
from util import temp_seed
|
||||||
|
|
||||||
|
|
||||||
class LabelledCollection:
|
class LabelledCollection:
|
||||||
|
|
||||||
def __init__(self, instances, labels, n_classes=None):
|
def __init__(self, instances, labels, n_classes=None):
|
||||||
self.instances = instances if issparse(instances) else np.asarray(instances)
|
if issparse(instances):
|
||||||
|
self.instances = instances
|
||||||
|
elif isinstance(instances, list) and len(instances)>0 and isinstance(instances[0], str):
|
||||||
|
# lists of strings occupy too much as ndarrays (although python-objects add a heavy overload)
|
||||||
|
self.instances = np.asarray(instances, dtype=object)
|
||||||
|
else:
|
||||||
|
self.instances = np.asarray(instances)
|
||||||
self.labels = np.asarray(labels, dtype=int)
|
self.labels = np.asarray(labels, dtype=int)
|
||||||
n_docs = len(self)
|
n_docs = len(self)
|
||||||
if n_classes is None:
|
if n_classes is None:
|
||||||
|
@ -87,6 +95,7 @@ class LabelledCollection:
|
||||||
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
return LabelledCollection(documents, labels, n_classes=self.n_classes)
|
||||||
|
|
||||||
def split_stratified(self, train_prop=0.6):
|
def split_stratified(self, train_prop=0.6):
|
||||||
|
# with temp_seed(42):
|
||||||
tr_docs, te_docs, tr_labels, te_labels = \
|
tr_docs, te_docs, tr_labels, te_labels = \
|
||||||
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
|
train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels)
|
||||||
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels)
|
||||||
|
@ -110,14 +119,16 @@ class LabelledCollection:
|
||||||
yield self.uniform_sampling_index(sample_size)
|
yield self.uniform_sampling_index(sample_size)
|
||||||
|
|
||||||
def __add__(self, other):
|
def __add__(self, other):
|
||||||
if issparse(self.instances) and issparse(other.documents):
|
if issparse(self.instances) and issparse(other.instances):
|
||||||
docs = vstack([self.instances, other.documents])
|
join_instances = vstack([self.instances, other.instances])
|
||||||
elif isinstance(self.instances, list) and isinstance(other.documents, list):
|
elif isinstance(self.instances, list) and isinstance(other.instances, list):
|
||||||
docs = self.instances + other.documents
|
join_instances = self.instances + other.instances
|
||||||
|
elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray):
|
||||||
|
join_instances = np.concatenate([self.instances, other.instances])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError('unsupported operation for collection types')
|
raise NotImplementedError('unsupported operation for collection types')
|
||||||
labels = np.concatenate([self.labels, other.labels])
|
labels = np.concatenate([self.labels, other.labels])
|
||||||
return LabelledCollection(docs, labels)
|
return LabelledCollection(join_instances, labels)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
import zipfile
|
import zipfile
|
||||||
from util import download_file_if_not_exists, download_file, get_quapy_home
|
from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
|
||||||
import os
|
import os
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from data.base import Dataset
|
from data.base import Dataset
|
||||||
|
@ -12,7 +12,21 @@ TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'sem
|
||||||
'sst', 'wa', 'wb']
|
'sst', 'wa', 'wb']
|
||||||
|
|
||||||
|
|
||||||
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
|
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
|
||||||
|
"""
|
||||||
|
Load a Reviews dataset as a Dataset instance, as used in:
|
||||||
|
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
|
||||||
|
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
|
||||||
|
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
|
||||||
|
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
|
||||||
|
:param min_df: minimun number of documents that should contain a term in order for the term to be
|
||||||
|
kept (ignored if tfidf==False)
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||||
|
faster subsequent invokations
|
||||||
|
:return: a Dataset instance
|
||||||
|
"""
|
||||||
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
|
||||||
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
|
||||||
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
|
||||||
|
@ -27,18 +41,37 @@ def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None):
|
||||||
download_file_if_not_exists(URL_TRAIN, train_path)
|
download_file_if_not_exists(URL_TRAIN, train_path)
|
||||||
download_file_if_not_exists(URL_TEST, test_path)
|
download_file_if_not_exists(URL_TEST, test_path)
|
||||||
|
|
||||||
data = Dataset.load(train_path, test_path, from_text)
|
pickle_path = None
|
||||||
|
if pickle:
|
||||||
|
pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
|
||||||
|
data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
|
||||||
|
|
||||||
if tfidf:
|
if tfidf:
|
||||||
text2tfidf(data, inplace=True)
|
text2tfidf(data, inplace=True)
|
||||||
|
|
||||||
if min_df is not None:
|
if min_df is not None:
|
||||||
reduce_columns(data, min_df=min_df, inplace=True)
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=None):
|
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
|
||||||
|
"""
|
||||||
|
Load a Twitter dataset as a Dataset instance, as used in:
|
||||||
|
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
|
||||||
|
Social Network Analysis and Mining6(19), 1–22 (2016)
|
||||||
|
|
||||||
|
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
|
||||||
|
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
|
||||||
|
:param for_model_selection: if True, then returns the train split as the training set and the devel split
|
||||||
|
as the test set; if False, then returns the train+devel split as the training set and the test set as the
|
||||||
|
test set
|
||||||
|
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
|
||||||
|
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
|
~/quay_data/ directory)
|
||||||
|
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
|
||||||
|
faster subsequent invokations
|
||||||
|
:return: a Dataset instance
|
||||||
|
"""
|
||||||
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
|
assert dataset_name in TWITTER_SENTIMENT_DATASETS, \
|
||||||
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
|
||||||
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
|
f'Valid ones are {TWITTER_SENTIMENT_DATASETS}'
|
||||||
|
@ -56,23 +89,27 @@ def fetch_twitter(dataset_name, model_selection=False, min_df=None, data_home=No
|
||||||
|
|
||||||
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
|
||||||
trainset_name = 'semeval'
|
trainset_name = 'semeval'
|
||||||
testset_name = 'semeval' if model_selection else dataset_name
|
testset_name = 'semeval' if for_model_selection else dataset_name
|
||||||
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
|
||||||
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
|
||||||
else:
|
else:
|
||||||
trainset_name = testset_name = dataset_name
|
trainset_name = testset_name = dataset_name
|
||||||
|
|
||||||
if model_selection:
|
if for_model_selection:
|
||||||
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
|
||||||
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
|
||||||
else:
|
else:
|
||||||
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
|
||||||
if dataset_name == 'semeval16':
|
if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only
|
||||||
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
|
||||||
else:
|
else:
|
||||||
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
|
||||||
|
|
||||||
data = Dataset.load(train, test, from_sparse)
|
pickle_path = None
|
||||||
|
if pickle:
|
||||||
|
mode = "train-dev" if for_model_selection else "train+dev-test"
|
||||||
|
pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
|
||||||
|
data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
|
||||||
|
|
||||||
if min_df is not None:
|
if min_df is not None:
|
||||||
reduce_columns(data, min_df=min_df, inplace=True)
|
reduce_columns(data, min_df=min_df, inplace=True)
|
||||||
|
|
|
@ -14,7 +14,9 @@ def artificial_sampling_prediction(
|
||||||
n_prevpoints=210,
|
n_prevpoints=210,
|
||||||
n_repetitions=1,
|
n_repetitions=1,
|
||||||
n_jobs=-1,
|
n_jobs=-1,
|
||||||
random_seed=42):
|
random_seed=42,
|
||||||
|
verbose=True
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
Performs the predictions for all samples generated according to the artificial sampling protocol.
|
||||||
:param model: the model in charge of generating the class prevalence estimations
|
:param model: the model in charge of generating the class prevalence estimations
|
||||||
|
@ -25,6 +27,7 @@ def artificial_sampling_prediction(
|
||||||
:param n_jobs: number of jobs to be run in parallel
|
:param n_jobs: number of jobs to be run in parallel
|
||||||
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
:param random_seed: allows to replicate the samplings. The seed is local to the method and does not affect
|
||||||
any other random process.
|
any other random process.
|
||||||
|
:param verbose: if True, shows a progress bar
|
||||||
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
:return: two ndarrays of [m,n] with m the number of samples (n_prevpoints*n_repetitions) and n the
|
||||||
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
number of classes. The first one contains the true prevalences for the samples generated while the second one
|
||||||
containing the the prevalences estimations
|
containing the the prevalences estimations
|
||||||
|
@ -36,15 +39,12 @@ def artificial_sampling_prediction(
|
||||||
if isinstance(model, AggregativeQuantifier):
|
if isinstance(model, AggregativeQuantifier):
|
||||||
quantification_func = model.aggregate
|
quantification_func = model.aggregate
|
||||||
if isinstance(model, AggregativeProbabilisticQuantifier):
|
if isinstance(model, AggregativeProbabilisticQuantifier):
|
||||||
print('\tpreclassifying with soft')
|
|
||||||
preclassified_instances = model.posterior_probabilities(test.instances)
|
preclassified_instances = model.posterior_probabilities(test.instances)
|
||||||
else:
|
else:
|
||||||
print('\tpreclassifying with hard')
|
|
||||||
preclassified_instances = model.classify(test.instances)
|
preclassified_instances = model.classify(test.instances)
|
||||||
test = LabelledCollection(preclassified_instances, test.labels)
|
test = LabelledCollection(preclassified_instances, test.labels)
|
||||||
else:
|
else:
|
||||||
quantification_func = model.quantify
|
quantification_func = model.quantify
|
||||||
print('not an aggregative')
|
|
||||||
|
|
||||||
def _predict_prevalences(index):
|
def _predict_prevalences(index):
|
||||||
sample = test.sampling_from_index(index)
|
sample = test.sampling_from_index(index)
|
||||||
|
@ -52,8 +52,9 @@ def artificial_sampling_prediction(
|
||||||
estim_prevalence = quantification_func(sample.instances)
|
estim_prevalence = quantification_func(sample.instances)
|
||||||
return true_prevalence, estim_prevalence
|
return true_prevalence, estim_prevalence
|
||||||
|
|
||||||
|
pbar = tqdm(indexes, desc='[artificial sampling protocol] predicting') if verbose else indexes
|
||||||
results = Parallel(n_jobs=n_jobs)(
|
results = Parallel(n_jobs=n_jobs)(
|
||||||
delayed(_predict_prevalences)(index) for index in tqdm(indexes, desc='[artificial sampling protocol] predicting')
|
delayed(_predict_prevalences)(index) for index in pbar
|
||||||
)
|
)
|
||||||
|
|
||||||
true_prevalences, estim_prevalences = zip(*results)
|
true_prevalences, estim_prevalences = zip(*results)
|
||||||
|
|
|
@ -3,12 +3,13 @@ from copy import deepcopy
|
||||||
import functional as F
|
import functional as F
|
||||||
import error
|
import error
|
||||||
from method.base import BaseQuantifier
|
from method.base import BaseQuantifier
|
||||||
from quapy.classification.svmperf import SVMperf
|
from classification.svmperf import SVMperf
|
||||||
from quapy.data import LabelledCollection
|
from data import LabelledCollection
|
||||||
from sklearn.metrics import confusion_matrix
|
from sklearn.metrics import confusion_matrix
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
# Abstract classes
|
# Abstract classes
|
||||||
|
@ -77,13 +78,19 @@ class AggregativeProbabilisticQuantifier(AggregativeQuantifier):
|
||||||
self.learner.set_params(**parameters)
|
self.learner.set_params(**parameters)
|
||||||
|
|
||||||
|
|
||||||
|
class BinaryQuantifier(BaseQuantifier):
|
||||||
|
def _check_binary(self, data : LabelledCollection, quantifier_name):
|
||||||
|
assert data.binary, f'{quantifier_name} works only on problems of binary classification. ' \
|
||||||
|
f'Use the class OneVsAll to enable {quantifier_name} work on single-label data.'
|
||||||
|
|
||||||
|
|
||||||
# Helper
|
# Helper
|
||||||
# ------------------------------------
|
# ------------------------------------
|
||||||
def training_helper(learner,
|
def training_helper(learner,
|
||||||
data: LabelledCollection,
|
data: LabelledCollection,
|
||||||
fit_learner: bool = True,
|
fit_learner: bool = True,
|
||||||
ensure_probabilistic=False,
|
ensure_probabilistic=False,
|
||||||
train_val_split=None):
|
val_split:Union[LabelledCollection, float]=None):
|
||||||
"""
|
"""
|
||||||
Training procedure common to all Aggregative Quantifiers.
|
Training procedure common to all Aggregative Quantifiers.
|
||||||
:param learner: the learner to be fit
|
:param learner: the learner to be fit
|
||||||
|
@ -91,7 +98,9 @@ def training_helper(learner,
|
||||||
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
:param fit_learner: whether or not to fit the learner (if False, then bypasses any action)
|
||||||
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
:param ensure_probabilistic: if True, guarantees that the resulting classifier implements predict_proba (if the
|
||||||
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
learner is not probabilistic, then a CalibratedCV instance of it is trained)
|
||||||
:param train_val_split: if specified, indicates the proportion of training instances on which to fit the learner
|
:param val_split: if specified as a float, indicates the proportion of training instances that will define the
|
||||||
|
validation split (e.g., 0.3 for using 30% of the training set as validation data); if specified as a
|
||||||
|
LabelledCollection, represents the validation split itself
|
||||||
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
:return: the learner trained on the training set, and the unused data (a _LabelledCollection_ if train_val_split>0
|
||||||
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
or None otherwise) to be used as a validation set for any subsequent parameter fitting
|
||||||
"""
|
"""
|
||||||
|
@ -101,10 +110,17 @@ def training_helper(learner,
|
||||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||||
f'The learner will be calibrated.')
|
f'The learner will be calibrated.')
|
||||||
learner = CalibratedClassifierCV(learner, cv=5)
|
learner = CalibratedClassifierCV(learner, cv=5)
|
||||||
if train_val_split is not None:
|
if val_split is not None:
|
||||||
if not (0 < train_val_split < 1):
|
if isinstance(val_split, float):
|
||||||
raise ValueError(f'train/val split {train_val_split} out of range, must be in (0,1)')
|
if not (0 < val_split < 1):
|
||||||
train, unused = data.split_stratified(train_prop=train_val_split)
|
raise ValueError(f'train/val split {val_split} out of range, must be in (0,1)')
|
||||||
|
train, unused = data.split_stratified(train_prop=1-val_split)
|
||||||
|
elif isinstance(val_split, LabelledCollection):
|
||||||
|
train = data
|
||||||
|
unused = val_split
|
||||||
|
else:
|
||||||
|
raise ValueError('train_val_split not understood; use either a float indicating the split proportion, '
|
||||||
|
'or a LabelledCollection indicating the validation split')
|
||||||
else:
|
else:
|
||||||
train, unused = data, None
|
train, unused = data, None
|
||||||
learner.fit(train.instances, train.labels)
|
learner.fit(train.instances, train.labels)
|
||||||
|
@ -148,8 +164,17 @@ class AdjustedClassifyAndCount(AggregativeQuantifier):
|
||||||
def __init__(self, learner):
|
def __init__(self, learner):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
self.learner, validation = training_helper(self.learner, data, fit_learner, train_val_split=train_val_split)
|
"""
|
||||||
|
Trains a ACC quantifier
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself
|
||||||
|
:return: self
|
||||||
|
"""
|
||||||
|
self.learner, validation = training_helper(self.learner, data, fit_learner, val_split=val_split)
|
||||||
self.cc = ClassifyAndCount(self.learner)
|
self.cc = ClassifyAndCount(self.learner)
|
||||||
y_ = self.classify(validation.instances)
|
y_ = self.classify(validation.instances)
|
||||||
y = validation.labels
|
y = validation.labels
|
||||||
|
@ -196,9 +221,18 @@ class ProbabilisticAdjustedClassifyAndCount(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner):
|
def __init__(self, learner):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
|
"""
|
||||||
|
Trains a PACC quantifier
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself
|
||||||
|
:return: self
|
||||||
|
"""
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split
|
||||||
)
|
)
|
||||||
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
self.pcc = ProbabilisticClassifyAndCount(self.learner)
|
||||||
y_ = self.classify(validation.instances)
|
y_ = self.classify(validation.instances)
|
||||||
|
@ -262,7 +296,7 @@ class ExpectationMaximizationQuantifier(AggregativeProbabilisticQuantifier):
|
||||||
return qs
|
return qs
|
||||||
|
|
||||||
|
|
||||||
class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
class HellingerDistanceY(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
"""
|
"""
|
||||||
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
|
Implementation of the method based on the Hellinger Distance y (HDy) proposed by
|
||||||
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
|
González-Castro, V., Alaiz-Rodrı́guez, R., and Alegre, E. (2013). Class distribution
|
||||||
|
@ -272,11 +306,19 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
def __init__(self, learner):
|
def __init__(self, learner):
|
||||||
self.learner = learner
|
self.learner = learner
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, train_val_split=0.6):
|
def fit(self, data: LabelledCollection, fit_learner=True, val_split:Union[float, LabelledCollection]=0.3):
|
||||||
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification. ' \
|
"""
|
||||||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
Trains a HDy quantifier
|
||||||
|
:param data: the training set
|
||||||
|
:param fit_learner: set to False to bypass the training (the learner is assumed to be already fit)
|
||||||
|
:param val_split: either a float in (0,1) indicating the proportion of training instances to use for
|
||||||
|
validation (e.g., 0.3 for using 30% of the training set as validation data), or a LabelledCollection
|
||||||
|
indicating the validation set itself
|
||||||
|
:return: self
|
||||||
|
"""
|
||||||
|
self._check_binary(data, self.__class__.__name__)
|
||||||
self.learner, validation = training_helper(
|
self.learner, validation = training_helper(
|
||||||
self.learner, data, fit_learner, ensure_probabilistic=True, train_val_split=train_val_split)
|
self.learner, data, fit_learner, ensure_probabilistic=True, val_split=val_split)
|
||||||
Px = self.posterior_probabilities(validation.instances)
|
Px = self.posterior_probabilities(validation.instances)
|
||||||
self.Pxy1 = Px[validation.labels == 1]
|
self.Pxy1 = Px[validation.labels == 1]
|
||||||
self.Pxy0 = Px[validation.labels == 0]
|
self.Pxy0 = Px[validation.labels == 0]
|
||||||
|
@ -312,7 +354,7 @@ class HellingerDistanceY(AggregativeProbabilisticQuantifier):
|
||||||
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
|
return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2))
|
||||||
|
|
||||||
|
|
||||||
class ExplicitLossMinimisation(AggregativeQuantifier):
|
class ExplicitLossMinimisation(AggregativeQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def __init__(self, svmperf_base, loss, **kwargs):
|
def __init__(self, svmperf_base, loss, **kwargs):
|
||||||
self.svmperf_base = svmperf_base
|
self.svmperf_base = svmperf_base
|
||||||
|
@ -320,8 +362,7 @@ class ExplicitLossMinimisation(AggregativeQuantifier):
|
||||||
self.kwargs = kwargs
|
self.kwargs = kwargs
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
def fit(self, data: LabelledCollection, fit_learner=True, *args):
|
||||||
assert data.binary, f'{self.__class__.__name__} works only on problems of binary classification' \
|
self._check_binary(data, self.__class__.__name__)
|
||||||
f'Use the class OneVsAll to enable {self.__class__.__name__} work on single-label data.'
|
|
||||||
assert fit_learner, 'the method requires that fit_learner=True'
|
assert fit_learner, 'the method requires that fit_learner=True'
|
||||||
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
self.learner = SVMperf(self.svmperf_base, loss=self.loss, **self.kwargs).fit(data.instances, data.labels)
|
||||||
return self
|
return self
|
||||||
|
@ -386,6 +427,9 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
f'{self.__class__.__name__} expect non-binary data'
|
f'{self.__class__.__name__} expect non-binary data'
|
||||||
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
assert isinstance(self.binary_quantifier, BaseQuantifier), \
|
||||||
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
f'{self.binary_quantifier} does not seem to be a Quantifier'
|
||||||
|
if not isinstance(self.binary_quantifier, BinaryQuantifier):
|
||||||
|
raise ValueError(f'{self.binary_quantifier.__class__.__name__} does not seem to be an instance of '
|
||||||
|
f'{BinaryQuantifier.__class__.__name__}')
|
||||||
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
self.dict_binary_quantifiers = {c: deepcopy(self.binary_quantifier) for c in data.classes_}
|
||||||
self.__parallel(self._delayed_binary_fit, data, **kwargs)
|
self.__parallel(self._delayed_binary_fit, data, **kwargs)
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -6,6 +6,7 @@ import numpy as np
|
||||||
import urllib
|
import urllib
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
def get_parallel_slices(n_tasks, n_jobs=-1):
|
def get_parallel_slices(n_tasks, n_jobs=-1):
|
||||||
|
@ -58,4 +59,19 @@ def create_if_not_exist(path):
|
||||||
|
|
||||||
|
|
||||||
def get_quapy_home():
|
def get_quapy_home():
|
||||||
return os.path.join(str(Path.home()), 'quapy_data')
|
home = os.path.join(str(Path.home()), 'quapy_data')
|
||||||
|
os.makedirs(home, exist_ok=True)
|
||||||
|
return home
|
||||||
|
|
||||||
|
|
||||||
|
def pickled_resource(pickle_path:str, generation_func:callable, *args):
|
||||||
|
if pickle_path is None:
|
||||||
|
return generation_func(*args)
|
||||||
|
else:
|
||||||
|
if os.path.exists(pickle_path):
|
||||||
|
return pickle.load(open(pickle_path, 'rb'))
|
||||||
|
else:
|
||||||
|
instance = generation_func(*args)
|
||||||
|
os.makedirs(str(Path(pickle_path).parent), exist_ok=True)
|
||||||
|
pickle.dump(instance, open(pickle_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
return instance
|
||||||
|
|
54
test.py
54
test.py
|
@ -3,11 +3,13 @@ from sklearn.svm import LinearSVC
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
import quapy.functional as F
|
import quapy.functional as F
|
||||||
import sys
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
#qp.datasets.fetch_reviews('hp')
|
#qp.datasets.fetch_reviews('hp')
|
||||||
#qp.datasets.fetch_twitter('sst')
|
#qp.datasets.fetch_twitter('sst')
|
||||||
|
|
||||||
#sys.exit()
|
#sys.exit()
|
||||||
|
from model_selection import GridSearchQ
|
||||||
|
|
||||||
SAMPLE_SIZE=500
|
SAMPLE_SIZE=500
|
||||||
binary = False
|
binary = False
|
||||||
|
@ -17,27 +19,31 @@ if binary:
|
||||||
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
dataset = qp.datasets.fetch_reviews('kindle', tfidf=True, min_df=5)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
dataset = qp.datasets.fetch_twitter('semeval13', model_selection=False, min_df=10)
|
dataset = qp.datasets.fetch_twitter('hcr', for_model_selection=False, min_df=10, pickle=True)
|
||||||
dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
# dataset.training = dataset.training.sampling(SAMPLE_SIZE, 0.2, 0.5, 0.3)
|
||||||
|
|
||||||
print('dataset loaded')
|
print('dataset loaded')
|
||||||
|
|
||||||
# training a quantifier
|
# training a quantifier
|
||||||
learner = LogisticRegression()
|
learner = LogisticRegression(max_iter=1000)
|
||||||
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
# model = qp.method.aggregative.ClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
model = qp.method.aggregative.AdjustedClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
# model = qp.method.aggregative.ProbabilisticClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
# model = qp.method.aggregative.ProbabilisticAdjustedClassifyAndCount(learner)
|
||||||
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
# model = qp.method.aggregative.ExpectationMaximizationQuantifier(learner)
|
||||||
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
# model = qp.method.aggregative.ExplicitLossMinimisationBinary(svmperf_home, loss='q', C=100)
|
||||||
model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
# model = qp.method.aggregative.SVMQ(svmperf_home, C=1)
|
||||||
|
|
||||||
if not binary:
|
if not binary and isinstance(model, qp.method.aggregative.BinaryQuantifier):
|
||||||
model = qp.method.aggregative.OneVsAll(model)
|
model = qp.method.aggregative.OneVsAll(model)
|
||||||
|
|
||||||
print('fitting model')
|
|
||||||
model.fit(dataset.training)
|
|
||||||
|
|
||||||
|
# Model fit and Evaluation on the test data
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
print(f'fitting model {model.__class__.__name__}')
|
||||||
|
train, val = dataset.training.split_stratified(0.6)
|
||||||
|
model.fit(train, val_split=val)
|
||||||
|
|
||||||
# estimating class prevalences
|
# estimating class prevalences
|
||||||
print('quantifying')
|
print('quantifying')
|
||||||
|
@ -47,14 +53,15 @@ prevalences_true = dataset.test.prevalence()
|
||||||
# evaluation (one single prediction)
|
# evaluation (one single prediction)
|
||||||
error = qp.error.mae(prevalences_true, prevalences_estim)
|
error = qp.error.mae(prevalences_true, prevalences_estim)
|
||||||
|
|
||||||
print(f'method {model.__class__.__name__}')
|
|
||||||
|
|
||||||
print(f'Evaluation in test (1 eval)')
|
print(f'Evaluation in test (1 eval)')
|
||||||
print(f'true prevalence {F.strprev(prevalences_true)}')
|
print(f'true prevalence {F.strprev(prevalences_true)}')
|
||||||
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
print(f'estim prevalence {F.strprev(prevalences_estim)}')
|
||||||
print(f'mae={error:.3f}')
|
print(f'mae={error:.3f}')
|
||||||
|
|
||||||
|
|
||||||
|
# Model fit and Evaluation according to the artificial sampling protocol
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
max_evaluations = 5000
|
max_evaluations = 5000
|
||||||
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
n_prevpoints = F.get_nprevpoints_approximation(combinations_budget=max_evaluations, n_classes=dataset.n_classes)
|
||||||
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
n_evaluations = F.num_prevalence_combinations(n_prevpoints, dataset.n_classes)
|
||||||
|
@ -70,3 +77,30 @@ for error in qp.error.QUANTIFICATION_ERROR:
|
||||||
score = error(true_prev, estim_prev)
|
score = error(true_prev, estim_prev)
|
||||||
print(f'{error.__name__}={score:.5f}')
|
print(f'{error.__name__}={score:.5f}')
|
||||||
|
|
||||||
|
|
||||||
|
# Model selection and Evaluation according to the artificial sampling protocol
|
||||||
|
# ----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
|
||||||
|
|
||||||
|
model_selection = GridSearchQ(model,
|
||||||
|
param_grid=param_grid,
|
||||||
|
sample_size=SAMPLE_SIZE,
|
||||||
|
eval_budget=max_evaluations//10,
|
||||||
|
error='mae',
|
||||||
|
refit=True,
|
||||||
|
verbose=True)
|
||||||
|
|
||||||
|
# model = model_selection.fit(dataset.training, validation=0.3)
|
||||||
|
model = model_selection.fit(train, validation=val)
|
||||||
|
print(f'Model selection: best_params = {model_selection.best_params_}')
|
||||||
|
print(f'param scores:')
|
||||||
|
for params, score in model_selection.param_scores_.items():
|
||||||
|
print(f'\t{params}: {score:.5f}')
|
||||||
|
|
||||||
|
true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction(model, dataset.test, SAMPLE_SIZE, n_prevpoints)
|
||||||
|
|
||||||
|
print(f'After model selection: Evaluation according to the artificial sampling protocol ({len(true_prev)} evals)')
|
||||||
|
for error in qp.error.QUANTIFICATION_ERROR:
|
||||||
|
score = error(true_prev, estim_prev)
|
||||||
|
print(f'{error.__name__}={score:.5f}')
|
Loading…
Reference in New Issue