diff --git a/quapy/data/base.py b/quapy/data/base.py index ce7b6d9..45f9a76 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -40,6 +40,8 @@ class LabelledCollection: return self.n_classes == 2 def sampling_index(self, size, *prevs, shuffle=True): + if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling + return np.random.choice(len(self), size, replace=False) if len(prevs) == self.n_classes-1: prevs = prevs + (1-sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' @@ -68,9 +70,16 @@ class LabelledCollection: return indexes_sample + # def uniform_sampling_index(self, size): + # return np.random.choice(len(self), size, replace=False) + + # def uniform_sampling(self, size): + # unif_index = self.uniform_sampling_index(size) + # return self.sampling_from_index(unif_index) + def sampling(self, size, *prevs, shuffle=True): - index = self.sampling_index(size, *prevs, shuffle=shuffle) - return self.sampling_from_index(index) + prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) + return self.sampling_from_index(prev_index) def sampling_from_index(self, index): documents = self.instances[index] @@ -92,6 +101,14 @@ class LabelledCollection: for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): yield self.sampling_index(sample_size, *prevs) + def natural_sampling_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling(sample_size) + + def natural_sampling_index_generator(self, sample_size, repeats=100): + for _ in range(repeats): + yield self.uniform_sampling_index(sample_size) + def __add__(self, other): if issparse(self.instances) and issparse(other.documents): docs = vstack([self.instances, other.documents]) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 2c25de9..9b486ed 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,8 +1,8 @@ import zipfile -from utils.util import download_file_if_not_exists, download_file, get_quapy_home +from util import download_file_if_not_exists, download_file, get_quapy_home import os from os.path import join -from data.base import Dataset, LabelledCollection +from data.base import Dataset from data.reader import from_text, from_sparse from data.preprocessing import text2tfidf, reduce_columns diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index b08bcab..3c1c6c1 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -2,7 +2,7 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from data.base import Dataset from scipy.sparse import spmatrix -from utils.util import parallelize +from util import parallelize from .base import LabelledCollection from tqdm import tqdm diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 106eb11..8003c38 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -1,7 +1,7 @@ from data import LabelledCollection from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier from method.base import BaseQuantifier -from utils.util import temp_seed +from util import temp_seed import numpy as np from joblib import Parallel, delayed from tqdm import tqdm diff --git a/quapy/functional.py b/quapy/functional.py index c351990..17966b5 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -94,11 +94,11 @@ def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1 """ __cache={} def __f(nc,np): - if (nc,np) in __cache: + if (nc,np) in __cache: # cached result return __cache[(nc,np)] - if nc==1: + if nc==1: # stop condition return 1 - else: + else: # recursive call x = sum([__f(nc-1, np-i) for i in range(np)]) __cache[(nc,np)] = x return x diff --git a/quapy/utils/util.py b/quapy/util.py similarity index 96% rename from quapy/utils/util.py rename to quapy/util.py index 921ab1b..d9430c8 100644 --- a/quapy/utils/util.py +++ b/quapy/util.py @@ -8,8 +8,6 @@ import os from pathlib import Path - - def get_parallel_slices(n_tasks, n_jobs=-1): if n_jobs == -1: n_jobs = multiprocessing.cpu_count() diff --git a/quapy/utils/__init__.py b/quapy/utils/__init__.py deleted file mode 100644 index 907cc97..0000000 --- a/quapy/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import util \ No newline at end of file