uniform sampling added if *prevs is empty

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-17 18:17:17 +01:00
parent bcb8432457
commit 7d6f523e4b
7 changed files with 26 additions and 12 deletions

View File

@ -40,6 +40,8 @@ class LabelledCollection:
return self.n_classes == 2 return self.n_classes == 2
def sampling_index(self, size, *prevs, shuffle=True): def sampling_index(self, size, *prevs, shuffle=True):
if len(prevs) == 0: # no prevalence was indicated; returns an index for uniform sampling
return np.random.choice(len(self), size, replace=False)
if len(prevs) == self.n_classes-1: if len(prevs) == self.n_classes-1:
prevs = prevs + (1-sum(prevs),) prevs = prevs + (1-sum(prevs),)
assert len(prevs) == self.n_classes, 'unexpected number of prevalences' assert len(prevs) == self.n_classes, 'unexpected number of prevalences'
@ -68,9 +70,16 @@ class LabelledCollection:
return indexes_sample return indexes_sample
# def uniform_sampling_index(self, size):
# return np.random.choice(len(self), size, replace=False)
# def uniform_sampling(self, size):
# unif_index = self.uniform_sampling_index(size)
# return self.sampling_from_index(unif_index)
def sampling(self, size, *prevs, shuffle=True): def sampling(self, size, *prevs, shuffle=True):
index = self.sampling_index(size, *prevs, shuffle=shuffle) prev_index = self.sampling_index(size, *prevs, shuffle=shuffle)
return self.sampling_from_index(index) return self.sampling_from_index(prev_index)
def sampling_from_index(self, index): def sampling_from_index(self, index):
documents = self.instances[index] documents = self.instances[index]
@ -92,6 +101,14 @@ class LabelledCollection:
for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats):
yield self.sampling_index(sample_size, *prevs) yield self.sampling_index(sample_size, *prevs)
def natural_sampling_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling(sample_size)
def natural_sampling_index_generator(self, sample_size, repeats=100):
for _ in range(repeats):
yield self.uniform_sampling_index(sample_size)
def __add__(self, other): def __add__(self, other):
if issparse(self.instances) and issparse(other.documents): if issparse(self.instances) and issparse(other.documents):
docs = vstack([self.instances, other.documents]) docs = vstack([self.instances, other.documents])

View File

@ -1,8 +1,8 @@
import zipfile import zipfile
from utils.util import download_file_if_not_exists, download_file, get_quapy_home from util import download_file_if_not_exists, download_file, get_quapy_home
import os import os
from os.path import join from os.path import join
from data.base import Dataset, LabelledCollection from data.base import Dataset
from data.reader import from_text, from_sparse from data.reader import from_text, from_sparse
from data.preprocessing import text2tfidf, reduce_columns from data.preprocessing import text2tfidf, reduce_columns

View File

@ -2,7 +2,7 @@ import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from data.base import Dataset from data.base import Dataset
from scipy.sparse import spmatrix from scipy.sparse import spmatrix
from utils.util import parallelize from util import parallelize
from .base import LabelledCollection from .base import LabelledCollection
from tqdm import tqdm from tqdm import tqdm

View File

@ -1,7 +1,7 @@
from data import LabelledCollection from data import LabelledCollection
from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier from quapy.method.aggregative import AggregativeQuantifier, AggregativeProbabilisticQuantifier
from method.base import BaseQuantifier from method.base import BaseQuantifier
from utils.util import temp_seed from util import temp_seed
import numpy as np import numpy as np
from joblib import Parallel, delayed from joblib import Parallel, delayed
from tqdm import tqdm from tqdm import tqdm

View File

@ -94,11 +94,11 @@ def num_prevalence_combinations(n_prevpoints:int, n_classes:int, n_repeats:int=1
""" """
__cache={} __cache={}
def __f(nc,np): def __f(nc,np):
if (nc,np) in __cache: if (nc,np) in __cache: # cached result
return __cache[(nc,np)] return __cache[(nc,np)]
if nc==1: if nc==1: # stop condition
return 1 return 1
else: else: # recursive call
x = sum([__f(nc-1, np-i) for i in range(np)]) x = sum([__f(nc-1, np-i) for i in range(np)])
__cache[(nc,np)] = x __cache[(nc,np)] = x
return x return x

View File

@ -8,8 +8,6 @@ import os
from pathlib import Path from pathlib import Path
def get_parallel_slices(n_tasks, n_jobs=-1): def get_parallel_slices(n_tasks, n_jobs=-1):
if n_jobs == -1: if n_jobs == -1:
n_jobs = multiprocessing.cpu_count() n_jobs = multiprocessing.cpu_count()

View File

@ -1 +0,0 @@
from . import util