force all samples be with replacement in base.LabelledCollection, irrespective of the sample size requested

sketching readme system by Lu and King, Hopings and King
2024-02-28 08:46:54 +01:00 · 2024-02-16 17:34:10 +01:00
4 changed files with 61 additions and 11 deletions
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,3 +1,9 @@
+Change Log 0.1.9
+----------------
+
+<...>
+
+
 Change Log 0.1.8
 ----------------

--- a/quapy/init.py
+++ b/quapy/init.py
@ -11,7 +11,7 @@ from . import util
 from . import model_selection
 from . import classification

-__version__ = '0.1.8'
+__version__ = '0.1.9'

 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/data/base.py
+++ b/quapy/data/base.py
@ -108,8 +108,7 @@ class LabelledCollection:
        """
        Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
        prevalence values are not specified, then returns the index of a uniform sampling.
-        For each class, the sampling is drawn with replacement if the requested prevalence is larger than
-        the actual prevalence of the class, or without replacement otherwise.
+        For each class, the sampling is drawn with replacement.

        :param size: integer, the requested size
        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
@ -153,7 +152,7 @@ class LabelledCollection:
            for class_, n_requested in n_requests.items():
                n_candidates = len(self.index[class_])
                index_sample = self.index[class_][
-                    np.random.choice(n_candidates, size=n_requested, replace=(n_requested > n_candidates))
+                    np.random.choice(n_candidates, size=n_requested, replace=True)
                ] if n_requested > 0 else []

                indexes_sample.append(index_sample)
@ -168,8 +167,7 @@ class LabelledCollection:
    def uniform_sampling_index(self, size, random_state=None):
        """
        Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
-        with replacement if the requested size is greater than the number of instances, or without replacement
-        otherwise.
+        with replacement.

        :param size: integer, the size of the uniform sample
        :param random_state: if specified, guarantees reproducibility of the split.
@ -179,13 +177,12 @@ class LabelledCollection:
            ng = RandomState(seed=random_state)
        else:
            ng = np.random
-        return ng.choice(len(self), size, replace=size > len(self))
+        return ng.choice(len(self), size, replace=True)

    def sampling(self, size, *prevs, shuffle=True, random_state=None):
        """
        Return a random sample (an instance of :class:`LabelledCollection`) of desired size and desired prevalence
-        values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
-        the actual prevalence of the class, or with replacement otherwise.
+        values. For each class, the sampling is drawn with replacement.

        :param size: integer, the requested size
        :param prevs: the prevalence for each class; the prevalence value for the last class can be lead empty since
@ -202,8 +199,7 @@ class LabelledCollection:
    def uniform_sampling(self, size, random_state=None):
        """
        Returns a uniform sample (an instance of :class:`LabelledCollection`) of desired size. The sampling is drawn
-        with replacement if the requested size is greater than the number of instances, or without replacement
-        otherwise.
+        with replacement.

        :param size: integer, the requested size
        :param random_state: if specified, guarantees reproducibility of the split.
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,5 +1,6 @@
 from typing import Union, Callable
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer

 from quapy.functional import get_divergence
 from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
        return F.argmin_prevalence(loss, n_classes, method=self.search)


+class ReadMe(BaseQuantifier):
+
+    def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
+        self.bootstrap_trials = bootstrap_trials
+        self.bootstrap_range = bootstrap_range
+        self.bagging_trials = bagging_trials
+        self.bagging_range = bagging_range
+        self.vectorizer_kwargs = vectorizer_kwargs
+
+    def fit(self, data: LabelledCollection):
+        X, y = data.Xy
+        self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
+        X = self.vectorizer.fit_transform(X)
+        self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
+
+    def quantify(self, instances):
+        X = self.vectorizer.transform(instances)
+
+        # number of features
+        num_docs, num_feats = X.shape
+
+        # bootstrap
+        p_boots = []
+        for _ in range(self.bootstrap_trials):
+            docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
+            class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
+            Xboot = X[docs_idx]
+
+            # bagging
+            p_bags = []
+            for _ in range(self.bagging_trials):
+                feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
+                class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
+                Xbag = Xboot[:,feat_idx]
+                p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
+                p_bags.append(p)
+            p_boots.append(np.mean(p_bags, axis=0))
+
+        p_mean = np.mean(p_boots, axis=0)
+        p_std  = np.std(p_bags, axis=0)
+
+        return p_mean
+
+
+    def std_constrained_linear_ls(self, X, class_cond_X: dict):
+        pass
+

 def _get_features_range(X):
    feat_ranges = []
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	75af15ae4a	force all samples be with replacement in base.LabelledCollection, irrespective of the sample size requested	2024-02-28 08:46:54 +01:00
Alejandro Moreo Fernandez	d50a86daf4	sketching readme system by Lu and King, Hopings and King	2024-02-16 17:34:10 +01:00