From d50a86daf4601f59c727276a89526364476cb519 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 16 Feb 2024 17:34:10 +0100 Subject: [PATCH] sketching readme system by Lu and King, Hopings and King --- CHANGE_LOG.txt | 6 +++++ quapy/__init__.py | 2 +- quapy/method/non_aggregative.py | 48 +++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index 5bf2643..7af3340 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -1,3 +1,9 @@ +Change Log 0.1.9 +---------------- + +<...> + + Change Log 0.1.8 ---------------- diff --git a/quapy/__init__.py b/quapy/__init__.py index da534df..8e4ceb2 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -11,7 +11,7 @@ from . import util from . import model_selection from . import classification -__version__ = '0.1.8' +__version__ = '0.1.9' environ = { 'SAMPLE_SIZE': None, diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index 6048bf6..02f133b 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,5 +1,6 @@ from typing import Union, Callable import numpy as np +from sklearn.feature_extraction.text import CountVectorizer from quapy.functional import get_divergence from quapy.data import LabelledCollection @@ -146,6 +147,53 @@ class DMx(BaseQuantifier): return F.argmin_prevalence(loss, n_classes, method=self.search) +class ReadMe(BaseQuantifier): + + def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs): + self.bootstrap_trials = bootstrap_trials + self.bootstrap_range = bootstrap_range + self.bagging_trials = bagging_trials + self.bagging_range = bagging_range + self.vectorizer_kwargs = vectorizer_kwargs + + def fit(self, data: LabelledCollection): + X, y = data.Xy + self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs) + X = self.vectorizer.fit_transform(X) + self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)} + + def quantify(self, instances): + X = self.vectorizer.transform(instances) + + # number of features + num_docs, num_feats = X.shape + + # bootstrap + p_boots = [] + for _ in range(self.bootstrap_trials): + docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False) + class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()} + Xboot = X[docs_idx] + + # bagging + p_bags = [] + for _ in range(self.bagging_trials): + feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False) + class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()} + Xbag = Xboot[:,feat_idx] + p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag) + p_bags.append(p) + p_boots.append(np.mean(p_bags, axis=0)) + + p_mean = np.mean(p_boots, axis=0) + p_std = np.std(p_bags, axis=0) + + return p_mean + + + def std_constrained_linear_ls(self, X, class_cond_X: dict): + pass + def _get_features_range(X): feat_ranges = []