documented ReadMe

2025-10-20 18:33:45 +02:00 · 2025-10-20 18:33:45 +02:00 · 854b3ba3f9
parent eafe486893
commit 854b3ba3f9
2 changed files with 26 additions and 2 deletions
--- a/examples/18.ReadMe_for_text_analysis.py
+++ b/examples/18.ReadMe_for_text_analysis.py
@ -16,8 +16,8 @@ for test_prev in [[0.25, 0.75], [0.5, 0.5], [0.75, 0.25]]:
    prev_estim, conf = readme.predict_conf(sample.X)
    err = qp.error.mae(sample.prevalence(), prev_estim)
    print(f'true-prevalence={F.strprev(sample.prevalence())},\n'
-          f'predicted-prevalence={F.strprev(prev_estim)},\n'
+          f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n'
          f'MAE={err:.4f}')
-    print(conf)
+


--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -153,6 +153,30 @@ class DMx(BaseQuantifier):


 class ReadMe(BaseQuantifier, WithConfidenceABC):
+    """
+    ReadMe is a non-aggregative quantification system proposed by
+    `Daniel Hopkins and Gary King, 2007. A method of automated nonparametric content analysis for
+    social science. American Journal of Political Science, 54(1):229–247.
+    <https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1540-5907.2009.00428.x>`_.
+    The idea is to estimate `Q(Y=i)` directly from:
+
+    :math:`Q(X)=\\sum_{i=1} Q(X|Y=i) Q(Y=i)`
+
+    via least-squares regression, i.e., without incurring the cost of computing posterior probabilities.
+    However, this poses a very difficult representation in which the vector `Q(X)` and the matrix `Q(X|Y=i)`
+    can be of very high dimensions. In order to render the problem tracktable, ReadMe performs bagging in
+    the feature space. ReadMe also combines bagging with bootstrap in order to derive confidence intervals
+    around point estimations.
+
+    :param bootstrap_trials: int, number of bootstrap trials (default 100)
+    :param bagging_trials: int, number of bagging trials (default 100)
+    :param bagging_range: int, number of features to keep for each bagging trial (default 250)
+    :param confidence_level: float, a value in (0,1) reflecting the desired confidence level (default 0.95)
+    :param region: str in 'intervals', 'ellipse', 'ellipse-clr'; indicates the preferred method for
+        defining the confidence region (see :class:`WithConfidenceABC`)
+    :param random_state: int or None, allows replicability (default None)
+    :param verbose: bool, whether to display information during the process (default False)
+    """

    def __init__(self,
                 bootstrap_trials=100,