preliminary experiment for post-hoc prediction

2023-11-09 10:28:40 +01:00 · 2023-11-09 10:28:40 +01:00 · 288181c9c7
parent 2df89c83e8
commit 288181c9c7
1 changed files with 89 additions and 0 deletions
--- a/Retrieval/preliminary.py
+++ b/Retrieval/preliminary.py
@ -0,0 +1,89 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+
+def methods():
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+    yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
+    yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
+    yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
+    yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
+    yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
+
+
+def load_txt_sample(path, verbose=False):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text']
+    y = df['first_letter_category']
+
+    return X, y
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_data_*.txt')):
+            X, y = self.load_fn(file)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            sample = LabelledCollection(X, y, classes=self.classes)
+            yield sample.Xp
+
+
+qp.environ['SAMPLE_SIZE']=100
+
+data_path = './data'
+train_path = join(data_path, 'train_data.txt')
+
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
+
+# training = training.sampling(1000)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('Xtr shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
+
+print('Training prevalence:', F.strprev(training.prevalence()))
+for X, p in test_prot():
+    print('Test prevalence:', F.strprev(p))
+
+for method_name, quantifier in methods():
+    print('training ', method_name)
+    quantifier.fit(training)
+    print('[done]')
+
+    report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
+
+    print(report.mean())
+
+
+