24 lines
955 B
Python
24 lines
955 B
Python
from sklearn.feature_extraction.text import CountVectorizer
|
|
import quapy as qp
|
|
from quapy.method.non_aggregative import ReadMe
|
|
import quapy.functional as F
|
|
|
|
reviews = qp.datasets.fetch_reviews('imdb').reduce(n_train=1000, random_state=0)
|
|
|
|
encode_0_1 = CountVectorizer(min_df=5, binary=True)
|
|
train, test = qp.data.preprocessing.instance_transformation(reviews, encode_0_1, inplace=True).train_test
|
|
|
|
readme = ReadMe(bootstrap_trials=100, bagging_trials=100, bagging_range=100, random_state=0, verbose=True)
|
|
readme.fit(*train.Xy)
|
|
|
|
for test_prev in [[0.25, 0.75], [0.5, 0.5], [0.75, 0.25]]:
|
|
sample = reviews.test.sampling(500, *test_prev, random_state=0)
|
|
prev_estim, conf = readme.predict_conf(sample.X)
|
|
err = qp.error.mae(sample.prevalence(), prev_estim)
|
|
print(f'true-prevalence={F.strprev(sample.prevalence())},\n'
|
|
f'predicted-prevalence={F.strprev(prev_estim)}, with confidence intervals {conf},\n'
|
|
f'MAE={err:.4f}')
|
|
|
|
|
|
|