From bf71aecf917f55e977b40f097a02d0d8efe1e12c Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 26 Sep 2025 12:19:45 +0200 Subject: [PATCH] added custom collection example and repr functions for labelled collection and dataset --- docs/source/manuals/datasets.md | 2 +- examples/3.custom_collection.py | 103 ++++++++++++++++++++++++++++++++ quapy/data/base.py | 19 +++++- 3 files changed, 122 insertions(+), 2 deletions(-) diff --git a/docs/source/manuals/datasets.md b/docs/source/manuals/datasets.md index 38d2bed..f818aa3 100644 --- a/docs/source/manuals/datasets.md +++ b/docs/source/manuals/datasets.md @@ -464,4 +464,4 @@ QuaPy implements a number of preprocessing functions in the package _qp.data.pre * _reduce_columns_: reducing the number of columns based on term frequency * _standardize_: transforms the column values into z-scores (i.e., subtract the mean and normalizes by the standard deviation, so that the column values have zero mean and unit variance). -* _index_: transforms textual tokens into lists of numeric ids) +* _index_: transforms textual tokens into lists of numeric ids diff --git a/examples/3.custom_collection.py b/examples/3.custom_collection.py index e69de29..13baeef 100644 --- a/examples/3.custom_collection.py +++ b/examples/3.custom_collection.py @@ -0,0 +1,103 @@ +import quapy as qp +from quapy.method.aggregative import PACC +from quapy.data import LabelledCollection, Dataset +from quapy.protocol import ArtificialPrevalenceProtocol +import quapy.functional as F +import os +from os.path import join + +# While quapy comes with ready-to-use datasets for experimental purposes, you may prefer to run experiments using +# your own data. Most of the quapy's functionality relies on an internal class called LabelledCollection, for fast +# indexing and sampling, and so this example provides guidance on how to convert your datasets into a LabelledCollection +# so all the functionality becomes available. This includes procedures for tuning the hyperparameters of your methods, +# evaluating the performance using high level sampling protocols, etc. + +# Let us assume that we have a binary sentiment dataset of opinions in natural language. We will use the "IMDb" +# dataset of reviews, which can be downloaded as follows +URL_TRAIN = f'https://zenodo.org/record/4117827/files/imdb_train.txt' +URL_TEST = f'https://zenodo.org/record/4117827/files/imdb_test.txt' +os.makedirs('./reviews', exist_ok=True) +train_path = join('reviews', 'hp_train.txt') +test_path = join('reviews', 'hp_test.txt') +qp.util.download_file_if_not_exists(URL_TRAIN, train_path) +qp.util.download_file_if_not_exists(URL_TEST, test_path) + +# these files contain 2 columns separated by a \t: +# the first one is a binary value (0=negative, 1=positive), and the second is the text +# Everything we need is to implement a function returning the instances and the labels as follows +def my_data_loader(path): + with open(path, 'rt') as fin: + labels, texts = zip(*[line.split('\t') for line in fin.readlines()]) + labels = list(map(int, labels)) # convert string numbers to int + return texts, labels + +# check that our function is working properly... +train_texts, train_labels = my_data_loader(train_path) +for i, (text, label) in enumerate(zip(train_texts, train_labels)): + print(f'#{i}: {label=}\t{text=}') + if i>=5: + print('...') + break + +# We can now instantiate a LabelledCollection simply as +train_lc = LabelledCollection(instances=train_texts, labels=train_labels) +print('my training collection:', train_lc) + +# We can instantiate directly a LabelledCollection using the data loader function, +# without having to load the data ourselves: +train_lc = LabelledCollection.load(train_path, loader_func=my_data_loader) +print('my training collection:', train_lc) + +# We can do the same for the test set, or we can instead directly instantiate a Dataset object (this is by and large +# simply a tuple with training and test LabelledCollections) as follows: +my_data = Dataset.load(train_path, test_path, loader_func=my_data_loader) +print('my dataset:', my_data) + +# However, since this is a textual dataset, we must vectorize it prior to training any quantification algorithm. +# We can do this in several ways in quapy. For example, manually... +# from sklearn.feature_extraction.text import TfidfVectorizer +# tfidf = TfidfVectorizer(min_df=5) +# Xtr = tfidf.fit_transform(my_data.training.instances) +# Xte = tfidf.transform(my_data.test.instances) +# ... or using some preprocessing functionality of quapy (recommended): +my_data_tfidf = qp.data.preprocessing.text2tfidf(my_data, min_df=5) + +training, test = my_data_tfidf.train_test + +# Once you have loaded your training and test data, you have access to a series of quapy's utilities, e.g.: +print(f'the training prevalence is {F.strprev(training.prevalence())}') +print(f'the test prevalence is {F.strprev(test.prevalence())}') +print(f'let us generate a small balanced training sample:') +desired_size = 200 +desired_prevalence = [0.5, 0.5] +small_training_balanced = training.sampling(desired_size, *desired_prevalence, shuffle=True, random_state=0) +print(small_training_balanced) +print(f'or generating train/val splits such as: {training.split_stratified(train_prop=0.7)}') + +# training +print('let us train a simple quantifier:...') +Xtr, ytr = training.Xy +quantifier = PACC() +quantifier.fit(Xtr, ytr) # or: quantifier.fit(*training.Xy) + +# test +print("and use quapy' evaluation functions") +evaluation_protocol = ArtificialPrevalenceProtocol( + data=test, + sample_size=200, + random_state=0 +) + +report = qp.evaluation.evaluation_report(quantifier, protocol=evaluation_protocol, error_metrics=['ae']) +print(report) +print(f'mean absolute error across {len(report)} experiments: {report.mean(numeric_only=True)}') + + + + + + + + + + diff --git a/quapy/data/base.py b/quapy/data/base.py index c22e895..b0fa779 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -95,6 +95,15 @@ class LabelledCollection: """ return len(self.classes_) + @property + def n_instances(self): + """ + The number of instances + + :return: integer + """ + return len(self.labels) + @property def binary(self): """ @@ -423,6 +432,11 @@ class LabelledCollection: test = self.sampling_from_index(test_index) yield train, test + def __repr__(self): + repr=f'<{self.n_instances} instances (dtype={type(self.instances[0])}), ' + repr+=f'n_classes={self.n_classes} {self.classes_}, prevalence={F.strprev(self.prevalence())}>' + return repr + class Dataset: """ @@ -576,4 +590,7 @@ class Dataset: *self.test.prevalence(), random_state = random_state ) - return self \ No newline at end of file + return self + + def __repr__(self): + return f'training={self.training}; test={self.test}' \ No newline at end of file