QuaPy/quapy/data/_ifcb.py

import os
import pandas as pd
import math

from quapy.data import LabelledCollection
from quapy.protocol import AbstractProtocol
from pathlib import Path


def get_sample_list(path_dir):
    """Gets a sample list finding the csv files in a directory

    Args:
        path_dir (_type_): directory to look for samples

    Returns:
        _type_: list of samples
    """
    samples = []
    for filename in sorted(os.listdir(path_dir)):
        if filename.endswith('.csv'):
            samples.append(filename)
    return samples


def generate_modelselection_split(samples, split=0.3):
    """This function generates a train/test split for model selection
    without the use of random numbers so the split is always the same

    Args:
        samples (_type_): list of samples
        split (float, optional): percentage saved for test. Defaults to 0.3.

    Returns:
        _type_: list of samples to use as train and list of samples to use as test
    """
    num_items_to_pick = math.ceil(len(samples) * split)
    step_size = math.floor(len(samples) / num_items_to_pick)
    test_indices = [i * step_size for i in range(num_items_to_pick)]
    test = [samples[i] for i in test_indices]
    train = [item for i, item in enumerate(samples) if i not in test_indices]
    return train, test


class IFCBTrainSamplesFromDir(AbstractProtocol):

    def __init__(self, path_dir:str, classes: list, samples: list = None):
        self.path_dir = path_dir
        self.classes = classes
        self.samples = []
        if samples is not None:
            self.samples = samples
        else:
            self.samples = get_sample_list(path_dir)

    def __call__(self):
        for sample in self.samples:
            s = pd.read_csv(os.path.join(self.path_dir,sample))
            # all columns but the first where we get the class
            X = s.iloc[:, 1:].to_numpy()
            y = s.iloc[:, 0].to_numpy()
            yield LabelledCollection(X, y, classes=self.classes)

    def total(self):
        """
        Returns the total number of samples that the protocol generates.

        :return: The number of training samples to generate.
        """
        return len(self.samples)


class IFCBTestSamples(AbstractProtocol):

    def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):
        self.path_dir = path_dir
        self.test_prevalences = test_prevalences
        self.classes = classes
        if samples is not None:
            self.samples = samples
        else:
            self.samples = get_sample_list(path_dir)

    def __call__(self):
        for test_sample in self.samples:
            s = pd.read_csv(os.path.join(self.path_dir,test_sample))
            if self.test_prevalences is not None:
                X = s
                # If we are working with the test samples, we have a dataframe with the prevalences and no labels for the test
                prevalences = self.test_prevalences.loc[self.test_prevalences['sample']==Path(test_sample).stem].to_numpy()[:,1:].flatten().astype(float)
            else:
                X = s.iloc[:, 1:].to_numpy()
                y = s.iloc[:,0]
                # In this case we compute the sample prevalences from the labels
                prevalences = y[y.isin(self.classes)].value_counts().reindex(self.classes, fill_value=0).to_numpy()/len(s)
            yield X, prevalences

    def total(self):
        """
        Returns the total number of samples that the protocol generates.

        :return: The number of training samples to generate.
        """
        return len(self.samples)
added dataset IFCB plankton 2023-11-08 11:07:47 +01:00			`import os`
			`import pandas as pd`
merged 2024-02-07 18:45:42 +01:00			`import math`
fixing ifcb and documenting 2024-02-12 12:39:18 +01:00
			`from quapy.data import LabelledCollection`
merged 2024-02-07 18:45:42 +01:00			`from quapy.protocol import AbstractProtocol`
			`from pathlib import Path`

testing IFCB dataset 2024-02-08 14:33:22 +01:00
merged 2024-02-07 18:45:42 +01:00			`def get_sample_list(path_dir):`
			`"""Gets a sample list finding the csv files in a directory`

			`Args:`
			`path_dir (_type_): directory to look for samples`

			`Returns:`
			`_type_: list of samples`
			`"""`
			`samples = []`
			`for filename in sorted(os.listdir(path_dir)):`
			`if filename.endswith('.csv'):`
			`samples.append(filename)`
			`return samples`

testing IFCB dataset 2024-02-08 14:33:22 +01:00
merged 2024-02-07 18:45:42 +01:00			`def generate_modelselection_split(samples, split=0.3):`
			`"""This function generates a train/test split for model selection`
			`without the use of random numbers so the split is always the same`

			`Args:`
			`samples (_type_): list of samples`
			`split (float, optional): percentage saved for test. Defaults to 0.3.`

			`Returns:`
			`_type_: list of samples to use as train and list of samples to use as test`
			`"""`
			`num_items_to_pick = math.ceil(len(samples) * split)`
			`step_size = math.floor(len(samples) / num_items_to_pick)`
			`test_indices = [i * step_size for i in range(num_items_to_pick)]`
			`test = [samples[i] for i in test_indices]`
			`train = [item for i, item in enumerate(samples) if i not in test_indices]`
			`return train, test`

testing IFCB dataset 2024-02-08 14:33:22 +01:00
merged 2024-02-07 18:45:42 +01:00			`class IFCBTrainSamplesFromDir(AbstractProtocol):`

			`def __init__(self, path_dir:str, classes: list, samples: list = None):`
			`self.path_dir = path_dir`
			`self.classes = classes`
			`self.samples = []`
			`if samples is not None:`
			`self.samples = samples`
			`else:`
			`self.samples = get_sample_list(path_dir)`
added dataset IFCB plankton 2023-11-08 11:07:47 +01:00
			`def __call__(self):`
			`for sample in self.samples:`
			`s = pd.read_csv(os.path.join(self.path_dir,sample))`
			`# all columns but the first where we get the class`
			`X = s.iloc[:, 1:].to_numpy()`
			`y = s.iloc[:, 0].to_numpy()`
fixing ifcb and documenting 2024-02-12 12:39:18 +01:00			`yield LabelledCollection(X, y, classes=self.classes)`
added dataset IFCB plankton 2023-11-08 11:07:47 +01:00
implementing the 'total' function of IFCB protocols 2023-11-08 11:31:33 +01:00			`def total(self):`
			`"""`
			`Returns the total number of samples that the protocol generates.`

			`:return: The number of training samples to generate.`
			`"""`
			`return len(self.samples)`

testing IFCB dataset 2024-02-08 14:33:22 +01:00
merged 2024-02-07 18:45:42 +01:00			`class IFCBTestSamples(AbstractProtocol):`

			`def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):`
			`self.path_dir = path_dir`
			`self.test_prevalences = test_prevalences`
			`self.classes = classes`
			`if samples is not None:`
			`self.samples = samples`
			`else:`
			`self.samples = get_sample_list(path_dir)`

			`def __call__(self):`
			`for test_sample in self.samples:`
			`s = pd.read_csv(os.path.join(self.path_dir,test_sample))`
			`if self.test_prevalences is not None:`
			`X = s`
			`# If we are working with the test samples, we have a dataframe with the prevalences and no labels for the test`
			`prevalences = self.test_prevalences.loc[self.test_prevalences['sample']==Path(test_sample).stem].to_numpy()[:,1:].flatten().astype(float)`
			`else:`
			`X = s.iloc[:, 1:].to_numpy()`
			`y = s.iloc[:,0]`
			`# In this case we compute the sample prevalences from the labels`
			`prevalences = y[y.isin(self.classes)].value_counts().reindex(self.classes, fill_value=0).to_numpy()/len(s)`
implementing the 'total' function of IFCB protocols 2023-11-08 11:31:33 +01:00			`yield X, prevalences`

			`def total(self):`
			`"""`
			`Returns the total number of samples that the protocol generates.`

merged 2024-02-07 18:45:42 +01:00			`:return: The number of training samples to generate.`
			`"""`
			`return len(self.samples)`