QuaPy/quapy/data/_lequa2022.py

from typing import Tuple, Union
import pandas as pd
import numpy as np
import os

from quapy.protocol import AbstractProtocol

DEV_SAMPLES = 1000
TEST_SAMPLES = 5000

ERROR_TOL = 1E-3


def load_category_map(path):
    cat2code = {}
    with open(path, 'rt') as fin:
        for line in fin:
            category, code = line.split()
            cat2code[category] = int(code)
    code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x: x[1])]
    return cat2code, code2cat


def load_raw_documents(path):
    df = pd.read_csv(path)
    documents = list(df["text"].values)
    labels = None
    if "label" in df.columns:
        labels = df["label"].values.astype(int)
    return documents, labels


def load_vector_documents(path):
    D = pd.read_csv(path).to_numpy(dtype=float)
    labelled = D.shape[1] == 301
    if labelled:
        X, y = D[:, 1:], D[:, 0].astype(int).flatten()
    else:
        X, y = D, None
    return X, y


class SamplesFromDir(AbstractProtocol):

    def __init__(self, path_dir:str, ground_truth_path:str, load_fn):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.true_prevs = ResultSubmission.load(ground_truth_path)

    def __call__(self):
        for id, prevalence in self.true_prevs.iterrows():
            sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt'))
            yield sample, prevalence


class ResultSubmission:

    def __init__(self):
        self.df = None

    def __init_df(self, categories: int):
        if not isinstance(categories, int) or categories < 2:
            raise TypeError('wrong format for categories: an int (>=2) was expected')
        df = pd.DataFrame(columns=list(range(categories)))
        df.index.set_names('id', inplace=True)
        self.df = df

    @property
    def n_categories(self):
        return len(self.df.columns.values)

    def add(self, sample_id: int, prevalence_values: np.ndarray):
        if not isinstance(sample_id, int):
            raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')
        if not isinstance(prevalence_values, np.ndarray):
            raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')
        if self.df is None:
            self.__init_df(categories=len(prevalence_values))
        if sample_id in self.df.index.values:
            raise ValueError(f'error: prevalence values for "{sample_id}" already added')
        if prevalence_values.ndim != 1 and prevalence_values.size != self.n_categories:
            raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
        if (prevalence_values < 0).any() or (prevalence_values > 1).any():
            raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')
        if np.abs(prevalence_values.sum() - 1) > ERROR_TOL:
            raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'
                             f'(error tolerance {ERROR_TOL})')

        self.df.loc[sample_id] = prevalence_values

    def __len__(self):
        return len(self.df)

    @classmethod
    def load(cls, path: str) -> 'ResultSubmission':
        df = ResultSubmission.check_file_format(path)
        r = ResultSubmission()
        r.df = df
        return r

    def dump(self, path: str):
        ResultSubmission.check_dataframe_format(self.df)
        self.df.to_csv(path)

    def prevalence(self, sample_id: int):
        sel = self.df.loc[sample_id]
        if sel.empty:
            return None
        else:
            return sel.values.flatten()

    def iterrows(self):
        for index, row in self.df.iterrows():
            prevalence = row.values.flatten()
            yield index, prevalence

    @classmethod
    def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
        try:
            df = pd.read_csv(path, index_col=0)
        except Exception as e:
            print(f'the file {path} does not seem to be a valid csv file. ')
            print(e)
        return ResultSubmission.check_dataframe_format(df, path=path)

    @classmethod
    def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
        hint_path = ''  # if given, show the data path in the error message
        if path is not None:
            hint_path = f' in {path}'

        if df.index.name != 'id' or len(df.columns) < 2:
            raise ValueError(f'wrong header{hint_path}, '
                             f'the format of the header should be "id,0,...,n-1", '
                             f'where n is the number of categories')
        if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):
            raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, '
                             f'where n is the number of categories')
        if df.empty:
            raise ValueError(f'error{hint_path}: results file is empty')
        elif len(df) != DEV_SAMPLES and len(df) != TEST_SAMPLES:
            raise ValueError(f'wrong number of prevalence values found{hint_path}; '
                             f'expected {DEV_SAMPLES} for development sets and '
                             f'{TEST_SAMPLES} for test sets; found {len(df)}')

        ids = set(df.index.values)
        expected_ids = set(range(len(df)))
        if ids != expected_ids:
            missing = expected_ids - ids
            if missing:
                raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')
            unexpected = ids - expected_ids
            if unexpected:
                raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')

        for category_id in df.columns:
            if (df[category_id] < 0).any() or (df[category_id] > 1).any():
                raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]')

        prevs = df.values
        round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ERROR_TOL
        if round_errors.any():
            raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
                             f'do not sum up to 1 (error tolerance {ERROR_TOL}), '
                             f'probably due to some rounding errors.')

        return df
data reader for lequa 2022 competition 2022-11-28 12:02:08 +01:00			`from typing import Tuple, Union`
			`import pandas as pd`
			`import numpy as np`
			`import os`

			`from quapy.protocol import AbstractProtocol`

			`DEV_SAMPLES = 1000`
			`TEST_SAMPLES = 5000`

			`ERROR_TOL = 1E-3`


			`def load_category_map(path):`
			`cat2code = {}`
			`with open(path, 'rt') as fin:`
			`for line in fin:`
			`category, code = line.split()`
			`cat2code[category] = int(code)`
			`code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x: x[1])]`
			`return cat2code, code2cat`


			`def load_raw_documents(path):`
			`df = pd.read_csv(path)`
			`documents = list(df["text"].values)`
			`labels = None`
			`if "label" in df.columns:`
some bugfixes, unittest and minor changes 2023-01-16 13:51:29 +01:00			`labels = df["label"].values.astype(int)`
data reader for lequa 2022 competition 2022-11-28 12:02:08 +01:00			`return documents, labels`


			`def load_vector_documents(path):`
some bugfixes, unittest and minor changes 2023-01-16 13:51:29 +01:00			`D = pd.read_csv(path).to_numpy(dtype=float)`
data reader for lequa 2022 competition 2022-11-28 12:02:08 +01:00			`labelled = D.shape[1] == 301`
			`if labelled:`
some bugfixes, unittest and minor changes 2023-01-16 13:51:29 +01:00			`X, y = D[:, 1:], D[:, 0].astype(int).flatten()`
data reader for lequa 2022 competition 2022-11-28 12:02:08 +01:00			`else:`
			`X, y = D, None`
			`return X, y`


			`class SamplesFromDir(AbstractProtocol):`

			`def __init__(self, path_dir:str, ground_truth_path:str, load_fn):`
			`self.path_dir = path_dir`
			`self.load_fn = load_fn`
			`self.true_prevs = ResultSubmission.load(ground_truth_path)`

			`def __call__(self):`
			`for id, prevalence in self.true_prevs.iterrows():`
			`sample, _ = self.load_fn(os.path.join(self.path_dir, f'{id}.txt'))`
			`yield sample, prevalence`


			`class ResultSubmission:`

			`def __init__(self):`
			`self.df = None`

			`def __init_df(self, categories: int):`
			`if not isinstance(categories, int) or categories < 2:`
			`raise TypeError('wrong format for categories: an int (>=2) was expected')`
			`df = pd.DataFrame(columns=list(range(categories)))`
			`df.index.set_names('id', inplace=True)`
			`self.df = df`

			`@property`
			`def n_categories(self):`
			`return len(self.df.columns.values)`

			`def add(self, sample_id: int, prevalence_values: np.ndarray):`
			`if not isinstance(sample_id, int):`
			`raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')`
			`if not isinstance(prevalence_values, np.ndarray):`
			`raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')`
			`if self.df is None:`
			`self.__init_df(categories=len(prevalence_values))`
			`if sample_id in self.df.index.values:`
			`raise ValueError(f'error: prevalence values for "{sample_id}" already added')`
			`if prevalence_values.ndim != 1 and prevalence_values.size != self.n_categories:`
			`raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')`
			`if (prevalence_values < 0).any() or (prevalence_values > 1).any():`
			`raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')`
			`if np.abs(prevalence_values.sum() - 1) > ERROR_TOL:`
			`raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'`
			`f'(error tolerance {ERROR_TOL})')`

			`self.df.loc[sample_id] = prevalence_values`

			`def __len__(self):`
			`return len(self.df)`

			`@classmethod`
			`def load(cls, path: str) -> 'ResultSubmission':`
			`df = ResultSubmission.check_file_format(path)`
			`r = ResultSubmission()`
			`r.df = df`
			`return r`

			`def dump(self, path: str):`
			`ResultSubmission.check_dataframe_format(self.df)`
			`self.df.to_csv(path)`

			`def prevalence(self, sample_id: int):`
			`sel = self.df.loc[sample_id]`
			`if sel.empty:`
			`return None`
			`else:`
			`return sel.values.flatten()`

			`def iterrows(self):`
			`for index, row in self.df.iterrows():`
			`prevalence = row.values.flatten()`
			`yield index, prevalence`

			`@classmethod`
			`def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:`
			`try:`
			`df = pd.read_csv(path, index_col=0)`
			`except Exception as e:`
			`print(f'the file {path} does not seem to be a valid csv file. ')`
			`print(e)`
			`return ResultSubmission.check_dataframe_format(df, path=path)`

			`@classmethod`
			`def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:`
			`hint_path = '' # if given, show the data path in the error message`
			`if path is not None:`
			`hint_path = f' in {path}'`

			`if df.index.name != 'id' or len(df.columns) < 2:`
			`raise ValueError(f'wrong header{hint_path}, '`
			`f'the format of the header should be "id,0,...,n-1", '`
			`f'where n is the number of categories')`
			`if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):`
			`raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n-1, '`
			`f'where n is the number of categories')`
			`if df.empty:`
			`raise ValueError(f'error{hint_path}: results file is empty')`
			`elif len(df) != DEV_SAMPLES and len(df) != TEST_SAMPLES:`
			`raise ValueError(f'wrong number of prevalence values found{hint_path}; '`
			`f'expected {DEV_SAMPLES} for development sets and '`
			`f'{TEST_SAMPLES} for test sets; found {len(df)}')`

			`ids = set(df.index.values)`
			`expected_ids = set(range(len(df)))`
			`if ids != expected_ids:`
			`missing = expected_ids - ids`
			`if missing:`
			`raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')`
			`unexpected = ids - expected_ids`
			`if unexpected:`
			`raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')`

			`for category_id in df.columns:`
			`if (df[category_id] < 0).any() or (df[category_id] > 1).any():`
			`raise ValueError(f'error{hint_path} column "{category_id}" contains values out of range [0,1]')`

			`prevs = df.values`
			`round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ERROR_TOL`
			`if round_errors.any():`
			`raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '`
			`f'do not sum up to 1 (error tolerance {ERROR_TOL}), '`
			`f'probably due to some rounding errors.')`

			`return df`