QuaPy/LeQua2022/data.py

import os.path
from typing import List, Tuple, Union

import pandas as pd

import quapy as qp
import numpy as np
import sklearn
import re
from glob import glob

import constants


# def load_binary_raw_document(path):
#     documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
#     labels = np.asarray(labels)
#     labels[np.logical_or(labels == 1, labels == 2)] = 0
#     labels[np.logical_or(labels == 4, labels == 5)] = 1
#     return documents, labels


# def load_multiclass_raw_document(path):
#     return qp.data.from_text(path, verbose=0, class2int=False)

def load_category_map(path):
    cat2code = {}
    with open(path, 'rt') as fin:
        for line in fin:
            category, code = line.split()
            cat2code[category] = int(code)
    code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]
    return cat2code, code2cat


def load_binary_vectors(path, nF=None):
    return sklearn.datasets.load_svmlight_file(path, n_features=nF)


def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):
    true_prevs = ResultSubmission.load(ground_truth_path)
    for id, prevalence in true_prevs.iterrows():
        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
        if return_id:
            yield id, sample, prevalence
        else:
            yield sample, prevalence


def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):
    nsamples = len(glob(os.path.join(path_dir, '*.txt')))
    for id in range(nsamples):
        sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)
        if return_id:
            yield id, sample
        else:
            yield sample


def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):
    if ground_truth_path is None:
        # the generator function returns tuples (filename:str, sample:csr_matrix)
        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_binary_vectors, nF=nF)
    else:
        # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_binary_vectors, nF=nF)
    for r in gen_fn:
        yield r


def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
    # for ... : yield
    pass


def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
    # for ... : yield
    pass


class ResultSubmission:

    def __init__(self, categories: List[str]):
        if not isinstance(categories, list) or len(categories) < 2:
            raise TypeError('wrong format for categories; a list with at least two category names (str) was expected')
        self.categories = categories
        self.df = pd.DataFrame(columns=list(categories))
        self.df.index.rename('id', inplace=True)

    def add(self, sample_id:int, prevalence_values:np.ndarray):
        if not isinstance(sample_id, int):
            raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')
        if not isinstance(prevalence_values, np.ndarray):
            raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')
        if sample_id in self.df.index.values:
            raise ValueError(f'error: prevalence values for "{sample_id}" already added')
        if prevalence_values.ndim!=1 and prevalence_values.size != len(self.categories):
            raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
        if (prevalence_values<0).any() or (prevalence_values>1).any():
            raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')
        if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:
            raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'
                             f'(error tolerance {constants.ERROR_TOL})')

        # new_entry = dict([('id', sample_id)] + [(col_i, prev_i) for col_i, prev_i in enumerate(prevalence_values)])
        new_entry = pd.DataFrame(prevalence_values.reshape(1,2), index=[sample_id], columns=self.df.columns)
        self.df = self.df.append(new_entry, ignore_index=False)

    def __len__(self):
        return len(self.df)

    @classmethod
    def load(cls, path: str) -> 'ResultSubmission':
        df = ResultSubmission.check_file_format(path)
        r = ResultSubmission(categories=df.columns.values.tolist())
        r.df = df
        return r

    def dump(self, path:str):
        ResultSubmission.check_dataframe_format(self.df)
        self.df.to_csv(path)

    def prevalence(self, sample_name:str):
        sel = self.df.loc[self.df['filename'] == sample_name]
        if sel.empty:
            return None
        else:
            return sel.loc[:,self.df.columns[1]:].values.flatten()

    def iterrows(self):
        for index, row in self.df.iterrows():
            # filename = row.filename
            prevalence = row.values.flatten()
            yield index, prevalence

    @classmethod
    def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
        df = pd.read_csv(path, index_col=0)
        return ResultSubmission.check_dataframe_format(df, path=path)

    @classmethod
    def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
        hint_path = ''  # if given, show the data path in the error message
        if path is not None:
            hint_path = f' in {path}'

        if df.index.name != 'id' or len(df.columns) < 2:
            raise ValueError(f'wrong header{hint_path}, '
                             f'the format of the header should be "id,<cat_1>,...,<cat_n>"')
        if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):
            raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n')

        if df.empty:
            raise ValueError(f'error{hint_path}: results file is empty')
        elif len(df) != constants.DEV_SAMPLES and len(df) != constants.TEST_SAMPLES:
            raise ValueError(f'wrong number of prevalence values found{hint_path}; '
                             f'expected {constants.DEV_SAMPLES} for development sets and '
                             f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')

        ids = set(df.index.values)
        expected_ids = set(range(len(df)))
        if ids != expected_ids:
            missing = expected_ids - ids
            if missing:
                raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')
            unexpected = ids - expected_ids
            if unexpected:
                raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')

        for category_name in df.columns:
            if (df[category_name] < 0).any() or (df[category_name] > 1).any():
                raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]')

        prevs = df.values
        round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL
        if round_errors.any():
            raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
                              f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '
                              f'probably due to some rounding errors.')

        return df

    def sort_categories(self):
        self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1)
        self.categories = sorted(self.categories)

    def filenames(self):
        return self.df.filename.values


def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):
    if sample_size is None:
        if qp.environ['SAMPLE_SIZE'] is None:
            raise ValueError('Relative Absolute Error cannot be computed: '
                             'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
        else:
            sample_size = qp.environ['SAMPLE_SIZE']

    if len(true_prevs) != len(predicted_prevs):
        raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '
                         f'while the file of predictions contain {len(predicted_prevs)} entries')
    true_prevs.sort_categories()
    predicted_prevs.sort_categories()
    if true_prevs.categories != predicted_prevs.categories:
        raise ValueError(f'these result files are not comparable since the categories are different: '
                         f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}')
    ae, rae = [], []
    for sample_name, true_prevalence in true_prevs.iterrows():
        pred_prevalence = predicted_prevs.prevalence(sample_name)
        ae.append(qp.error.ae(true_prevalence, pred_prevalence))
        rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
    ae = np.asarray(ae)
    rae = np.asarray(rae)
    if average:
        return ae.mean(), rae.mean()
    else:
        return ae, rae
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`import os.path`
			`from typing import List, Tuple, Union`

			`import pandas as pd`

branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00			`import quapy as qp`
			`import numpy as np`
setting baseline experiments with data format 2021-10-21 17:14:40 +02:00			`import sklearn`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`import re`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`from glob import glob`

			`import constants`
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00

setting baseline experiments with data format 2021-10-21 17:14:40 +02:00			`# def load_binary_raw_document(path):`
			`# documents, labels = qp.data.from_text(path, verbose=0, class2int=True)`
			`# labels = np.asarray(labels)`
			`# labels[np.logical_or(labels == 1, labels == 2)] = 0`
			`# labels[np.logical_or(labels == 4, labels == 5)] = 1`
			`# return documents, labels`
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00

some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00			`# def load_multiclass_raw_document(path):`
			`# return qp.data.from_text(path, verbose=0, class2int=False)`
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`def load_category_map(path):`
			`cat2code = {}`
			`with open(path, 'rt') as fin:`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`for line in fin:`
			`category, code = line.split()`
			`cat2code[category] = int(code)`
			`code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]`
			`return cat2code, code2cat`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00
setting baseline experiments with data format 2021-10-21 17:14:40 +02:00			`def load_binary_vectors(path, nF=None):`
			`return sklearn.datasets.load_svmlight_file(path, n_features=nF)`


adapting to the new format 2021-11-04 19:15:16 +01:00			`def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs):`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`true_prevs = ResultSubmission.load(ground_truth_path)`
adapting to the new format 2021-11-04 19:15:16 +01:00			`for id, prevalence in true_prevs.iterrows():`
			`sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)`
			`if return_id:`
			`yield id, sample, prevalence`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`else:`
			`yield sample, prevalence`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00

adapting to the new format 2021-11-04 19:15:16 +01:00			`def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs):`
			`nsamples = len(glob(os.path.join(path_dir, '*.txt')))`
			`for id in range(nsamples):`
			`sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs)`
			`if return_id:`
			`yield id, sample`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`else:`
			`yield sample`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00

adapting to the new format 2021-11-04 19:15:16 +01:00			`def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_id=True):`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`if ground_truth_path is None:`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`# the generator function returns tuples (filename:str, sample:csr_matrix)`
adapting to the new format 2021-11-04 19:15:16 +01:00			`gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_binary_vectors, nF=nF)`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`else:`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`# the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)`
adapting to the new format 2021-11-04 19:15:16 +01:00			`gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_binary_vectors, nF=nF)`
GridSearchQ adapted to work with generator functions and integrated for the baselines of LeQua2022; some tests with SVD 2021-10-26 18:41:10 +02:00			`for r in gen_fn:`
			`yield r`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00

			`def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):`
			`# for ... : yield`
			`pass`


			`def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):`
			`# for ... : yield`
			`pass`


			`class ResultSubmission:`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
			`def __init__(self, categories: List[str]):`
			`if not isinstance(categories, list) or len(categories) < 2:`
			`raise TypeError('wrong format for categories; a list with at least two category names (str) was expected')`
			`self.categories = categories`
adapting to the new format 2021-11-04 19:15:16 +01:00			`self.df = pd.DataFrame(columns=list(categories))`
			`self.df.index.rename('id', inplace=True)`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
adapting to the new format 2021-11-04 19:15:16 +01:00			`def add(self, sample_id:int, prevalence_values:np.ndarray):`
			`if not isinstance(sample_id, int):`
			`raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if not isinstance(prevalence_values, np.ndarray):`
			`raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}')`
adapting to the new format 2021-11-04 19:15:16 +01:00			`if sample_id in self.df.index.values:`
			`raise ValueError(f'error: prevalence values for "{sample_id}" already added')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if prevalence_values.ndim!=1 and prevalence_values.size != len(self.categories):`
			`raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')`
			`if (prevalence_values<0).any() or (prevalence_values>1).any():`
adapting to the new format 2021-11-04 19:15:16 +01:00			`raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"')`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:`
adapting to the new format 2021-11-04 19:15:16 +01:00			`raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"'`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`f'(error tolerance {constants.ERROR_TOL})')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
adapting to the new format 2021-11-04 19:15:16 +01:00			`# new_entry = dict([('id', sample_id)] + [(col_i, prev_i) for col_i, prev_i in enumerate(prevalence_values)])`
			`new_entry = pd.DataFrame(prevalence_values.reshape(1,2), index=[sample_id], columns=self.df.columns)`
			`self.df = self.df.append(new_entry, ignore_index=False)`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
			`def __len__(self):`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`return len(self.df)`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
			`@classmethod`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`def load(cls, path: str) -> 'ResultSubmission':`
adapting to the new format 2021-11-04 19:15:16 +01:00			`df = ResultSubmission.check_file_format(path)`
			`r = ResultSubmission(categories=df.columns.values.tolist())`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`r.df = df`
			`return r`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
			`def dump(self, path:str):`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`ResultSubmission.check_dataframe_format(self.df)`
			`self.df.to_csv(path)`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`def prevalence(self, sample_name:str):`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`sel = self.df.loc[self.df['filename'] == sample_name]`
			`if sel.empty:`
			`return None`
			`else:`
			`return sel.loc[:,self.df.columns[1]:].values.flatten()`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`def iterrows(self):`
			`for index, row in self.df.iterrows():`
adapting to the new format 2021-11-04 19:15:16 +01:00			`# filename = row.filename`
			`prevalence = row.values.flatten()`
			`yield index, prevalence`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`@classmethod`
adapting to the new format 2021-11-04 19:15:16 +01:00			`def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`df = pd.read_csv(path, index_col=0)`
adapting to the new format 2021-11-04 19:15:16 +01:00			`return ResultSubmission.check_dataframe_format(df, path=path)`
some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`@classmethod`
adapting to the new format 2021-11-04 19:15:16 +01:00			`def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`hint_path = '' # if given, show the data path in the error message`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if path is not None:`
			`hint_path = f' in {path}'`

adapting to the new format 2021-11-04 19:15:16 +01:00			`if df.index.name != 'id' or len(df.columns) < 2:`
			`raise ValueError(f'wrong header{hint_path}, '`
			`f'the format of the header should be "id,<cat_1>,...,<cat_n>"')`
			`if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))):`
			`raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
			`if df.empty:`
			`raise ValueError(f'error{hint_path}: results file is empty')`
adapting to the new format 2021-11-04 19:15:16 +01:00			`elif len(df) != constants.DEV_SAMPLES and len(df) != constants.TEST_SAMPLES:`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`raise ValueError(f'wrong number of prevalence values found{hint_path}; '`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`f'expected {constants.DEV_SAMPLES} for development sets and '`
			`f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
adapting to the new format 2021-11-04 19:15:16 +01:00			`ids = set(df.index.values)`
			`expected_ids = set(range(len(df)))`
			`if ids != expected_ids:`
			`missing = expected_ids - ids`
			`if missing:`
			`raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}')`
			`unexpected = ids - expected_ids`
			`if unexpected:`
			`raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}')`

			`for category_name in df.columns:`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if (df[category_name] < 0).any() or (df[category_name] > 1).any():`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
adapting to the new format 2021-11-04 19:15:16 +01:00			`prevs = df.values`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if round_errors.any():`
			`raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`f'probably due to some rounding errors.')`

adapting to the new format 2021-11-04 19:15:16 +01:00			`return df`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00
			`def sort_categories(self):`
			`self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1)`
			`self.categories = sorted(self.categories)`

evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`def filenames(self):`
			`return self.df.filename.values`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00

evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):`
			`if sample_size is None:`
			`if qp.environ['SAMPLE_SIZE'] is None:`
			`raise ValueError('Relative Absolute Error cannot be computed: '`
			`'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')`
			`else:`
			`sample_size = qp.environ['SAMPLE_SIZE']`

result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`if len(true_prevs) != len(predicted_prevs):`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '`
			`f'while the file of predictions contain {len(predicted_prevs)} entries')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`true_prevs.sort_categories()`
			`predicted_prevs.sort_categories()`
			`if true_prevs.categories != predicted_prevs.categories:`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`raise ValueError(f'these result files are not comparable since the categories are different: '`
			`f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}')`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`ae, rae = [], []`
evaluation script and format checker added 2021-10-25 13:37:22 +02:00			`for sample_name, true_prevalence in true_prevs.iterrows():`
			`pred_prevalence = predicted_prevs.prevalence(sample_name)`
			`ae.append(qp.error.ae(true_prevalence, pred_prevalence))`
			`rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))`
result file format check, read, load, and evaluation with pandas 2021-10-22 19:03:15 +02:00			`ae = np.asarray(ae)`
			`rae = np.asarray(rae)`
			`if average:`
			`return ae.mean(), rae.mean()`
			`else:`
			`return ae, rae`




some sketches for lequa2022 file reading 2021-10-21 19:54:18 +02:00


setting baseline experiments with data format 2021-10-21 17:14:40 +02:00