diff --git a/LeQua2022/baselines_T1Amodsel.py b/LeQua2022/baselines_T1Amodsel.py index c312135..f291714 100644 --- a/LeQua2022/baselines_T1Amodsel.py +++ b/LeQua2022/baselines_T1Amodsel.py @@ -44,7 +44,7 @@ param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]} def gen_samples(): - return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_filename=False) + return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_id=False) for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]: diff --git a/LeQua2022/data.py b/LeQua2022/data.py index 5068f47..596a113 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -37,32 +37,33 @@ def load_binary_vectors(path, nF=None): return sklearn.datasets.load_svmlight_file(path, n_features=nF) -def __gen_load_samples_with_groudtruth(path_dir:str, return_filename:bool, ground_truth_path:str, load_fn, **load_kwargs): +def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs): true_prevs = ResultSubmission.load(ground_truth_path) - for filename, prevalence in true_prevs.iterrows(): - sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs) - if return_filename: - yield filename, sample, prevalence + for id, prevalence in true_prevs.iterrows(): + sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) + if return_id: + yield id, sample, prevalence else: yield sample, prevalence -def __gen_load_samples_without_groudtruth(path_dir:str, return_filename:bool, load_fn, **load_kwargs): - for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')): - sample, _ = load_fn(filepath, **load_kwargs) - if return_filename: - yield os.path.basename(filepath), sample +def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs): + nsamples = len(glob(os.path.join(path_dir, '*.txt'))) + for id in range(nsamples): + sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) + if return_id: + yield id, sample else: yield sample -def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_filename=True): +def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_id=True): if ground_truth_path is None: # the generator function returns tuples (filename:str, sample:csr_matrix) - gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_filename, load_binary_vectors, nF=nF) + gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_binary_vectors, nF=nF) else: # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray) - gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_filename, ground_truth_path, load_binary_vectors, nF=nF) + gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_binary_vectors, nF=nF) for r in gen_fn: yield r @@ -83,47 +84,35 @@ class ResultSubmission: if not isinstance(categories, list) or len(categories) < 2: raise TypeError('wrong format for categories; a list with at least two category names (str) was expected') self.categories = categories - self.df = pd.DataFrame(columns=['filename'] + list(categories)) - self.inferred_type = None + self.df = pd.DataFrame(columns=list(categories)) + self.df.index.rename('id', inplace=True) - def add(self, sample_name:str, prevalence_values:np.ndarray): - if not isinstance(sample_name, str): - raise TypeError(f'error: expected str for sample_sample, found {type(sample_name)}') + def add(self, sample_id:int, prevalence_values:np.ndarray): + if not isinstance(sample_id, int): + raise TypeError(f'error: expected int for sample_sample, found {type(sample_id)}') if not isinstance(prevalence_values, np.ndarray): raise TypeError(f'error: expected np.ndarray for prevalence_values, found {type(prevalence_values)}') - - if self.inferred_type is None: - if sample_name.startswith('test'): - self.inferred_type = 'test' - elif sample_name.startswith('dev'): - self.inferred_type = 'dev' - else: - if not sample_name.startswith(self.inferred_type): - raise ValueError(f'error: sample "{sample_name}" is not a valid entry for type "{self.inferred_type}"') - - if not re.match("(test|dev)_sample_\d+\.txt", sample_name): - raise ValueError(f'error: wrong format "{sample_name}"; right format is (test|dev)_sample_.txt') - if sample_name in self.df.filename.values: - raise ValueError(f'error: prevalence values for "{sample_name}" already added') + if sample_id in self.df.index.values: + raise ValueError(f'error: prevalence values for "{sample_id}" already added') if prevalence_values.ndim!=1 and prevalence_values.size != len(self.categories): raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}') if (prevalence_values<0).any() or (prevalence_values>1).any(): - raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"') + raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_id}"') if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL: - raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"' + raise ValueError(f'error: prevalence values do not sum up to one for "{sample_id}"' f'(error tolerance {constants.ERROR_TOL})') - new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)]) - self.df = self.df.append(new_entry, ignore_index=True) + # new_entry = dict([('id', sample_id)] + [(col_i, prev_i) for col_i, prev_i in enumerate(prevalence_values)]) + new_entry = pd.DataFrame(prevalence_values.reshape(1,2), index=[sample_id], columns=self.df.columns) + self.df = self.df.append(new_entry, ignore_index=False) def __len__(self): return len(self.df) @classmethod def load(cls, path: str) -> 'ResultSubmission': - df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True) - r = ResultSubmission(categories=df.columns.values[1:].tolist()) - r.inferred_type = inferred_type + df = ResultSubmission.check_file_format(path) + r = ResultSubmission(categories=df.columns.values.tolist()) r.df = df return r @@ -140,59 +129,56 @@ class ResultSubmission: def iterrows(self): for index, row in self.df.iterrows(): - filename = row.filename - prevalence = row[self.df.columns[1]:].values.flatten() - yield filename, prevalence + # filename = row.filename + prevalence = row.values.flatten() + yield index, prevalence @classmethod - def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + def check_file_format(cls, path) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: df = pd.read_csv(path, index_col=0) - return ResultSubmission.check_dataframe_format(df, path=path, return_inferred_type=return_inferred_type) + return ResultSubmission.check_dataframe_format(df, path=path) @classmethod - def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: + def check_dataframe_format(cls, df, path=None) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]: hint_path = '' # if given, show the data path in the error message if path is not None: hint_path = f' in {path}' - if 'filename' not in df.columns or len(df.columns) < 3: - raise ValueError(f'wrong header{hint_path}, the format of the header should be ",filename,,...,"') + if df.index.name != 'id' or len(df.columns) < 2: + raise ValueError(f'wrong header{hint_path}, ' + f'the format of the header should be "id,,...,"') + if [int(ci) for ci in df.columns.values] != list(range(len(df.columns))): + raise ValueError(f'wrong header{hint_path}, category ids should be 0,1,2,...,n') if df.empty: raise ValueError(f'error{hint_path}: results file is empty') - elif len(df) == constants.DEV_SAMPLES: - inferred_type = 'dev' - expected_len = constants.DEV_SAMPLES - elif len(df) == constants.TEST_SAMPLES: - inferred_type = 'test' - expected_len = constants.TEST_SAMPLES - else: + elif len(df) != constants.DEV_SAMPLES and len(df) != constants.TEST_SAMPLES: raise ValueError(f'wrong number of prevalence values found{hint_path}; ' f'expected {constants.DEV_SAMPLES} for development sets and ' f'{constants.TEST_SAMPLES} for test sets; found {len(df)}') - set_names = frozenset(df.filename) - for i in range(expected_len): - if f'{inferred_type}_sample_{i}.txt' not in set_names: - raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type ' - f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing ' - f'(among perhaps many others)') + ids = set(df.index.values) + expected_ids = set(range(len(df))) + if ids != expected_ids: + missing = expected_ids - ids + if missing: + raise ValueError(f'there are {len(missing)} missing ids{hint_path}: {sorted(missing)}') + unexpected = ids - expected_ids + if unexpected: + raise ValueError(f'there are {len(missing)} unexpected ids{hint_path}: {sorted(unexpected)}') - for category_name in df.columns[1:]: + for category_name in df.columns: if (df[category_name] < 0).any() or (df[category_name] > 1).any(): raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]') - prevs = df.loc[:, df.columns[1]:].values + prevs = df.values round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL if round_errors.any(): raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} ' f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), ' f'probably due to some rounding errors.') - if return_inferred_type: - return df, inferred_type - else: - return df + return df def sort_categories(self): self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1) diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py index 25f3e45..2bf5cd9 100644 --- a/LeQua2022/format_checker.py +++ b/LeQua2022/format_checker.py @@ -20,7 +20,7 @@ def main(args): if __name__=='__main__': parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script') - parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str, + parser.add_argument('prevalence_file', metavar='PREVALENCEFILE-PATH', type=str, help='Path of the file containing prevalence values to check') args = parser.parse_args() diff --git a/LeQua2022/predict.py b/LeQua2022/predict.py index c6285cb..31d1fad 100644 --- a/LeQua2022/predict.py +++ b/LeQua2022/predict.py @@ -28,10 +28,10 @@ def main(args): model = pickle.load(open(args.model, 'rb')) # predictions - predictions = ResultSubmission(categories=categories) - for samplename, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), + predictions = ResultSubmission(categories=list(range(len(categories)))) + for sampleid, sample in tqdm(gen_load_samples_T1(args.samples, args.nf), desc='predicting', total=nsamples): - predictions.add(samplename, model.quantify(sample)) + predictions.add(sampleid, model.quantify(sample)) # saving basedir = os.path.basename(args.output)