diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt index 61c56cd..e51cf0d 100644 --- a/LeQua2022/TODO.txt +++ b/LeQua2022/TODO.txt @@ -4,4 +4,5 @@ 4. model selection 5. plots 6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y - que de todos modos genera un output con el mismo nombre del file \ No newline at end of file + que de todos modos genera un output con el mismo nombre del file +7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly \ No newline at end of file diff --git a/LeQua2022/data.py b/LeQua2022/data.py index f4be5a6..9a133c4 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -11,17 +11,71 @@ import sklearn # return documents, labels -def load_multiclass_raw_document(path): - return qp.data.from_text(path, verbose=0, class2int=False) +# def load_multiclass_raw_document(path): +# return qp.data.from_text(path, verbose=0, class2int=False) def load_binary_vectors(path, nF=None): return sklearn.datasets.load_svmlight_file(path, n_features=nF) -if __name__ == '__main__': - X, y = load_binary_vectors('./data/T1A/public/training_vectors.txt') - print(X.shape) - print(y) +def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None): + # for ... : yield + pass + + +def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None): + # for ... : yield + pass + + +def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None): + # for ... : yield + pass + + +def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None): + # for ... : yield + pass + + +class ResultSubmission: + def __init__(self, team_name, run_name, task_name): + assert isinstance(team_name, str) and team_name, \ + f'invalid value encountered for team_name' + assert isinstance(run_name, str) and run_name, \ + f'invalid value encountered for run_name' + assert isinstance(task_name, str) and task_name in {'T1A', 'T1B', 'T2A', 'T2B'}, \ + f'invalid value encountered for task_name; valid values are T1A, T1B, T2A, and T2B' + self.team_name = team_name + self.run_name = run_name + self.task_name = task_name + self.data = {} + + def add(self, sample_name:str, prevalence_values:np.ndarray): + # assert the result is a valid sample_name (not repeated) + pass + + def __len__(self): + return len(self.data) + + @classmethod + def load(cls, path:str)-> 'ResultSubmission': + pass + + def dump(self, path:str): + # assert all samples are covered (check for test and dev accordingly) + pass + + def get(self, sample_name:str): + pass + + +def evaluate_submission(ground_truth_prevs: ResultSubmission, submission_prevs: ResultSubmission): + + pass + + + diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py index 5a60520..fab1bc2 100644 --- a/LeQua2022/main_binary_vector.py +++ b/LeQua2022/main_binary_vector.py @@ -1,7 +1,6 @@ import pickle import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from tqdm import tqdm import pandas as pd