some sketches for lequa2022 file reading

This commit is contained in:
Alejandro Moreo Fernandez 2021-10-21 19:54:18 +02:00
parent 65b2c2ce74
commit 646d21873f
3 changed files with 62 additions and 8 deletions

View File

@ -4,4 +4,5 @@
4. model selection
5. plots
6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
que de todos modos genera un output con el mismo nombre del file
que de todos modos genera un output con el mismo nombre del file
7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly

View File

@ -11,17 +11,71 @@ import sklearn
# return documents, labels
def load_multiclass_raw_document(path):
return qp.data.from_text(path, verbose=0, class2int=False)
# def load_multiclass_raw_document(path):
# return qp.data.from_text(path, verbose=0, class2int=False)
def load_binary_vectors(path, nF=None):
return sklearn.datasets.load_svmlight_file(path, n_features=nF)
if __name__ == '__main__':
X, y = load_binary_vectors('./data/T1A/public/training_vectors.txt')
print(X.shape)
print(y)
def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None):
# for ... : yield
pass
def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
# for ... : yield
pass
def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
# for ... : yield
pass
def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
# for ... : yield
pass
class ResultSubmission:
def __init__(self, team_name, run_name, task_name):
assert isinstance(team_name, str) and team_name, \
f'invalid value encountered for team_name'
assert isinstance(run_name, str) and run_name, \
f'invalid value encountered for run_name'
assert isinstance(task_name, str) and task_name in {'T1A', 'T1B', 'T2A', 'T2B'}, \
f'invalid value encountered for task_name; valid values are T1A, T1B, T2A, and T2B'
self.team_name = team_name
self.run_name = run_name
self.task_name = task_name
self.data = {}
def add(self, sample_name:str, prevalence_values:np.ndarray):
# assert the result is a valid sample_name (not repeated)
pass
def __len__(self):
return len(self.data)
@classmethod
def load(cls, path:str)-> 'ResultSubmission':
pass
def dump(self, path:str):
# assert all samples are covered (check for test and dev accordingly)
pass
def get(self, sample_name:str):
pass
def evaluate_submission(ground_truth_prevs: ResultSubmission, submission_prevs: ResultSubmission):
pass

View File

@ -1,7 +1,6 @@
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import pandas as pd