diff --git a/LeQua2022/baselines_T2.py b/LeQua2022/_depr_baselines_T2.py similarity index 52% rename from LeQua2022/baselines_T2.py rename to LeQua2022/_depr_baselines_T2.py index e119aa4..2b02e7f 100644 --- a/LeQua2022/baselines_T2.py +++ b/LeQua2022/_depr_baselines_T2.py @@ -22,10 +22,10 @@ import constants def baselines(): yield CC(LR(n_jobs=-1)), "CC" - yield ACC(LR(n_jobs=-1)), "ACC" - yield PCC(LR(n_jobs=-1)), "PCC" - yield PACC(LR(n_jobs=-1)), "PACC" - yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" + # yield ACC(LR(n_jobs=-1)), "ACC" + # yield PCC(LR(n_jobs=-1)), "PCC" + # yield PACC(LR(n_jobs=-1)), "PACC" + # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" # yield HDy(LR(n_jobs=-1)) if args.task == 'T2A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield MLPE(), "MLPE" @@ -34,75 +34,15 @@ def main(args): models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) - path_dev_vectors = os.path.join(args.datadir, 'dev_documents') - path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') - path_train = os.path.join(args.datadir, 'training_documents.txt') + path_dev_vectors = os.path.join(args.datadir, 'dev_samples') + path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt') + path_train = os.path.join(args.datadir, 'training_data.txt') qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] train = LabelledCollection.load(path_train, load_raw_documents) - if args.mode == 'tfidf1': - tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True) - if args.mode == 'tfidf2': - tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2)) - if args.mode == 'tfidf3': - tfidf = Pipeline([ - ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True)), - ('svd', TruncatedSVD(n_components=300)) - ]) - if args.mode == 'tfidf4': - tfidf = Pipeline([ - ('tfidf', TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2))), - ('svd', TruncatedSVD(n_components=300)) - ]) - if args.mode == 'glove1': - tfidf = Pipeline([ - ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), - ('zscore', StandardScaler()) - ]) - if args.mode == 'glove2': - tfidf = WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe') - if args.mode == 'glove3': - vect = TfidfVectorizer(min_df=5, sublinear_tf=True) - tfidf = Pipeline([ - ('tfidf', vect), - ('embedding', TfidfWordEmbeddingTransformer( - wordset_name='glove', - features_call=vect.get_feature_names_out, - path='/mnt/1T/Datasets/GloVe')), - ('zscore', StandardScaler()) - ]) - if args.mode == 'glove4': - vect = TfidfVectorizer(min_df=5, sublinear_tf=True) - tfidf = Pipeline([ - ('tfidf', vect), - ('embedding', TfidfWordEmbeddingTransformer( - wordset_name='glove', - features_call=vect.get_feature_names_out, - path='/mnt/1T/Datasets/GloVe')) - ]) - if args.mode == 'wce1': - tfidf = WordClassEmbeddingsTransformer() - if args.mode == 'wce2': - glove = Pipeline([ - ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), - ('zscore', StandardScaler()) - ]) - wce = WordClassEmbeddingsTransformer() - tfidf = ConcatenateEmbeddingsTransformer([glove, wce]) - if args.mode == 'wce3': - glove = Pipeline([ - ('glove-ave', WordEmbeddingAverageTransformer(wordset_name='glove', path='/mnt/1T/Datasets/GloVe')), - ('zscore', StandardScaler()) - ]) - wce = WordClassEmbeddingsTransformer() - tfidf = Pipeline([ - ('glove-wce', ConcatenateEmbeddingsTransformer([glove, wce])), - ('svd', TruncatedSVD(n_components=300)) - ]) - target_metric = qp.error.mrae - + tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1,2)) train.instances = tfidf.fit_transform(*train.Xy) print(f'number of classes: {len(train.classes_)}') @@ -110,18 +50,18 @@ def main(args): print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training matrix shape: {train.instances.shape}') - param_grid = { - 'C': np.logspace(-3, 3, 7), - 'class_weight': ['balanced', None] - } - # param_grid = { - # 'C': [1], - # 'class_weight': ['balanced'] + # 'C': np.logspace(-3, 3, 7), + # 'class_weight': ['balanced', None] # } + param_grid = { + 'C': [1], + 'class_weight': ['balanced'] + } + def gen_samples(): - return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, + return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) outs = [] @@ -132,7 +72,7 @@ def main(args): param_grid, sample_size=None, protocol='gen', - error=target_metric, #qp.error.mae, + error=qp.error.mrae, refit=False, verbose=True ).fit(train, gen_samples) @@ -144,8 +84,6 @@ def main(args): print(f'saving model in {model_path}') pickle.dump(quantifier.best_model(), open(model_path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL) - print(tfidf) - print(args.mode) print(outs) with open(f'{args.mode}.{args.task}.txt', 'wt') as foo: for line in outs: @@ -157,26 +95,23 @@ if __name__ == '__main__': parser.add_argument('task', metavar='TASK', type=str, choices=['T2A', 'T2B'], help='Task name (T2A, T2B)') parser.add_argument('datadir', metavar='DATA-PATH', type=str, - help='Path of the directory containing "dev_prevalences.csv", "training_documents.txt", and ' + help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and ' 'the directory "dev_documents"') parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, help='Path where to save the models. ' 'A subdirectory named will be automatically created.') - parser.add_argument('mode', metavar='PREPROCESSMODE', type=str, - help='modality of preprocessing') args = parser.parse_args() if not os.path.exists(args.datadir): raise FileNotFoundError(f'path {args.datadir} does not exist') if not os.path.isdir(args.datadir): raise ValueError(f'path {args.datadir} is not a valid directory') - if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') - if not os.path.exists(os.path.join(args.datadir, "training_documents.txt")): - raise FileNotFoundError(f'path {args.datadir} does not contain "training_documents.txt" file') - if not os.path.exists(os.path.join(args.datadir, "dev_documents")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') + if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")): + raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file') + if not os.path.exists(os.path.join(args.datadir, "training_data.txt")): + raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file') + if not os.path.exists(os.path.join(args.datadir, "dev_samples")): + raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder') main(args) - # print('WITHOUT MODEL SELECTION') diff --git a/LeQua2022/baselines_T1.py b/LeQua2022/baselines.py similarity index 62% rename from LeQua2022/baselines_T1.py rename to LeQua2022/baselines.py index dd548c2..42ead5e 100644 --- a/LeQua2022/baselines_T1.py +++ b/LeQua2022/baselines.py @@ -1,5 +1,7 @@ import argparse import pickle + +from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression as LR from quapy.method.aggregative import * from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE @@ -16,8 +18,8 @@ def baselines(): yield CC(LR(n_jobs=-1)), "CC" # yield ACC(LR(n_jobs=-1)), "ACC" # yield PCC(LR(n_jobs=-1)), "PCC" - # yield PACC(LR(n_jobs=-1)), "PACC" - # yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" + yield PACC(LR(n_jobs=-1)), "PACC" + yield EMQ(CalibratedClassifierCV(LR(), n_jobs=-1)), "SLD" # yield HDy(LR(n_jobs=-1)) if args.task == 'T1A' else OneVsAll(HDy(LR()), n_jobs=-1), "HDy" # yield MLPE(), "MLPE" @@ -26,35 +28,40 @@ def main(args): models_path = qp.util.create_if_not_exist(os.path.join(args.modeldir, args.task)) - path_dev_vectors = os.path.join(args.datadir, 'dev_vectors') - path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.csv') - path_train = os.path.join(args.datadir, 'training_vectors.csv') + path_dev_vectors = os.path.join(args.datadir, 'dev_samples') + path_dev_prevs = os.path.join(args.datadir, 'dev_prevalences.txt') + path_train = os.path.join(args.datadir, 'training_data.txt') qp.environ['SAMPLE_SIZE'] = constants.SAMPLE_SIZE[args.task] - train = LabelledCollection.load(path_train, load_vector_documents) - nF = train.instances.shape[1] + if args.task in {'T1A', 'T1B'}: + train = LabelledCollection.load(path_train, load_vector_documents) + + def gen_samples(): + return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, load_fn=load_vector_documents) + else: + train = LabelledCollection.load(path_train, load_raw_documents) + tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True, ngram_range=(1, 2)) + train.instances = tfidf.fit_transform(*train.Xy) + + def gen_samples(): + return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, + load_fn=load_raw_unlabelled_documents, vectorizer=tfidf) print(f'number of classes: {len(train.classes_)}') print(f'number of training documents: {len(train)}') print(f'training prevalence: {F.strprev(train.prevalence())}') print(f'training matrix shape: {train.instances.shape}') - # param_grid = { - # 'C': np.logspace(-3, 3, 7), - # 'class_weight': ['balanced', None] - # } - param_grid = { - 'C': [0.01], - 'class_weight': ['balanced'] + 'C': np.logspace(-3, 3, 7), + 'class_weight': ['balanced', None] } - target_metric = qp.error.mrae - - def gen_samples(): - return gen_load_samples(path_dev_vectors, ground_truth_path=path_dev_prevs, return_id=False, - load_fn=load_vector_documents, ext='csv') + # param_grid = { + # 'C': [0.01, 0.1, 1], + # 'class_weight': ['balanced'] + # } for quantifier, q_name in baselines(): print(f'{q_name}: Model selection') @@ -63,7 +70,7 @@ def main(args): param_grid, sample_size=None, protocol='gen', - error=target_metric, #qp.error.mae, + error=qp.error.mrae, refit=False, verbose=True ).fit(train, gen_samples) @@ -76,12 +83,12 @@ def main(args): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='LeQua2022 Task T1A/T1B baselines') - parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B'], - help='Task name (T1A, T1B)') + parser = argparse.ArgumentParser(description='LeQua2022 baselines') + parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'], + help='Task name (T1A, T1B, T2A, T2B)') parser.add_argument('datadir', metavar='DATA-PATH', type=str, - help='Path of the directory containing "dev_prevalences.csv", "training_vectors.txt", and ' - 'the directory "dev_vectors"') + help='Path of the directory containing "dev_prevalences.txt", "training_data.txt", and ' + 'the directory "dev_samples"') parser.add_argument('modeldir', metavar='MODEL-PATH', type=str, help='Path where to save the models. ' 'A subdirectory named will be automatically created.') @@ -91,11 +98,11 @@ if __name__ == '__main__': raise FileNotFoundError(f'path {args.datadir} does not exist') if not os.path.isdir(args.datadir): raise ValueError(f'path {args.datadir} is not a valid directory') - if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.csv")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.csv" file') - if not os.path.exists(os.path.join(args.datadir, "training_vectors.csv")): - raise FileNotFoundError(f'path {args.datadir} does not contain "training_vectors.csv" file') - if not os.path.exists(os.path.join(args.datadir, "dev_vectors")): - raise FileNotFoundError(f'path {args.datadir} does not contain "dev_vectors" folder') + if not os.path.exists(os.path.join(args.datadir, "dev_prevalences.txt")): + raise FileNotFoundError(f'path {args.datadir} does not contain "dev_prevalences.txt" file') + if not os.path.exists(os.path.join(args.datadir, "training_data.txt")): + raise FileNotFoundError(f'path {args.datadir} does not contain "training_data.txt" file') + if not os.path.exists(os.path.join(args.datadir, "dev_samples")): + raise FileNotFoundError(f'path {args.datadir} does not contain "dev_samples" folder') main(args) diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py index 7a664a9..2c4bc77 100644 --- a/LeQua2022/constants.py +++ b/LeQua2022/constants.py @@ -2,18 +2,15 @@ DEV_SAMPLES = 1000 TEST_SAMPLES = 5000 TXA_SAMPLE_SIZE = 250 -TXB_SAMPLE_SIZE = 250 - -T1A_SAMPLE_SIZE = 250 -T1B_SAMPLE_SIZE = 1000 -T2A_SAMPLE_SIZE = 250 -T2B_SAMPLE_SIZE = 1000 +TXB_SAMPLE_SIZE = 1000 SAMPLE_SIZE={ - 'T1A': T1A_SAMPLE_SIZE, - 'T1B': T1B_SAMPLE_SIZE, - 'T2A': T2A_SAMPLE_SIZE, - 'T2B': T2B_SAMPLE_SIZE + 'TXA': TXA_SAMPLE_SIZE, + 'TXB': TXB_SAMPLE_SIZE, + 'T1A': TXA_SAMPLE_SIZE, + 'T1B': TXB_SAMPLE_SIZE, + 'T2A': TXA_SAMPLE_SIZE, + 'T2B': TXB_SAMPLE_SIZE } ERROR_TOL = 1E-3 diff --git a/LeQua2022/data.py b/LeQua2022/data.py index e581096..6d09db9 100644 --- a/LeQua2022/data.py +++ b/LeQua2022/data.py @@ -34,53 +34,37 @@ def load_raw_unlabelled_documents(path, vectorizer=None): return documents, None -# def load_vector_documents(path, nF=None): -# X, y = sklearn.datasets.load_svmlight_file(path, n_features=nF, zero_based=True) -# y = y.astype(int) -# return X, y - def load_vector_documents(path): D = pd.read_csv(path).to_numpy(dtype=np.float) labelled = D.shape[1] == 301 if labelled: - X, y = D[:,:300], D[:,-1].astype(np.int).flatten() + X, y = D[:,1:], D[:,0].astype(np.int).flatten() else: X, y = D, None return X, y -def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, ext:str, load_fn, **load_kwargs): +def __gen_load_samples_with_groudtruth(path_dir:str, return_id:bool, ground_truth_path:str, load_fn, **load_kwargs): true_prevs = ResultSubmission.load(ground_truth_path) for id, prevalence in true_prevs.iterrows(): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) + sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) yield (id, sample, prevalence) if return_id else (sample, prevalence) -def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, ext:str, load_fn, **load_kwargs): - nsamples = len(glob(os.path.join(path_dir, f'*.{ext}'))) +def __gen_load_samples_without_groudtruth(path_dir:str, return_id:bool, load_fn, **load_kwargs): + nsamples = len(glob(os.path.join(path_dir, f'*.txt'))) for id in range(nsamples): - sample, _ = load_fn(os.path.join(path_dir, f'{id}.{ext}'), **load_kwargs) + sample, _ = load_fn(os.path.join(path_dir, f'{id}.txt'), **load_kwargs) yield (id, sample) if return_id else sample -def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=True, ext='txt', load_fn=load_vector_documents, **load_kwargs): +def gen_load_samples(path_dir:str, ground_truth_path:str = None, return_id=False, load_fn=load_vector_documents, **load_kwargs): if ground_truth_path is None: # the generator function returns tuples (docid:str, sample:csr_matrix or str) - gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, ext, load_fn, **load_kwargs) + gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, **load_kwargs) else: # the generator function returns tuples (docid:str, sample:csr_matrix or str, prevalence:ndarray) - gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, ext, load_fn, **load_kwargs) - for r in gen_fn: - yield r - - -def genSVD_load_samples_T1(load_fn, path_dir:str, nF:int, ground_truth_path:str = None, return_id=True): - if ground_truth_path is None: - # the generator function returns tuples (filename:str, sample:csr_matrix) - gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_id, load_fn, nF=nF) - else: - # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray) - gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, nF=nF) + gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_id, ground_truth_path, load_fn, **load_kwargs) for r in gen_fn: yield r @@ -214,19 +198,19 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub raise ValueError(f'these result files are not comparable since the categories are different: ' f'true={true_prevs.n_categories} categories vs. ' f'predictions={predicted_prevs.n_categories} categories') - ae, rae = [], [] + rae, ae = [], [] for sample_id, true_prevalence in true_prevs.iterrows(): pred_prevalence = predicted_prevs.prevalence(sample_id) - ae.append(qp.error.ae(true_prevalence, pred_prevalence)) rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size))) + ae.append(qp.error.ae(true_prevalence, pred_prevalence)) - ae = np.asarray(ae) rae = np.asarray(rae) + ae = np.asarray(ae) if average: - return ae.mean(), rae.mean() + return rae.mean(), ae.mean() else: - return ae, rae + return rae, ae diff --git a/LeQua2022/evaluate.py b/LeQua2022/evaluate.py index 6095fd9..c21e368 100644 --- a/LeQua2022/evaluate.py +++ b/LeQua2022/evaluate.py @@ -8,21 +8,20 @@ LeQua2022 Official evaluation script """ def main(args): - if args.task in {'T1A', 'T2A'}: - qp.environ['SAMPLE_SIZE'] = constants.TXA_SAMPLE_SIZE - if args.task in {'T1B', 'T2B'}: - qp.environ['SAMPLE_SIZE'] = constants.TXB_SAMPLE_SIZE + + sample_size = constants.SAMPLE_SIZE[args.task] + true_prev = ResultSubmission.load(args.true_prevalences) pred_prev = ResultSubmission.load(args.pred_prevalences) - mae, mrae = evaluate_submission(true_prev, pred_prev) - print(f'MAE: {mae:.4f}') + + mrae, mae = evaluate_submission(true_prev, pred_prev, sample_size) print(f'MRAE: {mrae:.4f}') + print(f'MAE: {mae:.4f}') if args.output is not None: - qp.util.create_parent_dir(args.output) with open(args.output, 'wt') as foo: - foo.write(f'MAE: {mae:.4f}\n') foo.write(f'MRAE: {mrae:.4f}\n') + foo.write(f'MAE: {mae:.4f}\n') if __name__=='__main__': @@ -37,4 +36,7 @@ if __name__=='__main__': help='Path where to store the evaluation scores') args = parser.parse_args() + if args.output is not None: + qp.util.create_parent_dir(args.output) + main(args) diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py index 2bf5cd9..d9ae549 100644 --- a/LeQua2022/format_checker.py +++ b/LeQua2022/format_checker.py @@ -1,8 +1,6 @@ import argparse -import quapy as qp -from data import ResultSubmission, evaluate_submission -import constants -import os +from data import ResultSubmission + """ LeQua2022 Official format-checker script @@ -13,9 +11,9 @@ def main(args): ResultSubmission.check_file_format(args.prevalence_file) except Exception as e: print(e) - print('Format check: not passed') + print('Format check: [not passed]') else: - print('Format check: passed') + print('Format check: [passed]') if __name__=='__main__': diff --git a/LeQua2022/predict.py b/LeQua2022/predict.py index c02d94b..b014468 100644 --- a/LeQua2022/predict.py +++ b/LeQua2022/predict.py @@ -1,7 +1,6 @@ import argparse import quapy as qp from data import ResultSubmission -import constants import os import pickle from tqdm import tqdm @@ -27,7 +26,7 @@ def main(args): # predictions predictions = ResultSubmission() - for sampleid, sample in tqdm(gen_load_samples(args.samples, args.nf), desc='predicting', total=nsamples): + for sampleid, sample in tqdm(gen_load_samples(args.samples, return_id=True, load_fn=), desc='predicting', total=nsamples): predictions.add(sampleid, model.quantify(sample)) # saving diff --git a/quapy/data/base.py b/quapy/data/base.py index 7a8df5c..3cb2392 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -9,111 +9,6 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold from quapy.functional import artificial_prevalence_sampling, strprev -# class Sampling: -# -# @abstractmethod -# def load(cls, path: str, loader_func: callable, classes=None): ... -# -# @abstractmethod -# @property -# def __len__(self): ... -# -# @abstractmethod -# @property -# def prevalence(self): ... -# -# @abstractmethod -# @property -# def n_classes(self): -# -# @property -# def binary(self): -# return self.n_classes == 2 -# -# def uniform_sampling_index(self, size): -# return np.random.choice(len(self), size, replace=False) -# -# def uniform_sampling(self, size): -# unif_index = self.uniform_sampling_index(size) -# return self.sampling_from_index(unif_index) -# -# def sampling(self, size, *prevs, shuffle=True): -# prev_index = self.sampling_index(size, *prevs, shuffle=shuffle) -# return self.sampling_from_index(prev_index) -# -# def sampling_from_index(self, index): -# documents = self.instances[index] -# labels = self.labels[index] -# return LabelledCollection(documents, labels, classes_=self.classes_) -# -# def split_stratified(self, train_prop=0.6, random_state=None): -# # with temp_seed(42): -# tr_docs, te_docs, tr_labels, te_labels = \ -# train_test_split(self.instances, self.labels, train_size=train_prop, stratify=self.labels, -# random_state=random_state) -# return LabelledCollection(tr_docs, tr_labels), LabelledCollection(te_docs, te_labels) -# -# def artificial_sampling_generator(self, sample_size, n_prevalences=101, repeats=1): -# dimensions = self.n_classes -# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): -# yield self.sampling(sample_size, *prevs) -# -# def artificial_sampling_index_generator(self, sample_size, n_prevalences=101, repeats=1): -# dimensions = self.n_classes -# for prevs in artificial_prevalence_sampling(dimensions, n_prevalences, repeats): -# yield self.sampling_index(sample_size, *prevs) -# -# def natural_sampling_generator(self, sample_size, repeats=100): -# for _ in range(repeats): -# yield self.uniform_sampling(sample_size) -# -# def natural_sampling_index_generator(self, sample_size, repeats=100): -# for _ in range(repeats): -# yield self.uniform_sampling_index(sample_size) -# -# def __add__(self, other): -# if other is None: -# return self -# elif issparse(self.instances) and issparse(other.instances): -# join_instances = vstack([self.instances, other.instances]) -# elif isinstance(self.instances, list) and isinstance(other.instances, list): -# join_instances = self.instances + other.instances -# elif isinstance(self.instances, np.ndarray) and isinstance(other.instances, np.ndarray): -# join_instances = np.concatenate([self.instances, other.instances]) -# else: -# raise NotImplementedError('unsupported operation for collection types') -# labels = np.concatenate([self.labels, other.labels]) -# return LabelledCollection(join_instances, labels) -# -# @property -# def Xy(self): -# return self.instances, self.labels -# -# def stats(self, show=True): -# ninstances = len(self) -# instance_type = type(self.instances[0]) -# if instance_type == list: -# nfeats = len(self.instances[0]) -# elif instance_type == np.ndarray or issparse(self.instances): -# nfeats = self.instances.shape[1] -# else: -# nfeats = '?' -# stats_ = {'instances': ninstances, -# 'type': instance_type, -# 'features': nfeats, -# 'classes': self.classes_, -# 'prevs': strprev(self.prevalence())} -# if show: -# print(f'#instances={stats_["instances"]}, type={stats_["type"]}, #features={stats_["features"]}, ' -# f'#classes={stats_["classes"]}, prevs={stats_["prevs"]}') -# return stats_ -# -# def kFCV(self, nfolds=5, nrepeats=1, random_state=0): -# kf = RepeatedStratifiedKFold(n_splits=nfolds, n_repeats=nrepeats, random_state=random_state) -# for train_index, test_index in kf.split(*self.Xy): -# train = self.sampling_from_index(train_index) -# test = self.sampling_from_index(test_index) -# yield train, test class LabelledCollection: ''' @@ -146,8 +41,8 @@ class LabelledCollection: self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_} @classmethod - def load(cls, path: str, loader_func: callable, classes=None): - return LabelledCollection(*loader_func(path), classes) + def load(cls, path: str, loader_func: callable, classes=None, **loader_kwargs): + return LabelledCollection(*loader_func(path, **loader_kwargs), classes) def __len__(self): return self.instances.shape[0]