first commit, some ideas already explored

2024-02-23 16:42:31 +01:00 · 2024-02-23 16:42:31 +01:00 · 1c03dd651b
parent b3ccf71edb
commit 1c03dd651b
6 changed files with 685 additions and 0 deletions
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -0,0 +1,74 @@
+import pandas as pd
+import numpy as np
+from glob import glob
+from os.path import join
+
+from quapy.data import LabelledCollection
+from quapy.protocol import AbstractProtocol
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    # print('reading', path)
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        rank = rank[y != 'Antarctica']
+        scores = scores[y != 'Antarctica']
+
+    X = X[y!='Antarctica']
+    y = y[y!='Antarctica']
+
+    if parse_columns:
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+        self.classes=classes
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings', 'test_rankingstraining_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            # if len(X)!=qp.environ['SAMPLE_SIZE']:
+            #     print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            try:
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
+            except ValueError as e:
+                print(f'file {file} caused error {e}')
+                yield None, None
+
+            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
+            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
+
+            yield train_sample, test_sample
--- a/Retrieval/fourth.py
+++ b/Retrieval/fourth.py
@ -0,0 +1,161 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+import quapy.functional as F
+from Retrieval.commons import RetrievedSamples, load_txt_sample
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the third experiment, and the fairness group are defined upon geographic info as in the third case.
+The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
+a pool of the same size.
+
+Por ahora 1000 en tr y 100 en test
+Parece que ahora hay muy poco shift  
+"""
+
+def cls(classifier_trained=None):
+    if classifier_trained is None:
+        # return LinearSVC()
+        return LogisticRegression()
+    else:
+        return classifier_trained
+
+
+def methods(classifier_trained=None):
+    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
+    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
+    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
+    yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
+    yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
+    yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
+    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
+    yield ('PCC', PCC(cls(classifier_trained)))
+    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
+    yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
+    yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
+    yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
+    yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
+    yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
+    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
+    yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def train_classifier():
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+    training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+    if REDUCE_TR > 0:
+        print('Reducing the number of documents in the training to', REDUCE_TR)
+        training = training.sampling(REDUCE_TR, *training.prevalence())
+
+    Xtr, ytr = training.Xy
+    Xtr = tfidf.fit_transform(Xtr)
+    print('L orig shape = ', Xtr.shape)
+
+    training = LabelledCollection(Xtr, ytr)
+
+    print('training classifier')
+    classifier_trained = LogisticRegression()
+    classifier_trained = GridSearchCV(classifier_trained,
+                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
+                                      n_jobs=-1, cv=5)
+    classifier_trained.fit(Xtr, ytr)
+    classifier_trained = classifier_trained.best_estimator_
+    trained = True
+    print('[Done!]')
+
+    classes = training.classes_
+
+    print('training classes:', classes)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier_trained
+
+
+
+RANK_AT_K = 1000
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './50_50_split_trec'
+train_path = join(data_path, 'train_50_50_continent.txt')
+
+tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
+trained=True
+
+experiment_prot = RetrievedSamples(data_path,
+                           load_fn=load_txt_sample,
+                           vectorizer=tfidf,
+                           max_train_lines=None,
+                           max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
+
+result_mae_dict = {}
+result_mrae_dict = {}
+for method_name, quantifier in methods(classifier_trained):
+    # print('Starting with method=', method_name)
+
+    mae_errors = []
+    mrae_errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                if trained and method_name!='MLPE':
+                    quantifier.fit(train, val_split=train, fit_classifier=False)
+                else:
+                    quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                mae_errors.append(mae)
+
+                mrae = qp.error.mrae(test.prevalence(), estim_prev)
+                mrae_errors.append(mrae)
+
+                # print()
+                # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+                # print('Estim prevalence:', F.strprev(estim_prev))
+
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
+    print()
+    result_mae_dict[method_name] = np.mean(mae_errors)
+    result_mrae_dict[method_name] = np.mean(mrae_errors)
+
+print('Results\n'+('-'*100))
+for method_name in result_mae_dict.keys():
+    MAE = result_mae_dict[method_name]
+    MRAE = result_mrae_dict[method_name]
+    print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')
+
+
+
+
+
+
+
--- a/Retrieval/previous/preliminary_.py
+++ b/Retrieval/previous/preliminary_.py
@ -0,0 +1,98 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+
+"""
+This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
+The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
+This is a clear indication that the PPS assumptions do not hold.
+Actually, while the training set could be some iid sample from a distribution L and every test set
+is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
+are biased towards a query term whereas the training set is not.  
+"""
+
+def methods():
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+    yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
+    yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
+    yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
+    yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
+    yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
+
+
+def load_txt_sample(path, verbose=False):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text']
+    y = df['first_letter_category']
+
+    return X, y
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_data_*.txt')):
+            X, y = self.load_fn(file)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            sample = LabelledCollection(X, y, classes=self.classes)
+            yield sample.Xp
+
+
+qp.environ['SAMPLE_SIZE']=100
+
+data_path = './data'
+train_path = join(data_path, 'train_data.txt')
+
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
+
+# training = training.sampling(1000)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('Xtr shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
+
+print('Training prevalence:', F.strprev(training.prevalence()))
+for X, p in test_prot():
+    print('Test prevalence:', F.strprev(p))
+
+for method_name, quantifier in methods():
+    print('training ', method_name)
+    quantifier.fit(training)
+    print('[done]')
+
+    report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
+
+    print(report.mean())
+
+
+
--- a/Retrieval/previous/second.py
+++ b/Retrieval/previous/second.py
@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
+Both elements in the pair are *retrieved according to the same query*. This is a way to impose
+the same type of bias that was present in the test, to the training set. Let's see...  
+"""
+
+def methods():
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['first_letter_category'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            test_sample = LabelledCollection(X, y, classes=self.classes)
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 500
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollection'
+train_path = join(data_path, 'train_data.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   classes=classes,
+                                   max_train_lines=RANK_AT_K,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+        # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+        quantifier.fit(train)
+        estim_prev = quantifier.quantify(test.instances)
+        mae = qp.error.mae(test.prevalence(), estim_prev)
+        errors.append(mae)
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/previous/third.py
+++ b/Retrieval/previous/third.py
@ -0,0 +1,155 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the second experiment, but in this case the fairness group are defined upon geographic info.  
+"""
+
+def methods():
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    # print('reading', path)
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        rank = rank[y != 'Antarctica']
+        scores = scores[y != 'Antarctica']
+
+    X = X[y!='Antarctica']
+    y = y[y!='Antarctica']
+
+    if parse_columns:
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            try:
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
+            except ValueError as e:
+                print(f'file {file} caused error {e}')
+                yield None, None
+
+            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
+            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 100
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollectionGeo'
+train_path = join(data_path, 'train_data_continent.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+print('training classes:', classes)
+print('training prevalence:', training.prevalence())
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   max_train_lines=None,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+                # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                errors.append(mae)
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/understand_classif_scheme.py
+++ b/Retrieval/understand_classif_scheme.py
@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.metrics import make_scorer, f1_score
+from sklearn.svm import LinearSVC
+
+from quapy.data.base import LabelledCollection
+from sklearn.model_selection import cross_val_score, GridSearchCV
+
+from os.path import join
+
+"""
+In this experiment, I simply try to understand whether the learning task can be learned or not.
+The problem is that we are quantifying the categories based on the alphabetical order (of what?).  
+"""
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+data_path = './50_50_split_trec'
+train_path = join(data_path, 'train_50_50_continent.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+data = data.sampling(20000)
+train, test = data.split_stratified()
+train.instances = tfidf.fit_transform(train.instances)
+test.instances  = tfidf.transform(test.instances)
+
+# svm = LinearSVC()
+# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
+cls = LogisticRegression()
+cls.fit(*train.Xy)
+
+# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
+# print(score)
+# print(np.mean(score))
+
+y_pred = cls.predict(test.instances)
+macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
+microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
+
+print('macro', macrof1)
+print('micro', microf1)