QuaPy/Retrieval/previous/third.py

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import quapy as qp
import quapy.functional as F
from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
from quapy.protocol import AbstractProtocol
from quapy.data.base import LabelledCollection

from glob import glob
from os.path import join
from tqdm import tqdm

"""
In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as
in the second experiment, but in this case the fairness group are defined upon geographic info.
"""

def methods():
    yield ('CC', ClassifyAndCount(LogisticRegression()))
    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('EMQ', EMQ(LogisticRegression()))
    yield ('PCC', PCC(LogisticRegression()))
    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())


def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
    # print('reading', path)
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values

    if parse_columns:
        rank = df['rank'].values
        scores = df['score'].values
        rank = rank[y != 'Antarctica']
        scores = scores[y != 'Antarctica']

    X = X[y!='Antarctica']
    y = y[y!='Antarctica']

    if parse_columns:
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]

    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]

    return X, y


class RetrievedSamples(AbstractProtocol):

    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.max_train_lines = max_train_lines
        self.max_test_lines = max_test_lines

    def __call__(self):
        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):

            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
            X = self.vectorizer.transform(X)
            train_sample = LabelledCollection(X, y)

            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
            if len(X)!=qp.environ['SAMPLE_SIZE']:
                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
            X = self.vectorizer.transform(X)
            try:
                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
            except ValueError as e:
                print(f'file {file} caused error {e}')
                yield None, None

            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())

            yield train_sample, test_sample


RANK_AT_K = 100
REDUCE_TR = 50000
qp.environ['SAMPLE_SIZE'] = RANK_AT_K

data_path = './newCollectionGeo'
train_path = join(data_path, 'train_data_continent.txt')

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)

training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)

if REDUCE_TR>0:
    print('Reducing the number of documents in the training to', REDUCE_TR)
    training = training.sampling(REDUCE_TR)

Xtr, ytr = training.Xy
Xtr = tfidf.fit_transform(Xtr)
print('L orig shape = ', Xtr.shape)

training = LabelledCollection(Xtr, ytr)
classes = training.classes_

print('training classes:', classes)
print('training prevalence:', training.prevalence())

experiment_prot = RetrievedSamples(data_path,
                                   load_fn=load_txt_sample,
                                   vectorizer=tfidf,
                                   max_train_lines=None,
                                   max_test_lines=RANK_AT_K)

for method_name, quantifier in methods():
    print('Starting with method=', method_name)

    errors = []
    pbar = tqdm(experiment_prot(), total=49)
    for train, test in pbar:
        if train is not None:
            try:
                # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)

                # print(train.prevalence())
                # print(test.prevalence())
                quantifier.fit(train)
                estim_prev = quantifier.quantify(test.instances)
                mae = qp.error.mae(test.prevalence(), estim_prev)
                errors.append(mae)
            except Exception as e:
                print(f'wow, something happened here! skipping; {e}')
        else:
            print('skipping one!')

        pbar.set_description(f'mae={np.mean(errors):.4f}')
    print()