import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.svm import LinearSVC from tqdm import tqdm import quapy as qp from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as MLPE from quapy.method.aggregative import EMQ, PACC, CC, PCC, MS2, MS, ACC from quapy.data import LabelledCollection from sklearn.preprocessing import StandardScaler from commons import * np.set_printoptions(linewidth=np.inf) cens_y = './data/cens_y.csv' survey_y = './data/survey_y.csv' # Ate, Xte = load_csv(cens_y) Atr, Xtr, ytr = load_csv(survey_y, use_yhat=True) preprocessor = Preprocessor() Xtr = preprocessor.fit_transform(Xtr) # Xtr_proc = preprocessor.fit_transform(Xtr) # big_train = LabelledCollection(Xtr_proc, ytr) # q.fit(big_train) trains = get_dataset_by_area(Atr, Xtr, ytr) # tests = get_dataset_by_area(Ate, Xte) n_area = len(trains) # cls = LinearSVC() cls = LogisticRegression() # cls = LogisticRegressionCV(class_weight='balanced', Cs=10) # q = CC(cls) # q = PCC(cls) # q = PACC(cls) q = EMQ(cls) # q = MS(cls) #q = MaximumLikelihoodPrevalenceEstimation() for q in [CC(cls), PCC(cls), ACC(cls), PACC(cls), EMQ(cls), MLPE()]: results = np.zeros(shape=(n_area, n_area)) for i, (Ai, Xi, yi) in tqdm(enumerate(trains), total=n_area): # Xi = preprocessor.fit_transform(Xi) tr = LabelledCollection(Xi, yi) q.fit(tr) len_tr = len(tr) # len_tr = len(big_train) for j, (Aj, Xj, yj) in enumerate(trains): if i==j: continue # Xj = preprocessor.transform(Xj) te = LabelledCollection(Xj, yj) pred_prev = q.quantify(te.X) true_prev = te.prevalence() # qp.environ["SAMPLE_SIZE"] = len(te) # err = qp.error.mrae(true_prev, pred_prev) err = qp.error.mae(true_prev, pred_prev) print(f'{i=} {j=} [#train={len_tr}] true_prev={true_prev[1]:.3f} pred_prev={pred_prev[1]:.3f} {err=:.4f}') results[i,j] = err import sys; sys.exit() q_name = q.__class__.__name__ # print(results) print(f'{q_name} mean results = {results.mean():.4f}') results += np.eye(results.shape[0]) print(results.min(axis=0).mean())