diff --git a/TweetSentQuant/experiments.py b/TweetSentQuant/experiments.py index 136ddea..5add09b 100644 --- a/TweetSentQuant/experiments.py +++ b/TweetSentQuant/experiments.py @@ -8,22 +8,28 @@ import pickle import itertools from joblib import Parallel, delayed import settings +import argparse + +parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') +parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results') +args = parser.parse_args() def quantification_models(): def newLR(): return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) __C_range = np.logspace(-4, 5, 10) - lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} + #lr_params = {'C': __C_range, 'class_weight': [None, 'balanced']} svmperf_params = {'C': __C_range} + lr_params = {'C': [1,10]} yield 'cc', qp.method.aggregative.CC(newLR()), lr_params - yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params - yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params - yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params - yield 'sld', qp.method.aggregative.EMQ(newLR()), lr_params - yield 'svmq', OneVsAll(qp.method.aggregative.SVMQ(settings.SVMPERF_HOME)), svmperf_params - yield 'svmkld', OneVsAll(qp.method.aggregative.SVMKLD(settings.SVMPERF_HOME)), svmperf_params - yield 'svmnkld', OneVsAll(qp.method.aggregative.SVMNKLD(settings.SVMPERF_HOME)), svmperf_params + #yield 'acc', qp.method.aggregative.ACC(newLR()), lr_params + #yield 'pcc', qp.method.aggregative.PCC(newLR()), lr_params + #yield 'pacc', qp.method.aggregative.PACC(newLR()), lr_params + #yield 'sld', qp.method.aggregative.EMQ(newLR()), lr_params + #yield 'svmq', OneVsAll(qp.method.aggregative.SVMQ(settings.SVMPERF_HOME)), svmperf_params + #yield 'svmkld', OneVsAll(qp.method.aggregative.SVMKLD(settings.SVMPERF_HOME)), svmperf_params + #yield 'svmnkld', OneVsAll(qp.method.aggregative.SVMNKLD(settings.SVMPERF_HOME)), svmperf_params # 'svmmae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mae'), # 'svmmrae': lambda learner: OneVsAllELM(settings.SVM_PERF_HOME, loss='mrae'), @@ -47,7 +53,7 @@ def evaluate_method_point_test(true_prev, estim_prev): def result_path(dataset_name, model_name, optim_loss): - return f'./results/{dataset_name}-{model_name}-{optim_loss}.pkl' + return os.path.join(args.results, f'{dataset_name}-{model_name}-{optim_loss}.pkl') def is_already_computed(dataset_name, model_name, optim_loss): @@ -77,7 +83,6 @@ def run(experiment): return else: print(f'running dataset={dataset_name} model={model_name} loss={optim_loss}') - return benchmark_devel = qp.datasets.fetch_twitter(dataset_name, for_model_selection=True, min_df=5, pickle=True) benchmark_devel.stats() @@ -125,6 +130,7 @@ def run(experiment): if __name__ == '__main__': + print(f'Result folder: {args.results}') np.random.seed(0) optim_losses = ['mae', 'mrae'] diff --git a/quapy/__init__.py b/quapy/__init__.py index 011412e..20dc49b 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,13 +1,13 @@ from . import error -from .data import datasets +from . import data +from quapy.data import datasets from . import functional from . import method -from . import data from . import evaluation from . import plot from . import util from . import model_selection -from method.aggregative import isaggregative, isprobabilistic +from quapy.method.aggregative import isaggregative, isprobabilistic environ = { diff --git a/quapy/classification/neural.py b/quapy/classification/neural.py index 5f8038d..c14ca36 100644 --- a/quapy/classification/neural.py +++ b/quapy/classification/neural.py @@ -1,6 +1,7 @@ import os from abc import ABCMeta, abstractmethod from pathlib import Path + import numpy as np import torch import torch.nn as nn @@ -8,10 +9,10 @@ import torch.nn.functional as F from sklearn.metrics import accuracy_score, f1_score from torch.nn.utils.rnn import pad_sequence from tqdm import tqdm + +import quapy as qp from data import LabelledCollection from util import EarlyStop -import quapy as qp - class NeuralClassifierTrainer: diff --git a/quapy/classification/svmperf.py b/quapy/classification/svmperf.py index ceab225..38cf6c6 100644 --- a/quapy/classification/svmperf.py +++ b/quapy/classification/svmperf.py @@ -1,9 +1,10 @@ import random import subprocess import tempfile -from os.path import join, exists from os import remove +from os.path import join, exists from subprocess import PIPE, STDOUT + import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.datasets import dump_svmlight_file diff --git a/quapy/data/__init__.py b/quapy/data/__init__.py index 9c119ab..563cfcf 100644 --- a/quapy/data/__init__.py +++ b/quapy/data/__init__.py @@ -1,6 +1,6 @@ +from . import datasets +from . import preprocessing from .base import * from .reader import * -from . import preprocessing -from . import datasets diff --git a/quapy/data/base.py b/quapy/data/base.py index 322ba71..697bcf6 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,8 +1,8 @@ import numpy as np from scipy.sparse import issparse +from scipy.sparse import vstack from sklearn.model_selection import train_test_split from quapy.functional import artificial_prevalence_sampling, strprev -from scipy.sparse import vstack class LabelledCollection: diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 2af3de3..dca22cd 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -1,12 +1,13 @@ -import zipfile -from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource import os +import zipfile from os.path import join -from data.base import Dataset, LabelledCollection -from data.reader import * -from data.preprocessing import text2tfidf, reduce_columns + import pandas as pd +from data.base import Dataset, LabelledCollection +from quapy.data.preprocessing import text2tfidf, reduce_columns +from quapy.data.reader import * +from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py index 972a3db..4376482 100644 --- a/quapy/data/preprocessing.py +++ b/quapy/data/preprocessing.py @@ -1,11 +1,12 @@ import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from data.base import Dataset from scipy.sparse import spmatrix -from util import parallelize -from .base import LabelledCollection +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from tqdm import tqdm + import quapy as qp +from quapy.data.base import Dataset +from quapy.util import parallelize +from .base import LabelledCollection def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kwargs): diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 7597fd0..f7e45f4 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -1,7 +1,6 @@ import numpy as np from scipy.sparse import dok_matrix from tqdm import tqdm -import pandas as pd def from_text(path): diff --git a/quapy/error.py b/quapy/error.py index e39aa8a..b1e5f14 100644 --- a/quapy/error.py +++ b/quapy/error.py @@ -1,7 +1,7 @@ -from sklearn.metrics import f1_score import numpy as np -import quapy as qp +from sklearn.metrics import f1_score +import quapy as qp def f1e(y_true, y_pred): diff --git a/quapy/evaluation.py b/quapy/evaluation.py index 498f284..78917d1 100644 --- a/quapy/evaluation.py +++ b/quapy/evaluation.py @@ -1,12 +1,13 @@ -import quapy as qp from typing import Union, Callable, Iterable -from data import LabelledCollection -from method.base import BaseQuantifier -from util import temp_seed + import numpy as np from joblib import Parallel, delayed from tqdm import tqdm -import error + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.method.base import BaseQuantifier +from quapy.util import temp_seed def artificial_sampling_prediction( @@ -72,8 +73,8 @@ def artificial_sampling_prediction( def evaluate(model: BaseQuantifier, test_samples:Iterable[LabelledCollection], err:Union[str, Callable], n_jobs:int=-1): if isinstance(err, str): - err = getattr(error, err) - assert err.__name__ in error.QUANTIFICATION_ERROR_NAMES, \ + err = getattr(qp.error, err) + assert err.__name__ in qp.error.QUANTIFICATION_ERROR_NAMES, \ f'error={err} does not seem to be a quantification error' scores = Parallel(n_jobs=n_jobs)( delayed(_delayed_eval)(model, Ti, err) for Ti in test_samples diff --git a/quapy/functional.py b/quapy/functional.py index 579e738..726b214 100644 --- a/quapy/functional.py +++ b/quapy/functional.py @@ -1,6 +1,7 @@ -from collections import defaultdict -import numpy as np import itertools +from collections import defaultdict + +import numpy as np def artificial_prevalence_sampling(dimensions, n_prevalences=21, repeat=1, return_constrained_dim=False): @@ -61,13 +62,6 @@ def HellingerDistance(P, Q): return np.sqrt(np.sum((np.sqrt(P) - np.sqrt(Q))**2)) -#def uniform_simplex_sampling(n_classes): - # from https://cs.stackexchange.com/questions/3227/uniform-sampling-from-a-simplex -# r = [0.] + sorted(np.random.rand(n_classes-1)) + [1.] -# return np.asarray([b-a for a,b in zip(r[:-1],r[1:])]) - - - def uniform_prevalence_sampling(n_classes, size=1): if n_classes == 2: u = np.random.rand(size) diff --git a/quapy/method/__init__.py b/quapy/method/__init__.py index 47d2b3a..6ef83f1 100644 --- a/quapy/method/__init__.py +++ b/quapy/method/__init__.py @@ -1,8 +1,7 @@ -from . import base from . import aggregative -from . import non_aggregative +from . import base from . import meta - +from . import non_aggregative AGGREGATIVE_METHODS = { aggregative.CC, diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 77c585f..ef233e9 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1,19 +1,20 @@ -import numpy as np -from copy import deepcopy -from sklearn.base import BaseEstimator, clone -import functional as F -import error -from method.base import BaseQuantifier, BinaryQuantifier -from classification.svmperf import SVMperf -from data import LabelledCollection -from sklearn.metrics import confusion_matrix -from sklearn.calibration import CalibratedClassifierCV -from joblib import Parallel, delayed from abc import abstractmethod +from copy import deepcopy from typing import Union + +import numpy as np +from joblib import Parallel, delayed +from sklearn.base import BaseEstimator +from sklearn.calibration import CalibratedClassifierCV +from sklearn.metrics import confusion_matrix from sklearn.model_selection import StratifiedKFold from tqdm import tqdm +import quapy.functional as F +from quapy.classification.svmperf import SVMperf +from quapy.data import LabelledCollection +from quapy.method.base import BaseQuantifier, BinaryQuantifier + # Abstract classes # ------------------------------------ diff --git a/quapy/method/base.py b/quapy/method/base.py index 2627b87..9dfd74c 100644 --- a/quapy/method/base.py +++ b/quapy/method/base.py @@ -1,5 +1,6 @@ from abc import ABCMeta, abstractmethod -from data import LabelledCollection + +from quapy.data import LabelledCollection # Base Quantifier abstract class diff --git a/quapy/method/meta.py b/quapy/method/meta.py index 08d4813..cc2a473 100644 --- a/quapy/method/meta.py +++ b/quapy/method/meta.py @@ -1,16 +1,17 @@ +from copy import deepcopy + import numpy as np -from sklearn.linear_model import LogisticRegressionCV, LogisticRegression +from joblib import Parallel, delayed +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV, cross_val_predict import quapy as qp -from sklearn.model_selection import GridSearchCV, cross_val_predict -from model_selection import GridSearchQ -from .base import BaseQuantifier, BinaryQuantifier -from joblib import Parallel, delayed -from copy import deepcopy -from data import LabelledCollection from quapy import functional as F +from quapy.data import LabelledCollection +from quapy.evaluation import evaluate +from quapy.model_selection import GridSearchQ from . import neural -from evaluation import evaluate +from .base import BaseQuantifier QuaNet = neural.QuaNetTrainer diff --git a/quapy/method/neural.py b/quapy/method/neural.py index 9adf175..896ff43 100644 --- a/quapy/method/neural.py +++ b/quapy/method/neural.py @@ -1,11 +1,12 @@ import os from pathlib import Path + import torch from torch.nn import MSELoss from torch.nn.functional import relu -from tqdm import tqdm -from method.aggregative import * -from util import EarlyStop + +from quapy.method.aggregative import * +from quapy.util import EarlyStop class QuaNetTrainer(BaseQuantifier): diff --git a/quapy/method/non_aggregative.py b/quapy/method/non_aggregative.py index a23ffce..4defdeb 100644 --- a/quapy/method/non_aggregative.py +++ b/quapy/method/non_aggregative.py @@ -1,4 +1,4 @@ -from data import LabelledCollection +from quapy.data import LabelledCollection from .base import BaseQuantifier diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 8f782c1..c3a2556 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -1,12 +1,13 @@ import itertools -import quapy as qp -from evaluation import artificial_sampling_prediction -from data.base import LabelledCollection -from method.aggregative import BaseQuantifier -from typing import Union, Callable -import functional as F -from copy import deepcopy import signal +from copy import deepcopy +from typing import Union, Callable + +import quapy as qp +import quapy.functional as F +from data.base import LabelledCollection +from quapy.evaluation import artificial_sampling_prediction +from quapy.method.aggregative import BaseQuantifier class GridSearchQ(BaseQuantifier): @@ -80,8 +81,8 @@ class GridSearchQ(BaseQuantifier): training, validation = training.split_stratified(train_prop=1-validation) return training, validation else: - raise ValueError('"validation" must either be a LabelledCollection or a float in (0,1) indicating the' - 'proportion of training documents to extract') + raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the' + f'proportion of training documents to extract (found) {type(validation)}') def __check_num_evals(self, n_prevpoints, eval_budget, n_repetitions, n_classes): if n_prevpoints is None and eval_budget is None: diff --git a/quapy/plot.py b/quapy/plot.py index 5164a59..7fbdae6 100644 --- a/quapy/plot.py +++ b/quapy/plot.py @@ -1,9 +1,10 @@ from collections import defaultdict -import matplotlib.pyplot as plt -from matplotlib import cm -import numpy as np -import quapy as qp +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import cm + +import quapy as qp plt.rcParams['figure.figsize'] = [12, 8] plt.rcParams['figure.dpi'] = 200 diff --git a/quapy/util.py b/quapy/util.py index 3bb9446..4f5fc02 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -1,12 +1,13 @@ +import contextlib import itertools import multiprocessing -from joblib import Parallel, delayed -import contextlib -import numpy as np -import urllib import os -from pathlib import Path import pickle +import urllib +from pathlib import Path + +import numpy as np +from joblib import Parallel, delayed def get_parallel_slices(n_tasks, n_jobs=-1):