Merge pull request #2 from lorenzovolpi/logger

Logger
This commit is contained in:
Lorenzo Volpi 2023-10-28 16:17:53 +02:00 committed by GitHub
commit 54d85df1fc
9 changed files with 231 additions and 67 deletions

4
.gitignore vendored
View File

@ -12,4 +12,6 @@ elsahar19_rca/__pycache__/*
*.coverage *.coverage
.coverage .coverage
scp_sync.py scp_sync.py
out/* out/*
output/*
*.log

View File

@ -7,6 +7,7 @@ debug_conf: &debug_conf
datasets: datasets:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: CCAT DATASET_TARGET: CCAT
- DATASET_NAME: imdb
plot_confs: plot_confs:
debug: debug:
@ -49,14 +50,14 @@ main_conf: &main_conf
DATASET_N_PREVS: 9 DATASET_N_PREVS: 9
datasets: datasets:
- DATASET_NAME: rcv1 - DATASET_NAME: imdb
DATASET_TARGET: CCAT
datasets_bck: datasets_bck:
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: GCAT DATASET_TARGET: GCAT
- DATASET_NAME: rcv1 - DATASET_NAME: rcv1
DATASET_TARGET: MCAT DATASET_TARGET: MCAT
- DATASET_NAME: imdb - DATASET_NAME: rcv1
DATASET_TARGET: CCAT
plot_confs: plot_confs:
gs_vs_atc: gs_vs_atc:

View File

@ -34,9 +34,17 @@ class DatasetSample:
class Dataset: class Dataset:
def __init__(self, name, n_prevalences=9, target=None): def __init__(self, name, n_prevalences=9, prevs=None, target=None):
self._name = name self._name = name
self._target = target self._target = target
self.prevs = None
if prevs is not None:
prevs = np.unique([p for p in prevs if p > 0.0 and p < 1.0])
if prevs.shape[0] > 0:
self.prevs = np.sort(prevs)
self.n_prevs = self.prevs.shape[0]
self.n_prevs = n_prevalences self.n_prevs = n_prevalences
def __spambase(self): def __spambase(self):
@ -93,10 +101,14 @@ class Dataset:
) )
# sample prevalences # sample prevalences
prevalences = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:] if self.prevs is not None:
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevalences) prevs = self.prevs
else:
prevs = np.linspace(0.0, 1.0, num=self.n_prevs + 1, endpoint=False)[1:]
at_size = min(math.floor(len(all_train) * 0.5 / p) for p in prevs)
datasets = [] datasets = []
for p in prevalences: for p in prevs:
all_train_sampled = all_train.sampling(at_size, p, random_state=0) all_train_sampled = all_train.sampling(at_size, p, random_state=0)
train, validation = all_train_sampled.split_stratified( train, validation = all_train_sampled.split_stratified(
train_prop=TRAIN_VAL_PROP, random_state=0 train_prop=TRAIN_VAL_PROP, random_state=0

View File

@ -8,6 +8,7 @@ defalut_env = {
"PLOT_ESTIMATORS": [], "PLOT_ESTIMATORS": [],
"PLOT_STDEV": False, "PLOT_STDEV": False,
"DATASET_N_PREVS": 9, "DATASET_N_PREVS": 9,
"DATASET_PREVS": None,
"OUT_DIR_NAME": "output", "OUT_DIR_NAME": "output",
"OUT_DIR": None, "OUT_DIR": None,
"PLOT_DIR_NAME": "plot", "PLOT_DIR_NAME": "plot",

View File

@ -1,22 +1,21 @@
import logging as log
import multiprocessing import multiprocessing
import time import time
import traceback
from typing import List from typing import List
import numpy as np
import pandas as pd import pandas as pd
import quapy as qp import quapy as qp
from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.environment import env from quacc.environment import env
from quacc.evaluation import baseline, method from quacc.evaluation import baseline, method
from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport from quacc.evaluation.report import CompReport, DatasetReport, EvaluationReport
from quacc.evaluation.worker import estimate_worker
qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE from quacc.logging import Logger
pd.set_option("display.float_format", "{:.4f}".format) pd.set_option("display.float_format", "{:.4f}".format)
qp.environ["SAMPLE_SIZE"] = env.SAMPLE_SIZE
log = Logger.logger()
class CompEstimator: class CompEstimator:
@ -41,38 +40,6 @@ class CompEstimator:
CE = CompEstimator CE = CompEstimator
def fit_and_estimate(_estimate, train, validation, test, _env=None):
_env = env if _env is None else _env
model = LogisticRegression()
model.fit(*train.Xy)
protocol = APP(
test,
n_prevalences=_env.PROTOCOL_N_PREVS,
repeats=_env.PROTOCOL_REPEATS,
return_type="labelled_collection",
)
start = time.time()
try:
result = _estimate(model, validation, protocol)
except Exception as e:
log.error(f"Method {_estimate.__name__} failed. Exception: {e}")
return {
"name": _estimate.__name__,
"result": None,
"time": 0,
}
end = time.time()
log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]")
return {
"name": _estimate.__name__,
"result": result,
"time": end - start,
}
def evaluate_comparison( def evaluate_comparison(
dataset: Dataset, estimators=["OUR_BIN_SLD", "OUR_MUL_SLD"] dataset: Dataset, estimators=["OUR_BIN_SLD", "OUR_MUL_SLD"]
) -> EvaluationReport: ) -> EvaluationReport:
@ -81,11 +48,14 @@ def evaluate_comparison(
dr = DatasetReport(dataset.name) dr = DatasetReport(dataset.name)
log.info(f"dataset {dataset.name}") log.info(f"dataset {dataset.name}")
for d in dataset(): for d in dataset():
log.info(f"train prev.: {np.around(d.train_prev, decimals=2)}") log.info(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} started"
)
tstart = time.time() tstart = time.time()
tasks = [(estim, d.train, d.validation, d.test) for estim in CE[estimators]] tasks = [(estim, d.train, d.validation, d.test) for estim in CE[estimators]]
results = [ results = [
pool.apply_async(fit_and_estimate, t, {"_env": env}) for t in tasks pool.apply_async(estimate_worker, t, {"_env": env, "q": Logger.queue()})
for t in tasks
] ]
results_got = [] results_got = []
@ -95,22 +65,29 @@ def evaluate_comparison(
if r["result"] is not None: if r["result"] is not None:
results_got.append(r) results_got.append(r)
except Exception as e: except Exception as e:
log.error( log.warning(
f"Dataset sample {d.train[1]:.2f} of dataset {dataset.name} failed. Exception: {e}" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
) )
tend = time.time() tend = time.time()
times = {r["name"]: r["time"] for r in results_got} times = {r["name"]: r["time"] for r in results_got}
times["tot"] = tend - tstart times["tot"] = tend - tstart
log.info( log.info(
f"Dataset sample {d.train[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s" f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} finished [took {times['tot']:.4f}s]"
) )
dr += CompReport( try:
[r["result"] for r in results_got], cr = CompReport(
name=dataset.name, [r["result"] for r in results_got],
train_prev=d.train_prev, name=dataset.name,
valid_prev=d.validation_prev, train_prev=d.train_prev,
times=times, valid_prev=d.validation_prev,
) times=times,
)
except Exception as e:
log.warning(
f"Dataset sample {d.train_prev[1]:.2f} of dataset {dataset.name} failed. Exception: {e}"
)
traceback(e)
cr = None
dr += cr
return dr return dr

View File

@ -280,6 +280,9 @@ class DatasetReport:
return np.around([(1.0 - p, p) for p in self.prevs], decimals=2) return np.around([(1.0 - p, p) for p in self.prevs], decimals=2)
def add(self, cr: CompReport): def add(self, cr: CompReport):
if cr is None:
return
self.crs.append(cr) self.crs.append(cr)
if self._dict is None: if self._dict is None:
@ -320,6 +323,11 @@ class DatasetReport:
[self.s_dict[col][sp], cr_s_dict[col][sp]] [self.s_dict[col][sp], cr_s_dict[col][sp]]
) )
for sp in self.s_prevs:
for col, vals in self.s_dict.items():
if sp not in vals:
vals[sp] = []
for k, score in cr.fit_scores.items(): for k, score in cr.fit_scores.items():
if k not in self.fit_scores: if k not in self.fit_scores:
self.fit_scores[k] = [] self.fit_scores[k] = []

View File

@ -0,0 +1,42 @@
import time
import quapy as qp
from quapy.protocol import APP
from sklearn.linear_model import LogisticRegression
from quacc.logging import SubLogger
def estimate_worker(_estimate, train, validation, test, _env=None, q=None):
qp.environ["SAMPLE_SIZE"] = _env.SAMPLE_SIZE
SubLogger.setup(q)
log = SubLogger.logger()
model = LogisticRegression()
model.fit(*train.Xy)
protocol = APP(
test,
n_prevalences=_env.PROTOCOL_N_PREVS,
repeats=_env.PROTOCOL_REPEATS,
return_type="labelled_collection",
)
start = time.time()
try:
result = _estimate(model, validation, protocol)
except Exception as e:
log.warning(f"Method {_estimate.__name__} failed. Exception: {e}")
return {
"name": _estimate.__name__,
"result": None,
"time": 0,
}
end = time.time()
log.info(f"{_estimate.__name__} finished [took {end-start:.4f}s]")
return {
"name": _estimate.__name__,
"result": result,
"time": end - start,
}

118
quacc/logging.py Normal file
View File

@ -0,0 +1,118 @@
import logging
import logging.handlers
import multiprocessing
import threading
class Logger:
__logger_file = "quacc.log"
__logger_name = "queue_logger"
__manager = None
__queue = None
__thread = None
__setup = False
@classmethod
def __logger_listener(cls, q):
while True:
record = q.get()
if record is None:
break
root = logging.getLogger("listener")
root.handle(record)
@classmethod
def setup(cls):
if cls.__setup:
return
# setup root
root = logging.getLogger("listener")
root.setLevel(logging.DEBUG)
rh = logging.FileHandler(cls.__logger_file, mode="a")
rh.setLevel(logging.DEBUG)
root.addHandler(rh)
root.info("-" * 100)
# setup logger
if cls.__manager is None:
cls.__manager = multiprocessing.Manager()
if cls.__queue is None:
cls.__queue = cls.__manager.Queue()
logger = logging.getLogger(cls.__logger_name)
logger.setLevel(logging.DEBUG)
qh = logging.handlers.QueueHandler(cls.__queue)
qh.setLevel(logging.DEBUG)
qh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)s: %(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
logger.addHandler(qh)
# start listener
cls.__thread = threading.Thread(
target=cls.__logger_listener,
args=(cls.__queue,),
)
cls.__thread.start()
cls.__setup = True
@classmethod
def queue(cls):
if not cls.__setup:
cls.setup()
return cls.__queue
@classmethod
def logger(cls):
if not cls.__setup:
cls.setup()
return logging.getLogger(cls.__logger_name)
@classmethod
def close(cls):
if cls.__setup and cls.__thread is not None:
cls.__queue.put(None)
cls.__thread.join()
# cls.__manager.close()
class SubLogger:
__queue = None
__setup = False
@classmethod
def setup(cls, q):
if cls.__setup:
return
cls.__queue = q
# setup root
root = logging.getLogger()
root.setLevel(logging.DEBUG)
rh = logging.handlers.QueueHandler(q)
rh.setLevel(logging.DEBUG)
rh.setFormatter(
logging.Formatter(
fmt="%(asctime)s| %(levelname)s: %(message)s",
datefmt="%d/%m/%y %H:%M:%S",
)
)
root.addHandler(rh)
cls.__setup = True
@classmethod
def logger(cls):
if not cls.__setup:
return None
return logging.getLogger()

View File

@ -1,12 +1,14 @@
import logging as log
import traceback import traceback
from sys import platform from sys import platform
import quacc.evaluation.comp as comp import quacc.evaluation.comp as comp
from quacc.dataset import Dataset from quacc.dataset import Dataset
from quacc.environment import env from quacc.environment import env
from quacc.logging import Logger
from quacc.utils import create_dataser_dir from quacc.utils import create_dataser_dir
log = Logger.logger()
def toast(): def toast():
if platform == "win32": if platform == "win32":
@ -22,6 +24,7 @@ def estimate_comparison():
env.DATASET_NAME, env.DATASET_NAME,
target=env.DATASET_TARGET, target=env.DATASET_TARGET,
n_prevalences=env.DATASET_N_PREVS, n_prevalences=env.DATASET_N_PREVS,
prevs=env.DATASET_PREVS,
) )
try: try:
dr = comp.evaluate_comparison(dataset, estimators=env.COMP_ESTIMATORS) dr = comp.evaluate_comparison(dataset, estimators=env.COMP_ESTIMATORS)
@ -46,14 +49,14 @@ def estimate_comparison():
def main(): def main():
log.basicConfig( try:
filename="quacc.log", estimate_comparison()
filemode="a", except Exception as e:
format="%(asctime)s| %(levelname)s: %(message)s", log.error(f"estimate comparison failed. Exceprion: {e}")
datefmt="%d/%m/%y %H:%M:%S", traceback(e)
)
estimate_comparison()
toast() toast()
Logger.close()
if __name__ == "__main__": if __name__ == "__main__":