refactoring everything

This commit is contained in:
Alejandro Moreo Fernandez 2024-04-18 09:32:30 +02:00
parent 8399552c8d
commit 985f430d52
8 changed files with 74 additions and 557 deletions

View File

@ -8,50 +8,28 @@ from quapy.protocol import AbstractProtocol
import json
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
# print('reading', path)
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
def load_sample(path, class_name, max_lines=-1):
"""
Loads a sample json as a dataframe and returns text and labels for
the given class_name
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
rank = rank[y != 'Antarctica']
scores = scores[y != 'Antarctica']
X = X[y!='Antarctica']
y = y[y!='Antarctica']
if parse_columns:
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
def load_json_sample(path, class_name, max_lines=-1):
obj = json.load(open(path, 'rt'))
keys = [f'{id}' for id in range(len(obj['text'].keys()))]
text = [obj['text'][id] for id in keys]
#print(list(obj.keys()))
#import sys; sys.exit(0)
classes = [obj[class_name][id] for id in keys]
:param path: path to a json file
:param class_name: string representing the target class
:param max_lines: if provided and > 0 then returns only the
first requested number of instances
:return: texts and labels for class_name
"""
df = pd.read_json(path)
text = df.text.values
try:
labels = df[class_name].values
except KeyError as e:
print(f'error in {path}; key {class_name} not found')
raise e
if max_lines is not None and max_lines>0:
text = text[:max_lines]
classes = classes[:max_lines]
return text, classes
labels = labels[:max_lines]
return text, labels
class TextRankings:
@ -75,49 +53,81 @@ class TextRankings:
return texts, labels
def get_query_id_from_path(path, prefix='training', posfix='200SPLIT'):
qid = path
qid = qid[:qid.index(posfix)]
qid = qid[qid.index(prefix)+len(prefix):]
return qid
def filter_by_classes(X, y, classes):
idx = np.isin(y, classes)
return X[idx], y[idx]
class RetrievedSamples(AbstractProtocol):
def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None, class_name=None):
self.path_dir = path_dir
def __init__(self,
class_home: str,
test_rankings_path: str,
load_fn,
vectorizer,
class_name,
max_train_lines=None,
max_test_lines=None,
classes=None
):
self.class_home = class_home
self.test_rankings_df = pd.read_json(test_rankings_path)
self.load_fn = load_fn
self.vectorizer = vectorizer
self.class_name = class_name
self.max_train_lines = max_train_lines
self.max_test_lines = max_test_lines
self.classes=classes
assert class_name is not None, 'class name should be specified'
self.class_name = class_name
self.text_samples = TextRankings(join(self.path_dir, 'testRankingsRetrieval.json'), class_name=class_name)
def __call__(self):
for file in glob(join(self.path_dir, 'training*SPLIT.json')):
for file in self._list_queries():
X, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
X = self.vectorizer.transform(X)
texts, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
texts, y = filter_by_classes(texts, y, self.classes)
X = self.vectorizer.transform(texts)
train_sample = LabelledCollection(X, y, classes=self.classes)
query_id = get_query_id_from_path(file)
X, y = self.text_samples.get_sample_Xy(query_id, max_lines=self.max_test_lines)
query_id = self._get_query_id_from_path(file)
texts, y = self._get_test_sample(query_id, max_lines=self.max_test_lines)
texts, y = filter_by_classes(texts, y, self.classes)
X = self.vectorizer.transform(texts)
# if len(X)!=qp.environ['SAMPLE_SIZE']:
# print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
# assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
X = self.vectorizer.transform(X)
try:
test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
yield train_sample, test_sample
except ValueError as e:
print(f'file {file} caused error {e}')
print(f'file {file} caused an exception: {e}')
yield None, None
# print('train #classes:', train_sample.n_classes, train_sample.prevalence())
# print('test #classes:', test_sample.n_classes, test_sample.prevalence())
yield train_sample, test_sample
def _list_queries(self):
return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
def _get_test_sample(self, query_id, max_lines=-1):
df = self.test_rankings_df
sel_df = df[df.qid==int(query_id)]
texts = sel_df.text.values
try:
labels = sel_df[self.class_name].values
except KeyError as e:
print(f'error: key {self.class_name} not found in test rankings')
raise e
if max_lines > 0 and len(texts) > max_lines:
ranks = sel_df.rank.values
idx = np.argsort(ranks)[:max_lines]
texts = np.asarray(texts)[idx]
labels = np.asarray(labels)[idx]
return texts, labels
def total(self):
return len(self._list_queries())
def _get_query_id_from_path(self, path):
prefix = 'training_Query-'
posfix = 'Sample-200SPLIT'
qid = path
qid = qid[:qid.index(posfix)]
qid = qid[qid.index(prefix) + len(prefix):]
return qid

View File

@ -1,427 +0,0 @@
import os.path
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
from pathlib import Path
from os.path import join
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True, color_mode='local', maxtone=50):
assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col:j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = ttest
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.color_mode = color_mode
self.maxtone = maxtone
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
def _addcolor(self):
minval = {}
maxval = {}
if self.color_mode == 'global':
filled_cols_idx = np.argwhere(self.map['fill'])
col_means = [self.map['mean'][i, j] for i, j in filled_cols_idx]
if len(filled_cols_idx) > 0:
global_minval = min(col_means)
global_maxval = max(col_means)
for i in range(self.nbenchmarks):
minval[i] = global_minval
maxval[i] = global_maxval
elif self.color_mode == 'local':
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i, i + 1])
if len(filled_cols_idx)>0:
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
minval[i] = min(col_means)
maxval[i] = max(col_means)
else:
print(f'color mode {self.color_mode} not understood, valid ones are "local" and "global"; skip')
return
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
for col_idx in filled_cols_idx:
val = self.map['mean'][i,col_idx]
if i not in maxval or i not in minval:
continue
norm = (maxval[i] - minval[i])
if norm > 0:
normval = (val - minval[i]) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
normval = np.clip(normval, 0,1)
self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
try:
_, p_val = wilcoxon(values1, values2)
except ValueError:
p_val = 0
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False]*self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j==best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods,
lower_is_better=self.lower_is_better,
ttest=self.ttest,
average=False,
missing=self.missing,
missing_str=self.missing_str,
prec_mean=self.prec_mean,
prec_std=self.prec_std,
clean_zero=self.clean_zero,
show_std=self.show_std,
color=self.color,
maxtone=self.maxtone)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
else: # wilcoxon
# values = np.asarray(self.map['mean'][:, self.method_index[col]])
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim==0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
self.map['values'][rid, cid] = values
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v,float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latex(self, benchmark, method):
self.update()
i,j = self._coordinates(benchmark, method)
if self.map['fill'][i,j] == False:
return self.missing_str
mean = self.map['mean'][i,j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i,j] == 1
if isbest:
l = "\\textbf{"+l.strip()+"}"
stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
if self.ttest is not None and self.some_similar[j]:
test_label = self.map['ttest'][i,j]
if test_label == 'Sim':
stat = '^{\dag}'
elif test_label == 'Same':
stat = '^{\ddag}'
elif isbest or test_label == 'Diff':
stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i,j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f"\pm {std:{self.prec_std}}"
if stat!='' or std!='':
l = f'{l}${stat}{std}$'
if self.color:
l += ' ' + self.map['color'][i,j]
return l
def latexPDF(self, path, name:str, *args, **kwargs):
if not name.endswith('.tex'):
name += '.tex'
self.latexSaveDocument(join(path, name), *args, **kwargs)
print("[Tables Done] runing latex")
os.chdir(path)
os.system('pdflatex '+name)
basename = name.replace('.tex', '')
os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
os.chdir('..')
def latexSaveDocument(self, path, *args, **kwargs):
document = self.latexDocument(*args, **kwargs)
parent = Path(path).parent
os.makedirs(parent, exist_ok=True)
with open(path, 'wt') as foo:
foo.write(document)
print('text file save at ', path)
def latexDocument(self, *args, **kwargs):
document = """
\\documentclass[10pt,a4paper]{article}
\\usepackage[utf8]{inputenc}
\\usepackage{amsmath}
\\usepackage{amsfonts}
\\usepackage{amssymb}
\\usepackage{graphicx}
\\usepackage{xcolor}
\\usepackage{colortbl}
\\begin{document}
"""
document += self.latexTable(*args, **kwargs)
document += "\n\end{document}\n"
return document
def latexTable(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline', resizebox=True):
table = """
\\begin{table}
\center
%%%\\resizebox{\\textwidth}{!}{% \n
"""
table += "\n\\begin{tabular}{|c"+"|c" * self.nmethods + "|}\n"
table += self.latexTabular(benchmark_replace, method_replace, aslines, endl)
table += "\n\\end{tabular}\n"
table += """
%%%}%
\end{table}
"""
if resizebox:
table = table.replace("%%%", "")
return table
def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
lines = []
l = '\multicolumn{1}{c|}{} & '
l += ' & '.join([method_replace.get(col, col) for col in self.methods])
l += ' \\\\\hline'
lines.append(l)
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
l = rowname + ' & '
l += self.latexRow(row, endl=endl)
lines.append(l)
if self.add_average:
# l += '\hline\n'
l = '\hline \n \\textit{Average} & '
l += self.latexAverage(endl=endl)
lines.append(l)
if not aslines:
lines='\n'.join(lines)
return lines
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latex(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self, prec_mean=0):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col:j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

View File

@ -1,66 +0,0 @@
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.svm import LinearSVC
from quapy.data.base import LabelledCollection
from sklearn.model_selection import cross_val_score, GridSearchCV
from os.path import join
"""
In this experiment, I simply try to understand whether the learning task can be learned or not.
The problem is that we are quantifying the categories based on the alphabetical order (of what?).
"""
def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
if verbose:
print(f'loading {path}...', end='')
df = pd.read_csv(path, sep='\t')
if verbose:
print('[done]')
X = df['text'].values
y = df['continent'].values
if parse_columns:
rank = df['rank'].values
scores = df['score'].values
order = np.argsort(rank)
X = X[order]
y = y[order]
rank = rank[order]
scores = scores[order]
if max_lines is not None:
X = X[:max_lines]
y = y[:max_lines]
return X, y
data_path = './50_50_split_trec'
train_path = join(data_path, 'train_50_50_continent.txt')
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
data = data.sampling(20000)
train, test = data.split_stratified()
train.instances = tfidf.fit_transform(train.instances)
test.instances = tfidf.transform(test.instances)
# svm = LinearSVC()
# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
cls = LogisticRegression()
cls.fit(*train.Xy)
# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
# print(score)
# print(np.mean(score))
y_pred = cls.predict(test.instances)
macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
print('macro', macrof1)
print('micro', microf1)