1
0
Fork 0

Compare commits

...

16 Commits

Author SHA1 Message Date
Alejandro Moreo Fernandez 08c26c58f9 regenerating tfidf vectors 2024-03-15 14:30:51 +01:00
Alejandro Moreo Fernandez faa3af587c adding helper files, unorganized 2024-03-12 09:52:58 +01:00
Alejandro Moreo Fernandez b756871f21 regenerating dataset 2023-07-25 10:45:44 +02:00
Alejandro Moreo Fernandez 9ad4503153 switching 2023-07-16 18:43:40 +02:00
Alejandro Moreo Fernandez 72c63fff09 new dataset generated out of real prevalence values of books by products 2023-05-05 12:58:54 +02:00
Alejandro Moreo Fernandez 85abaf2ba2 scripts using QuaPy 2022-03-31 18:46:56 +02:00
Alejandro Moreo Fernandez b4c3e57343 preparing fine tunning experiments with roberta 2022-03-24 17:29:54 +01:00
Alejandro Moreo Fernandez 464bd60c7c generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00
Alejandro Moreo Fernandez d949c77317 generating BERT outputs for textual documents 2022-03-15 14:01:40 +01:00
Alejandro Moreo Fernandez ad64dfe2a0 adding sample_weight to ordinal-aware classifiers 2022-03-10 18:28:49 +01:00
Alejandro Moreo Fernandez b2e161480e table generation 2022-03-10 13:32:53 +01:00
Alejandro Moreo Fernandez 5df355a4e1 regression-based adjustment using the validation set; seems to be working 2022-03-08 18:24:30 +01:00
Alejandro Moreo Fernandez b982a51103 trying ordinal classification 2022-03-08 16:27:41 +01:00
Alejandro Moreo Fernandez f285e936ad first experiments 2022-03-03 18:33:27 +01:00
Alejandro Moreo Fernandez c63325e364 script for genearting the datasets 2022-03-03 14:40:11 +01:00
Alejandro Moreo Fernandez 5e1d59687f fisrt commit, lets put here some code for ordinal quantification 2022-01-27 12:41:32 +01:00
29 changed files with 2970 additions and 3 deletions

View File

@ -0,0 +1,52 @@
import gzip
import os
import sys
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t')
telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
nclasses_amazon = amazon.shape[1]
nclasses_telescope = telescope.shape[1]
jags_amazon = np.asarray([jaggedness(p) for p in amazon])
jags_telescope = np.asarray([jaggedness(p) for p in telescope])
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.7)
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
ymax = 0.75
figure(figsize=(8, 4), dpi=300)
ax=plt.subplot(1, 2, 1)
classes = np.arange(1, nclasses_amazon+1)
plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1)
ax.set_ylim(0, ymax)
ax.set_xlabel("stars")
ax.set_xticks(classes)
ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})')
ax=plt.subplot(1, 2, 2)
# ax=plt.subplot(1, 1, 1)
classes = np.arange(1, nclasses_telescope+1)
plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1)
ax.set_ylim(0, ymax)
ax.set_xlabel("energy bin")
ax.set_xticks(classes)
ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})')
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.savefig('prevalence_averages.pdf', bbox_inches='tight')

View File

@ -0,0 +1,43 @@
import gzip
import os
import sys
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
telescope = np.genfromtxt('fact_expectation.txt')
nclasses_telescope = len(telescope)
jag = jaggedness(telescope)
print(jag)
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.7)
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
ymax = 0.4
figure(figsize=(8, 4), dpi=300)
ax=plt.subplot(1, 1, 1)
classes = np.arange(1, nclasses_telescope+1)
plt.bar(classes, telescope, width=1)
# ax.bar_label(telescope)
ax.set_ylim(0, ymax)
ax.set_xlabel("energy bin")
ax.set_xticks(classes)
ax.set_title(f'FACT data ({jag:.4f})')
for index, data in enumerate(telescope):
plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}")
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.savefig('telescope_prevalence.pdf', bbox_inches='tight')

0
Ordinal/__init__.py Normal file
View File

View File

@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
# this script computes the distribution of smoothness/sharpness of the books;
# either considering all books, as well as considering different groups of books' reviews (by product)
# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz'
# df = pd.read_json(filein, lines=True, compression='gzip')
read_meta = True
def prepare_vote_field(df):
df['vote'] = df['vote'].fillna('0')
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
df['vote'] = pd.to_numeric(df['vote'])
return df
def read_from_huge_json(filein):
df = pd.read_json(filein, lines=True)
df.drop(columns=[
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
'image'
], inplace=True)
df = prepare_vote_field(df)
return df
def read_from_metadata(filein):
df = pd.read_csv(filein)
df['vote'] = pd.to_numeric(df['vote'])
return df
def filter_by_vote(df, vote_threshold=1):
df = df[df['vote'] >= vote_threshold]
df.drop(columns=['vote'], inplace=True)
return df
if read_meta:
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
readfn = read_from_metadata
else:
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
readfn = read_from_huge_json
votes_support=9
df = readfn(filein)
num_entries = len(df)
# df = prepare_vote_field(df)
df = filter_by_vote(df, vote_threshold=votes_support)
num_entries_with_vote = len(df)
unique_product_ids = df['asin'].unique()
num_products = len(unique_product_ids)
print(df.columns)
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
print(f'num unique products {num_products}')
# df = df.groupby(df['asin'])
def not_smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
# pass to dictionaries
df = df.reset_index() # make sure indexes pair with number of rows
ids = df['asin'].values
overalls = df['overall'].values
allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
allbooks_sharpness = not_smoothness(allbooks_prev)
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
import sys
sys.exit(0)
# Defining a dict
d = defaultdict(list)
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
d[id].append(overalls[i])
by_review_support = []
by_review_support_label = []
for reviews_support in [50, 100, 1]:
sharpness_all = []
num_products_with_reviews = 0
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
# ratings = df[df["asin"] == product_id]["overall"].values
n_ratings = len(ratings)
if n_ratings >= reviews_support:
# print(product_id, ratings)
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
sharpness = not_smoothness(prev)
# print(prev, sharpness)
sharpness_all.append(sharpness)
num_products_with_reviews+=1
by_review_support.append(sharpness_all)
by_review_support_label.append(f'>{reviews_support}')
print(f'#votes-support (min number of votes): {votes_support}')
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
print(f'#reviews-support (min number of reviews): {reviews_support}')
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
q05 = np.percentile(sharpness_all, 5)
q25 = np.percentile(sharpness_all, 25)
q50 = np.percentile(sharpness_all, 50)
q75 = np.percentile(sharpness_all, 75)
q95 = np.percentile(sharpness_all, 95)
print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
print(f'ave={np.mean(sharpness_all):.5f}')
print(f'min={np.min(sharpness_all):.5f}')
print(f'max={np.max(sharpness_all):.5f}')
#fig, ax = plt.subplots()
#ax.boxplot(by_review_support)
#ax.set_xticklabels(by_review_support_label)
#ax.set_ylabel("Sharpness")
#ax.set_xlabel("Distributions by number of reviews")
#plt.show()

View File

@ -0,0 +1,209 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
from quapy.data import LabelledCollection
from quapy.protocol import UPP
# this script computes the distribution of smoothness/sharpness of the books;
# either considering all books, as well as considering different groups of books' reviews (by product)
# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms
# of NMD between training set prevalences and sample prevalences; this script does this
# It also generates a csv containing all the prevalence values by product
read_meta = True
def not_smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
prevs = prevs.reshape(1,-1)
return prevs
# mean normalized match distance
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
return np.mean(nmds)
# normalized match distance
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)
"""
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
computed efficiently.
[Mirko Bunse's code from Julia adapted]
"""
def mdpa(a, b):
assert len(a) == len(b), "histograms have to have the same length"
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
# algorithm 1 in [cha2002measuring]
prefixsum = 0.0
distance = 0.0
for i in range(len(a)):
prefixsum += a[i] - b[i]
distance += abs(prefixsum)
return distance / sum(a) # the normalization is a fix to the original MDPA
def prepare_vote_field(df):
df['vote'] = df['vote'].fillna('0')
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
df['vote'] = pd.to_numeric(df['vote'])
return df
def read_from_huge_json(filein):
df = pd.read_json(filein, lines=True)
df.drop(columns=[
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
'image'
], inplace=True)
df = prepare_vote_field(df)
return df
def read_from_metadata(filein):
df = pd.read_csv(filein)
df['vote'] = pd.to_numeric(df['vote'])
return df
def filter_by_vote(df, vote_threshold=1):
df = df[df['vote'] >= vote_threshold]
df.drop(columns=['vote'], inplace=True)
return df
if read_meta:
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
readfn = read_from_metadata
else:
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
readfn = read_from_huge_json
def create_dictionary_bookid_ratings(df):
# pass to dictionaries
df = df.reset_index() # make sure indexes pair with number of rows
ids = df['asin'].values
overalls = df['overall'].values
# Defining a dict
d = defaultdict(list)
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
d[id].append(overalls[i])
return d
def get_stats(distribution, msg=''):
# computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution
vmean = np.mean(distribution)
vmax = np.max(distribution)
vmin = np.min(distribution)
q05 = np.percentile(distribution, 5)
q25 = np.percentile(distribution, 25)
q50 = np.percentile(distribution, 50)
q75 = np.percentile(distribution, 75)
q95 = np.percentile(distribution, 95)
print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
print(f'{msg}: ave={np.mean(distribution):.5f}')
print(f'{msg}: max={np.max(distribution):.5f}')
print(f'{msg}: min={np.min(distribution):.5f}')
return vmean, vmax, vmin, q05, q25, q50, q75, q95
with open('book_stats.csv', 'wt') as foo:
foo.write(f'minvotes\tminreviews\t#products\t#reviews'
f'\tsharp-ave\tsharp-max\tsharp-min\t'
f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95'
f'\tshift-ave\tshift-max\tshift-min\t'
f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95'
f'\n')
for votes_support in [1]:
df = readfn(filein)
df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1)
num_entries = len(df)
df = filter_by_vote(df, vote_threshold=votes_support)
num_entries_with_vote = len(df)
unique_product_ids = df['asin'].unique()
num_products = len(unique_product_ids)
print(df.columns)
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
print(f'num unique products {num_products}')
d = create_dictionary_bookid_ratings(df)
for reviews_support in [100]:
with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev:
sharpness_all = []
num_products_with_reviews = 0
sel_ids, sel_overalls = [], []
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
n_ratings = len(ratings)
if n_ratings >= reviews_support:
sel_ids.extend([product_id] * n_ratings)
sel_overalls.extend(ratings)
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
for i, prev_i in enumerate(prev):
fprev.write(f'{prev_i:.5f}')
if i < len(prev)-1:
fprev.write('\t')
else:
fprev.write('\n')
sharpness = not_smoothness(prev)
sharpness_all.append(sharpness)
num_products_with_reviews+=1
print(f'#votes-support (min number of votes): {votes_support}')
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
print(f'#reviews-support (min number of reviews): {reviews_support}')
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness')
allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
allbooks_sharpness = not_smoothness(allbooks_prev)
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5])
prot = UPP(sel_collection, sample_size=1000, repeats=5000)
prot_iterator = prot()
shifts = []
for _, test_prev in tqdm(prot_iterator, total=prot.total()):
shifts.append(nmd(allbooks_prev, prev_hat=test_prev))
s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift')
foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}'
f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t'
f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}'
f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t'
f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n')

View File

@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import optimize
# this script checks for the prevalence values that yield the maximum or minimum values of smoothness;
# the result indicates any linear distribution (not only the uniform) satisfies this requirement
def sharpness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def smoothness(p):
return 1-sharpness(p)
nclasses = 5
uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses)
uniform_distribution /= uniform_distribution.sum()
bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'minimum of sharpness function {r.x}')
r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'maximum of sharpness function {r.x}')

View File

@ -0,0 +1,105 @@
import gzip
import os
from collections import Counter
from Ordinal.utils import jaggedness
import quapy as qp
import pickle
import numpy as np
import pandas as pd
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
def get_prevalence_merchandise(category):
input_file = os.path.join(base_path, category+'.txt.gz')
labels = []
print(f'{category} starts')
with gzip.open(input_file, 'rt') as f:
for line in f:
try:
stars, doc = line.split('\t')
labels.append(stars)
except:
print('error in line: ', line)
counts = Counter(labels)
print(f'\t{category} done')
return counts
target_file = './counters_Amazon_merchandise.pkl'
if not os.path.exists(target_file):
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
print('saving pickle')
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
(categories, counters) = pickle.load(open(target_file, 'rb'))
index_gift_cards = categories.index('Gift_Cards')
del categories[index_gift_cards]
del counters[index_gift_cards]
class_smooth = []
for cat, counter in zip(categories, counters):
total = sum(count for label, count in counter.items())
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
p = np.asarray(counts)/total
smooth = jaggedness(p)
class_smooth.append([smooth, cat, p])
class_smooth = sorted(class_smooth)
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.5)
nrows = 7
ncols = 4
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
with open('categories.txt', 'wt') as foo:
foo.write(f'Category\tSmooth\tPrevalence\n')
for i, (smooth, category, prevalence) in enumerate(class_smooth):
row = i // 4
col = i % 4
# print(i, row, col)
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
axis[row, col].set_ylim(0, 0.75)
axis[row, col].set_facecolor('white')
for spine in axis[row, col].spines.values():
spine.set_edgecolor('black')
spine.set_linewidth(0.3)
# axis[row, col].set_xticks(loc=0)
if row==6:
axis[row, col].set_xlabel("stars")
# axis[row, col].set_xticks([1,2,3,4,5])
# else:
# axis[row, col].set_xticks([])
if col==0:
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
else:
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
category = category.replace('_', ' ').title()
category = category.replace(' And ', ' & ')
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
# axis[row, col].set_title
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
# plt.show()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')

View File

@ -0,0 +1,147 @@
import gzip
import quapy as qp
from Ordinal.utils import load_simple_sample_raw
from quapy.data import LabelledCollection
import quapy.functional as F
import os
from os.path import join
from pathlib import Path
import numpy as np
datadir = '/media/moreo/Volume/Datasets/Amazon/reviews'
outdir = './data/'
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
domain = 'Books'
seed = 7
tr_size = 20000
val_size = 1000
te_size = 1000
nval = 1000
nte = 5000
def from_text(path, encoding='utf-8', class2int=True):
"""
Reads a labelled colletion of documents.
File fomart <0-4>\t<document>\n
:param path: path to the labelled collection
:param encoding: the text encoding used to open the file
:return: a list of sentences, and a list of labels
"""
all_sentences, all_labels = [], []
file = open(path, 'rt', encoding=encoding).readlines()
for line in file:
line = line.strip()
if line:
try:
label, sentence = line.split('\t')
sentence = sentence.strip()
if class2int:
label = int(label)
if label >= 0:
if sentence:
all_sentences.append(sentence)
all_labels.append(label)
except ValueError:
print(f'format error in {line}')
return all_sentences, all_labels
def write_txt_sample(sample: LabelledCollection, path):
os.makedirs(Path(path).parent, exist_ok=True)
with open(path, 'wt') as foo:
for document, label in zip(*sample.Xy):
foo.write(f'{label}\t{document}\n')
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
sample = pool.sampling(sample_size, *prev)
write_txt_sample(sample, join(outdir, f'{i}.txt'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
write_txt_sample(sample, join(outdir, f'{i}.txt'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
os.makedirs(outdir, exist_ok=True)
with open(prevpath_out, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(real_prevalences):
sample = pool.sampling(sample_size, *prev[:-1])
write_txt_sample(sample, join(outdir, f'{i}.txt'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
# fullpath = join(datadir,domain)+'.txt.gz' <- deprecated; there were duplicates
# data = LabelledCollection.load(fullpath, from_gz_text)
fullpath = './data/Books/Books.txt'
data = LabelledCollection.load(fullpath, from_text)
print(len(data))
print(data.classes_)
print(data.prevalence())
with qp.util.temp_seed(seed):
train, rest = data.split_stratified(train_prop=tr_size)
devel, test = rest.split_stratified(train_prop=0.5)
print(len(train))
print(len(devel))
print(len(test))
domaindir = join(outdir, domain)
write_txt_sample(train, join(domaindir, 'training_data.txt'))
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
write_txt_sample(test, join(domaindir, 'test_data.txt'))
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
#train = load_simple_sample_raw(domaindir, 'training_data')
#devel = load_simple_sample_raw(domaindir, 'development_data')
#test = load_simple_sample_raw(domaindir, 'test_data')
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
# and 1000 for val (disjoint). Then realize the samplings
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
nrows = real_prevalences.shape[0]
rand_sel = np.random.permutation(nrows)
real_prevalences_val = real_prevalences[rand_sel[:nval]]
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))

View File

@ -0,0 +1,116 @@
import gzip
import quapy as qp
import numpy as np
import pandas as pd
from quapy.data import LabelledCollection
import quapy.functional as F
import os
from os.path import join
from pathlib import Path
import pickle
datadir = '../OrdinalQuantification'
outdir = './data/'
domain = 'fact'
seed = 7
tr_size = 20000
val_size = 1000
te_size = 1000
nval = 1000
nte = 5000
def from_csv(path):
df = pd.read_csv(path)
# divide the continuous labels into ordered classes
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
# note: omitting the dtype will result in a single instance having a different class
# obtain a matrix of shape (n_samples, n_features)
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
return X, y
def write_pkl(sample: LabelledCollection, path):
os.makedirs(Path(path).parent, exist_ok=True)
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
sample = pool.sampling(sample_size, *prev)
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
fullpath = join(datadir,domain, 'fact_wobble.csv')
data = LabelledCollection.load(fullpath, from_csv)
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isinf(data.instances).any():
rows, cols = np.where(np.isinf(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted inf rows')
print(len(data))
print(data.classes_)
print(data.prevalence())
with qp.util.temp_seed(seed):
train, rest = data.split_stratified(train_prop=tr_size)
devel, test = rest.split_stratified(train_prop=0.5)
print(len(train))
print(len(devel))
print(len(test))
domaindir = join(outdir, domain)
write_pkl(train, join(domaindir, 'training_data.pkl'))
write_pkl(devel, join(domaindir, 'development_data.pkl'))
write_pkl(test, join(domaindir, 'test_data.pkl'))
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))

50
Ordinal/evaluation.py Normal file
View File

@ -0,0 +1,50 @@
import numpy as np
# smoothing approximation
def smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
prevs = prevs.reshape(1,-1)
return prevs
# mean normalized match distance
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
return np.mean(nmds)
# normalized match distance
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)
"""
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
computed efficiently.
[Mirko Bunse's code from Julia adapted]
"""
def mdpa(a, b):
assert len(a) == len(b), "histograms have to have the same length"
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
# algorithm 1 in [cha2002measuring]
prefixsum = 0.0
distance = 0.0
for i in range(len(a)):
prefixsum += a[i] - b[i]
distance += abs(prefixsum)
return distance / sum(a) # the normalization is a fix to the original MDPA

View File

@ -0,0 +1,151 @@
import numpy as np
import quapy as qp
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from Ordinal.model import LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge #, RegressionQuantification
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
from os.path import join
from utils import load_samples_folder, load_single_sample_pkl
from Ordinal.evaluation import nmd, mnmd
from tqdm import tqdm
"""
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
standard logistic regression against quantifiers equipped with order-aware classifiers
"""
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
# yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
# yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
# yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
# yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(LAD)', CC(LAD()), params_SVR
yield 'ACC(LAD)', ACC(LAD()), params_SVR
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
def run_experiment(params):
qname, q, param_grid = params
qname += posfix
resultfile = join(resultpath, f'{qname}.all.APP-OQ.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for all-drift')
def load_test_samples():
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
def load_dev_samples():
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
timeout=60*60*2,
verbose=True).fit(train)
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
# print('[learning regressor-based adjustment]')
# q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
# q.fit(None)
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
# mean_nmd = report['nmd'].mean()
# std_nmd = report['nmd'].std()
# print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
# resultfile = join(resultpath, f'{qname}.all.reg.csv')
# report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
#domain = 'Books-tfidf'
posfix = ''
# domain = 'fact'
# posfix = '-std' # set to '' to avoid standardization
# posfix = ''
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
if posfix=='-std':
zscore = StandardScaler()
train.instances = zscore.fit_transform(train.instances)
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')

View File

@ -0,0 +1,137 @@
import numpy as np
from scipy.stats import wilcoxon
import quapy as qp
import os
from os.path import join
from Ordinal.tabular import Table
from utils import load_samples_folder, load_single_sample_pkl, jaggedness
from Ordinal.evaluation import nmd, mnmd
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
"""
This script takes all results from the book domain, that correspond to the APP protocol, and filters by
smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd
thus effectively reporting the results for the APP-OQ protocol
"""
def parse_str_prev(df_col):
values = df_col.values
array_list = [np.fromstring(array[1:-1], sep=' ') for array in values]
return np.asarray(array_list)
def parse_result_file(path):
df = pd.read_csv(path)
true_prev = parse_str_prev(df['true-prev'])
estim_prev = parse_str_prev(df['estim-prev'])
nmd = df['nmd'].values
return true_prev, estim_prev, nmd
def ave_jaggedness(prevs, less_percentile=1):
jag = np.sort([jaggedness(p) for p in prevs])
up_to = int(less_percentile * len(jag))
return np.mean(jag[:up_to])
def retain_half_smoothest(true_prev, estim_prev, nmd):
jag = [jaggedness(p) for p in true_prev]
order = np.argsort(jag)
up_to = len(order)//2
order = order[:up_to]
return true_prev[order], estim_prev[order], nmd[order]
def compute_half_smoothest_nmd(true_prev, estim_prev, nmd):
_, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd)
return nmd_smooth
if __name__ == '__main__':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
in_protocol = 'app'
out_protocol = 'app-oq'
in_result_path = join('./results', domain, in_protocol)
out_result_path = join('./results', domain, out_protocol)
os.makedirs(out_result_path, exist_ok=True)
# recompute the results in terms of APP-OQ
result_dict = {}
for filepath in glob(f'{in_result_path}/*).all.csv'):
name = Path(filepath).name
quantifier = name[:name.index('(')]
classifier = name[name.index('(')+1:name.index(')')]
true_prev, estim_prev, nmds = parse_result_file(filepath)
nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds)
result_dict[classifier + '-' + quantifier] = nmds
# convert to numbers and search for the best in each quantifier
best_keys = {}
best_nmds = {}
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
best_ave, best_key, best_nmd = None, None, None
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
key = classifier + '-' + quantifier
if key in result_dict:
nmds = result_dict[key]
mean_val = np.mean(nmds)
if best_ave is None or mean_val < best_ave:
best_ave = mean_val
best_key = key
best_nmd = nmds
best_keys[quantifier] = best_key
best_nmds[quantifier] = best_nmd
# print(best_keys)
# write a latex table
for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
print('& \multicolumn{2}{c}{'+q+'} ', end='')
print('\\\\')
print('\\midrule')
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
print(classifier + '\t', end='')
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
key = classifier + '-' + quantifier
the_best_nmds = best_nmds[quantifier]
if key in result_dict:
nmds = result_dict[key]
mean_val = np.mean(nmds)
bold = False
if best_keys[quantifier] == key:
bold = True
else:
_, pval = wilcoxon(nmds, the_best_nmds)
if pval > 0.01:
bold = True
str_mean = f'{mean_val:.4f}'
if bold:
str_mean = '\\textbf{' + str_mean + '}'
if classifier == 'LR':
std_val = np.std(nmds)
str_val = f'{str_mean} & $\pm {std_val:.4f}$'
else:
rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds)
sign = '+' if rel_increment>0 else ''
str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)'
else:
str_val = '\multicolumn{2}{c}{---}'
str_val = ' & ' + str_val
print(str_val, end='')
print('\\\\')

105
Ordinal/finetune_bert.py Normal file
View File

@ -0,0 +1,105 @@
import csv
import sys
import datasets
import numpy as np
import pandas as pd
import torch.cuda
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
from transformers import TrainingArguments
"""
This script fine-tunes a pre-trained language model on a given textual training set.
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
to the validation loss in a hold-out val split of 1000 documents (stratified).
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
$> python3 ./data/Books/training_data.txt roberta-base
"""
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
return tokens
def compute_metrics(eval_preds):
logits, labels = eval_preds
preds = np.argmax(logits, axis=-1)
return {
'macro-f1': f1_score(labels, preds, average='macro'),
'micro-f1': f1_score(labels, preds, average='micro'),
}
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
# datapath = './data/Books/training_data.txt'
# checkpoint = 'roberta-base'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
datapath = sys.argv[1] # './data/Books/training_data.txt'
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
modelout = checkpoint+'-finetuned-new'
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df['labels'].to_frame()
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
num_labels = len(pd.unique(labels['labels']))
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
train = Dataset.from_pandas(df=X_train, split='train', features=features)
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
dataset = DatasetDict({
'train': train.select(range(500)) if debug else train,
'validation': validation.select(range(500)) if debug else validation
})
# tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
# fine-tuning
training_args = TrainingArguments(
modelout,
learning_rate=2e-5,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy='epoch',
save_strategy='epoch',
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
# eval_steps=10,
save_total_limit=1,
load_best_model_at_end=True
)
trainer = Trainer(
model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=DataCollatorWithPadding(tokenizer),
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()

View File

@ -0,0 +1,70 @@
import pandas as pd
from os.path import join
import os
from glob import glob
from pathlib import Path
from Ordinal.main import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Amazon-OQ-BK (for internal use only)
"""
domain = 'Books-tfidf'
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
resultpath_bertlast = join('./results', domain_bert_last, prot)
resultpath_bertave = join('./results', domain_bert_ave, prot)
resultpath_bertpost = join('./results', domain_bert_post, prot)
methods = [qname for qname, *_ in quantifiers()]
methods += ['SLD(LR)-agg']
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
methods_Rave = [m+'-RoBERTa-average' for m in methods]
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
# methods += [m+'-r' for m in methods]
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
resultfiles = list(glob(f'{resultpath}/*.csv')) \
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
+ list(glob(f'{resultpath_bertave}/*.csv')) \
+ list(glob(f'{resultpath_bertpost}/*.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').split('.')
if other:
method += '-r'
if method not in methods:
continue
table.add(drift, method, nmd)
os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
"""
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""
print('saving table in', outpath)
with open(outpath, 'wt') as foo:
foo.write(tabular)
foo.write('\n')
print('[done]')

View File

@ -0,0 +1,82 @@
import pandas as pd
from os.path import join
import os
from glob import glob
from pathlib import Path
from Ordinal.experiments_lr_vs_ordlr import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Fact-OQ (for internal use only)
"""
#domain = 'fact'
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
withstd=False
methods = [qname for qname, *_ in quantifiers()]
if withstd:
methods = [m+'-std' for m in methods]
#methods = methods + methods_variant
# methods += [m+'-r' for m in methods]
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
if withstd:
method_variants = [m+'-std' for m in method_variants]
print('families:', quantifiers_families)
print('variants', method_variants)
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
if drift!='all':
continue
if other:
method += '-r'
if method not in methods:
continue
family, variant = method.split('(')
variant = variant.replace(')', '')
if variant not in method_variants:
continue
table.add(family, variant, nmd)
os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
\\toprule
"""
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""
print('saving table in', outpath)
with open(outpath, 'wt') as foo:
foo.write(tabular)
foo.write('\n')
print('[done]')

View File

@ -0,0 +1,152 @@
import sys
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from os.path import join
import os
import shutil
from tqdm import tqdm
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
"""
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
samples in the dataset. The representations are saved in npy-txt plain format.
"""
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
return {
'input_ids': tokens.input_ids.cuda(),
'attention_mask': tokens.attention_mask.cuda()
}
def save_samples_as_txt(tensors, labels, path):
vectors = tensors
labels = labels.values
vec_lab = np.hstack([labels, vectors])
n_cols = vectors.shape[1]
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
def transform_sample(instances, labels, outpath, batch_size=50):
ndocs = len(labels)
batches = ndocs // batch_size
assert ndocs % batches == 0, 'fragmented last bach not supported'
transformations = []
for batch_id in range(0, ndocs, batch_size):
batch_instances = instances[batch_id:batch_id + batch_size]
tokenized_dataset = tokenize_function(batch_instances)
out = model(**tokenized_dataset, output_hidden_states=True)
if generation_mode == 'posteriors':
logits = out.logits
posteriors = torch.softmax(logits, dim=-1)
transformed = posteriors
elif generation_mode == 'last':
hidden_states = out.hidden_states
last_layer_cls = hidden_states[-1][:, 0, :]
transformed = last_layer_cls
elif generation_mode == 'average':
hidden_states = out.hidden_states
hidden_states = torch.stack(hidden_states)
all_layer_cls = hidden_states[:, :, 0, :]
average_cls = torch.mean(all_layer_cls, dim=0)
transformed = average_cls
else:
raise NotImplementedError()
transformations.append(transformed.cpu().numpy())
transformations = np.vstack(transformations)
save_samples_as_txt(transformations, labels, outpath)
def transform_folder_samples(protocol, splitname, skip=0):
in_folder = join(datapath, domain, protocol, splitname)
out_folder = join(datapath, outname, protocol, splitname)
total = 1000 if splitname.startswith('dev') else 5000
for i, (instances, labels) in tqdm(enumerate(
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
if i>= skip:
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
def get_best_checkpoint(checkpointdir):
from glob import glob
steps = []
for folder in glob(f'{checkpointdir}/checkpoint-*'):
step=int(folder.split('checkpoint-')[1])
steps.append(step)
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
print(f'choosen checkpoint is {choosen}')
return choosen
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
#checkpoint='roberta-base-val-finetuned'
#generation_mode = 'average' #ave seemed to work slightly better
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
generation_mode = sys.argv[2] # e.g., 'average' # ave seemed to work slightly better
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
checkpoint = get_best_checkpoint(checkpoint)
num_labels = 5
datapath = './data'
domain = 'Books'
protocols = ['real', 'app'] # ['app', 'npp']
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
outname = domain + f'-{checkpoint}-{generation_mode}'
with torch.no_grad():
print('loading', checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
os.makedirs(join(datapath, outname), exist_ok=True)
print('transforming the training set')
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
print('[done]')
for protocol in protocols:
in_path = join(datapath, domain, protocol)
out_path = join(datapath, outname, protocol)
os.makedirs(out_path, exist_ok=True)
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
print('processing', protocol)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

156
Ordinal/main.py Normal file
View File

@ -0,0 +1,156 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import quapy as qp
import numpy as np
from Ordinal.model import OrderedLogisticRegression, LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
import os
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
from evaluation import nmd, mnmd
from time import time
import pickle
from tqdm import tqdm
import mord
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_SVR = {'C': np.logspace(0, 1, 2)}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
def run_experiment(params):
qname, q, param_grid, drift = params
qname += posfix
resultfile = join(resultpath, f'{qname}.{drift}.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for {drift}-drift')
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
verbose=True).fit(train)
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
print('[learning regressor-based adjustment]')
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
#preprocessing = 'roberta.last'
preprocessing = 'roberta.average'
# preprocessing = 'roberta.posteriors'
#preprocessing = 'tfidf'
if preprocessing=='tfidf':
domain = 'Books-tfidf'
posfix = ''
elif preprocessing=='roberta.last':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
posfix = '-RoBERTa-last'
elif preprocessing=='roberta.average':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
posfix = '-RoBERTa-average'
elif preprocessing=='roberta.posteriors':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
posfix = '-RoBERTa-posteriors'
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')

195
Ordinal/model.py Normal file
View File

@ -0,0 +1,195 @@
import mord
import numpy as np
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.utils.class_weight import compute_class_weight
from statsmodels.miscmodels.ordinal_model import OrderedModel
class OrderedLogisticRegression:
def __init__(self, model='logit'):
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
self.model = model
def fit(self, X, y):
if issparse(X):
self.svd = TruncatedSVD(500)
X = self.svd.fit_transform(X)
self.learner = OrderedModel(y, X, distr=self.model)
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
def predict(self, X):
prob = self.predict_proba(X)
return np.argmax(prob, axis=1)
def predict_proba(self, X):
if issparse(X):
assert hasattr(self, 'svd'), \
'X matrix in predict is sparse, but the method has not been fit with sparse type'
X = self.svd.transform(X)
return self.res_prob.model.predict(self.res_prob.params, exog=X)
class LAD(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, class_weight=None):
self.C = C
self.class_weight = class_weight
def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(np.int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'C':self.C, 'class_weight': self.class_weight}
def set_params(self, **params):
self.C = params['C']
self.class_weight = params['class_weight']
class OrdinalRidge(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
self.alpha = alpha
self.class_weight = class_weight
self.normalize = normalize
def fit(self, X, y, sample_weight=None):
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(np.int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
def set_params(self, **params):
self.alpha = params['alpha']
self.class_weight = params['class_weight']
self.normalize = params['normalize']
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LogisticSE(mord.LogisticSE):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticSE, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
class LogisticIT(mord.LogisticIT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticIT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# class LAD(mord.LAD):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)
# class OrdinalRidge(mord.OrdinalRidge):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)

View File

@ -0,0 +1,296 @@
from copy import deepcopy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel
import mord
from sklearn.utils.class_weight import compute_class_weight
class OrderedLogisticRegression:
def __init__(self, model='logit'):
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
self.model = model
def fit(self, X, y):
if issparse(X):
self.svd = TruncatedSVD(500)
X = self.svd.fit_transform(X)
self.learner = OrderedModel(y, X, distr=self.model)
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
def predict(self, X):
prob = self.predict_proba(X)
return np.argmax(prob, axis=1)
def predict_proba(self, X):
if issparse(X):
assert hasattr(self, 'svd'), \
'X matrix in predict is sparse, but the method has not been fit with sparse type'
X = self.svd.transform(X)
return self.res_prob.quantifier.predict(self.res_prob.params, exog=X)
class StackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.base = deepcopy(base_estimator)
self.meta = deepcopy(base_estimator)
self.norm = StandardScaler()
def fit(self, X, y):
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class RegressionQuantification:
def __init__(self,
base_quantifier,
regression='svr',
val_samples_generator=None,
norm=True):
self.base_quantifier = base_quantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.val_samples_generator = val_samples_generator
# self.norm = StandardScaler()
# self.covs = covs
def generate_validation_samples(self):
Xs, ys = [], []
for instances, prevalence in self.val_samples_generator():
ys.append(prevalence)
Xs.append(self.base_quantifier.quantify(instances))
Xs = np.asarray(Xs)
ys = np.asarray(ys)
return Xs, ys
def fit(self, data):
print('fitting quantifier')
if data is not None:
self.base_quantifier.fit(data)
print('generating val samples')
Xs, ys = self.generate_validation_samples()
# Xs = self.norm.fit_transform(Xs)
print('fitting regressor')
self.reg.fit(Xs, ys)
print('[done]')
return self
def quantify(self, instances):
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs).flatten()
# Xs = self.norm.inverse_transform(Xs)
Xs = np.clip(Xs, 0, 1)
adjusted = Xs / Xs.sum()
# adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted
return adjusted
def get_params(self, deep=True):
return self.base_quantifier.get_params()
def set_params(self, **params):
self.base_quantifier.set_params(**params)
class LAD(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, class_weight=None):
self.C = C
self.class_weight = class_weight
def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'C':self.C, 'class_weight': self.class_weight}
def set_params(self, **params):
self.C = params['C']
self.class_weight = params['class_weight']
class OrdinalRidge(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
self.alpha = alpha
self.class_weight = class_weight
self.normalize = normalize
def fit(self, X, y, sample_weight=None):
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
def set_params(self, **params):
self.alpha = params['alpha']
self.class_weight = params['class_weight']
self.normalize = params['normalize']
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LogisticSE(mord.LogisticSE):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticSE, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
class LogisticIT(mord.LogisticIT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticIT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# class LAD(mord.LAD):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)
# class OrdinalRidge(mord.OrdinalRidge):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)

View File

@ -0,0 +1,51 @@
import numpy as np
import quapy as qp
from evaluation import nmd
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from quapy.data import LabelledCollection
import pickle
import os
from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "shift".
The partition is only carried out by generating index vectors.
"""
def partition_by_drift(split, training_prevalence):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
drifts = []
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
drifts.append(nmd(training_prevalence, sample.prevalence()))
drifts = np.asarray(drifts)
order = np.argsort(drifts)
nD = len(order)
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
all_drift = np.arange(nD)
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
lows = drifts[low_drift]
mids = drifts[mid_drift]
highs = drifts[high_drift]
all = drifts[all_drift]
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
datapath = './data'
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
partition_by_drift('dev', training.prevalence())
partition_by_drift('test', training.prevalence())

View File

@ -0,0 +1,41 @@
import numpy as np
from Ordinal.evaluation import smoothness
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "smoothness".
The partition is only carried out by generating index vectors.
"""
def partition_by_smoothness(split):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
smooths = []
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
smooths.append(smoothness(sample.prevalence()))
smooths = np.asarray(smooths)
order = np.argsort(smooths)
nD = len(order)
low2high_smooth = np.array_split(order, 5)
all_drift = np.arange(nD)
for i, smooth_idx in enumerate(low2high_smooth):
block = smooths[smooth_idx]
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
partition_by_smoothness('dev')
partition_by_smoothness('test')

View File

@ -0,0 +1,51 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import *
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
Three vector generation modes are available: posteriors, last, average
"""
vector_generation = 'posteriors'
datapath = './data'
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
outname = domain.replace('-finetuned', '-finetuned-pkl')
protocol = 'app'
print('pickling npy txt files')
print('from:', join(datapath, domain))
print('to', join(datapath, outname))
print('for protocol:', protocol)
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
folder_dir=join(datapath, domain, protocol, splitname)
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

View File

@ -0,0 +1,62 @@
import quapy as qp
from Ordinal.utils import load_simple_sample_raw, load_samples_raw
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
"""
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
def save_preprocessing_info(transformer):
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
foo.write(f'{str(transformer)}\n')
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
os.makedirs(join(datapath, outname, 'real'), exist_ok=True)
os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
save_preprocessing_info(tfidf)
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples('app', 'dev_samples')
transform_folder_samples('app', 'test_samples')
transform_folder_samples('real', 'dev_samples')
transform_folder_samples('real', 'test_samples')

374
Ordinal/tabular.py Normal file
View File

@ -0,0 +1,374 @@
import numpy as np
import itertools
from scipy.stats import ttest_ind_from_stats, wilcoxon
class Table:
VALID_TESTS = [None, "wilcoxon", "ttest"]
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
color=True, show_rel_to=-1):
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
self.benchmarks = np.asarray(benchmarks)
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
self.methods = np.asarray(methods)
self.method_index = {col: j for j, col in enumerate(methods)}
self.map = {}
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
self._addmap('values', dtype=object)
self.lower_is_better = lower_is_better
self.ttest = significance_test
self.prec_mean = prec_mean
self.clean_zero = clean_zero
self.show_std = show_std
self.prec_std = prec_std
self.add_average = average
self.missing = missing
self.missing_str = missing_str
self.color = color
self.show_rel_to = show_rel_to
self.touch()
@property
def nbenchmarks(self):
return len(self.benchmarks)
@property
def nmethods(self):
return len(self.methods)
def touch(self):
self._modif = True
def update(self):
if self._modif:
self.compute()
def _getfilled(self):
return np.argwhere(self.map['fill'])
@property
def values(self):
return self.map['values']
def _indexes(self):
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
def _addmap(self, map, dtype, func=None):
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
if func is None:
return
m = self.map[map]
f = func
indexes = self._indexes() if map == 'fill' else self._getfilled()
for i, j in indexes:
m[i, j] = f(self.values[i, j])
def _addrank(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
if not self.lower_is_better:
ranked_cols_idx = ranked_cols_idx[::-1]
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
def _addcolor(self):
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if filled_cols_idx.size == 0:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
minval = min(col_means)
maxval = max(col_means)
for col_idx in filled_cols_idx:
val = self.map['mean'][i, col_idx]
norm = (maxval - minval)
if norm > 0:
normval = (val - minval) / norm
else:
normval = 0.5
if self.lower_is_better:
normval = 1 - normval
self.map['color'][i, col_idx] = color_red2green_01(normval)
def _run_ttest(self, row, col1, col2):
mean1 = self.map['mean'][row, col1]
std1 = self.map['std'][row, col1]
nobs1 = self.map['nobs'][row, col1]
mean2 = self.map['mean'][row, col2]
std2 = self.map['std'][row, col2]
nobs2 = self.map['nobs'][row, col2]
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
return p_val
def _run_wilcoxon(self, row, col1, col2):
values1 = self.map['values'][row, col1]
values2 = self.map['values'][row, col2]
_, p_val = wilcoxon(values1, values2)
return p_val
def _add_statistical_test(self):
if self.ttest is None:
return
self.some_similar = [False] * self.nmethods
for i in range(self.nbenchmarks):
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
if len(filled_cols_idx) <= 1:
continue
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
best_pos = filled_cols_idx[np.argmin(col_means)]
for j in filled_cols_idx:
if j == best_pos:
continue
if self.ttest == 'ttest':
p_val = self._run_ttest(i, best_pos, j)
else:
p_val = self._run_wilcoxon(i, best_pos, j)
pval_outcome = pval_interpretation(p_val)
self.map['ttest'][i, j] = pval_outcome
if pval_outcome != 'Diff':
self.some_similar[j] = True
def compute(self):
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
self._addmap('mean', dtype=float, func=np.mean)
self._addmap('std', dtype=float, func=np.std)
self._addmap('nobs', dtype=float, func=len)
self._addmap('rank', dtype=int, func=None)
self._addmap('color', dtype=object, func=None)
self._addmap('ttest', dtype=object, func=None)
self._addmap('latex', dtype=object, func=None)
self._addrank()
self._addcolor()
self._add_statistical_test()
if self.add_average:
self._addave()
self._modif = False
def _is_column_full(self, col):
return all(self.map['fill'][:, self.method_index[col]])
def _addave(self):
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
show_std=self.show_std)
for col in self.methods:
values = None
if self._is_column_full(col):
if self.ttest == 'ttest':
values = np.asarray(self.map['mean'][:, self.method_index[col]])
else: # wilcoxon
values = np.concatenate(self.values[:, self.method_index[col]])
ave.add('ave', col, values)
self.average = ave
def add(self, benchmark, method, values):
if values is not None:
values = np.asarray(values)
if values.ndim == 0:
values = values.flatten()
rid, cid = self._coordinates(benchmark, method)
if self.map['values'][rid, cid] is None:
self.map['values'][rid, cid] = values
elif values is not None:
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
self.touch()
def get(self, benchmark, method, attr='mean'):
self.update()
assert attr in self.map, f'unknwon attribute {attr}'
rid, cid = self._coordinates(benchmark, method)
if self.map['fill'][rid, cid]:
v = self.map[attr][rid, cid]
if v is None or (isinstance(v, float) and np.isnan(v)):
return self.missing
return v
else:
return self.missing
def _coordinates(self, benchmark, method):
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
assert method in self.method_index, f'method {method} out of range'
rid = self.benchmark_index[benchmark]
cid = self.method_index[method]
return rid, cid
def get_average(self, method, attr='mean'):
self.update()
if self.add_average:
return self.average.get('ave', method, attr=attr)
return None
def get_color(self, benchmark, method):
color = self.get(benchmark, method, attr='color')
if color is None:
return ''
return color
def latexCell(self, benchmark, method):
self.update()
i, j = self._coordinates(benchmark, method)
if self.map['fill'][i, j] == False:
return self.missing_str
mean = self.map['mean'][i, j]
l = f" {mean:.{self.prec_mean}f}"
if self.clean_zero:
l = l.replace(' 0.', '.')
isbest = self.map['rank'][i, j] == 1
if self.ttest is not None: # and self.some_similar[j]:
test_label = self.map['ttest'][i, j]
if test_label in ['Sim', 'Same']:
isbest = True
if isbest:
l = "\\textbf{" + l.strip() + "}\;"
else:
l += '\; '
stat = ''
# this is commented because we are putting in textbf all results that are similar to the best one
# if self.ttest is not None: # and self.some_similar[j]:
# test_label = self.map['ttest'][i, j]
# if test_label == 'Sim':
# stat = '^{\dag\phantom{\dag}}'
# elif test_label == 'Same':
# stat = '^{\ddag}'
# elif isbest or test_label == 'Diff':
# stat = '^{\phantom{\ddag}}'
std = ''
if self.show_std:
std = self.map['std'][i, j]
std = f" {std:.{self.prec_std}f}"
if self.clean_zero:
std = std.replace(' 0.', '.')
std = f" \pm {std:{self.prec_std}}"
relto = ''
if self.show_rel_to != -1:
if j != self.show_rel_to:
ref_ave = self.map['mean'][i, self.show_rel_to]
rel = 100*(mean-ref_ave)/ref_ave
if abs(rel) < 0.1:
relto=f'(\\approx)'
else:
plussign = '+' if rel>0 else '' # already plugs the '-' sign
relto=f'({plussign}{rel:.1f}\%)'
std = ''
if stat != '' or std != '' or relto != '':
l = f'{l}${stat}{std}{relto}$'
if self.color:
l += ' ' + self.map['color'][i, j]
return l
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
tab = ' & '
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
tab += ' \\\\\hline\n'
for row in self.benchmarks:
rowname = benchmark_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRow(row)
if average:
tab += '\hline\n'
tab += 'Average & '
tab += self.latexAverage()
return tab
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
def withside(label):
return '\side{'+label+'}' if side else label
def center(label):
return '\multicolumn{1}{c}{'+label+'}'
tab = ' & '
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
if average:
tab += ' & ' + withside('Ave')
# tab += ' \\\\\hline\n'
tab += ' \\\\\midrule\n'
for row in self.methods:
rowname = method_replace.get(row, row)
tab += rowname + ' & '
tab += self.latexRowT(row, endl='')
if average:
tab += ' & '
tab += self.average.latexCell('ave', row)
# tab += '\\\\\hline\n'
tab += '\\\\\n'
tab += '\\bottomrule'
return tab
def latexRow(self, benchmark, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, col) for col in self.methods]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexRowT(self, method, endl='\\\\\hline\n'):
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
s = ' & '.join(s)
s += ' ' + endl
return s
def latexAverage(self, endl='\\\\\hline\n'):
if self.add_average:
return self.average.latexRow('ave', endl=endl)
def getRankTable(self):
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
for rid, cid in self._getfilled():
row = self.benchmarks[rid]
col = self.methods[cid]
t.add(row, col, self.get(row, col, 'rank'))
t.compute()
return t
def dropMethods(self, methods):
drop_index = [self.method_index[m] for m in methods]
new_methods = np.delete(self.methods, drop_index)
new_index = {col: j for j, col in enumerate(new_methods)}
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
self.methods = new_methods
self.method_index = new_index
self.touch()
def pval_interpretation(p_val):
if 0.005 >= p_val:
return 'Diff'
elif 0.05 >= p_val > 0.005:
return 'Sim'
elif p_val > 0.05:
return 'Same'
def color_red2green_01(val, maxtone=50):
if np.isnan(val): return None
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
# rescale to [-1,1]
val = val * 2 - 1
if val < 0:
color = 'red'
tone = maxtone * (-val)
else:
color = 'green'
tone = maxtone * val
return '\cellcolor{' + color + f'!{int(tone)}' + '}'

View File

@ -0,0 +1,78 @@
import gzip
import os
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
import pandas as pd
nrows = 3
ncols = 4
prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
#prevalences = prevalences[:nrows*ncols]
print(prevalences)
n = prevalences.shape[1]
class_smooth = []
for i, sample in enumerate(prevalences):
p = sample
smooth = jaggedness(p)
class_smooth.append([smooth, f'Sample {i+1}', p])
# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged
# at equal steps
class_smooth = sorted(class_smooth)
class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)]
class_smooth = class_smooth[:nrows*ncols]
# print(class_smooth)
# print(len(class_smooth))
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.5)
maxy = np.max(prevalences) + 0.1
class_labels = np.arange(1,n+1)
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
for i, (smooth, category, prevalence) in enumerate(class_smooth):
row = i // ncols
col = i % ncols
# print(i, row, col)
#axis[row, col].bar(list(range(1,n+1)), prevalence, width=1)
axis[row, col].bar(class_labels, prevalence, width=1)
axis[row, col].set_ylim(0, maxy)
axis[row, col].set_facecolor('white')
for spine in axis[row, col].spines.values():
spine.set_edgecolor('black')
spine.set_linewidth(0.3)
if row==nrows-1:
axis[row, col].set_xlabel("energy bin")
axis[row, col].set_xticks(class_labels)
else:
axis[row, col].set_xlabel("")
axis[row, col].set_xticks([])
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
category = category.replace('_', ' ').title()
category = category.replace(' And ', ' & ')
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
# axis[row, col].set_title
print(smooth, category, prevalence)
# plt.show()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight')

View File

@ -0,0 +1,13 @@
import pickle
target_file = './counters_Amazon_merchandise.pkl'
(categories, counters) = pickle.load(open(target_file, 'rb'))
print(categories)
print(counters)
with open('categories.txt', 'wt') as foo:
for counter, category in zip(counters, categories):
foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n')

67
Ordinal/utils.py Normal file
View File

@ -0,0 +1,67 @@
import numpy as np
from glob import glob
from json import load
import os
from os.path import join
import pickle
import pandas as pd
import csv
import datasets
from datasets import Dataset
import quapy as qp
from quapy.data import LabelledCollection
def jaggedness(p):
return (1/min(6, len(p)+1)) * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def load_simple_sample_npytxt(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
yX = np.loadtxt(samplepath)
X = yX[:,1:]
y = yX[:,0].astype(np.int32)
return LabelledCollection(instances=X, labels=y, classes_=classes)
def load_simple_sample_raw(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
def load_single_sample_as_csv(parentdir, filename):
samplepath = join(parentdir, filename+'.txt')
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df.pop('labels').to_frame()
features = datasets.Features({'review': datasets.Value('string')})
sample = Dataset.from_pandas(df=df, features=features)
return sample, labels
def load_single_sample_pkl(parentdir, filename):
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
# def load_samples_npytxt(path_dir, filter=None, classes=None):
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
def load_samples_raw(path_dir, filter=None, classes=None):
return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, classes=classes)
# def load_samples_as_csv(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
# def load_samples_pkl(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
nsamples = len(glob(join(path_dir, f'*')))
for id in range(nsamples):
if (filter is None) or id in filter:
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)

View File

@ -1,9 +1,10 @@
import numpy as np
from scipy.sparse import dok_matrix
from tqdm import tqdm
from time import time
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
def from_text(path, encoding='utf-8', verbose=0, class2int=True):
"""
Reads a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n

View File

@ -183,7 +183,7 @@ def _training_helper(learner,
if not hasattr(learner, 'predict_proba'):
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
f'The learner will be calibrated.')
learner = CalibratedClassifierCV(learner, cv=5)
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
if val_split is not None:
if isinstance(val_split, float):
if not (0 < val_split < 1):
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
def fit(self, data: LabelledCollection, fit_learner=True):
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
return self
def aggregate(self, classif_posteriors, epsilon=EPSILON):