forked from moreo/QuaPy
Compare commits
16 Commits
|
@ -0,0 +1,52 @@
|
|||
import gzip
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t')
|
||||
telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||
|
||||
nclasses_amazon = amazon.shape[1]
|
||||
nclasses_telescope = telescope.shape[1]
|
||||
|
||||
jags_amazon = np.asarray([jaggedness(p) for p in amazon])
|
||||
jags_telescope = np.asarray([jaggedness(p) for p in telescope])
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.pyplot import figure
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.7)
|
||||
|
||||
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||
ymax = 0.75
|
||||
|
||||
figure(figsize=(8, 4), dpi=300)
|
||||
|
||||
ax=plt.subplot(1, 2, 1)
|
||||
classes = np.arange(1, nclasses_amazon+1)
|
||||
plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("stars")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})')
|
||||
|
||||
ax=plt.subplot(1, 2, 2)
|
||||
# ax=plt.subplot(1, 1, 1)
|
||||
classes = np.arange(1, nclasses_telescope+1)
|
||||
plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("energy bin")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})')
|
||||
|
||||
|
||||
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||
plt.savefig('prevalence_averages.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
import gzip
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
telescope = np.genfromtxt('fact_expectation.txt')
|
||||
nclasses_telescope = len(telescope)
|
||||
|
||||
jag = jaggedness(telescope)
|
||||
print(jag)
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.pyplot import figure
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.7)
|
||||
|
||||
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||
ymax = 0.4
|
||||
|
||||
figure(figsize=(8, 4), dpi=300)
|
||||
|
||||
ax=plt.subplot(1, 1, 1)
|
||||
classes = np.arange(1, nclasses_telescope+1)
|
||||
plt.bar(classes, telescope, width=1)
|
||||
# ax.bar_label(telescope)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("energy bin")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'FACT data ({jag:.4f})')
|
||||
for index, data in enumerate(telescope):
|
||||
plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}")
|
||||
|
||||
|
||||
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||
plt.savefig('telescope_prevalence.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# this script computes the distribution of smoothness/sharpness of the books;
|
||||
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||
|
||||
# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz'
|
||||
# df = pd.read_json(filein, lines=True, compression='gzip')
|
||||
|
||||
read_meta = True
|
||||
|
||||
|
||||
def prepare_vote_field(df):
|
||||
df['vote'] = df['vote'].fillna('0')
|
||||
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def read_from_huge_json(filein):
|
||||
df = pd.read_json(filein, lines=True)
|
||||
df.drop(columns=[
|
||||
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||
'image'
|
||||
], inplace=True)
|
||||
|
||||
df = prepare_vote_field(df)
|
||||
return df
|
||||
|
||||
|
||||
def read_from_metadata(filein):
|
||||
df = pd.read_csv(filein)
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def filter_by_vote(df, vote_threshold=1):
|
||||
df = df[df['vote'] >= vote_threshold]
|
||||
df.drop(columns=['vote'], inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
if read_meta:
|
||||
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||
readfn = read_from_metadata
|
||||
else:
|
||||
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||
readfn = read_from_huge_json
|
||||
|
||||
votes_support=9
|
||||
|
||||
df = readfn(filein)
|
||||
|
||||
num_entries = len(df)
|
||||
# df = prepare_vote_field(df)
|
||||
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||
num_entries_with_vote = len(df)
|
||||
|
||||
unique_product_ids = df['asin'].unique()
|
||||
num_products = len(unique_product_ids)
|
||||
|
||||
print(df.columns)
|
||||
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||
print(f'num unique products {num_products}')
|
||||
|
||||
|
||||
# df = df.groupby(df['asin'])
|
||||
|
||||
def not_smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
# pass to dictionaries
|
||||
df = df.reset_index() # make sure indexes pair with number of rows
|
||||
|
||||
ids = df['asin'].values
|
||||
overalls = df['overall'].values
|
||||
|
||||
allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||
|
||||
import sys
|
||||
sys.exit(0)
|
||||
|
||||
# Defining a dict
|
||||
d = defaultdict(list)
|
||||
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||
d[id].append(overalls[i])
|
||||
|
||||
|
||||
by_review_support = []
|
||||
by_review_support_label = []
|
||||
for reviews_support in [50, 100, 1]:
|
||||
sharpness_all = []
|
||||
num_products_with_reviews = 0
|
||||
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||
# ratings = df[df["asin"] == product_id]["overall"].values
|
||||
n_ratings = len(ratings)
|
||||
if n_ratings >= reviews_support:
|
||||
# print(product_id, ratings)
|
||||
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
sharpness = not_smoothness(prev)
|
||||
# print(prev, sharpness)
|
||||
sharpness_all.append(sharpness)
|
||||
num_products_with_reviews+=1
|
||||
by_review_support.append(sharpness_all)
|
||||
by_review_support_label.append(f'>{reviews_support}')
|
||||
|
||||
print(f'#votes-support (min number of votes): {votes_support}')
|
||||
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||
|
||||
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||
|
||||
q05 = np.percentile(sharpness_all, 5)
|
||||
q25 = np.percentile(sharpness_all, 25)
|
||||
q50 = np.percentile(sharpness_all, 50)
|
||||
q75 = np.percentile(sharpness_all, 75)
|
||||
q95 = np.percentile(sharpness_all, 95)
|
||||
print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||
print(f'ave={np.mean(sharpness_all):.5f}')
|
||||
print(f'min={np.min(sharpness_all):.5f}')
|
||||
print(f'max={np.max(sharpness_all):.5f}')
|
||||
|
||||
#fig, ax = plt.subplots()
|
||||
#ax.boxplot(by_review_support)
|
||||
#ax.set_xticklabels(by_review_support_label)
|
||||
#ax.set_ylabel("Sharpness")
|
||||
#ax.set_xlabel("Distributions by number of reviews")
|
||||
#plt.show()
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.protocol import UPP
|
||||
|
||||
# this script computes the distribution of smoothness/sharpness of the books;
|
||||
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||
# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms
|
||||
# of NMD between training set prevalences and sample prevalences; this script does this
|
||||
# It also generates a csv containing all the prevalence values by product
|
||||
|
||||
|
||||
read_meta = True
|
||||
|
||||
|
||||
def not_smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
prevs = prevs.reshape(1,-1)
|
||||
return prevs
|
||||
|
||||
|
||||
# mean normalized match distance
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||
|
||||
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||
return np.mean(nmds)
|
||||
|
||||
|
||||
# normalized match distance
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
||||
|
||||
"""
|
||||
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||
computed efficiently.
|
||||
[Mirko Bunse's code from Julia adapted]
|
||||
"""
|
||||
def mdpa(a, b):
|
||||
assert len(a) == len(b), "histograms have to have the same length"
|
||||
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||
|
||||
# algorithm 1 in [cha2002measuring]
|
||||
prefixsum = 0.0
|
||||
distance = 0.0
|
||||
for i in range(len(a)):
|
||||
prefixsum += a[i] - b[i]
|
||||
distance += abs(prefixsum)
|
||||
|
||||
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||
|
||||
|
||||
def prepare_vote_field(df):
|
||||
df['vote'] = df['vote'].fillna('0')
|
||||
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def read_from_huge_json(filein):
|
||||
df = pd.read_json(filein, lines=True)
|
||||
df.drop(columns=[
|
||||
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||
'image'
|
||||
], inplace=True)
|
||||
|
||||
df = prepare_vote_field(df)
|
||||
return df
|
||||
|
||||
|
||||
def read_from_metadata(filein):
|
||||
df = pd.read_csv(filein)
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def filter_by_vote(df, vote_threshold=1):
|
||||
df = df[df['vote'] >= vote_threshold]
|
||||
df.drop(columns=['vote'], inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
if read_meta:
|
||||
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||
readfn = read_from_metadata
|
||||
else:
|
||||
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||
readfn = read_from_huge_json
|
||||
|
||||
|
||||
def create_dictionary_bookid_ratings(df):
|
||||
# pass to dictionaries
|
||||
df = df.reset_index() # make sure indexes pair with number of rows
|
||||
|
||||
ids = df['asin'].values
|
||||
overalls = df['overall'].values
|
||||
|
||||
# Defining a dict
|
||||
d = defaultdict(list)
|
||||
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||
d[id].append(overalls[i])
|
||||
|
||||
return d
|
||||
|
||||
def get_stats(distribution, msg=''):
|
||||
# computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution
|
||||
vmean = np.mean(distribution)
|
||||
vmax = np.max(distribution)
|
||||
vmin = np.min(distribution)
|
||||
q05 = np.percentile(distribution, 5)
|
||||
q25 = np.percentile(distribution, 25)
|
||||
q50 = np.percentile(distribution, 50)
|
||||
q75 = np.percentile(distribution, 75)
|
||||
q95 = np.percentile(distribution, 95)
|
||||
print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||
print(f'{msg}: ave={np.mean(distribution):.5f}')
|
||||
print(f'{msg}: max={np.max(distribution):.5f}')
|
||||
print(f'{msg}: min={np.min(distribution):.5f}')
|
||||
return vmean, vmax, vmin, q05, q25, q50, q75, q95
|
||||
|
||||
with open('book_stats.csv', 'wt') as foo:
|
||||
foo.write(f'minvotes\tminreviews\t#products\t#reviews'
|
||||
f'\tsharp-ave\tsharp-max\tsharp-min\t'
|
||||
f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95'
|
||||
f'\tshift-ave\tshift-max\tshift-min\t'
|
||||
f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95'
|
||||
f'\n')
|
||||
|
||||
for votes_support in [1]:
|
||||
|
||||
df = readfn(filein)
|
||||
df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1)
|
||||
|
||||
num_entries = len(df)
|
||||
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||
num_entries_with_vote = len(df)
|
||||
|
||||
unique_product_ids = df['asin'].unique()
|
||||
num_products = len(unique_product_ids)
|
||||
|
||||
print(df.columns)
|
||||
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||
print(f'num unique products {num_products}')
|
||||
|
||||
d = create_dictionary_bookid_ratings(df)
|
||||
|
||||
for reviews_support in [100]:
|
||||
with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev:
|
||||
sharpness_all = []
|
||||
num_products_with_reviews = 0
|
||||
sel_ids, sel_overalls = [], []
|
||||
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||
n_ratings = len(ratings)
|
||||
if n_ratings >= reviews_support:
|
||||
sel_ids.extend([product_id] * n_ratings)
|
||||
sel_overalls.extend(ratings)
|
||||
|
||||
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
for i, prev_i in enumerate(prev):
|
||||
fprev.write(f'{prev_i:.5f}')
|
||||
if i < len(prev)-1:
|
||||
fprev.write('\t')
|
||||
else:
|
||||
fprev.write('\n')
|
||||
sharpness = not_smoothness(prev)
|
||||
sharpness_all.append(sharpness)
|
||||
num_products_with_reviews+=1
|
||||
|
||||
print(f'#votes-support (min number of votes): {votes_support}')
|
||||
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||
|
||||
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||
|
||||
vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness')
|
||||
|
||||
allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||
|
||||
sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5])
|
||||
prot = UPP(sel_collection, sample_size=1000, repeats=5000)
|
||||
prot_iterator = prot()
|
||||
shifts = []
|
||||
for _, test_prev in tqdm(prot_iterator, total=prot.total()):
|
||||
shifts.append(nmd(allbooks_prev, prev_hat=test_prev))
|
||||
s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift')
|
||||
|
||||
foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}'
|
||||
f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t'
|
||||
f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}'
|
||||
f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t'
|
||||
f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n')
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy import optimize
|
||||
|
||||
|
||||
# this script checks for the prevalence values that yield the maximum or minimum values of smoothness;
|
||||
# the result indicates any linear distribution (not only the uniform) satisfies this requirement
|
||||
|
||||
def sharpness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
def smoothness(p):
|
||||
return 1-sharpness(p)
|
||||
|
||||
nclasses = 5
|
||||
uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses)
|
||||
uniform_distribution /= uniform_distribution.sum()
|
||||
|
||||
bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
|
||||
print(f'minimum of sharpness function {r.x}')
|
||||
|
||||
r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
print(f'maximum of sharpness function {r.x}')
|
|
@ -0,0 +1,105 @@
|
|||
import gzip
|
||||
import os
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import quapy as qp
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
||||
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
|
||||
|
||||
|
||||
def get_prevalence_merchandise(category):
|
||||
input_file = os.path.join(base_path, category+'.txt.gz')
|
||||
labels = []
|
||||
print(f'{category} starts')
|
||||
with gzip.open(input_file, 'rt') as f:
|
||||
for line in f:
|
||||
try:
|
||||
stars, doc = line.split('\t')
|
||||
labels.append(stars)
|
||||
except:
|
||||
print('error in line: ', line)
|
||||
counts = Counter(labels)
|
||||
print(f'\t{category} done')
|
||||
return counts
|
||||
|
||||
target_file = './counters_Amazon_merchandise.pkl'
|
||||
|
||||
if not os.path.exists(target_file):
|
||||
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
|
||||
|
||||
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
|
||||
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
|
||||
|
||||
print('saving pickle')
|
||||
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
else:
|
||||
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||
|
||||
index_gift_cards = categories.index('Gift_Cards')
|
||||
del categories[index_gift_cards]
|
||||
del counters[index_gift_cards]
|
||||
|
||||
class_smooth = []
|
||||
for cat, counter in zip(categories, counters):
|
||||
total = sum(count for label, count in counter.items())
|
||||
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
|
||||
p = np.asarray(counts)/total
|
||||
smooth = jaggedness(p)
|
||||
class_smooth.append([smooth, cat, p])
|
||||
|
||||
class_smooth = sorted(class_smooth)
|
||||
|
||||
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.5)
|
||||
|
||||
nrows = 7
|
||||
ncols = 4
|
||||
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||
with open('categories.txt', 'wt') as foo:
|
||||
foo.write(f'Category\tSmooth\tPrevalence\n')
|
||||
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||
row = i // 4
|
||||
col = i % 4
|
||||
# print(i, row, col)
|
||||
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
|
||||
axis[row, col].set_ylim(0, 0.75)
|
||||
axis[row, col].set_facecolor('white')
|
||||
for spine in axis[row, col].spines.values():
|
||||
spine.set_edgecolor('black')
|
||||
spine.set_linewidth(0.3)
|
||||
# axis[row, col].set_xticks(loc=0)
|
||||
if row==6:
|
||||
axis[row, col].set_xlabel("stars")
|
||||
# axis[row, col].set_xticks([1,2,3,4,5])
|
||||
# else:
|
||||
# axis[row, col].set_xticks([])
|
||||
if col==0:
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
else:
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
|
||||
category = category.replace('_', ' ').title()
|
||||
category = category.replace(' And ', ' & ')
|
||||
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||
# axis[row, col].set_title
|
||||
|
||||
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
|
||||
|
||||
# plt.show()
|
||||
plt.subplots_adjust(wspace=0, hspace=0)
|
||||
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
||||
|
||||
datadir = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
||||
outdir = './data/'
|
||||
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
|
||||
domain = 'Books'
|
||||
seed = 7
|
||||
|
||||
tr_size = 20000
|
||||
val_size = 1000
|
||||
te_size = 1000
|
||||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
|
||||
def from_text(path, encoding='utf-8', class2int=True):
|
||||
"""
|
||||
Reads a labelled colletion of documents.
|
||||
File fomart <0-4>\t<document>\n
|
||||
|
||||
:param path: path to the labelled collection
|
||||
:param encoding: the text encoding used to open the file
|
||||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
file = open(path, 'rt', encoding=encoding).readlines()
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
if class2int:
|
||||
label = int(label)
|
||||
if label >= 0:
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
except ValueError:
|
||||
print(f'format error in {line}')
|
||||
return all_sentences, all_labels
|
||||
|
||||
|
||||
def write_txt_sample(sample: LabelledCollection, path):
|
||||
os.makedirs(Path(path).parent, exist_ok=True)
|
||||
with open(path, 'wt') as foo:
|
||||
for document, label in zip(*sample.Xy):
|
||||
foo.write(f'{label}\t{document}\n')
|
||||
|
||||
|
||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||
sample = pool.sampling(sample_size, *prev)
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath_out, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(real_prevalences):
|
||||
sample = pool.sampling(sample_size, *prev[:-1])
|
||||
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
# fullpath = join(datadir,domain)+'.txt.gz' <- deprecated; there were duplicates
|
||||
# data = LabelledCollection.load(fullpath, from_gz_text)
|
||||
|
||||
fullpath = './data/Books/Books.txt'
|
||||
data = LabelledCollection.load(fullpath, from_text)
|
||||
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||
|
||||
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
|
||||
#train = load_simple_sample_raw(domaindir, 'training_data')
|
||||
#devel = load_simple_sample_raw(domaindir, 'development_data')
|
||||
#test = load_simple_sample_raw(domaindir, 'test_data')
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
|
||||
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
|
||||
# and 1000 for val (disjoint). Then realize the samplings
|
||||
|
||||
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
|
||||
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
|
||||
|
||||
nrows = real_prevalences.shape[0]
|
||||
rand_sel = np.random.permutation(nrows)
|
||||
real_prevalences_val = real_prevalences[rand_sel[:nval]]
|
||||
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
|
||||
|
||||
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
|
||||
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
|
||||
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
|
||||
datadir = '../OrdinalQuantification'
|
||||
outdir = './data/'
|
||||
domain = 'fact'
|
||||
seed = 7
|
||||
|
||||
tr_size = 20000
|
||||
val_size = 1000
|
||||
te_size = 1000
|
||||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
|
||||
def from_csv(path):
|
||||
df = pd.read_csv(path)
|
||||
|
||||
# divide the continuous labels into ordered classes
|
||||
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
|
||||
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
|
||||
|
||||
# note: omitting the dtype will result in a single instance having a different class
|
||||
|
||||
# obtain a matrix of shape (n_samples, n_features)
|
||||
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
|
||||
return X, y
|
||||
|
||||
|
||||
def write_pkl(sample: LabelledCollection, path):
|
||||
os.makedirs(Path(path).parent, exist_ok=True)
|
||||
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||
sample = pool.sampling(sample_size, *prev)
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
|
||||
fullpath = join(datadir,domain, 'fact_wobble.csv')
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_csv)
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isinf(data.instances).any():
|
||||
rows, cols = np.where(np.isinf(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted inf rows')
|
||||
|
||||
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_pkl(train, join(domaindir, 'training_data.pkl'))
|
||||
write_pkl(devel, join(domaindir, 'development_data.pkl'))
|
||||
write_pkl(test, join(domaindir, 'test_data.pkl'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
# smoothing approximation
|
||||
def smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
prevs = prevs.reshape(1,-1)
|
||||
return prevs
|
||||
|
||||
|
||||
# mean normalized match distance
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||
|
||||
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||
return np.mean(nmds)
|
||||
|
||||
|
||||
# normalized match distance
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
||||
|
||||
"""
|
||||
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||
computed efficiently.
|
||||
[Mirko Bunse's code from Julia adapted]
|
||||
"""
|
||||
def mdpa(a, b):
|
||||
assert len(a) == len(b), "histograms have to have the same length"
|
||||
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||
|
||||
# algorithm 1 in [cha2002measuring]
|
||||
prefixsum = 0.0
|
||||
distance = 0.0
|
||||
for i in range(len(a)):
|
||||
prefixsum += a[i] - b[i]
|
||||
distance += abs(prefixsum)
|
||||
|
||||
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
import os
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from Ordinal.model import LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge #, RegressionQuantification
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||
from os.path import join
|
||||
from utils import load_samples_folder, load_single_sample_pkl
|
||||
from Ordinal.evaluation import nmd, mnmd
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
|
||||
standard logistic regression against quantifiers equipped with order-aware classifiers
|
||||
"""
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
|
||||
|
||||
# baselines
|
||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
|
||||
# yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
|
||||
# yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
|
||||
# yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
|
||||
# yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
|
||||
# yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
|
||||
|
||||
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
|
||||
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
|
||||
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
|
||||
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
|
||||
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(LAD)', CC(LAD()), params_SVR
|
||||
yield 'ACC(LAD)', ACC(LAD()), params_SVR
|
||||
|
||||
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
|
||||
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
|
||||
|
||||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.all.APP-OQ.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
return None
|
||||
|
||||
print(f'fitting {qname} for all-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
sample_size=1000,
|
||||
protocol='gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
timeout=60*60*2,
|
||||
verbose=True).fit(train)
|
||||
|
||||
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
# print('[learning regressor-based adjustment]')
|
||||
# q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||
# q.fit(None)
|
||||
|
||||
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
# mean_nmd = report['nmd'].mean()
|
||||
# std_nmd = report['nmd'].std()
|
||||
# print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
# resultfile = join(resultpath, f'{qname}.all.reg.csv')
|
||||
# report.to_csv(resultfile, index=False)
|
||||
|
||||
return hyperparams
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
#domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
|
||||
# domain = 'fact'
|
||||
# posfix = '-std' # set to '' to avoid standardization
|
||||
# posfix = ''
|
||||
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
if posfix=='-std':
|
||||
zscore = StandardScaler()
|
||||
train.instances = zscore.fit_transform(train.instances)
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
import numpy as np
|
||||
from scipy.stats import wilcoxon
|
||||
|
||||
import quapy as qp
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
from Ordinal.tabular import Table
|
||||
from utils import load_samples_folder, load_single_sample_pkl, jaggedness
|
||||
from Ordinal.evaluation import nmd, mnmd
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
"""
|
||||
This script takes all results from the book domain, that correspond to the APP protocol, and filters by
|
||||
smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd
|
||||
thus effectively reporting the results for the APP-OQ protocol
|
||||
"""
|
||||
|
||||
def parse_str_prev(df_col):
|
||||
values = df_col.values
|
||||
array_list = [np.fromstring(array[1:-1], sep=' ') for array in values]
|
||||
return np.asarray(array_list)
|
||||
|
||||
def parse_result_file(path):
|
||||
df = pd.read_csv(path)
|
||||
true_prev = parse_str_prev(df['true-prev'])
|
||||
estim_prev = parse_str_prev(df['estim-prev'])
|
||||
nmd = df['nmd'].values
|
||||
return true_prev, estim_prev, nmd
|
||||
|
||||
def ave_jaggedness(prevs, less_percentile=1):
|
||||
jag = np.sort([jaggedness(p) for p in prevs])
|
||||
up_to = int(less_percentile * len(jag))
|
||||
return np.mean(jag[:up_to])
|
||||
|
||||
|
||||
def retain_half_smoothest(true_prev, estim_prev, nmd):
|
||||
jag = [jaggedness(p) for p in true_prev]
|
||||
order = np.argsort(jag)
|
||||
up_to = len(order)//2
|
||||
order = order[:up_to]
|
||||
return true_prev[order], estim_prev[order], nmd[order]
|
||||
|
||||
|
||||
def compute_half_smoothest_nmd(true_prev, estim_prev, nmd):
|
||||
_, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd)
|
||||
return nmd_smooth
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
in_protocol = 'app'
|
||||
out_protocol = 'app-oq'
|
||||
in_result_path = join('./results', domain, in_protocol)
|
||||
out_result_path = join('./results', domain, out_protocol)
|
||||
os.makedirs(out_result_path, exist_ok=True)
|
||||
|
||||
# recompute the results in terms of APP-OQ
|
||||
result_dict = {}
|
||||
for filepath in glob(f'{in_result_path}/*).all.csv'):
|
||||
name = Path(filepath).name
|
||||
quantifier = name[:name.index('(')]
|
||||
classifier = name[name.index('(')+1:name.index(')')]
|
||||
|
||||
true_prev, estim_prev, nmds = parse_result_file(filepath)
|
||||
nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds)
|
||||
|
||||
result_dict[classifier + '-' + quantifier] = nmds
|
||||
|
||||
# convert to numbers and search for the best in each quantifier
|
||||
best_keys = {}
|
||||
best_nmds = {}
|
||||
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
best_ave, best_key, best_nmd = None, None, None
|
||||
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||
key = classifier + '-' + quantifier
|
||||
if key in result_dict:
|
||||
nmds = result_dict[key]
|
||||
mean_val = np.mean(nmds)
|
||||
if best_ave is None or mean_val < best_ave:
|
||||
best_ave = mean_val
|
||||
best_key = key
|
||||
best_nmd = nmds
|
||||
best_keys[quantifier] = best_key
|
||||
best_nmds[quantifier] = best_nmd
|
||||
|
||||
# print(best_keys)
|
||||
|
||||
# write a latex table
|
||||
for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
print('& \multicolumn{2}{c}{'+q+'} ', end='')
|
||||
print('\\\\')
|
||||
print('\\midrule')
|
||||
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||
print(classifier + '\t', end='')
|
||||
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
key = classifier + '-' + quantifier
|
||||
the_best_nmds = best_nmds[quantifier]
|
||||
|
||||
if key in result_dict:
|
||||
nmds = result_dict[key]
|
||||
mean_val = np.mean(nmds)
|
||||
|
||||
bold = False
|
||||
if best_keys[quantifier] == key:
|
||||
bold = True
|
||||
else:
|
||||
_, pval = wilcoxon(nmds, the_best_nmds)
|
||||
if pval > 0.01:
|
||||
bold = True
|
||||
|
||||
str_mean = f'{mean_val:.4f}'
|
||||
if bold:
|
||||
str_mean = '\\textbf{' + str_mean + '}'
|
||||
|
||||
if classifier == 'LR':
|
||||
std_val = np.std(nmds)
|
||||
str_val = f'{str_mean} & $\pm {std_val:.4f}$'
|
||||
else:
|
||||
rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds)
|
||||
sign = '+' if rel_increment>0 else ''
|
||||
str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)'
|
||||
else:
|
||||
str_val = '\multicolumn{2}{c}{---}'
|
||||
|
||||
str_val = ' & ' + str_val
|
||||
|
||||
print(str_val, end='')
|
||||
print('\\\\')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,105 @@
|
|||
import csv
|
||||
import sys
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch.cuda
|
||||
from datasets import Dataset, DatasetDict
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||
from transformers import Trainer
|
||||
from transformers import TrainingArguments
|
||||
|
||||
|
||||
"""
|
||||
This script fine-tunes a pre-trained language model on a given textual training set.
|
||||
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
|
||||
to the validation loss in a hold-out val split of 1000 documents (stratified).
|
||||
|
||||
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
|
||||
$> python3 ./data/Books/training_data.txt roberta-base
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
||||
return tokens
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
logits, labels = eval_preds
|
||||
preds = np.argmax(logits, axis=-1)
|
||||
return {
|
||||
'macro-f1': f1_score(labels, preds, average='macro'),
|
||||
'micro-f1': f1_score(labels, preds, average='micro'),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
# datapath = './data/Books/training_data.txt'
|
||||
# checkpoint = 'roberta-base'
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||
|
||||
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||
|
||||
modelout = checkpoint+'-finetuned-new'
|
||||
|
||||
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
||||
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df['labels'].to_frame()
|
||||
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
|
||||
num_labels = len(pd.unique(labels['labels']))
|
||||
|
||||
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
||||
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
||||
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
||||
|
||||
dataset = DatasetDict({
|
||||
'train': train.select(range(500)) if debug else train,
|
||||
'validation': validation.select(range(500)) if debug else validation
|
||||
})
|
||||
|
||||
# tokenize the dataset
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
# fine-tuning
|
||||
training_args = TrainingArguments(
|
||||
modelout,
|
||||
learning_rate=2e-5,
|
||||
num_train_epochs=5,
|
||||
weight_decay=0.01,
|
||||
evaluation_strategy='epoch',
|
||||
save_strategy='epoch',
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
# eval_steps=10,
|
||||
save_total_limit=1,
|
||||
load_best_model_at_end=True
|
||||
)
|
||||
trainer = Trainer(
|
||||
model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets['train'],
|
||||
eval_dataset=tokenized_datasets['validation'],
|
||||
data_collator=DataCollatorWithPadding(tokenizer),
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
import pandas as pd
|
||||
from os.path import join
|
||||
import os
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from Ordinal.main import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Amazon-OQ-BK (for internal use only)
|
||||
"""
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
||||
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
||||
resultpath_bertpost = join('./results', domain_bert_post, prot)
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
methods += ['SLD(LR)-agg']
|
||||
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
||||
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
||||
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
|
||||
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
|
||||
# methods += [m+'-r' for m in methods]
|
||||
|
||||
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertave}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertpost}/*.csv'))
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
method, drift, *other = resultname.replace('.csv', '').split('.')
|
||||
if other:
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
table.add(drift, method, nmd)
|
||||
|
||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
|
||||
"""
|
||||
tabular += table.latexTabularT(average=False)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}"""
|
||||
|
||||
print('saving table in', outpath)
|
||||
with open(outpath, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
foo.write('\n')
|
||||
|
||||
print('[done]')
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import pandas as pd
|
||||
from os.path import join
|
||||
import os
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from Ordinal.experiments_lr_vs_ordlr import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Fact-OQ (for internal use only)
|
||||
"""
|
||||
|
||||
#domain = 'fact'
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
|
||||
withstd=False
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
if withstd:
|
||||
methods = [m+'-std' for m in methods]
|
||||
#methods = methods + methods_variant
|
||||
# methods += [m+'-r' for m in methods]
|
||||
|
||||
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
|
||||
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
|
||||
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
|
||||
if withstd:
|
||||
method_variants = [m+'-std' for m in method_variants]
|
||||
|
||||
print('families:', quantifiers_families)
|
||||
print('variants', method_variants)
|
||||
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
|
||||
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
|
||||
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
|
||||
if drift!='all':
|
||||
continue
|
||||
if other:
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
family, variant = method.split('(')
|
||||
variant = variant.replace(')', '')
|
||||
if variant not in method_variants:
|
||||
continue
|
||||
table.add(family, variant, nmd)
|
||||
|
||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
|
||||
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
|
||||
\\toprule
|
||||
"""
|
||||
|
||||
tabular += table.latexTabularT(average=False)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}"""
|
||||
|
||||
print('saving table in', outpath)
|
||||
with open(outpath, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
foo.write('\n')
|
||||
|
||||
print('[done]')
|
||||
|
|
@ -0,0 +1,152 @@
|
|||
import sys
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from os.path import join
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||
|
||||
|
||||
"""
|
||||
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
|
||||
samples in the dataset. The representations are saved in npy-txt plain format.
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||
return {
|
||||
'input_ids': tokens.input_ids.cuda(),
|
||||
'attention_mask': tokens.attention_mask.cuda()
|
||||
}
|
||||
|
||||
|
||||
def save_samples_as_txt(tensors, labels, path):
|
||||
vectors = tensors
|
||||
labels = labels.values
|
||||
vec_lab = np.hstack([labels, vectors])
|
||||
n_cols = vectors.shape[1]
|
||||
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
||||
|
||||
|
||||
def transform_sample(instances, labels, outpath, batch_size=50):
|
||||
ndocs = len(labels)
|
||||
batches = ndocs // batch_size
|
||||
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
||||
|
||||
transformations = []
|
||||
for batch_id in range(0, ndocs, batch_size):
|
||||
|
||||
batch_instances = instances[batch_id:batch_id + batch_size]
|
||||
|
||||
tokenized_dataset = tokenize_function(batch_instances)
|
||||
out = model(**tokenized_dataset, output_hidden_states=True)
|
||||
|
||||
if generation_mode == 'posteriors':
|
||||
logits = out.logits
|
||||
posteriors = torch.softmax(logits, dim=-1)
|
||||
transformed = posteriors
|
||||
elif generation_mode == 'last':
|
||||
hidden_states = out.hidden_states
|
||||
last_layer_cls = hidden_states[-1][:, 0, :]
|
||||
transformed = last_layer_cls
|
||||
elif generation_mode == 'average':
|
||||
hidden_states = out.hidden_states
|
||||
hidden_states = torch.stack(hidden_states)
|
||||
all_layer_cls = hidden_states[:, :, 0, :]
|
||||
average_cls = torch.mean(all_layer_cls, dim=0)
|
||||
transformed = average_cls
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
transformations.append(transformed.cpu().numpy())
|
||||
|
||||
transformations = np.vstack(transformations)
|
||||
save_samples_as_txt(transformations, labels, outpath)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname, skip=0):
|
||||
in_folder = join(datapath, domain, protocol, splitname)
|
||||
out_folder = join(datapath, outname, protocol, splitname)
|
||||
total = 1000 if splitname.startswith('dev') else 5000
|
||||
|
||||
for i, (instances, labels) in tqdm(enumerate(
|
||||
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
||||
if i>= skip:
|
||||
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
||||
|
||||
|
||||
def get_best_checkpoint(checkpointdir):
|
||||
from glob import glob
|
||||
steps = []
|
||||
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
||||
step=int(folder.split('checkpoint-')[1])
|
||||
steps.append(step)
|
||||
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||
print(f'choosen checkpoint is {choosen}')
|
||||
return choosen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
#checkpoint='roberta-base-val-finetuned'
|
||||
#generation_mode = 'average' #ave seemed to work slightly better
|
||||
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
||||
|
||||
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
||||
generation_mode = sys.argv[2] # e.g., 'average' # ave seemed to work slightly better
|
||||
|
||||
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
||||
|
||||
checkpoint = get_best_checkpoint(checkpoint)
|
||||
|
||||
num_labels = 5
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
protocols = ['real', 'app'] # ['app', 'npp']
|
||||
|
||||
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||
|
||||
with torch.no_grad():
|
||||
print('loading', checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
|
||||
print('transforming the training set')
|
||||
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
||||
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
||||
print('[done]')
|
||||
|
||||
for protocol in protocols:
|
||||
in_path = join(datapath, domain, protocol)
|
||||
out_path = join(datapath, outname, protocol)
|
||||
os.makedirs(out_path, exist_ok=True)
|
||||
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
||||
|
||||
print('processing', protocol)
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
from Ordinal.model import OrderedLogisticRegression, LogisticAT
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
||||
from quapy.data import LabelledCollection
|
||||
from os.path import join
|
||||
import os
|
||||
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
||||
from evaluation import nmd, mnmd
|
||||
from time import time
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
import mord
|
||||
|
||||
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# params_SVR = {'C': np.logspace(0, 1, 2)}
|
||||
|
||||
# baselines
|
||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
|
||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
||||
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
||||
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
||||
# not implement predict_proba nor decision_score
|
||||
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
||||
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
|
||||
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
|
||||
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
|
||||
|
||||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid, drift = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
return None
|
||||
|
||||
print(f'fitting {qname} for {drift}-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
sample_size=1000,
|
||||
protocol='gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
verbose=True).fit(train)
|
||||
|
||||
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
print('[learning regressor-based adjustment]')
|
||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
return hyperparams
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
#preprocessing = 'roberta.last'
|
||||
preprocessing = 'roberta.average'
|
||||
# preprocessing = 'roberta.posteriors'
|
||||
#preprocessing = 'tfidf'
|
||||
if preprocessing=='tfidf':
|
||||
domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
elif preprocessing=='roberta.last':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
posfix = '-RoBERTa-last'
|
||||
elif preprocessing=='roberta.average':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
posfix = '-RoBERTa-average'
|
||||
elif preprocessing=='roberta.posteriors':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
posfix = '-RoBERTa-posteriors'
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
||||
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
import mord
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
|
||||
|
||||
class OrderedLogisticRegression:
|
||||
def __init__(self, model='logit'):
|
||||
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||
self.model = model
|
||||
|
||||
def fit(self, X, y):
|
||||
if issparse(X):
|
||||
self.svd = TruncatedSVD(500)
|
||||
X = self.svd.fit_transform(X)
|
||||
self.learner = OrderedModel(y, X, distr=self.model)
|
||||
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||
|
||||
def predict(self, X):
|
||||
prob = self.predict_proba(X)
|
||||
return np.argmax(prob, axis=1)
|
||||
|
||||
def predict_proba(self, X):
|
||||
if issparse(X):
|
||||
assert hasattr(self, 'svd'), \
|
||||
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||
X = self.svd.transform(X)
|
||||
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class LAD(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, C=1.0, class_weight=None):
|
||||
self.C = C
|
||||
self.class_weight = class_weight
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = LinearSVR(C=self.C)
|
||||
# self.regressor = SVR()
|
||||
# self.regressor = Ridge(normalize=True)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(np.int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'C':self.C, 'class_weight': self.class_weight}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.C = params['C']
|
||||
self.class_weight = params['class_weight']
|
||||
|
||||
|
||||
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||
self.alpha = alpha
|
||||
self.class_weight = class_weight
|
||||
self.normalize = normalize
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(np.int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.alpha = params['alpha']
|
||||
self.class_weight = params['class_weight']
|
||||
self.normalize = params['normalize']
|
||||
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
class LogisticAT(mord.LogisticAT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticAT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticSE(mord.LogisticSE):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticSE, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticIT(mord.LogisticIT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticIT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# class LAD(mord.LAD):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
||||
|
||||
# class OrdinalRidge(mord.OrdinalRidge):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
|
@ -0,0 +1,296 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVR, SVR
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
import mord
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
|
||||
|
||||
class OrderedLogisticRegression:
|
||||
def __init__(self, model='logit'):
|
||||
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||
self.model = model
|
||||
|
||||
def fit(self, X, y):
|
||||
if issparse(X):
|
||||
self.svd = TruncatedSVD(500)
|
||||
X = self.svd.fit_transform(X)
|
||||
self.learner = OrderedModel(y, X, distr=self.model)
|
||||
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||
|
||||
def predict(self, X):
|
||||
prob = self.predict_proba(X)
|
||||
return np.argmax(prob, axis=1)
|
||||
|
||||
def predict_proba(self, X):
|
||||
if issparse(X):
|
||||
assert hasattr(self, 'svd'), \
|
||||
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||
X = self.svd.transform(X)
|
||||
return self.res_prob.quantifier.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class StackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.base = deepcopy(base_estimator)
|
||||
self.meta = deepcopy(base_estimator)
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class RegressionQuantification:
|
||||
def __init__(self,
|
||||
base_quantifier,
|
||||
regression='svr',
|
||||
val_samples_generator=None,
|
||||
norm=True):
|
||||
|
||||
self.base_quantifier = base_quantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.val_samples_generator = val_samples_generator
|
||||
# self.norm = StandardScaler()
|
||||
# self.covs = covs
|
||||
|
||||
def generate_validation_samples(self):
|
||||
Xs, ys = [], []
|
||||
for instances, prevalence in self.val_samples_generator():
|
||||
ys.append(prevalence)
|
||||
Xs.append(self.base_quantifier.quantify(instances))
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
return Xs, ys
|
||||
|
||||
def fit(self, data):
|
||||
print('fitting quantifier')
|
||||
if data is not None:
|
||||
self.base_quantifier.fit(data)
|
||||
print('generating val samples')
|
||||
Xs, ys = self.generate_validation_samples()
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
print('fitting regressor')
|
||||
self.reg.fit(Xs, ys)
|
||||
print('[done]')
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs).flatten()
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
Xs = np.clip(Xs, 0, 1)
|
||||
adjusted = Xs / Xs.sum()
|
||||
# adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted
|
||||
return adjusted
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
|
||||
class LAD(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, C=1.0, class_weight=None):
|
||||
self.C = C
|
||||
self.class_weight = class_weight
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = LinearSVR(C=self.C)
|
||||
# self.regressor = SVR()
|
||||
# self.regressor = Ridge(normalize=True)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'C':self.C, 'class_weight': self.class_weight}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.C = params['C']
|
||||
self.class_weight = params['class_weight']
|
||||
|
||||
|
||||
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||
self.alpha = alpha
|
||||
self.class_weight = class_weight
|
||||
self.normalize = normalize
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.alpha = params['alpha']
|
||||
self.class_weight = params['class_weight']
|
||||
self.normalize = params['normalize']
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
class LogisticAT(mord.LogisticAT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticAT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticSE(mord.LogisticSE):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticSE, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticIT(mord.LogisticIT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticIT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# class LAD(mord.LAD):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
||||
|
||||
# class OrdinalRidge(mord.OrdinalRidge):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
from evaluation import nmd
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
from quapy.data import LabelledCollection
|
||||
import pickle
|
||||
import os
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "shift".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_drift(split, training_prevalence):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
drifts = []
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||
drifts = np.asarray(drifts)
|
||||
order = np.argsort(drifts)
|
||||
nD = len(order)
|
||||
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||
all_drift = np.arange(nD)
|
||||
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
|
||||
lows = drifts[low_drift]
|
||||
mids = drifts[mid_drift]
|
||||
highs = drifts[high_drift]
|
||||
all = drifts[all_drift]
|
||||
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
||||
|
||||
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||
datapath = './data'
|
||||
|
||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
||||
partition_by_drift('dev', training.prevalence())
|
||||
partition_by_drift('test', training.prevalence())
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
import numpy as np
|
||||
from Ordinal.evaluation import smoothness
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "smoothness".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_smoothness(split):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
smooths = []
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
smooths.append(smoothness(sample.prevalence()))
|
||||
smooths = np.asarray(smooths)
|
||||
order = np.argsort(smooths)
|
||||
nD = len(order)
|
||||
low2high_smooth = np.array_split(order, 5)
|
||||
all_drift = np.arange(nD)
|
||||
for i, smooth_idx in enumerate(low2high_smooth):
|
||||
block = smooths[smooth_idx]
|
||||
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
||||
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
||||
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
||||
|
||||
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
|
||||
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
||||
partition_by_smoothness('dev')
|
||||
partition_by_smoothness('test')
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import *
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
|
||||
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
|
||||
Three vector generation modes are available: posteriors, last, average
|
||||
"""
|
||||
|
||||
vector_generation = 'posteriors'
|
||||
|
||||
datapath = './data'
|
||||
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
||||
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
||||
|
||||
protocol = 'app'
|
||||
|
||||
print('pickling npy txt files')
|
||||
print('from:', join(datapath, domain))
|
||||
print('to', join(datapath, outname))
|
||||
print('for protocol:', protocol)
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||
|
||||
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
folder_dir=join(datapath, domain, protocol, splitname)
|
||||
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw, load_samples_raw
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
|
||||
"""
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
||||
|
||||
def save_preprocessing_info(transformer):
|
||||
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||
foo.write(f'{str(transformer)}\n')
|
||||
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||
os.makedirs(join(datapath, outname, 'real'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||
|
||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
save_preprocessing_info(tfidf)
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples('app', 'dev_samples')
|
||||
transform_folder_samples('app', 'test_samples')
|
||||
transform_folder_samples('real', 'dev_samples')
|
||||
transform_folder_samples('real', 'test_samples')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,374 @@
|
|||
import numpy as np
|
||||
import itertools
|
||||
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||
|
||||
|
||||
class Table:
|
||||
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||
|
||||
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
|
||||
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||
color=True, show_rel_to=-1):
|
||||
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||
|
||||
self.benchmarks = np.asarray(benchmarks)
|
||||
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||
|
||||
self.methods = np.asarray(methods)
|
||||
self.method_index = {col: j for j, col in enumerate(methods)}
|
||||
|
||||
self.map = {}
|
||||
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||
self._addmap('values', dtype=object)
|
||||
self.lower_is_better = lower_is_better
|
||||
self.ttest = significance_test
|
||||
self.prec_mean = prec_mean
|
||||
self.clean_zero = clean_zero
|
||||
self.show_std = show_std
|
||||
self.prec_std = prec_std
|
||||
self.add_average = average
|
||||
self.missing = missing
|
||||
self.missing_str = missing_str
|
||||
self.color = color
|
||||
self.show_rel_to = show_rel_to
|
||||
|
||||
self.touch()
|
||||
|
||||
@property
|
||||
def nbenchmarks(self):
|
||||
return len(self.benchmarks)
|
||||
|
||||
@property
|
||||
def nmethods(self):
|
||||
return len(self.methods)
|
||||
|
||||
def touch(self):
|
||||
self._modif = True
|
||||
|
||||
def update(self):
|
||||
if self._modif:
|
||||
self.compute()
|
||||
|
||||
def _getfilled(self):
|
||||
return np.argwhere(self.map['fill'])
|
||||
|
||||
@property
|
||||
def values(self):
|
||||
return self.map['values']
|
||||
|
||||
def _indexes(self):
|
||||
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
||||
|
||||
def _addmap(self, map, dtype, func=None):
|
||||
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
||||
if func is None:
|
||||
return
|
||||
m = self.map[map]
|
||||
f = func
|
||||
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||
for i, j in indexes:
|
||||
m[i, j] = f(self.values[i, j])
|
||||
|
||||
def _addrank(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
||||
if not self.lower_is_better:
|
||||
ranked_cols_idx = ranked_cols_idx[::-1]
|
||||
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
|
||||
|
||||
def _addcolor(self):
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if filled_cols_idx.size == 0:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
minval = min(col_means)
|
||||
maxval = max(col_means)
|
||||
for col_idx in filled_cols_idx:
|
||||
val = self.map['mean'][i, col_idx]
|
||||
norm = (maxval - minval)
|
||||
if norm > 0:
|
||||
normval = (val - minval) / norm
|
||||
else:
|
||||
normval = 0.5
|
||||
if self.lower_is_better:
|
||||
normval = 1 - normval
|
||||
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
||||
|
||||
def _run_ttest(self, row, col1, col2):
|
||||
mean1 = self.map['mean'][row, col1]
|
||||
std1 = self.map['std'][row, col1]
|
||||
nobs1 = self.map['nobs'][row, col1]
|
||||
mean2 = self.map['mean'][row, col2]
|
||||
std2 = self.map['std'][row, col2]
|
||||
nobs2 = self.map['nobs'][row, col2]
|
||||
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
||||
return p_val
|
||||
|
||||
def _run_wilcoxon(self, row, col1, col2):
|
||||
values1 = self.map['values'][row, col1]
|
||||
values2 = self.map['values'][row, col2]
|
||||
_, p_val = wilcoxon(values1, values2)
|
||||
return p_val
|
||||
|
||||
def _add_statistical_test(self):
|
||||
if self.ttest is None:
|
||||
return
|
||||
self.some_similar = [False] * self.nmethods
|
||||
for i in range(self.nbenchmarks):
|
||||
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||
if len(filled_cols_idx) <= 1:
|
||||
continue
|
||||
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||
best_pos = filled_cols_idx[np.argmin(col_means)]
|
||||
|
||||
for j in filled_cols_idx:
|
||||
if j == best_pos:
|
||||
continue
|
||||
if self.ttest == 'ttest':
|
||||
p_val = self._run_ttest(i, best_pos, j)
|
||||
else:
|
||||
p_val = self._run_wilcoxon(i, best_pos, j)
|
||||
|
||||
pval_outcome = pval_interpretation(p_val)
|
||||
self.map['ttest'][i, j] = pval_outcome
|
||||
if pval_outcome != 'Diff':
|
||||
self.some_similar[j] = True
|
||||
|
||||
def compute(self):
|
||||
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
||||
self._addmap('mean', dtype=float, func=np.mean)
|
||||
self._addmap('std', dtype=float, func=np.std)
|
||||
self._addmap('nobs', dtype=float, func=len)
|
||||
self._addmap('rank', dtype=int, func=None)
|
||||
self._addmap('color', dtype=object, func=None)
|
||||
self._addmap('ttest', dtype=object, func=None)
|
||||
self._addmap('latex', dtype=object, func=None)
|
||||
self._addrank()
|
||||
self._addcolor()
|
||||
self._add_statistical_test()
|
||||
if self.add_average:
|
||||
self._addave()
|
||||
self._modif = False
|
||||
|
||||
def _is_column_full(self, col):
|
||||
return all(self.map['fill'][:, self.method_index[col]])
|
||||
|
||||
def _addave(self):
|
||||
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||
show_std=self.show_std)
|
||||
for col in self.methods:
|
||||
values = None
|
||||
if self._is_column_full(col):
|
||||
if self.ttest == 'ttest':
|
||||
values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||
else: # wilcoxon
|
||||
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||
ave.add('ave', col, values)
|
||||
self.average = ave
|
||||
|
||||
def add(self, benchmark, method, values):
|
||||
if values is not None:
|
||||
values = np.asarray(values)
|
||||
if values.ndim == 0:
|
||||
values = values.flatten()
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['values'][rid, cid] is None:
|
||||
self.map['values'][rid, cid] = values
|
||||
elif values is not None:
|
||||
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
|
||||
self.touch()
|
||||
|
||||
def get(self, benchmark, method, attr='mean'):
|
||||
self.update()
|
||||
assert attr in self.map, f'unknwon attribute {attr}'
|
||||
rid, cid = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][rid, cid]:
|
||||
v = self.map[attr][rid, cid]
|
||||
if v is None or (isinstance(v, float) and np.isnan(v)):
|
||||
return self.missing
|
||||
return v
|
||||
else:
|
||||
return self.missing
|
||||
|
||||
def _coordinates(self, benchmark, method):
|
||||
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||
assert method in self.method_index, f'method {method} out of range'
|
||||
rid = self.benchmark_index[benchmark]
|
||||
cid = self.method_index[method]
|
||||
return rid, cid
|
||||
|
||||
def get_average(self, method, attr='mean'):
|
||||
self.update()
|
||||
if self.add_average:
|
||||
return self.average.get('ave', method, attr=attr)
|
||||
return None
|
||||
|
||||
def get_color(self, benchmark, method):
|
||||
color = self.get(benchmark, method, attr='color')
|
||||
if color is None:
|
||||
return ''
|
||||
return color
|
||||
|
||||
def latexCell(self, benchmark, method):
|
||||
self.update()
|
||||
i, j = self._coordinates(benchmark, method)
|
||||
if self.map['fill'][i, j] == False:
|
||||
return self.missing_str
|
||||
|
||||
mean = self.map['mean'][i, j]
|
||||
l = f" {mean:.{self.prec_mean}f}"
|
||||
if self.clean_zero:
|
||||
l = l.replace(' 0.', '.')
|
||||
|
||||
isbest = self.map['rank'][i, j] == 1
|
||||
if self.ttest is not None: # and self.some_similar[j]:
|
||||
test_label = self.map['ttest'][i, j]
|
||||
if test_label in ['Sim', 'Same']:
|
||||
isbest = True
|
||||
|
||||
if isbest:
|
||||
l = "\\textbf{" + l.strip() + "}\;"
|
||||
else:
|
||||
l += '\; '
|
||||
|
||||
stat = ''
|
||||
# this is commented because we are putting in textbf all results that are similar to the best one
|
||||
# if self.ttest is not None: # and self.some_similar[j]:
|
||||
# test_label = self.map['ttest'][i, j]
|
||||
# if test_label == 'Sim':
|
||||
# stat = '^{\dag\phantom{\dag}}'
|
||||
# elif test_label == 'Same':
|
||||
# stat = '^{\ddag}'
|
||||
# elif isbest or test_label == 'Diff':
|
||||
# stat = '^{\phantom{\ddag}}'
|
||||
|
||||
std = ''
|
||||
if self.show_std:
|
||||
std = self.map['std'][i, j]
|
||||
std = f" {std:.{self.prec_std}f}"
|
||||
if self.clean_zero:
|
||||
std = std.replace(' 0.', '.')
|
||||
std = f" \pm {std:{self.prec_std}}"
|
||||
|
||||
relto = ''
|
||||
if self.show_rel_to != -1:
|
||||
if j != self.show_rel_to:
|
||||
ref_ave = self.map['mean'][i, self.show_rel_to]
|
||||
rel = 100*(mean-ref_ave)/ref_ave
|
||||
if abs(rel) < 0.1:
|
||||
relto=f'(\\approx)'
|
||||
else:
|
||||
plussign = '+' if rel>0 else '' # already plugs the '-' sign
|
||||
relto=f'({plussign}{rel:.1f}\%)'
|
||||
std = ''
|
||||
|
||||
if stat != '' or std != '' or relto != '':
|
||||
l = f'{l}${stat}{std}{relto}$'
|
||||
|
||||
if self.color:
|
||||
l += ' ' + self.map['color'][i, j]
|
||||
|
||||
return l
|
||||
|
||||
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
|
||||
tab = ' & '
|
||||
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
||||
tab += ' \\\\\hline\n'
|
||||
for row in self.benchmarks:
|
||||
rowname = benchmark_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRow(row)
|
||||
|
||||
if average:
|
||||
tab += '\hline\n'
|
||||
tab += 'Average & '
|
||||
tab += self.latexAverage()
|
||||
return tab
|
||||
|
||||
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||
def withside(label):
|
||||
return '\side{'+label+'}' if side else label
|
||||
|
||||
def center(label):
|
||||
return '\multicolumn{1}{c}{'+label+'}'
|
||||
|
||||
tab = ' & '
|
||||
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
|
||||
if average:
|
||||
tab += ' & ' + withside('Ave')
|
||||
# tab += ' \\\\\hline\n'
|
||||
tab += ' \\\\\midrule\n'
|
||||
for row in self.methods:
|
||||
rowname = method_replace.get(row, row)
|
||||
tab += rowname + ' & '
|
||||
tab += self.latexRowT(row, endl='')
|
||||
if average:
|
||||
tab += ' & '
|
||||
tab += self.average.latexCell('ave', row)
|
||||
# tab += '\\\\\hline\n'
|
||||
tab += '\\\\\n'
|
||||
tab += '\\bottomrule'
|
||||
return tab
|
||||
|
||||
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||
s = ' & '.join(s)
|
||||
s += ' ' + endl
|
||||
return s
|
||||
|
||||
def latexAverage(self, endl='\\\\\hline\n'):
|
||||
if self.add_average:
|
||||
return self.average.latexRow('ave', endl=endl)
|
||||
|
||||
def getRankTable(self):
|
||||
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
|
||||
for rid, cid in self._getfilled():
|
||||
row = self.benchmarks[rid]
|
||||
col = self.methods[cid]
|
||||
t.add(row, col, self.get(row, col, 'rank'))
|
||||
t.compute()
|
||||
return t
|
||||
|
||||
def dropMethods(self, methods):
|
||||
drop_index = [self.method_index[m] for m in methods]
|
||||
new_methods = np.delete(self.methods, drop_index)
|
||||
new_index = {col: j for j, col in enumerate(new_methods)}
|
||||
|
||||
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
||||
self.methods = new_methods
|
||||
self.method_index = new_index
|
||||
self.touch()
|
||||
|
||||
|
||||
def pval_interpretation(p_val):
|
||||
if 0.005 >= p_val:
|
||||
return 'Diff'
|
||||
elif 0.05 >= p_val > 0.005:
|
||||
return 'Sim'
|
||||
elif p_val > 0.05:
|
||||
return 'Same'
|
||||
|
||||
|
||||
def color_red2green_01(val, maxtone=50):
|
||||
if np.isnan(val): return None
|
||||
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
||||
|
||||
# rescale to [-1,1]
|
||||
val = val * 2 - 1
|
||||
if val < 0:
|
||||
color = 'red'
|
||||
tone = maxtone * (-val)
|
||||
else:
|
||||
color = 'green'
|
||||
tone = maxtone * val
|
||||
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
|
@ -0,0 +1,78 @@
|
|||
import gzip
|
||||
import os
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
nrows = 3
|
||||
ncols = 4
|
||||
|
||||
prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||
#prevalences = prevalences[:nrows*ncols]
|
||||
print(prevalences)
|
||||
|
||||
n = prevalences.shape[1]
|
||||
|
||||
class_smooth = []
|
||||
for i, sample in enumerate(prevalences):
|
||||
p = sample
|
||||
smooth = jaggedness(p)
|
||||
class_smooth.append([smooth, f'Sample {i+1}', p])
|
||||
|
||||
# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged
|
||||
# at equal steps
|
||||
class_smooth = sorted(class_smooth)
|
||||
class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)]
|
||||
class_smooth = class_smooth[:nrows*ncols]
|
||||
# print(class_smooth)
|
||||
# print(len(class_smooth))
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.5)
|
||||
|
||||
maxy = np.max(prevalences) + 0.1
|
||||
class_labels = np.arange(1,n+1)
|
||||
|
||||
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||
row = i // ncols
|
||||
col = i % ncols
|
||||
# print(i, row, col)
|
||||
#axis[row, col].bar(list(range(1,n+1)), prevalence, width=1)
|
||||
|
||||
axis[row, col].bar(class_labels, prevalence, width=1)
|
||||
axis[row, col].set_ylim(0, maxy)
|
||||
axis[row, col].set_facecolor('white')
|
||||
for spine in axis[row, col].spines.values():
|
||||
spine.set_edgecolor('black')
|
||||
spine.set_linewidth(0.3)
|
||||
|
||||
if row==nrows-1:
|
||||
axis[row, col].set_xlabel("energy bin")
|
||||
axis[row, col].set_xticks(class_labels)
|
||||
else:
|
||||
axis[row, col].set_xlabel("")
|
||||
axis[row, col].set_xticks([])
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
|
||||
category = category.replace('_', ' ').title()
|
||||
category = category.replace(' And ', ' & ')
|
||||
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||
# axis[row, col].set_title
|
||||
|
||||
print(smooth, category, prevalence)
|
||||
|
||||
# plt.show()
|
||||
plt.subplots_adjust(wspace=0, hspace=0)
|
||||
plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
import pickle
|
||||
|
||||
target_file = './counters_Amazon_merchandise.pkl'
|
||||
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||
|
||||
print(categories)
|
||||
print(counters)
|
||||
|
||||
with open('categories.txt', 'wt') as foo:
|
||||
for counter, category in zip(counters, categories):
|
||||
foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n')
|
||||
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
import numpy as np
|
||||
from glob import glob
|
||||
from json import load
|
||||
import os
|
||||
from os.path import join
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import csv
|
||||
import datasets
|
||||
from datasets import Dataset
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
|
||||
|
||||
def jaggedness(p):
|
||||
return (1/min(6, len(p)+1)) * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
yX = np.loadtxt(samplepath)
|
||||
X = yX[:,1:]
|
||||
y = yX[:,0].astype(np.int32)
|
||||
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
||||
|
||||
|
||||
def load_simple_sample_raw(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
|
||||
def load_single_sample_as_csv(parentdir, filename):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df.pop('labels').to_frame()
|
||||
|
||||
features = datasets.Features({'review': datasets.Value('string')})
|
||||
sample = Dataset.from_pandas(df=df, features=features)
|
||||
|
||||
return sample, labels
|
||||
|
||||
|
||||
def load_single_sample_pkl(parentdir, filename):
|
||||
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
||||
|
||||
|
||||
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||
|
||||
|
||||
def load_samples_raw(path_dir, filter=None, classes=None):
|
||||
return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, classes=classes)
|
||||
|
||||
|
||||
# def load_samples_as_csv(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
||||
|
||||
|
||||
# def load_samples_pkl(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
||||
|
||||
|
||||
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
||||
nsamples = len(glob(join(path_dir, f'*')))
|
||||
for id in range(nsamples):
|
||||
if (filter is None) or id in filter:
|
||||
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
|
@ -1,9 +1,10 @@
|
|||
import numpy as np
|
||||
from scipy.sparse import dok_matrix
|
||||
from tqdm import tqdm
|
||||
from time import time
|
||||
|
||||
|
||||
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
||||
def from_text(path, encoding='utf-8', verbose=0, class2int=True):
|
||||
"""
|
||||
Reads a labelled colletion of documents.
|
||||
File fomart <0 or 1>\t<document>\n
|
||||
|
|
|
@ -183,7 +183,7 @@ def _training_helper(learner,
|
|||
if not hasattr(learner, 'predict_proba'):
|
||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||
f'The learner will be calibrated.')
|
||||
learner = CalibratedClassifierCV(learner, cv=5)
|
||||
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
|
||||
if val_split is not None:
|
||||
if isinstance(val_split, float):
|
||||
if not (0 < val_split < 1):
|
||||
|
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
|||
|
||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
|
||||
return self
|
||||
|
||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||
|
|
Loading…
Reference in New Issue