forked from moreo/QuaPy
branch for LeQua2022 - first commit
This commit is contained in:
parent
537a95fa18
commit
4284f1daa3
|
@ -0,0 +1,16 @@
|
|||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_binary_raw_document(path):
|
||||
documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
|
||||
labels = np.asarray(labels)
|
||||
labels[np.logical_or(labels == 1, labels == 2)] = 0
|
||||
labels[np.logical_or(labels == 4, labels == 5)] = 1
|
||||
return documents, labels
|
||||
|
||||
|
||||
def load_multiclass_raw_document(path):
|
||||
return qp.data.from_text(path, verbose=0, class2int=False)
|
||||
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
from data import load_binary_raw_document
|
||||
import os
|
||||
|
||||
path_binary_raw = 'binary_raw'
|
||||
result_path = os.path.join('results', 'binary_raw')
|
||||
os.makedirs(result_path, exist_ok=True)
|
||||
|
||||
train_file = os.path.join(path_binary_raw, 'documents', 'training.txt')
|
||||
|
||||
train = LabelledCollection.load(train_file, load_binary_raw_document)
|
||||
|
||||
print(train.classes_)
|
||||
print(len(train))
|
||||
print(train.prevalence())
|
||||
|
||||
tfidf = TfidfVectorizer(min_df=5)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
|
||||
scores = {}
|
||||
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
|
||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||
model = quantifier(classifier).fit(train)
|
||||
|
||||
quantifier_name = model.__class__.__name__
|
||||
scores[quantifier_name]={}
|
||||
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
|
||||
ae_errors, rae_errors = [], []
|
||||
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
|
||||
test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt')
|
||||
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
|
||||
test.instances = tfidf.transform(test.instances)
|
||||
qp.environ['SAMPLE_SIZE'] = len(test)
|
||||
prev_estim = model.quantify(test.instances)
|
||||
prev_true = test.prevalence()
|
||||
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
||||
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
||||
|
||||
ae_errors = np.asarray(ae_errors)
|
||||
rae_errors = np.asarray(rae_errors)
|
||||
|
||||
mae = ae_errors.mean()
|
||||
mrae = rae_errors.mean()
|
||||
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
||||
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
||||
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
||||
|
||||
for model in scores:
|
||||
for sample_set in ['validation']:#, 'test']:
|
||||
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
||||
|
||||
|
||||
"""
|
||||
test:
|
||||
CC 0.1859 1.5406
|
||||
ACC 0.0453 0.2840
|
||||
PCC 0.1793 1.7187
|
||||
PACC 0.0287 0.1494
|
||||
EMQ 0.0225 0.1020
|
||||
HDy 0.0631 0.2307
|
||||
|
||||
validation
|
||||
CC 0.1862 1.9587
|
||||
ACC 0.0394 0.2669
|
||||
PCC 0.1789 2.1383
|
||||
PACC 0.0354 0.1587
|
||||
EMQ 0.0224 0.0960
|
||||
HDy 0.0467 0.2121
|
||||
"""
|
||||
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from tqdm import tqdm
|
||||
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.method.aggregative import *
|
||||
from data import load_multiclass_raw_document
|
||||
import os
|
||||
|
||||
path_multiclass_raw = 'multiclass_raw'
|
||||
result_path = os.path.join('results', 'multiclass_raw')
|
||||
os.makedirs(result_path, exist_ok=True)
|
||||
|
||||
train_file = os.path.join(path_multiclass_raw, 'documents', 'training.txt')
|
||||
|
||||
train = LabelledCollection.load(train_file, load_multiclass_raw_document)
|
||||
|
||||
print('classes', train.classes_)
|
||||
print('#classes', len(train.classes_))
|
||||
print('#docs', len(train))
|
||||
print('prevalence', train.prevalence())
|
||||
print('counts', train.counts())
|
||||
|
||||
tfidf = TfidfVectorizer(min_df=5)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
print(train.instances.shape[1])
|
||||
|
||||
scores = {}
|
||||
for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]:
|
||||
classifier = CalibratedClassifierCV(LogisticRegression())
|
||||
# classifier = LogisticRegression()
|
||||
model = quantifier(classifier).fit(train)
|
||||
print('model trained')
|
||||
|
||||
quantifier_name = model.__class__.__name__
|
||||
scores[quantifier_name]={}
|
||||
for sample_set, sample_size in [('validation', 1000), ('test', 5000)]:
|
||||
ae_errors, rae_errors = [], []
|
||||
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
|
||||
test_file = os.path.join(path_multiclass_raw, 'documents', f'{sample_set}_{i}.txt')
|
||||
test = LabelledCollection.load(test_file, load_multiclass_raw_document, classes=train.classes_)
|
||||
test.instances = tfidf.transform(test.instances)
|
||||
qp.environ['SAMPLE_SIZE'] = len(test)
|
||||
prev_estim = model.quantify(test.instances)
|
||||
prev_true = test.prevalence()
|
||||
ae_errors.append(qp.error.mae(prev_true, prev_estim))
|
||||
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
|
||||
|
||||
ae_errors = np.asarray(ae_errors)
|
||||
rae_errors = np.asarray(rae_errors)
|
||||
|
||||
mae = ae_errors.mean()
|
||||
mrae = rae_errors.mean()
|
||||
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
|
||||
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
|
||||
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
|
||||
|
||||
for model in scores:
|
||||
for sample_set in ['validation', 'test']:
|
||||
print(f'{model}\t{sample_set}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
|
||||
|
||||
|
||||
"""
|
||||
test:
|
||||
|
||||
|
||||
validation
|
||||
|
||||
"""
|
||||
|
||||
|
2
TODO.txt
2
TODO.txt
|
@ -1,3 +1,5 @@
|
|||
Looks like there are some "multilingual" stuff in the master branch? See, e.g., MultilingualLabelledCollection in data/base.py
|
||||
|
||||
Packaging:
|
||||
==========================================
|
||||
Documentation with sphinx
|
||||
|
|
|
@ -39,8 +39,8 @@ class LabelledCollection:
|
|||
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: str, loader_func: callable):
|
||||
return LabelledCollection(*loader_func(path))
|
||||
def load(cls, path: str, loader_func: callable, classes=None):
|
||||
return LabelledCollection(*loader_func(path), classes)
|
||||
|
||||
def __len__(self):
|
||||
return self.instances.shape[0]
|
||||
|
|
|
@ -3,20 +3,25 @@ from scipy.sparse import dok_matrix
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
def from_text(path, encoding='utf-8'):
|
||||
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
||||
"""
|
||||
Reas a labelled colletion of documents.
|
||||
Reads a labelled colletion of documents.
|
||||
File fomart <0 or 1>\t<document>\n
|
||||
:param path: path to the labelled collection
|
||||
:return: a list of sentences, and a list of labels
|
||||
"""
|
||||
all_sentences, all_labels = [], []
|
||||
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
|
||||
if verbose>0:
|
||||
file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')
|
||||
else:
|
||||
file = open(path, 'rt', encoding=encoding).readlines()
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line:
|
||||
label, sentence = line.split('\t')
|
||||
sentence = sentence.strip()
|
||||
label = int(label)
|
||||
if class2int:
|
||||
label = int(label)
|
||||
if sentence:
|
||||
all_sentences.append(sentence)
|
||||
all_labels.append(label)
|
||||
|
|
Loading…
Reference in New Issue