branch for LeQua2022 - first commit

This commit is contained in:
Alejandro Moreo Fernandez 2021-10-13 20:36:53 +02:00
parent 537a95fa18
commit 4284f1daa3
6 changed files with 188 additions and 6 deletions

16
LeQua2022/data.py Normal file
View File

@ -0,0 +1,16 @@
import quapy as qp
import numpy as np
def load_binary_raw_document(path):
documents, labels = qp.data.from_text(path, verbose=0, class2int=True)
labels = np.asarray(labels)
labels[np.logical_or(labels == 1, labels == 2)] = 0
labels[np.logical_or(labels == 4, labels == 5)] = 1
return documents, labels
def load_multiclass_raw_document(path):
return qp.data.from_text(path, verbose=0, class2int=False)

82
LeQua2022/main_binary.py Normal file
View File

@ -0,0 +1,82 @@
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
from data import load_binary_raw_document
import os
path_binary_raw = 'binary_raw'
result_path = os.path.join('results', 'binary_raw')
os.makedirs(result_path, exist_ok=True)
train_file = os.path.join(path_binary_raw, 'documents', 'training.txt')
train = LabelledCollection.load(train_file, load_binary_raw_document)
print(train.classes_)
print(len(train))
print(train.prevalence())
tfidf = TfidfVectorizer(min_df=5)
train.instances = tfidf.fit_transform(train.instances)
scores = {}
for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
classifier = CalibratedClassifierCV(LogisticRegression())
model = quantifier(classifier).fit(train)
quantifier_name = model.__class__.__name__
scores[quantifier_name]={}
for sample_set, sample_size in [('validation', 1000)]:#, ('test', 5000)]:
ae_errors, rae_errors = [], []
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
test_file = os.path.join(path_binary_raw, 'documents', f'{sample_set}_{i}.txt')
test = LabelledCollection.load(test_file, load_binary_raw_document, classes=train.classes_)
test.instances = tfidf.transform(test.instances)
qp.environ['SAMPLE_SIZE'] = len(test)
prev_estim = model.quantify(test.instances)
prev_true = test.prevalence()
ae_errors.append(qp.error.mae(prev_true, prev_estim))
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
ae_errors = np.asarray(ae_errors)
rae_errors = np.asarray(rae_errors)
mae = ae_errors.mean()
mrae = rae_errors.mean()
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
for model in scores:
for sample_set in ['validation']:#, 'test']:
print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
"""
test:
CC 0.1859 1.5406
ACC 0.0453 0.2840
PCC 0.1793 1.7187
PACC 0.0287 0.1494
EMQ 0.0225 0.1020
HDy 0.0631 0.2307
validation
CC 0.1862 1.9587
ACC 0.0394 0.2669
PCC 0.1789 2.1383
PACC 0.0354 0.1587
EMQ 0.0224 0.0960
HDy 0.0467 0.2121
"""

View File

@ -0,0 +1,77 @@
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import quapy as qp
from quapy.data import LabelledCollection
from quapy.method.aggregative import *
from data import load_multiclass_raw_document
import os
path_multiclass_raw = 'multiclass_raw'
result_path = os.path.join('results', 'multiclass_raw')
os.makedirs(result_path, exist_ok=True)
train_file = os.path.join(path_multiclass_raw, 'documents', 'training.txt')
train = LabelledCollection.load(train_file, load_multiclass_raw_document)
print('classes', train.classes_)
print('#classes', len(train.classes_))
print('#docs', len(train))
print('prevalence', train.prevalence())
print('counts', train.counts())
tfidf = TfidfVectorizer(min_df=5)
train.instances = tfidf.fit_transform(train.instances)
print(train.instances.shape[1])
scores = {}
for quantifier in [CC, ACC, PCC, PACC, EMQ]:#, HDy]:
classifier = CalibratedClassifierCV(LogisticRegression())
# classifier = LogisticRegression()
model = quantifier(classifier).fit(train)
print('model trained')
quantifier_name = model.__class__.__name__
scores[quantifier_name]={}
for sample_set, sample_size in [('validation', 1000), ('test', 5000)]:
ae_errors, rae_errors = [], []
for i in tqdm(range(sample_size), total=sample_size, desc=f'testing {quantifier_name} in {sample_set}'):
test_file = os.path.join(path_multiclass_raw, 'documents', f'{sample_set}_{i}.txt')
test = LabelledCollection.load(test_file, load_multiclass_raw_document, classes=train.classes_)
test.instances = tfidf.transform(test.instances)
qp.environ['SAMPLE_SIZE'] = len(test)
prev_estim = model.quantify(test.instances)
prev_true = test.prevalence()
ae_errors.append(qp.error.mae(prev_true, prev_estim))
rae_errors.append(qp.error.mrae(prev_true, prev_estim))
ae_errors = np.asarray(ae_errors)
rae_errors = np.asarray(rae_errors)
mae = ae_errors.mean()
mrae = rae_errors.mean()
scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
for model in scores:
for sample_set in ['validation', 'test']:
print(f'{model}\t{sample_set}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
"""
test:
validation
"""

View File

@ -1,3 +1,5 @@
Looks like there are some "multilingual" stuff in the master branch? See, e.g., MultilingualLabelledCollection in data/base.py
Packaging:
==========================================
Documentation with sphinx

View File

@ -39,8 +39,8 @@ class LabelledCollection:
self.index = {class_: np.arange(n_docs)[self.labels == class_] for class_ in self.classes_}
@classmethod
def load(cls, path: str, loader_func: callable):
return LabelledCollection(*loader_func(path))
def load(cls, path: str, loader_func: callable, classes=None):
return LabelledCollection(*loader_func(path), classes)
def __len__(self):
return self.instances.shape[0]

View File

@ -3,20 +3,25 @@ from scipy.sparse import dok_matrix
from tqdm import tqdm
def from_text(path, encoding='utf-8'):
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
"""
Reas a labelled colletion of documents.
Reads a labelled colletion of documents.
File fomart <0 or 1>\t<document>\n
:param path: path to the labelled collection
:return: a list of sentences, and a list of labels
"""
all_sentences, all_labels = [], []
for line in tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}'):
if verbose>0:
file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')
else:
file = open(path, 'rt', encoding=encoding).readlines()
for line in file:
line = line.strip()
if line:
label, sentence = line.split('\t')
sentence = sentence.strip()
label = int(label)
if class2int:
label = int(label)
if sentence:
all_sentences.append(sentence)
all_labels.append(label)