forked from moreo/QuaPy
regenerating tfidf vectors
This commit is contained in:
parent
faa3af587c
commit
08c26c58f9
|
@ -1,5 +1,5 @@
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from Ordinal.utils import load_simple_sample_raw
|
from Ordinal.utils import load_simple_sample_raw, load_samples_raw
|
||||||
from quapy.data import LabelledCollection
|
from quapy.data import LabelledCollection
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
@ -19,6 +19,7 @@ datapath = './data'
|
||||||
domain = 'Books'
|
domain = 'Books'
|
||||||
outname = domain + '-tfidf'
|
outname = domain + '-tfidf'
|
||||||
|
|
||||||
|
|
||||||
def save_preprocessing_info(transformer):
|
def save_preprocessing_info(transformer):
|
||||||
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||||
foo.write(f'{str(transformer)}\n')
|
foo.write(f'{str(transformer)}\n')
|
||||||
|
@ -30,11 +31,11 @@ os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||||
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||||
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||||
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||||
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
os.makedirs(join(datapath, outname, 'real'), exist_ok=True)
|
||||||
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)
|
||||||
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)
|
||||||
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))
|
||||||
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||||
|
@ -45,16 +46,17 @@ save_preprocessing_info(tfidf)
|
||||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def transform_folder_samples(protocol, splitname):
|
def transform_folder_samples(protocol, splitname):
|
||||||
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
sample.instances = tfidf.transform(sample.instances)
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
transform_folder_samples('app', 'dev_samples')
|
transform_folder_samples('app', 'dev_samples')
|
||||||
transform_folder_samples('app', 'test_samples')
|
transform_folder_samples('app', 'test_samples')
|
||||||
transform_folder_samples('npp', 'dev_samples')
|
transform_folder_samples('real', 'dev_samples')
|
||||||
transform_folder_samples('npp', 'test_samples')
|
transform_folder_samples('real', 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,8 @@ def load_single_sample_pkl(parentdir, filename):
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_raw(path_dir, filter=None, classes=None):
|
def load_samples_raw(path_dir, filter=None, classes=None):
|
||||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
|
return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, classes=classes)
|
||||||
|
|
||||||
|
|
||||||
# def load_samples_as_csv(path_dir, filter=None):
|
# def load_samples_as_csv(path_dir, filter=None):
|
||||||
|
|
Loading…
Reference in New Issue