1
0
Fork 0

generating BERT outputs for textual documents

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-15 14:01:40 +01:00
parent ad64dfe2a0
commit d949c77317
1 changed files with 22 additions and 0 deletions

View File

@ -12,6 +12,28 @@ def load_samples(path_dir, classes):
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
def load_samples_as_csv(path_dir, debug=False):
import pandas as pd
import csv
import datasets
from datasets import Dataset
nsamples = len(glob(join(path_dir, f'*.txt')))
for id in range(nsamples):
df = pd.read_csv(join(path_dir, f'{id}.txt'), sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df.pop('labels').to_frame()
X = df
features = datasets.Features({'review': datasets.Value('string')})
if debug:
sample = Dataset.from_pandas(df=X, features=features).select(range(50))
labels = labels[:50]
else:
sample = Dataset.from_pandas(df=X, features=features)
yield sample, labels
def load_samples_pkl(path_dir, filter=None):
nsamples = len(glob(join(path_dir, f'*.pkl')))
for id in range(nsamples):