dante-verification/src/data/dante_loader.py

86 lines
2.7 KiB
Python
Executable File

import os
from os.path import join
import re
import collections
# ------------------------------------------------------------------------
# document loading routine
# ------------------------------------------------------------------------
def remove_pattern(doc, start_symbol, end_symbol, counter):
assert counter[start_symbol] == counter[end_symbol], 'wrong number of {}{} found'.format(start_symbol,end_symbol)
search = True
while search:
start = doc.find(start_symbol)
if start > -1:
end = doc[start + 1:].find(end_symbol)
doc = doc[:start] + doc[start + 1 + end + 1:]
else:
search = False
return doc
# removes citations in format:
# *latino*
# {volgare}
def remove_citations(doc):
counter = collections.Counter(doc)
doc = remove_pattern(doc, start_symbol='*', end_symbol='*', counter=counter)
doc = remove_pattern(doc, start_symbol='{', end_symbol='}', counter=counter)
return doc
def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'):
# load the training data (all documents but Epistolas 1 and 2)
positive,negative = [],[]
authors = []
ndocs=0
for file in os.listdir(path):
if file.startswith(train_skip_prefix): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
text = open(join(path,file), encoding= "utf8").read()
text = remove_citations(text)
if author == positive_author:
positive.append(text)
else:
negative.append(text)
authors.append(author)
ndocs+=1
# load the test data (Epistolas 1 and 2)
if unknown_target:
if isinstance(unknown_target, str):
unknown_target = [unknown_target]
unknowns = []
for unknown_text in unknown_target:
unknown = open(join(path, unknown_text), encoding="utf8").read()
unknown = remove_citations(unknown)
unknowns.append(unknown)
if len(unknowns) == 1: unknowns = unknowns[0]
return positive, negative, unknowns
else:
return positive, negative
def ___list_texts(path):
authors = {}
for file in os.listdir(path):
if file.startswith('EpistolaXIII_'): continue
file_clean = file.replace('.txt','')
author, textname = file_clean.split('_')[0],file_clean.split('_')[1]
if author not in authors:
authors[author] = []
authors[author].append(textname)
author_order = sorted(authors.keys())
for author in author_order:
print('{}:\t{}'.format(author,', '.join(authors[author])))