import os from os.path import join import re import collections # ------------------------------------------------------------------------ # document loading routine # ------------------------------------------------------------------------ def remove_pattern(doc, start_symbol, end_symbol, counter): assert counter[start_symbol] == counter[end_symbol], 'wrong number of {}{} found'.format(start_symbol,end_symbol) search = True while search: start = doc.find(start_symbol) if start > -1: end = doc[start + 1:].find(end_symbol) doc = doc[:start] + doc[start + 1 + end + 1:] else: search = False return doc # removes citations in format: # *latino* # {volgare} def remove_citations(doc): counter = collections.Counter(doc) doc = remove_pattern(doc, start_symbol='*', end_symbol='*', counter=counter) doc = remove_pattern(doc, start_symbol='{', end_symbol='}', counter=counter) return doc def load_texts(path, positive_author='Dante', unknown_target=None, train_skip_prefix='EpistolaXIII_'): # load the training data (all documents but Epistolas 1 and 2) positive,negative = [],[] authors = [] ndocs=0 for file in os.listdir(path): if file.startswith(train_skip_prefix): continue file_clean = file.replace('.txt','') author, textname = file_clean.split('_')[0],file_clean.split('_')[1] text = open(join(path,file), encoding= "utf8").read() text = remove_citations(text) if author == positive_author: positive.append(text) else: negative.append(text) authors.append(author) ndocs+=1 # load the test data (Epistolas 1 and 2) if unknown_target: if isinstance(unknown_target, str): unknown_target = [unknown_target] unknowns = [] for unknown_text in unknown_target: unknown = open(join(path, unknown_text), encoding="utf8").read() unknown = remove_citations(unknown) unknowns.append(unknown) if len(unknowns) == 1: unknowns = unknowns[0] return positive, negative, unknowns else: return positive, negative def ___list_texts(path): authors = {} for file in os.listdir(path): if file.startswith('EpistolaXIII_'): continue file_clean = file.replace('.txt','') author, textname = file_clean.split('_')[0],file_clean.split('_')[1] if author not in authors: authors[author] = [] authors[author].append(textname) author_order = sorted(authors.keys()) for author in author_order: print('{}:\t{}'.format(author,', '.join(authors[author])))