mask_numbers method

This commit is contained in:
andrea 2019-12-02 12:40:39 +01:00
parent fedc83f84e
commit e9404e2b8d
1 changed files with 15 additions and 2 deletions

View File

@ -11,6 +11,8 @@ import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from scipy.sparse import issparse from scipy.sparse import issparse
import itertools import itertools
from tqdm import tqdm
import re
class MultilingualDataset: class MultilingualDataset:
@ -73,10 +75,14 @@ class MultilingualDataset:
return self.lXte(), self.lYte() return self.lXte(), self.lYte()
def lXtr(self): def lXtr(self):
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self): def lXte(self):
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self): def lYtr(self):
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
@ -129,6 +135,13 @@ class MultilingualDataset:
def set_labels(self, labels): def set_labels(self, labels):
self.labels = labels self.labels = labels
def mask_numbers(self, data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
# Helpers # Helpers