mask_numbers method
This commit is contained in:
parent
fedc83f84e
commit
e9404e2b8d
|
|
@ -11,6 +11,8 @@ import numpy as np
|
||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from scipy.sparse import issparse
|
from scipy.sparse import issparse
|
||||||
import itertools
|
import itertools
|
||||||
|
from tqdm import tqdm
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
class MultilingualDataset:
|
class MultilingualDataset:
|
||||||
|
|
@ -73,10 +75,14 @@ class MultilingualDataset:
|
||||||
return self.lXte(), self.lYte()
|
return self.lXte(), self.lYte()
|
||||||
|
|
||||||
def lXtr(self):
|
def lXtr(self):
|
||||||
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
|
||||||
|
lang in self.langs()}
|
||||||
|
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||||
|
|
||||||
def lXte(self):
|
def lXte(self):
|
||||||
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
|
||||||
|
lang in self.langs()}
|
||||||
|
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||||
|
|
||||||
def lYtr(self):
|
def lYtr(self):
|
||||||
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||||
|
|
@ -129,6 +135,13 @@ class MultilingualDataset:
|
||||||
def set_labels(self, labels):
|
def set_labels(self, labels):
|
||||||
self.labels = labels
|
self.labels = labels
|
||||||
|
|
||||||
|
def mask_numbers(self, data, number_mask='numbermask'):
|
||||||
|
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||||
|
masked = []
|
||||||
|
for text in tqdm(data, desc='masking numbers'):
|
||||||
|
masked.append(mask.sub(number_mask, text))
|
||||||
|
return masked
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------------------------
|
||||||
# Helpers
|
# Helpers
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue