43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
"""
|
|
bg = Bulgarian
|
|
cs = Czech
|
|
da = Danish
|
|
de = German
|
|
el = Greek
|
|
en = English
|
|
es = Spanish
|
|
et = Estonian
|
|
fi = Finnish
|
|
fr = French
|
|
hu = Hungarian
|
|
it = Italian
|
|
lt = Lithuanian
|
|
lv = Latvian
|
|
nl = Dutch
|
|
mt = Maltese
|
|
pl = Polish
|
|
pt = Portuguese
|
|
ro = Romanian
|
|
sk = Slovak
|
|
sl = Slovene
|
|
sv = Swedish
|
|
"""
|
|
|
|
NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
|
|
'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
|
|
|
|
|
|
#top 10 languages in wikipedia order by the number of articles
|
|
#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
|
|
|
|
#all languages in JRC-acquis v3
|
|
JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
|
|
JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
|
|
|
|
RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
|
|
RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
|
|
|
|
lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
|
|
'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}
|
|
|