first commit

2019-11-28 18:39:19 +01:00 · 2019-11-28 18:39:19 +01:00 · 2a5d0243db
commit 2a5d0243db
47 changed files with 4219 additions and 0 deletions
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData" serverName="anna_isti">
+    <serverData>
+      <paths name="anna_isti">
+        <serverdata>
+          <mappings>
+            <mapping deploy="/home/andreapdr/funneling_pdr" local="$PROJECT_DIR$" web="/" />
+            <mapping deploy="/home/andreapdr/CLESA/embeddings" local="/storage/andrea/FUNNELING/embeddings" />
+          </mappings>
+          <excludedPaths>
+            <excludedPath local="true" path="$PROJECT_DIR$/src/venv" />
+            <excludedPath local="true" path="$PROJECT_DIR$/src/pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle" />
+            <excludedPath local="true" path="$PROJECT_DIR$/src/results/results.csv" />
+          </excludedPaths>
+        </serverdata>
+      </paths>
+    </serverData>
+  </component>
+</project>
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding" addBOMForNewFiles="with NO BOM" />
+</project>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="NodePackageJsonFileManager">
+    <packageJsonPaths />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (word-class-embeddings)" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/tesi_funneling.iml" filepath="$PROJECT_DIR$/.idea/tesi_funneling.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/tesi_funneling.iml
+++ b/.idea/tesi_funneling.iml
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/src/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="Python 3.7 (word-class-embeddings)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
--- a/.idea/webServers.xml
+++ b/.idea/webServers.xml
@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="WebServers">
+    <option name="servers">
+      <webServer id="8f0f329c-a17c-48ba-b459-18d8b1a104e5" name="anna_isti" url="http://anna.isti.cnr.it">
+        <fileTransfer host="anna.isti.cnr.it" port="22" accessType="SFTP">
+          <advancedOptions>
+            <advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
+          </advancedOptions>
+          <option name="port" value="22" />
+        </fileTransfer>
+      </webServer>
+    </option>
+  </component>
+</project>
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@ -0,0 +1,655 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="CoverageDataManager">
+    <SUITE FILE_PATH="coverage/tesi_funneling$funneling_poly.coverage" NAME="funneling_poly Coverage Results" MODIFIED="1574690332154" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
+    <SUITE FILE_PATH="coverage/tesi_funneling$last_test.coverage" NAME="last_test Coverage Results" MODIFIED="1574960066673" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
+    <SUITE FILE_PATH="coverage/tesi_funneling$scratch.coverage" NAME="scratch Coverage Results" MODIFIED="1574759452703" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
+  </component>
+  <component name="FileEditorManager">
+    <splitter split-orientation="horizontal" split-proportion="0.5">
+      <split-first>
+        <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
+          <file pinned="false" current-in-tab="true">
+            <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
+              <provider selected="true" editor-type-id="text-editor">
+                <state relative-caret-position="182">
+                  <caret line="321" selection-start-line="321" selection-end-line="321" />
+                  <folding>
+                    <element signature="e#13891#17737#0" />
+                  </folding>
+                </state>
+              </provider>
+            </entry>
+          </file>
+        </leaf>
+      </split-first>
+      <split-second>
+        <leaf>
+          <file pinned="false" current-in-tab="true">
+            <entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
+              <provider selected="true" editor-type-id="text-editor">
+                <state relative-caret-position="462">
+                  <caret line="162" selection-start-line="162" selection-end-line="162" />
+                  <folding>
+                    <element signature="e#0#9#0" expanded="true" />
+                    <element signature="e#222#778#0" />
+                  </folding>
+                </state>
+              </provider>
+            </entry>
+          </file>
+        </leaf>
+      </split-second>
+    </splitter>
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="FindInProjectRecents">
+    <findStrings>
+      <find>NaivePolylingualClassifier</find>
+      <find>tra</find>
+      <find>base_l</find>
+      <find>tfidf</find>
+      <find>proba</find>
+      <find>we</find>
+      <find>Wordembeddings</find>
+      <find>hstack</find>
+      <find>ha</find>
+      <find>timeit</find>
+      <find>ti</find>
+      <find>time</find>
+      <find>dot</find>
+      <find>vec</find>
+      <find>_fit_binary</find>
+      <find>oneVs</find>
+      <find>embed</find>
+      <find>no tf-</find>
+      <find>embedding_matrix</find>
+      <find>WordEm</find>
+      <find>WordEmbeddings</find>
+      <find># pretrai</find>
+      <find># [pre</find>
+      <find>joblib</find>
+    </findStrings>
+  </component>
+  <component name="HighlightingSettingsPerFile">
+    <setting file="file://$PROJECT_DIR$/src/learning/learners.py" root0="FORCE_HIGHLIGHTING" />
+  </component>
+  <component name="IdeDocumentHistory">
+    <option name="CHANGED_PATHS">
+      <list>
+        <option value="$PROJECT_DIR$/src/dataset_builder.py" />
+        <option value="$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py" />
+        <option value="$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
+        <option value="$PROJECT_DIR$/src/scratch.py" />
+        <option value="$PROJECT_DIR$/src/data/tsr_function__.py" />
+        <option value="$PROJECT_DIR$/src/data/supervised.py" />
+        <option value="$PROJECT_DIR$/src/last_test.py" />
+        <option value="&lt;8f0f329c-a17c-48ba-b459-18d8b1a104e5&gt;/home/andreapdr/funneling_pdr/src/data/embeddings.py" />
+        <option value="$PROJECT_DIR$/src/util/results.py" />
+        <option value="$PROJECT_DIR$/src/transformers/clesa.py" />
+        <option value="$PROJECT_DIR$/src/transformers/dci.py" />
+        <option value="$PROJECT_DIR$/src/util/evaluation.py" />
+        <option value="$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
+        <option value="$PROJECT_DIR$/src/funneling_poly.py" />
+        <option value="$PROJECT_DIR$/src/learning/learners.py" />
+        <option value="$PROJECT_DIR$/src/FPEC_andrea.py" />
+        <option value="$PROJECT_DIR$/src/data/embeddings.py" />
+      </list>
+    </option>
+  </component>
+  <component name="ProjectFrameBounds" extendedState="6">
+    <option name="x" value="5" />
+    <option name="y" value="28" />
+    <option name="width" value="960" />
+    <option name="height" value="1052" />
+  </component>
+  <component name="ProjectView">
+    <navigator proportions="" version="1">
+      <foldersAlwaysOnTop value="true" />
+    </navigator>
+    <panes>
+      <pane id="Scope" />
+      <pane id="ProjectPane">
+        <subPane>
+          <expand>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+              <item name="learning" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+              <item name="pickles" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+              <item name="results" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+              <item name="transformers" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
+              <item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
+              <item name="src" type="462c0819:PsiDirectoryNode" />
+              <item name="util" type="462c0819:PsiDirectoryNode" />
+            </path>
+          </expand>
+          <select />
+        </subPane>
+      </pane>
+    </panes>
+  </component>
+  <component name="PropertiesComponent">
+    <property name="WebServerToolWindowFactoryState" value="true" />
+    <property name="WebServerToolWindowPanel.toolwindow.highlight.mappings" value="true" />
+    <property name="WebServerToolWindowPanel.toolwindow.highlight.symlinks" value="true" />
+    <property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
+    <property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
+    <property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/src/results/results.csv" />
+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/src/util" />
+      <recent name="$PROJECT_DIR$/src/data" />
+      <recent name="$PROJECT_DIR$/src" />
+      <recent name="$PROJECT_DIR$/src/learning" />
+    </key>
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="RunManager" selected="Python.last_test">
+    <configuration name="funneling_poly" type="PythonConfigurationType" factoryName="Python" temporary="true">
+      <module name="tesi_funneling" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/funneling_poly.py" />
+      <option name="PARAMETERS" value="-d &quot;pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle&quot; -w /storage/andrea/FUNNELING/embeddings/" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="last_test" type="PythonConfigurationType" factoryName="Python">
+      <module name="tesi_funneling" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/FPEC_andrea.py" />
+      <option name="PARAMETERS" value="-d &quot;pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle&quot; -w /storage/andrea/FUNNELING/embeddings/ -e unsupervised" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <configuration name="scratch" type="PythonConfigurationType" factoryName="Python" temporary="true">
+      <module name="tesi_funneling" />
+      <option name="INTERPRETER_OPTIONS" value="" />
+      <option name="PARENT_ENVS" value="true" />
+      <envs>
+        <env name="PYTHONUNBUFFERED" value="1" />
+      </envs>
+      <option name="SDK_HOME" value="" />
+      <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
+      <option name="IS_MODULE_SDK" value="true" />
+      <option name="ADD_CONTENT_ROOTS" value="true" />
+      <option name="ADD_SOURCE_ROOTS" value="true" />
+      <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
+      <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/scratch.py" />
+      <option name="PARAMETERS" value="" />
+      <option name="SHOW_COMMAND_LINE" value="false" />
+      <option name="EMULATE_TERMINAL" value="false" />
+      <option name="MODULE_MODE" value="false" />
+      <option name="REDIRECT_INPUT" value="false" />
+      <option name="INPUT_FILE" value="" />
+      <method v="2" />
+    </configuration>
+    <list>
+      <item itemvalue="Python.last_test" />
+      <item itemvalue="Python.funneling_poly" />
+      <item itemvalue="Python.scratch" />
+    </list>
+    <recent_temporary>
+      <list>
+        <item itemvalue="Python.scratch" />
+        <item itemvalue="Python.funneling_poly" />
+      </list>
+    </recent_temporary>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
+      <created>1574680487463</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1574680487463</updated>
+      <workItem from="1574680491429" duration="18756000" />
+      <workItem from="1574705313406" duration="1369000" />
+      <workItem from="1574758627235" duration="18313000" />
+      <workItem from="1574845439127" duration="15307000" />
+      <workItem from="1574870087360" duration="629000" />
+      <workItem from="1574871032651" duration="671000" />
+      <workItem from="1574873488200" duration="225000" />
+      <workItem from="1574876908618" duration="140000" />
+      <workItem from="1574877826026" duration="560000" />
+      <workItem from="1574938635317" duration="14980000" />
+      <workItem from="1574958501259" duration="1736000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TimeTrackingManager">
+    <option name="totallyTimeSpent" value="72686000" />
+  </component>
+  <component name="ToolWindowManager">
+    <frame x="0" y="28" width="1920" height="1052" extended-state="6" />
+    <layout>
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.15544872" />
+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
+      <window_info id="Favorites" order="2" side_tool="true" />
+      <window_info anchor="bottom" id="File Transfer" order="0" weight="0.3297414" />
+      <window_info anchor="bottom" id="Message" order="1" />
+      <window_info anchor="bottom" id="Find" order="2" weight="0.3297414" />
+      <window_info anchor="bottom" id="Run" order="3" weight="0.53556037" />
+      <window_info anchor="bottom" id="Debug" order="4" weight="0.5538793" />
+      <window_info anchor="bottom" id="Cvs" order="5" weight="0.25" />
+      <window_info anchor="bottom" id="Inspection" order="6" weight="0.4" />
+      <window_info anchor="bottom" id="TODO" order="7" />
+      <window_info anchor="bottom" id="Docker" order="8" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Version Control" order="9" />
+      <window_info anchor="bottom" id="Database Changes" order="10" />
+      <window_info anchor="bottom" id="Event Log" order="11" side_tool="true" weight="0.3297414" />
+      <window_info anchor="bottom" id="Terminal" order="12" weight="0.42456895" />
+      <window_info anchor="bottom" id="Python Console" order="13" />
+      <window_info anchor="right" id="Remote Host" order="0" weight="0.32959402" />
+      <window_info anchor="right" id="Commander" order="1" weight="0.4" />
+      <window_info anchor="right" id="Ant Build" order="2" weight="0.25" />
+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="3" weight="0.25" />
+      <window_info anchor="right" id="SciView" order="4" weight="0.5918803" />
+      <window_info anchor="right" id="Database" order="5" />
+    </layout>
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+  <component name="XDebuggerManager">
+    <breakpoint-manager>
+      <breakpoints>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/src/data/embeddings.py</url>
+          <line>162</line>
+          <option name="timeStamp" value="1" />
+        </line-breakpoint>
+        <line-breakpoint enabled="true" suspend="THREAD" type="python-line">
+          <url>file://$PROJECT_DIR$/src/learning/learners.py</url>
+          <line>566</line>
+          <option name="timeStamp" value="2" />
+        </line-breakpoint>
+      </breakpoints>
+    </breakpoint-manager>
+  </component>
+  <component name="editorHistoryManager">
+    <entry file="file://$PROJECT_DIR$/src/data/reader/__init__.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/time.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="152">
+          <caret line="308" column="4" selection-start-line="308" selection-start-column="4" selection-end-line="308" selection-end-column="4" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/utils/validation.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="440">
+          <caret line="950" selection-start-line="950" selection-end-line="950" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/scipy/sparse/base.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="746">
+          <caret line="1218" selection-start-line="1218" selection-end-line="1218" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$APPLICATION_HOME_DIR$/helpers/typeshed/stdlib/2and3/time.pyi">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/torch/_C/_TensorBase.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="325">
+          <caret line="887" column="8" selection-start-line="887" selection-start-column="8" selection-end-line="887" selection-end-column="8" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file:///usr/lib/python3.7/multiprocessing/connection.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="183">
+          <caret line="368" selection-start-line="368" selection-end-line="368" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/base.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="201">
+          <caret line="292" selection-start-line="292" selection-end-line="292" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file:///usr/lib/python3.7/selectors.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="167">
+          <caret line="417" selection-start-line="417" selection-end-line="417" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/scratch.py" />
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="191">
+          <caret line="271" selection-start-line="271" selection-end-line="271" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="191">
+          <caret line="566" selection-start-line="566" selection-end-line="566" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file:///usr/lib/python3.7/concurrent/futures/_base.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="101">
+          <caret line="383" selection-start-line="383" selection-end-line="383" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="182">
+          <caret line="224" selection-start-line="224" selection-end-line="224" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/classes.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="20">
+          <caret line="604" selection-start-line="604" selection-end-line="604" />
+          <folding>
+            <element signature="e#18103#25253#1" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="166">
+          <caret line="702" selection-start-line="702" selection-end-line="702" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file:///usr/lib/python3.7/pickle.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="166">
+          <caret line="503" selection-start-line="503" selection-end-line="503" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/nn/modules/__init__.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="216">
+          <caret line="12" selection-start-line="12" selection-end-line="12" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/pydevd.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="139">
+          <caret line="1486" selection-start-line="1486" selection-end-line="1486" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_bundle/pydev_monkey.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="180">
+          <caret line="12" column="38" lean-forward="true" selection-start-line="12" selection-end-line="13" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/data/tsr_function__.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="54">
+          <caret line="3" selection-start-line="3" selection-end-line="3" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/data/supervised.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="620">
+          <caret line="67" column="12" selection-start-line="67" selection-start-column="11" selection-end-line="67" selection-end-column="12" />
+          <folding>
+            <element signature="e#0#99#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/__init__.pyi">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="299">
+          <caret line="491" column="8" selection-start-line="491" selection-start-column="8" selection-end-line="491" selection-end-column="8" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="217">
+          <caret line="180" selection-start-line="180" selection-end-line="180" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="14">
+          <caret line="366" selection-start-line="366" selection-end-line="366" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file:///usr/lib/python3.7/codecs.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="174">
+          <caret line="309" selection-start-line="309" selection-end-line="309" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/pandas/core/series.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="160">
+          <caret line="303" selection-start-line="303" selection-end-line="303" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/transformers/clesa.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="72">
+          <caret line="4" selection-start-line="4" selection-end-line="4" />
+          <folding>
+            <element signature="e#0#18#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/transformers/riboc.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state>
+          <folding>
+            <element signature="e#0#11#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/util/evaluation.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="18">
+          <caret line="1" column="36" selection-start-line="1" selection-start-column="36" selection-end-line="1" selection-end-column="36" />
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/util/file.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state>
+          <folding>
+            <element signature="e#0#32#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
+    <entry file="file://$PROJECT_DIR$/src/transformers/dci.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="159">
+          <caret line="9" selection-start-line="9" selection-end-line="9" />
+          <folding>
+            <element signature="e#0#18#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/util/results.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="594">
+          <caret line="33" lean-forward="true" selection-start-line="33" selection-end-line="33" />
+          <folding>
+            <element signature="e#0#9#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/dataset_builder.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="1314">
+          <caret line="73" selection-start-line="73" selection-end-line="73" />
+          <folding>
+            <element signature="e#0#32#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/util/metrics.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/funneling_poly.py" />
+    <entry file="file://$PROJECT_DIR$/src/learning/learners.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="2063">
+          <caret line="517" selection-start-line="517" selection-end-line="517" />
+          <folding>
+            <element signature="e#0#18#0" expanded="true" />
+            <element signature="e#23965#24743#0" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="462">
+          <caret line="162" selection-start-line="162" selection-end-line="162" />
+          <folding>
+            <element signature="e#0#9#0" expanded="true" />
+            <element signature="e#222#778#0" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/src/FPEC_andrea.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="119">
+          <caret line="99" column="40" lean-forward="true" selection-start-line="74" selection-start-column="6" selection-end-line="99" selection-end-column="40" />
+          <folding>
+            <element signature="e#0#27#0" expanded="true" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+    <entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="182">
+          <caret line="321" selection-start-line="321" selection-end-line="321" />
+          <folding>
+            <element signature="e#13891#17737#0" />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+  </component>
+</project>
--- a/src/.gitignore
+++ b/src/.gitignore
@ -0,0 +1 @@
+*.idea
--- a/src/FPEC_andrea.py
+++ b/src/FPEC_andrea.py
@ -0,0 +1,151 @@
+from sklearn.svm import SVC
+import os, sys
+from dataset_builder import MultilingualDataset
+from learning.learners import *
+from util.evaluation import *
+from optparse import OptionParser
+from util.file import exists
+from util.results import PolylingualClassificationResults
+
+
+parser = OptionParser()
+
+parser.add_option("-d", "--dataset", dest="dataset",
+                  help="Path to the multilingual dataset processed and stored in .pickle format")
+
+parser.add_option("-o", "--output", dest="output",
+                  help="Result file", type=str,  default='./results/results.csv')
+
+parser.add_option("-e", "--mode-embed", dest="mode_embed",
+                  help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
+
+parser.add_option("-w", "--we-path", dest="we_path",
+                  help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings')
+
+parser.add_option("-s", "--set_c", dest="set_c",type=float,
+                  help="Set the C parameter", default=1)
+
+parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
+                  help="Optimices hyperparameters", default=False)
+
+parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
+                  help="Number of parallel jobs (default is -1, all)", default=-1)
+
+
+def get_learner(calibrate=False, kernel='linear'):
+    return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
+
+
+def get_params(dense=False):    # TODO kernel function could be usefull for meta-classifier
+    if not op.optimc:
+        return None
+    c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
+    kernel = 'rbf' if dense else 'linear'
+    return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
+
+#######################################################################################################################
+
+
+if __name__ == '__main__':
+
+    (op, args) = parser.parse_args()
+
+    assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
+    assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
+
+    dataset_file = os.path.basename(op.dataset)
+
+    results = PolylingualClassificationResults(op.output)
+
+    data = MultilingualDataset.load(op.dataset)
+    data.show_dimensions()
+
+    lXtr, lytr = data.training()
+    lXte, lyte = data.test()
+
+    print(lXtr.keys())
+
+    small_lXtr = dict()
+    small_lytr = dict()
+    small_lXte = dict()
+    small_lyte = dict()
+
+    small_lXtr['da'] = lXtr['da'][:50]
+    small_lytr['da'] = lytr['da'][:50]
+    # small_lXtr['en'] = lXtr['en'][:50]
+    # small_lytr['en'] = lytr['en'][:50]
+    # small_lXtr['fr'] = lXtr['fr'][:50]
+    # small_lytr['fr'] = lytr['fr'][:50]
+    # small_lXte['da'] = lXte['da'][:50]
+    # small_lyte['da'] = lyte['da'][:50]
+    # small_lXte['en'] = lXte['en'][:50]
+    # small_lyte['en'] = lyte['en'][:50]
+    # small_lXte['fr'] = lXte['fr'][:50]
+    # small_lyte['fr'] = lyte['fr'][:50]
+    # small_lXtr['it'] = lXtr['it'][:50]
+    # small_lytr['it'] = lytr['it'][:50]
+    # small_lXtr['es'] = lXtr['es'][:50]
+    # small_lytr['es'] = lytr['es'][:50]
+    # small_lXtr['de'] = lXtr['de'][:50]
+    # small_lytr['de'] = lytr['de'][:50]
+    # small_lXtr['pt'] = lXtr['pt'][:50]
+    # small_lytr['pt'] = lytr['pt'][:50]
+    # small_lXtr['nl'] = lXtr['de'][:50]
+    # small_lytr['nl'] = lytr['de'][:50]
+    # small_lXtr['fi'] = lXtr['fi'][:50]
+    # small_lytr['fi'] = lytr['fi'][:50]
+    # small_lXtr['hu'] = lXtr['hu'][:50]
+    # small_lytr['hu'] = lytr['hu'][:50]
+    # small_lXtr['sv'] = lXtr['sv'][:50]
+    # small_lytr['sv'] = lytr['sv'][:50]
+
+    if op.set_c != -1:
+        meta_parameters = None
+    else:
+        meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
+
+    # Embeddings and WCE config
+    _available_mode = ['none', 'unsupervised', 'supervised', 'both']
+    assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
+
+    if op.mode_embed == 'none':
+        config = {'unsupervised': False,
+                    'supervised': False}
+        _config_id = 'None'
+    elif op.mode_embed == 'unsupervised':
+        config = {'unsupervised': True,
+                  'supervised': False}
+        _config_id = 'M'
+    elif op.mode_embed == 'supervised':
+        config = {'unsupervised': False,
+                  'supervised': True}
+        _config_id = 'F'
+    elif op.mode_embed == 'both':
+        config = {'unsupervised': True,
+                  'supervised': True}
+        _config_id = 'M_and_F'
+
+    result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
+
+    print(f'### PolyEmbedd_andrea_{_config_id}\n')
+    classifier = AndreaCLF(op.we_path,
+                           config,
+                           first_tier_learner=get_learner(calibrate=True),
+                           meta_learner=get_learner(calibrate=False),
+                           first_tier_parameters=get_params(dense=True),
+                           meta_parameters=get_params(dense=True),
+                           n_jobs=op.n_jobs)
+
+    print('# Fitting ...')
+    classifier.fit(small_lXtr, small_lytr)
+
+    print('# Evaluating ...')
+    l_eval = evaluate_method(classifier, lXte, lyte)
+
+    metrics = []
+    for lang in lXte.keys():
+        macrof1, microf1, macrok, microk = l_eval[lang]
+        metrics.append([macrof1, microf1, macrok, microk])
+        print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
+        results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
+    print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
--- a/src/pycache/dataset_builder.cpython-37.pyc
+++ b/src/pycache/dataset_builder.cpython-37.pyc
--- a/src/data/init.py
+++ b/src/data/init.py
--- a/src/data/pycache/init.cpython-37.pyc
+++ b/src/data/pycache/init.cpython-37.pyc
--- a/src/data/pycache/embeddings.cpython-37.pyc
+++ b/src/data/pycache/embeddings.cpython-37.pyc
--- a/src/data/pycache/languages.cpython-37.pyc
+++ b/src/data/pycache/languages.cpython-37.pyc
--- a/src/data/pycache/supervised.cpython-37.pyc
+++ b/src/data/pycache/supervised.cpython-37.pyc
--- a/src/data/pycache/text_preprocessor.cpython-37.pyc
+++ b/src/data/pycache/text_preprocessor.cpython-37.pyc
--- a/src/data/pycache/tsr_function__.cpython-37.pyc
+++ b/src/data/pycache/tsr_function__.cpython-37.pyc
--- a/src/data/embeddings.py
+++ b/src/data/embeddings.py
@ -0,0 +1,196 @@
+import os
+import pickle
+import numpy as np
+from torchtext.vocab import Vectors
+import torch
+from abc import ABC, abstractmethod
+from data.supervised import get_supervised_embeddings
+
+
+class PretrainedEmbeddings(ABC):
+
+    def __init__(self):
+        super().__init__()
+
+    @abstractmethod
+    def vocabulary(self): pass
+
+    @abstractmethod
+    def dim(self): pass
+
+    @classmethod
+    def reindex(cls, words, word2index):
+        source_idx, target_idx = [], []
+        for i, word in enumerate(words):
+            if word not in word2index: continue
+            j = word2index[word]
+            source_idx.append(i)
+            target_idx.append(j)
+        source_idx = np.asarray(source_idx)
+        target_idx = np.asarray(target_idx)
+        return source_idx, target_idx
+
+
+class WordEmbeddings:
+
+    def __init__(self, lang, we, worddim):
+        self.lang = lang
+        self.we = we
+        self.worddim = worddim
+        self.dimword = {v:k for k,v in self.worddim.items()}
+
+    @classmethod
+    def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
+        filename = 'wiki.multi.{}.vec'.format(lang)
+        we_path = os.path.join(basedir, filename)
+
+        if dopickle and os.path.exists(we_path + '.pkl'):
+            print('loading pkl in {}'.format(we_path + '.pkl'))
+            (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
+        else:
+            word_registry=set()
+            lines = open(we_path).readlines()
+            nwords, dims = [int(x) for x in lines[0].split()]
+            print('reading we of {} dimensions'.format(dims))
+            we = np.zeros((nwords, dims), dtype=float)
+            worddim = {}
+            index = 0
+            for i, line in enumerate(lines[1:]):
+                if (i + 1) % 100 == 0:
+                    print('\r{}/{}'.format(i + 1, len(lines)), end='')
+                word, *vals = line.split()
+                wordp = word_preprocessor(word) if word_preprocessor is not None else word
+                if wordp:
+                    wordp=wordp[0]
+                    if wordp in word_registry:
+                        print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
+                    elif len(vals) == dims:
+                        worddim[wordp] = index
+                        we[index, :] = np.array(vals).astype(float)
+                        index+=1
+                # else:
+                #     print('warning: word <{}> generates an empty string after preprocessing'.format(word))
+            we = we[:index]
+            print('load {} words'.format(index))
+            if dopickle:
+                print('saving...')
+                pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
+
+        return WordEmbeddings(lang, we, worddim)
+
+    def vocabulary(self):
+        return set(self.worddim.keys())
+
+    def __getitem__(self, key):
+        return self.we[self.worddim[key]]
+
+    def dim(self):
+        return self.we.shape[1]
+
+    def __contains__(self, key):
+        return key in self.worddim
+
+    def most_similar(self, word_vect, k):
+        if word_vect.ndim == 1:
+            word_vect = word_vect.reshape(1,-1)
+        assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
+
+        sim = np.dot(word_vect,self.we.T)
+        order = np.argsort(-1*sim, axis=1)[:,:k]
+
+        similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
+        sim_scores = sim[:,order]
+        return similar_words, sim_scores
+
+    def get_vectors(self, wordlist):
+        indexes = np.array([self.worddim[w] for w in wordlist])
+        return self.we[indexes]
+
+    def restrict(self, vocabulary):
+        # vocabulary is a set of terms to be kept
+        active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
+        lost = len(vocabulary)-len(active_vocabulary)
+        if lost>0: #some termr are missing, so it will be replaced by UNK
+            print('warning: missing {} terms for lang {}'.format(lost, self.lang))
+        self.we = self.get_vectors(active_vocabulary)
+        assert self.we.shape[0]==len(active_vocabulary)
+        self.dimword={i:w for i,w in enumerate(active_vocabulary)}
+        self.worddim={w:i for i,w in enumerate(active_vocabulary)}
+        return self
+
+    @classmethod
+    def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
+        if lang_vocabularies is None:
+            return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
+        else:
+            # assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
+            return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
+
+    @classmethod
+    def merge(cls, we_list):
+        assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
+            'instances of {} expected'.format(WordEmbeddings.__name__)
+
+        polywe = []
+        worddim={}
+        offset=0
+        for we in we_list:
+            polywe.append(we.we)
+            worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
+            offset=len(worddim)
+        polywe = np.vstack(polywe)
+
+        return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
+
+
+class FastTextWikiNews(Vectors):
+
+    url_base = 'Cant auto-download MUSE embeddings'
+    path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
+
+    def __init__(self, cache, language="en", **kwargs):
+        url = self.url_base.format(language)
+        name = self.path.format(language)
+        super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
+
+
+class FastTextMUSE(PretrainedEmbeddings):
+
+    def __init__(self, path, lang, limit=None):
+        super().__init__()
+        print(f'Loading fastText pretrained vectors from {path}')
+        assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
+        self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
+        # print('Done')
+
+    def vocabulary(self):
+        return set(self.embed.stoi.keys())
+
+    def dim(self):
+        return self.embed.dim
+
+    def extract(self, words):
+        source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
+        extraction = torch.zeros((len(words), self.dim()))
+        extraction[source_idx] = self.embed.vectors[target_idx]
+        return extraction
+
+
+def embedding_matrix(path, voc, lang):
+    vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
+
+    print('[embedding matrix]')
+    print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
+    pretrained = FastTextMUSE(path, lang)
+    P = pretrained.extract(vocabulary).numpy()
+    del pretrained
+    print(f'[embedding matrix done] of shape={P.shape}\n')
+
+    return vocabulary, P
+
+
+def WCE_matrix(Xtr, Ytr, lang):
+    print('\n# [supervised-matrix]')
+    S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
+    print(f'[embedding matrix done] of shape={S.shape}\n')
+    return S
--- a/src/data/languages.py
+++ b/src/data/languages.py
@ -0,0 +1,42 @@
+"""
+bg = Bulgarian
+cs = Czech
+da = Danish
+de = German
+el = Greek
+en = English
+es = Spanish
+et = Estonian
+fi = Finnish
+fr = French
+hu = Hungarian
+it = Italian
+lt = Lithuanian
+lv = Latvian
+nl = Dutch
+mt = Maltese
+pl = Polish
+pt = Portuguese
+ro = Romanian
+sk = Slovak
+sl = Slovene
+sv = Swedish
+"""
+
+NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
+                'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
+
+
+#top 10 languages in wikipedia order by the number of articles
+#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
+
+#all languages in JRC-acquis v3
+JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
+JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
+
+RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
+RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
+
+lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING,   'JRC':JRC_LANGS,
+            'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}
+
--- a/src/data/reader/init.py
+++ b/src/data/reader/init.py
--- a/src/data/reader/pycache/init.cpython-37.pyc
+++ b/src/data/reader/pycache/init.cpython-37.pyc
--- a/src/data/reader/pycache/jrcacquis_reader.cpython-37.pyc
+++ b/src/data/reader/pycache/jrcacquis_reader.cpython-37.pyc
--- a/src/data/reader/pycache/rcv_reader.cpython-37.pyc
+++ b/src/data/reader/pycache/rcv_reader.cpython-37.pyc
--- a/src/data/reader/pycache/wikipedia_tools.cpython-37.pyc
+++ b/src/data/reader/pycache/wikipedia_tools.cpython-37.pyc
--- a/src/data/reader/jrcacquis_reader.py
+++ b/src/data/reader/jrcacquis_reader.py
@ -0,0 +1,321 @@
+from __future__ import print_function
+import os, sys
+from os.path import join
+import tarfile
+import xml.etree.ElementTree as ET
+from sklearn.datasets import get_data_home
+import pickle
+from util.file import download_file, list_dirs, list_files
+import rdflib
+from rdflib.namespace import RDF, SKOS
+from rdflib import URIRef
+import zipfile
+from data.languages import JRC_LANGS
+from collections import Counter
+from random import shuffle
+from data.languages import lang_set
+
+"""
+JRC Acquis' Nomenclature:
+bg = Bulgarian
+cs = Czech
+da = Danish
+de = German
+el = Greek
+en = English
+es = Spanish
+et = Estonian
+fi = Finnish
+fr = French
+hu = Hungarian
+it = Italian
+lt = Lithuanian
+lv = Latvian
+nl = Dutch
+mt = Maltese
+pl = Polish
+pt = Portuguese
+ro = Romanian
+sk = Slovak
+sl = Slovene
+sv = Swedish
+"""
+
+class JRCAcquis_Document:
+    def __init__(self, id, name, lang, year, head, body, categories):
+        self.id = id
+        self.parallel_id = name
+        self.lang = lang
+        self.year = year
+        self.text = body if not head else head + "\n" + body
+        self.categories = categories
+
+# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
+# however, it seems that the title is often appearing as the first paragraph in the text/body (with
+# standard codification), so it might be preferable not to read the header after all (as here by default)
+def _proc_acute(text):
+    for ch in ['a','e','i','o','u']:
+        text = text.replace('%'+ch+'acute%',ch)
+    return text
+
+def parse_document(file, year, head=False):
+    root = ET.parse(file).getroot()
+
+    doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
+    doc_lang = root.attrib['lang'] # e.g., 'es'
+    doc_id   = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
+    doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
+    doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
+    doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
+
+    def raise_if_empty(field, from_file):
+        if isinstance(field, str):
+            if not field.strip():
+                raise ValueError("Empty field in file %s" % from_file)
+
+    raise_if_empty(doc_name, file)
+    raise_if_empty(doc_lang, file)
+    raise_if_empty(doc_id, file)
+    if head: raise_if_empty(doc_head, file)
+    raise_if_empty(doc_body, file)
+
+    return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
+
+# removes documents without a counterpart in all other languages
+def _force_parallel(doclist, langs):
+    n_langs = len(langs)
+    par_id_count = Counter([d.parallel_id for d in doclist])
+    parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
+    return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
+
+def random_sampling_avoiding_parallel(doclist):
+    random_order = list(range(len(doclist)))
+    shuffle(random_order)
+    sampled_request = []
+    parallel_ids = set()
+    for ind in random_order:
+        pid = doclist[ind].parallel_id
+        if pid not in parallel_ids:
+            sampled_request.append(doclist[ind])
+            parallel_ids.add(pid)
+    print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
+    return sampled_request
+
+
+#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
+def _filter_by_category(doclist, cat_filter):
+    if not isinstance(cat_filter, frozenset):
+        cat_filter = frozenset(cat_filter)
+    filtered = []
+    for doc in doclist:
+        doc.categories = list(cat_filter & set(doc.categories))
+        if doc.categories:
+            doc.categories.sort()
+            filtered.append(doc)
+    print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
+    return filtered
+
+#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
+def _filter_by_frequency(doclist, cat_threshold):
+    cat_count = Counter()
+    for d in doclist:
+        cat_count.update(d.categories)
+
+    freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
+    freq_categories.sort()
+    return _filter_by_category(doclist, freq_categories), freq_categories
+
+#select top most_frequent categories (and filters documents containing those categories)
+def _most_common(doclist, most_frequent):
+    cat_count = Counter()
+    for d in doclist:
+        cat_count.update(d.categories)
+
+    freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
+    freq_categories.sort()
+    return _filter_by_category(doclist, freq_categories), freq_categories
+
+def _get_categories(request):
+    final_cats = set()
+    for d in request:
+        final_cats.update(d.categories)
+    return list(final_cats)
+
+def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
+                    parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
+
+    assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
+    if not langs:
+        langs = JRC_LANGS
+    else:
+        if isinstance(langs, str): langs = [langs]
+        for l in langs:
+            if l not in JRC_LANGS:
+                raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
+
+    if not data_path:
+        data_path = get_data_home()
+
+    if not os.path.exists(data_path):
+        os.mkdir(data_path)
+
+    request = []
+    total_read = 0
+    for l in langs:
+        file_name = 'jrc-'+l+'.tgz'
+        archive_path = join(data_path, file_name)
+
+        if not os.path.exists(archive_path):
+            print("downloading language-specific dataset (once and for all) into %s" % data_path)
+            DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
+            download_file(DOWNLOAD_URL, archive_path)
+            print("untarring dataset...")
+            tarfile.open(archive_path, 'r:gz').extractall(data_path)
+
+        documents_dir = join(data_path, l)
+
+        print("Reading documents...")
+        read = 0
+        for dir in list_dirs(documents_dir):
+            year = int(dir)
+            if years==None or year in years:
+                year_dir = join(documents_dir,dir)
+                pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
+                if os.path.exists(pickle_name):
+                    print("loading from file %s" % pickle_name)
+                    l_y_documents = pickle.load(open(pickle_name, "rb"))
+                    read += len(l_y_documents)
+                else:
+                    l_y_documents = []
+                    all_documents = list_files(year_dir)
+                    empty = 0
+                    for i,doc_file in enumerate(all_documents):
+                        try:
+                            jrc_doc = parse_document(join(year_dir, doc_file), year)
+                        except ValueError:
+                            jrc_doc = None
+
+                        if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
+                            l_y_documents.append(jrc_doc)
+                        else: empty += 1
+                        if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
+                            print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
+                        read+=1
+                    print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
+                    print("\t\t(Pickling object for future runs in %s)" % pickle_name)
+                    pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
+                request += l_y_documents
+        print("Read %d documents for language %s\n" % (read, l))
+        total_read += read
+    print("Read %d documents in total" % (total_read))
+
+    if parallel=='force':
+        request = _force_parallel(request, langs)
+    elif parallel == 'avoid':
+        request = random_sampling_avoiding_parallel(request)
+
+    final_cats = _get_categories(request)
+
+    if cat_filter:
+        request = _filter_by_category(request, cat_filter)
+        final_cats = _get_categories(request)
+    if cat_threshold > 0:
+        request, final_cats = _filter_by_frequency(request, cat_threshold)
+    if most_frequent != -1 and len(final_cats) > most_frequent:
+        request, final_cats = _most_common(request, most_frequent)
+
+    return request, final_cats
+
+def print_cat_analysis(request):
+    cat_count = Counter()
+    for d in request:
+        cat_count.update(d.categories)
+    print("Number of active categories: {}".format(len(cat_count)))
+    print(cat_count.most_common())
+
+# inspects the Eurovoc thesaurus in order to select a subset of categories
+# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
+def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
+                    eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
+                    select="broadest"):
+
+    fullpath_pickle = join(data_path, select+'_concepts.pickle')
+    if os.path.exists(fullpath_pickle):
+        print("Pickled object found in %s. Loading it." % fullpath_pickle)
+        return pickle.load(open(fullpath_pickle,'rb'))
+
+    fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
+    if not os.path.exists(fullpath):
+        print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
+        download_file(eurovoc_url, fullpath)
+        print("Unzipping file...")
+        zipped = zipfile.ZipFile(data_path + '.zip', 'r')
+        zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
+        zipped.close()
+
+    print("Parsing %s" %fullpath)
+    g = rdflib.Graph()
+    g.parse(location=fullpath, format="application/rdf+xml")
+
+    if select == "all":
+        print("Selecting all concepts")
+        all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
+        all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
+        all_concepts.sort()
+        selected_concepts = all_concepts
+    elif select=="broadest":
+        print("Selecting broadest concepts (those without any other broader concept linked to it)")
+        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
+        narrower_concepts = set(g.subjects(SKOS.broader, None))
+        broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
+        broadest_concepts.sort()
+        selected_concepts = broadest_concepts
+    elif select=="leaves":
+        print("Selecting leaves concepts (those not linked as broader of any other concept)")
+        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
+        broad_concepts = set(g.objects(None, SKOS.broader))
+        leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
+        leave_concepts.sort()
+        selected_concepts = leave_concepts
+    else:
+        raise ValueError("Selection policy %s is not currently supported" % select)
+
+    print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
+    print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
+    pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+    return selected_concepts
+
+if __name__ == '__main__':
+
+    def single_label_fragment(doclist):
+        single = [d for d in doclist if len(d.categories) < 2]
+        final_categories = set([d.categories[0] if d.categories else [] for d in single])
+        print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
+                                                                                                len(final_categories),
+                                                                                                len(doclist)))
+        return single, list(final_categories)
+
+    train_years = list(range(1986, 2006))
+    test_years = [2006]
+    cat_policy = 'leaves'
+    most_common_cat = 300
+    # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
+    JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
+    langs = lang_set['JRC_NLTK']
+    cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
+    sys.exit()
+
+    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
+    test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
+
+    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
+    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
+
+    training_docs, label_names = single_label_fragment(training_docs)
+    test_docs, label_namestest = single_label_fragment(test_docs)
+
+    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
+    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
+
+
--- a/src/data/reader/rcv_reader.py
+++ b/src/data/reader/rcv_reader.py
@ -0,0 +1,225 @@
+from zipfile import ZipFile
+import xml.etree.ElementTree as ET
+from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
+from util.file import list_files
+from sklearn.datasets import get_data_home
+import gzip
+from os.path import join, exists
+from util.file import download_file_if_not_exists
+import re
+from collections import Counter
+import numpy as np
+import sys
+
+"""
+RCV2's Nomenclature:
+ru = Russian
+da = Danish
+de = German
+es = Spanish
+lat = Spanish Latin-American (actually is also 'es' in the collection)
+fr = French
+it = Italian
+nl = Dutch
+pt = Portuguese
+sv = Swedish
+ja = Japanese
+htw = Chinese
+no = Norwegian
+"""
+
+RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
+RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
+RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
+RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
+
+rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
+             'lyrl2004_tokens_test_pt1.dat.gz',
+             'lyrl2004_tokens_test_pt2.dat.gz',
+             'lyrl2004_tokens_test_pt3.dat.gz']
+
+rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
+
+rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
+
+RCV2_LANG_DIR = {'ru':'REUTE000',
+                 'de':'REUTE00A',
+                 'fr':'REUTE00B',
+                 'sv':'REUTE001',
+                 'no':'REUTE002',
+                 'da':'REUTE003',
+                 'pt':'REUTE004',
+                 'it':'REUTE005',
+                 'es':'REUTE006',
+                 'lat':'REUTE007',
+                 'jp':'REUTE008',
+                 'htw':'REUTE009',
+                 'nl':'REUTERS_'}
+
+
+class RCV_Document:
+
+    def __init__(self, id, text, categories, date='', lang=None):
+        self.id = id
+        self.date = date
+        self.lang = lang
+        self.text = text
+        self.categories = categories
+
+
+class ExpectedLanguageException(Exception): pass
+class IDRangeException(Exception): pass
+
+
+nwords = []
+
+def parse_document(xml_content, assert_lang=None, valid_id_range=None):
+    root = ET.fromstring(xml_content)
+    if assert_lang:
+        if assert_lang not in root.attrib.values():
+            if assert_lang != 'jp' or 'ja' not in root.attrib.values():  # some documents are attributed to 'ja', others to 'jp'
+                raise ExpectedLanguageException('error: document of a different language')
+
+    doc_id = root.attrib['itemid']
+    if valid_id_range is not None:
+        if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
+            raise IDRangeException
+
+    doc_categories = [cat.attrib['code'] for cat in
+                      root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
+
+    doc_date = root.attrib['date']
+    doc_title = root.find('.//title').text
+    doc_headline = root.find('.//headline').text
+    doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
+
+    if not doc_body:
+        raise ValueError('Empty document')
+
+    if doc_title is None: doc_title = ''
+    if doc_headline is None or doc_headline in doc_title: doc_headline = ''
+    text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
+
+    text_length = len(text.split())
+    global nwords
+    nwords.append(text_length)
+
+    return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
+
+
+def fetch_RCV1(data_path, split='all'):
+
+    assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
+
+    request = []
+    labels = set()
+    read_documents = 0
+    lang = 'en'
+
+    training_documents = 23149
+    test_documents = 781265
+
+    if split == 'all':
+        split_range = (2286, 810596)
+        expected = training_documents+test_documents
+    elif split == 'train':
+        split_range = (2286, 26150)
+        expected = training_documents
+    else:
+        split_range = (26151, 810596)
+        expected = test_documents
+
+    global nwords
+    nwords=[]
+    for part in list_files(data_path):
+        if not re.match('\d+\.zip', part): continue
+        target_file = join(data_path, part)
+        assert exists(target_file), \
+            "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
+            " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
+        zipfile = ZipFile(target_file)
+        for xmlfile in zipfile.namelist():
+            xmlcontent = zipfile.open(xmlfile).read()
+            try:
+                doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
+                labels.update(doc.categories)
+                request.append(doc)
+                read_documents += 1
+            except ValueError:
+                print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
+            except (IDRangeException, ExpectedLanguageException) as e:
+                pass
+            print('\r[{}] read {} documents'.format(part, len(request)), end='')
+            if read_documents == expected: break
+        if read_documents == expected: break
+    print()
+    print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
+    return request, list(labels)
+
+
+def fetch_RCV2(data_path, languages=None):
+
+    if not languages:
+        languages = list(RCV2_LANG_DIR.keys())
+    else:
+        assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
+
+    request = []
+    labels = set()
+    global nwords
+    nwords=[]
+    for lang in languages:
+        path = join(data_path, RCV2_LANG_DIR[lang])
+        lang_docs_read = 0
+        for part in list_files(path):
+            target_file = join(path, part)
+            assert exists(target_file), \
+                "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
+                " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
+            zipfile = ZipFile(target_file)
+            for xmlfile in zipfile.namelist():
+                xmlcontent = zipfile.open(xmlfile).read()
+                try:
+                    doc = parse_document(xmlcontent, assert_lang=lang)
+                    labels.update(doc.categories)
+                    request.append(doc)
+                    lang_docs_read += 1
+                except ValueError:
+                    print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
+                except (IDRangeException, ExpectedLanguageException) as e:
+                    pass
+                print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
+        print()
+    print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
+    return request, list(labels)
+
+
+def fetch_topic_hierarchy(path, topics='all'):
+    assert topics in ['all', 'leaves']
+
+    download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
+    hierarchy = {}
+    for line in open(path, 'rt'):
+        parts = line.strip().split()
+        parent,child = parts[1],parts[3]
+        if parent not in hierarchy:
+            hierarchy[parent]=[]
+        hierarchy[parent].append(child)
+
+    del hierarchy['None']
+    del hierarchy['Root']
+    print(hierarchy)
+
+    if topics=='all':
+        topics = set(hierarchy.keys())
+        for parent in hierarchy.keys():
+            topics.update(hierarchy[parent])
+        return list(topics)
+    elif topics=='leaves':
+        parents = set(hierarchy.keys())
+        childs = set()
+        for parent in hierarchy.keys():
+            childs.update(hierarchy[parent])
+        return list(childs.difference(parents))
+
+
--- a/src/data/reader/wikipedia_tools.py
+++ b/src/data/reader/wikipedia_tools.py
@ -0,0 +1,304 @@
+from __future__ import print_function
+import ijson
+import os, sys
+from os.path import join
+from bz2 import BZ2File
+from ijson.common import ObjectBuilder
+import pickle
+from util.file import list_dirs, list_files, makedirs_if_not_exist
+from itertools import islice
+import re
+from xml.sax.saxutils import escape
+import numpy as np
+
+policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
+
+"""
+This file contains a set of tools for processing the Wikipedia multilingual documents.
+In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
+and have processed each document to clean their texts with one of the tools:
+    - https://github.com/aesuli/wikipediatools (Python 2)
+    - https://github.com/aesuli/wikipedia-extractor (Python 3)
+It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
+
+This tools help you in:
+    - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
+    Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
+    extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
+    Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
+    - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
+    - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
+    language-specific versions from the document.
+    - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
+    in a way that the i-th element from any list refers to the same element in the respective language.
+"""
+
+def _doc_generator(text_path, langs):
+    dotspace = re.compile(r'\.(?!\s)')
+    for l,lang in enumerate(langs):
+        print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
+        lang_dir = join(text_path, lang)
+        split_dirs = list_dirs(lang_dir)
+        for sd,split_dir in enumerate(split_dirs):
+            print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
+            split_files = list_files(join(lang_dir, split_dir))
+            for sf,split_file in enumerate(split_files):
+                print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
+                with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
+                    while True:
+                        doc_lines = list(islice(fi, 3))
+                        if doc_lines:
+                            # some sentences are not followed by a space after the dot
+                            doc_lines[1] = dotspace.sub('. ', doc_lines[1])
+                            # [workaround] I found &nbsp; html symbol was not treated, and unescaping it now might not help...
+                            doc_lines[1] = escape(doc_lines[1].replace("&nbsp;", " "))
+                            yield doc_lines, lang
+                        else: break
+
+def _extract_title(doc_lines):
+    m = re.search('title="(.+?)"', doc_lines[0])
+    if m: return m.group(1).decode('utf-8')
+    else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
+
+def _create_doc(target_file, id, doc, lang):
+    doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
+    with open(target_file, 'w') as fo:
+        fo.write('<multidoc id="%s">\n'%id)
+        [fo.write(line) for line in doc]
+        fo.write('</multidoc>')
+
+def _append_doc(target_file, doc, lang):
+    doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
+    with open(target_file, 'r', buffering=1024*1024) as fi:
+        lines = fi.readlines()
+    if doc[0] in lines[1::3]:
+        return
+    lines[-1:-1]=doc
+    with open(target_file, 'w', buffering=1024*1024) as fo:
+        [fo.write(line) for line in lines]
+
+def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
+    if not os.path.exists(out_path):
+        os.makedirs(out_path)
+    for lang in langs:
+        if lang not in inv_dict:
+            raise ValueError("Lang %s is not in the dictionary" % lang)
+
+    docs_created = len(list_files(out_path))
+    print("%d multilingual documents found." % docs_created)
+    for doc,lang in _doc_generator(text_path, langs):
+        title = _extract_title(doc)
+
+        if title in inv_dict[lang]:
+            #pass
+            ids = inv_dict[lang][title]
+            for id in ids:
+                target_file = join(out_path, id) + ".xml"
+                if os.path.exists(target_file):
+                    _append_doc(target_file, doc, lang)
+                else:
+                    _create_doc(target_file, id, doc, lang)
+                    docs_created+=1
+        else:
+            if not re.match('[A-Za-z]+', title):
+                print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
+
+
+
+def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
+    simplified_file = join(data_dir,filename)
+
+    if policy not in policies:
+        raise ValueError("Policy %s not supported." % policy)
+    print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
+
+    lang_prefix = list(langs)
+    lang_prefix.sort()
+    pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
+    pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
+    pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
+    if os.path.exists(pickle_invdict):
+        if return_both and os.path.exists(pickle_dict):
+            print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
+            return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
+        elif return_both==False:
+            print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
+            return pickle.load(open(pickle_invdict, 'rb'))
+
+    multiling_titles = {}
+    inv_dict = {lang:{} for lang in langs}
+
+    def process_entry(line):
+        parts = line.strip().split('\t')
+        id = parts[0]
+        if id in multiling_titles:
+            raise ValueError("id <%s> already indexed" % id)
+
+        titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
+        for lang in titles.keys():
+            if lang not in langs:
+                del titles[lang]
+
+        if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
+                or (policy == "IN_ANY_LANG" and len(titles) > 0):
+            multiling_titles[id] = titles
+            for lang, title in titles.items():
+                if title in inv_dict[lang]:
+                    inv_dict[lang][title].append(id)
+                inv_dict[lang][title] = [id]
+
+    with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
+        completed = 0
+        try:
+            for line in fi:
+                process_entry(line)
+                completed += 1
+                if completed % 10 == 0:
+                    print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
+            print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
+        except EOFError:
+            print("\nUnexpected file ending... saving anyway")
+
+        print("Pickling dictionaries in %s" % data_dir)
+        pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
+        pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
+        print("Done")
+
+    return (multiling_titles, inv_dict) if return_both else inv_dict
+
+
+# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
+def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
+    latest_all_json_file = join(data_dir,json_file)
+
+    if policy not in policies:
+        raise ValueError("Policy %s not supported." % policy)
+
+    print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
+
+    lang_prefix = list(langs)
+    lang_prefix.sort()
+    simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
+
+    def process_entry(last, fo):
+        global written
+        id = last["id"]
+        titles = None
+        if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
+            titles = {lang: last["labels"][lang]["value"] for lang in langs}
+        elif policy == "IN_ANY_LANG":
+            titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
+
+        if titles:
+            fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
+            return True
+        else:
+            return False
+
+    written = 0
+    with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
+            BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
+        builder = ObjectBuilder()
+        completed = 0
+        for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
+             builder.event(event, value)
+             if len(builder.value)>1:
+                if process_entry(builder.value.pop(0), fo): written += 1
+                completed += 1
+                print("\rCompleted %d\ttitles %d" % (completed,written), end="")
+        print("")
+
+        #process the last entry
+        process_entry(builder.value.pop(0))
+
+    return simple_titles_path
+
+"""
+Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
+specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
+specific version of the same document. Documents are forced to contain version in all specified languages and to contain
+a minimum number of words; otherwise it is discarded.
+"""
+class MinWordsNotReached(Exception): pass
+class WrongDocumentFormat(Exception): pass
+
+def _load_multilang_doc(path, langs, min_words=100):
+    import xml.etree.ElementTree as ET
+    from xml.etree.ElementTree import Element, ParseError
+    try:
+        root = ET.parse(path).getroot()
+        doc = {}
+        for lang in langs:
+            doc_body = root.find('.//doc[@lang="' + lang + '"]')
+            if isinstance(doc_body, Element):
+                n_words = len(doc_body.text.split(' '))
+                if n_words >= min_words:
+                    doc[lang] = doc_body.text
+                else:
+                    raise MinWordsNotReached
+            else:
+                raise WrongDocumentFormat
+    except ParseError:
+        raise WrongDocumentFormat
+    return doc
+
+#returns the multilingual documents mapped by language, and a counter with the number of documents readed
+def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
+    if pickle_name and os.path.exists(pickle_name):
+        print("unpickling %s" % pickle_name)
+        return pickle.load(open(pickle_name, 'rb'))
+
+    multi_docs = list_files(wiki_multi_path)
+    mling_documents = {l:[] for l in langs}
+    valid_documents = 0
+    minwords_exception = 0
+    wrongdoc_exception = 0
+    for d,multi_doc in enumerate(multi_docs):
+        print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
+              (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
+        doc_path = join(wiki_multi_path, multi_doc)
+        try:
+            m_doc = _load_multilang_doc(doc_path, langs, min_words)
+            valid_documents += 1
+            for l in langs:
+                mling_documents[l].append(m_doc[l])
+        except MinWordsNotReached:
+            minwords_exception += 1
+            if deletions: os.remove(doc_path)
+        except WrongDocumentFormat:
+            wrongdoc_exception += 1
+            if deletions: os.remove(doc_path)
+        if max_documents>0 and valid_documents>=max_documents:
+            break
+
+    if pickle_name:
+        print("Pickling wikipedia documents object in %s" % pickle_name)
+        pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
+
+    return mling_documents
+
+def random_wiki_sample(l_wiki, max_documents):
+    if max_documents == 0: return None
+    langs = list(l_wiki.keys())
+    assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
+    ndocs_per_lang = len(l_wiki[langs[0]])
+    if ndocs_per_lang > max_documents:
+        sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
+        for lang in langs:
+            l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
+    return l_wiki
+
+
+if __name__ == "__main__":
+
+    wikipedia_home = "../Datasets/Wikipedia"
+
+    from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
+    langs = frozenset(langs)
+
+    simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
+    _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
+    extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
+                                   out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))
+
+
--- a/src/data/supervised.py
+++ b/src/data/supervised.py
@ -0,0 +1,75 @@
+from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
+# from util.common import *
+from sklearn.decomposition import PCA
+import numpy as np
+
+
+def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
+    std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
+    mean = np.mean(x, axis=axis)
+    return (x - mean) / std
+
+
+def supervised_embeddings_tfidf(X,Y):
+    tfidf_norm = X.sum(axis=0)
+    F = (X.T).dot(Y) / tfidf_norm.T
+    return F
+
+
+def supervised_embeddings_ppmi(X,Y):
+    Xbin = X>0
+    D = X.shape[0]
+    Pxy = (Xbin.T).dot(Y)/D
+    Px = Xbin.sum(axis=0)/D
+    Py = Y.sum(axis=0)/D
+    F = np.asarray(Pxy/(Px.T*Py))
+    F = np.maximum(F, 1.0)
+    F = np.log(F)
+    return F
+
+
+def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
+    D = X.shape[0]
+    if D>max_documents:
+        print(f'sampling {max_documents}')
+        random_sample = np.random.permutation(D)[:max_documents]
+        X = X[random_sample]
+        Y = Y[random_sample]
+    cell_matrix = get_supervised_matrix(X, Y)
+    F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
+    return F
+
+
+def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
+    print('computing supervised embeddings...')
+
+    nC = Y.shape[1]
+    if nC==2 and binary_structural_problems > nC:
+        raise ValueError('not implemented in this branch')
+
+    if method=='ppmi':
+        F = supervised_embeddings_ppmi(X, Y)
+    elif method == 'dotn':
+        F = supervised_embeddings_tfidf(X, Y)
+    elif method == 'ig':
+        F = supervised_embeddings_tsr(X, Y, information_gain)
+    elif method == 'chi2':
+        F = supervised_embeddings_tsr(X, Y, chi_square)
+
+    if dozscore:
+        F = zscores(F, axis=0)
+
+    if nC > max_label_space:
+        print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
+              f'Applying PCA(n_components={max_label_space})')
+        pca = PCA(n_components=max_label_space)
+        F = pca.fit(F).transform(F)
+
+    return F
+
+
+
+
+
+
+
--- a/src/data/text_preprocessor.py
+++ b/src/data/text_preprocessor.py
@ -0,0 +1,33 @@
+from nltk.corpus import stopwords
+from data.languages import NLTK_LANGMAP
+from nltk import word_tokenize
+from nltk.stem import SnowballStemmer
+
+
+def preprocess_documents(documents, lang):
+    tokens = NLTKStemTokenizer(lang, verbose=True)
+    sw = stopwords.words(NLTK_LANGMAP[lang])
+    return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
+
+
+class NLTKStemTokenizer(object):
+
+    def __init__(self, lang, verbose=False):
+        if lang not in NLTK_LANGMAP:
+            raise ValueError('Language %s is not supported in NLTK' % lang)
+        self.verbose=verbose
+        self.called = 0
+        self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
+        self.cache = {}
+
+    def __call__(self, doc):
+        self.called += 1
+        if self.verbose:
+            print("\r\t\t[documents processed %d]" % (self.called), end="")
+        tokens = word_tokenize(doc)
+        stems = []
+        for t in tokens:
+            if t not in self.cache:
+                self.cache[t] = self.wnl.stem(t)
+            stems.append(self.cache[t])
+        return stems
--- a/src/data/tsr_function__.py
+++ b/src/data/tsr_function__.py
@ -0,0 +1,270 @@
+import math
+import numpy as np
+from scipy.stats import t
+from joblib import Parallel, delayed
+from scipy.sparse import csr_matrix, csc_matrix
+
+
+def get_probs(tpr, fpr, pc):
+    # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
+    # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
+    pnc = 1.0 - pc
+    tp = tpr * pc
+    fn = pc - tp
+    fp = fpr * pnc
+    tn = pnc - fp
+    return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
+
+
+def apply_tsr(tpr, fpr, pc, tsr):
+    cell = get_probs(tpr, fpr, pc)
+    return tsr(cell)
+
+
+def positive_information_gain(cell):
+    if cell.tpr() < cell.fpr():
+        return 0.0
+    else:
+        return information_gain(cell)
+
+
+def posneg_information_gain(cell):
+    ig = information_gain(cell)
+    if cell.tpr() < cell.fpr():
+        return -ig
+    else:
+        return ig
+
+
+def __ig_factor(p_tc, p_t, p_c):
+    den = p_t * p_c
+    if den != 0.0 and p_tc != 0:
+        return p_tc * math.log(p_tc / den, 2)
+    else:
+        return 0.0
+
+
+def information_gain(cell):
+    return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
+           __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
+           __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
+           __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
+
+
+def information_gain_mod(cell):
+    return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()))  \
+           - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
+
+
+def pointwise_mutual_information(cell):
+    return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
+
+
+def gain_ratio(cell):
+    pc = cell.p_c()
+    pnc = 1.0 - pc
+    norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
+    return information_gain(cell) / (-norm)
+
+
+def chi_square(cell):
+    den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
+    if den==0.0: return 0.0
+    num = gss(cell)**2
+    return num / den
+
+
+def relevance_frequency(cell):
+    a = cell.tp
+    c = cell.fp
+    if c == 0: c = 1
+    return math.log(2.0 + (a * 1.0 / c), 2)
+
+
+def idf(cell):
+    if cell.p_f()>0:
+        return math.log(1.0 / cell.p_f())
+    return 0.0
+
+
+def gss(cell):
+    return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
+
+
+def conf_interval(xt, n):
+    if n>30:
+        z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
+    else:
+        z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
+    p = (xt + 0.5 * z2) / (n + z2)
+    amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
+    return p, amplitude
+
+def strength(minPosRelFreq, minPos, maxNeg):
+    if minPos > maxNeg:
+        return math.log(2.0 * minPosRelFreq, 2.0)
+    else:
+        return 0.0
+
+
+#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
+#however, for some extremely imbalanced dataset caused all documents to be 0
+def conf_weight(cell, cancel_features=False):
+    c = cell.get_c()
+    not_c = cell.get_not_c()
+    tp = cell.tp
+    fp = cell.fp
+
+    pos_p, pos_amp = conf_interval(tp, c)
+    neg_p, neg_amp = conf_interval(fp, not_c)
+
+    min_pos = pos_p-pos_amp
+    max_neg = neg_p+neg_amp
+    den = (min_pos + max_neg)
+    minpos_relfreq = min_pos / (den if den != 0 else 1)
+
+    str_tplus = strength(minpos_relfreq, min_pos, max_neg);
+
+    if str_tplus == 0 and not cancel_features:
+        return 1e-20
+
+    return str_tplus;
+
+
+class ContTable:
+
+    def __init__(self, tp=0, tn=0, fp=0, fn=0):
+        self.tp=tp
+        self.tn=tn
+        self.fp=fp
+        self.fn=fn
+
+    def get_d(self): return self.tp + self.tn + self.fp + self.fn
+
+    def get_c(self): return self.tp + self.fn
+
+    def get_not_c(self): return self.tn + self.fp
+
+    def get_f(self): return self.tp + self.fp
+
+    def get_not_f(self): return self.tn + self.fn
+
+    def p_c(self): return (1.0*self.get_c())/self.get_d()
+
+    def p_not_c(self): return 1.0-self.p_c()
+
+    def p_f(self): return (1.0*self.get_f())/self.get_d()
+
+    def p_not_f(self): return 1.0-self.p_f()
+
+    def p_tp(self): return (1.0*self.tp) / self.get_d()
+
+    def p_tn(self): return (1.0*self.tn) / self.get_d()
+
+    def p_fp(self): return (1.0*self.fp) / self.get_d()
+
+    def p_fn(self): return (1.0*self.fn) / self.get_d()
+
+    def tpr(self):
+        c = 1.0*self.get_c()
+        return self.tp / c if c > 0.0 else 0.0
+
+    def fpr(self):
+        _c = 1.0*self.get_not_c()
+        return self.fp / _c if _c > 0.0 else 0.0
+
+
+def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
+    print(f'[selectiong {k} terms]')
+    nC = Y.shape[1]
+    FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
+    best_features_idx = np.argsort(-FC, axis=0).flatten()
+    tsr_values = FC.flatten()
+    selected_indexes_set = set()
+    selected_indexes = list()
+    selected_value = list()
+    from_category = list()
+    round_robin = iter(best_features_idx)
+    values_iter = iter(tsr_values)
+    round=0
+    while len(selected_indexes) < k:
+        term_idx = next(round_robin)
+        term_val = next(values_iter)
+        if term_idx not in selected_indexes_set:
+            selected_indexes_set.add(term_idx)
+            selected_indexes.append(term_idx)
+            selected_value.append(term_val)
+            from_category.append(round)
+        round = (round + 1) % nC
+    return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
+
+
+def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
+    tp_ = len(positive_document_indexes & feature_document_indexes)
+    fp_ = len(feature_document_indexes - positive_document_indexes)
+    fn_ = len(positive_document_indexes - feature_document_indexes)
+    tn_ = nD - (tp_ + fp_ + fn_)
+    return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
+
+
+def category_tables(feature_sets, category_sets, c, nD, nF):
+    return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
+
+
+"""
+Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
+Efficiency O(nF x nC x log(S)) where S is the sparse factor
+"""
+def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
+    nD, nF = coocurrence_matrix.shape
+    nD2, nC = label_matrix.shape
+
+    if nD != nD2:
+        raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
+                         (coocurrence_matrix.shape,label_matrix.shape))
+
+    def nonzero_set(matrix, col):
+        return set(matrix[:, col].nonzero()[0])
+
+    if isinstance(coocurrence_matrix, csr_matrix):
+        coocurrence_matrix = csc_matrix(coocurrence_matrix)
+    feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
+    category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
+    cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
+    return np.array(cell_matrix)
+
+# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
+def get_tsr_matrix(cell_matrix, tsr_score_funtion):
+    nC,nF = cell_matrix.shape
+    tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
+    return np.array(tsr_matrix)
+
+
+""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
+take as input any real-valued feature column (e.g., tf-idf weights).
+feat is the feature vector, and c is a binary classification vector.
+This implementation covers only the binary case, while the formula is defined for multiclass
+single-label scenarios, for which the version [2] might be preferred.
+[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
+[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
+"""
+def fisher_score_binary(feat, c):
+    neg = np.ones_like(c) - c
+
+    npos = np.sum(c)
+    nneg = np.sum(neg)
+
+    mupos = np.mean(feat[c == 1])
+    muneg = np.mean(feat[neg == 1])
+    mu = np.mean(feat)
+
+    stdpos = np.std(feat[c == 1])
+    stdneg = np.std(feat[neg == 1])
+
+    num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
+    den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
+
+    if den>0:
+        return num / den
+    else:
+        return num
--- a/src/dataset_builder.py
+++ b/src/dataset_builder.py
@ -0,0 +1,567 @@
+from os.path import join, exists
+from nltk.corpus import stopwords
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from sklearn.preprocessing import MultiLabelBinarizer
+from data.reader.jrcacquis_reader import *
+from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
+from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
+from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
+from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
+import pickle
+import numpy as np
+from sklearn.model_selection import train_test_split
+from scipy.sparse import issparse
+import itertools
+
+
+class MultilingualDataset:
+    """
+    A multilingual dataset is a dictionary of training and test documents indexed by language code.
+    Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
+    documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
+    labels of each document, and ids is a list of document-identifiers from the original collection.
+    """
+
+    def __init__(self):
+        self.dataset_name = ""
+        self.multiling_dataset = {}
+
+    def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
+        self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
+
+    def save(self, file):
+        self.sort_indexes()
+        pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
+        return self
+
+    def __getitem__(self, item):
+        if item in self.langs():
+            return self.multiling_dataset[item]
+        return None
+
+    @classmethod
+    def load(cls, file):
+        data = pickle.load(open(file, 'rb'))
+        data.sort_indexes()
+        return data
+
+    @classmethod
+    def load_ids(cls, file):
+        data = pickle.load(open(file, 'rb'))
+        tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
+        te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
+        return tr_ids, te_ids
+
+    def sort_indexes(self):
+        for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
+            if issparse(Xtr): Xtr.sort_indices()
+            if issparse(Xte): Xte.sort_indices()
+
+    def set_view(self, categories=None, languages=None):
+        if categories is not None:
+            if isinstance(categories, int):
+                categories = np.array([categories])
+            elif isinstance(categories, list):
+                categories = np.array(categories)
+            self.categories_view = categories
+        if languages is not None:
+            self.languages_view = languages
+
+    def training(self):
+        return self.lXtr(), self.lYtr()
+
+    def test(self):
+        return self.lXte(), self.lYte()
+
+    def lXtr(self):
+        return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
+
+    def lXte(self):
+        return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
+
+    def lYtr(self):
+        return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
+
+    def lYte(self):
+        return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
+
+    def cat_view(self, Y):
+        if hasattr(self, 'categories_view'):
+            return Y[:,self.categories_view]
+        else:
+            return Y
+
+    def langs(self):
+        if hasattr(self, 'languages_view'):
+            langs = self.languages_view
+        else:
+            langs = sorted(self.multiling_dataset.keys())
+        return langs
+
+    def num_categories(self):
+        return self.lYtr()[self.langs()[0]].shape[1]
+
+    def show_dimensions(self):
+        for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
+            if lang not in self.langs(): continue
+            if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'):
+                print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape))
+
+    def show_category_prevalences(self):
+        #pass
+        nC = self.num_categories()
+        accum_tr = np.zeros(nC, dtype=np.int)
+        accum_te = np.zeros(nC, dtype=np.int)
+        in_langs = np.zeros(nC, dtype=np.int) #count languages with at least one positive example (per category)
+        for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
+            if lang not in self.langs(): continue
+            prev_train = np.sum(self.cat_view(Ytr), axis=0)
+            prev_test = np.sum(self.cat_view(Yte), axis=0)
+            accum_tr += prev_train
+            accum_te += prev_test
+            in_langs += (prev_train>0)*1
+            print(lang+'-train', prev_train)
+            print(lang+'-test', prev_test)
+        print('all-train', accum_tr)
+        print('all-test', accum_te)
+
+        return accum_tr, accum_te, in_langs
+
+    def set_labels(self, labels):
+        self.labels = labels
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------------------------------------------------
+def get_active_labels(doclist):
+    cat_list = set()
+    for d in doclist:
+        cat_list.update(d.categories)
+    return list(cat_list)
+
+def filter_by_categories(doclist, keep_categories):
+    catset = frozenset(keep_categories)
+    for d in doclist:
+        d.categories = list(set(d.categories).intersection(catset))
+
+def __years_to_str(years):
+    if isinstance(years, list):
+        if len(years) > 1:
+            return str(years[0])+'-'+str(years[-1])
+        return str(years[0])
+    return str(years)
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Matrix builders
+# ----------------------------------------------------------------------------------------------------------------------
+def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
+    """
+    Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
+    i.e., each language-specific matrix lies in a dedicate feature space.
+    :param dataset_name: the name of the dataset (str)
+    :param langs: list of languages (str)
+    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
+    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
+    :param label_names: list of names of labels (str)
+    :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
+    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
+    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
+    by language the processed wikipedia documents in their respective language-specific feature spaces
+    """
+
+    mlb = MultiLabelBinarizer()
+    mlb.fit([label_names])
+
+    lW = {}
+
+    multilingual_dataset = MultilingualDataset()
+    multilingual_dataset.dataset_name = dataset_name
+    multilingual_dataset.set_labels(mlb.classes_)
+    for lang in langs:
+        print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
+              (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
+
+        tr_data, tr_labels, IDtr = zip(*training_docs[lang])
+        te_data, te_labels, IDte = zip(*test_docs[lang])
+
+        if preprocess:
+            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
+                                    tokenizer=NLTKStemTokenizer(lang, verbose=True),
+                                    stop_words=stopwords.words(NLTK_LANGMAP[lang]))
+        else:
+            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
+
+        Xtr = tfidf.fit_transform(tr_data)
+        Xte = tfidf.transform(te_data)
+        if wiki_docs:
+            lW[lang] = tfidf.transform(wiki_docs[lang])
+
+        Ytr = mlb.transform(tr_labels)
+        Yte = mlb.transform(te_labels)
+
+        multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
+
+    multilingual_dataset.show_dimensions()
+    multilingual_dataset.show_category_prevalences()
+
+    if wiki_docs:
+        return multilingual_dataset, lW
+    else:
+        return multilingual_dataset
+
+
+# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
+def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
+    """
+    Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
+    since all of them lie on the same yuxtaposed feature space.
+    :param dataset_name: the name of the dataset (str)
+    :param langs: list of languages (str)
+    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
+    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
+    :param label_names: list of names of labels (str)
+    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
+    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
+    by language the processed wikipedia documents in their respective language-specific feature spaces
+    """
+
+    multiling_dataset = MultilingualDataset()
+    multiling_dataset.dataset_name = dataset_name
+
+    mlb = MultiLabelBinarizer()
+    mlb.fit([label_names])
+
+    multiling_dataset.set_labels(mlb.classes_)
+
+    tr_data_stack = []
+    for lang in langs:
+        print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
+        tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
+        te_data, te_labels, te_ID = zip(*test_docs[lang])
+        if preprocess:
+            tr_data = preprocess_documents(tr_data, lang)
+            te_data = preprocess_documents(te_data, lang)
+        tr_data_stack.extend(tr_data)
+        multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
+
+    tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
+    tfidf.fit(tr_data_stack)
+
+    for lang in langs:
+        print("\nweighting documents for language <%s>" % (lang))
+        (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
+        Xtr = tfidf.transform(tr_data)
+        Xte = tfidf.transform(te_data)
+        Ytr = mlb.transform(tr_labels)
+        Yte = mlb.transform(te_labels)
+        multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
+
+    multiling_dataset.show_dimensions()
+    return multiling_dataset
+
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Methods to recover the original documents from the MultilingualDataset's ids
+# ----------------------------------------------------------------------------------------------------------------------
+"""
+This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
+article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
+from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
+"""
+def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
+
+    tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
+    assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
+    langs = list(tr_ids.keys())
+
+    print('fetching the datasets')
+    rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
+    rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
+
+    filter_by_categories(rcv1_documents, labels_rcv2)
+    filter_by_categories(rcv2_documents, labels_rcv1)
+
+    label_names = get_active_labels(rcv1_documents + rcv2_documents)
+    print('Active labels in RCV1/2 {}'.format(len(label_names)))
+
+    print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
+    print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
+
+    all_docs = rcv1_documents + rcv2_documents
+    mlb = MultiLabelBinarizer()
+    mlb.fit([label_names])
+
+    dataset = MultilingualDataset()
+    for lang in langs:
+        analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
+                                   stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
+
+        Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
+        Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
+        Xtr = [' '.join(analyzer(d)) for d in Xtr]
+        Xte = [' '.join(analyzer(d)) for d in Xte]
+        Ytr = mlb.transform(Ytr)
+        Yte = mlb.transform(Yte)
+        dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
+
+    dataset.save(outpath)
+
+"""
+Same thing but for JRC-Acquis
+"""
+def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
+
+    tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
+    assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
+    langs = list(tr_ids.keys())
+
+    print('fetching the datasets')
+
+    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
+    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
+                                                 cat_filter=cat_list, cat_threshold=1, parallel=None,
+                                                 most_frequent=most_common_cat)
+    test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
+                                   parallel='force')
+
+    def filter_by_id(doclist, ids):
+        ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
+        return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
+
+    training_docs = filter_by_id(training_docs, tr_ids)
+    test_docs = filter_by_id(test_docs, te_ids)
+
+    print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
+
+    mlb = MultiLabelBinarizer()
+    mlb.fit([label_names])
+
+    dataset = MultilingualDataset()
+    for lang in langs:
+        analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
+                                   stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
+
+        Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
+        Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
+        Xtr = [' '.join(analyzer(d)) for d in Xtr]
+        Xte = [' '.join(analyzer(d)) for d in Xte]
+        Ytr = mlb.transform(Ytr)
+        Yte = mlb.transform(Yte)
+        dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
+
+    dataset.save(outpath)
+
+# ----------------------------------------------------------------------------------------------------------------------
+# Dataset Generators
+# ----------------------------------------------------------------------------------------------------------------------
+def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
+    """
+    Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
+    "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
+    In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
+    :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
+    all splits will be generated
+    :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
+    :param langs: the list of languages to consider (as defined in data/languages.py)
+    :param train_years: a list of ints containing the years to be considered as training documents
+    :param test_years: a list of ints containing the  years to be considered as test documents
+    :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
+    (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
+    leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
+    :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
+    :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
+    :param run: a numeric label naming the random split (useful to keep track of different runs)
+    :return: None
+    """
+
+    name = 'JRCacquis'
+    run = '_run' + str(run)
+    config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
+                  'vs' + __years_to_str(test_years) + \
+                  '_' + cat_policy + \
+                  ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
+                  '_noparallel_processed'
+
+    indep_path = join(jrc_data_home, config_name + run + '.pickle')
+    upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
+    yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
+    wiki_path  = join(jrc_data_home, config_name + run + '.wiki.pickle')
+    wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
+
+    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
+    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
+                                                 cat_filter=cat_list, cat_threshold=1, parallel=None,
+                                                 most_frequent=most_common_cat)
+    test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
+                                   parallel='force')
+
+    print('Generating feature-independent dataset...')
+    training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
+
+    def _group_by_lang(doc_list, langs):
+        return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
+                for lang in langs}
+
+    training_docs = _group_by_lang(training_docs, langs)
+    training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
+    test_docs = _group_by_lang(test_docs, langs)
+    if not exists(indep_path):
+        wiki_docs=None
+        if max_wiki>0:
+            if not exists(wiki_docs_path):
+                wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
+                wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
+                pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+            else:
+                wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
+            wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
+
+        if wiki_docs:
+            lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
+            pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+        else:
+            lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
+
+        lang_data.save(indep_path)
+
+    print('Generating upper-bound (English-only) dataset...')
+    if not exists(upper_path):
+        training_docs_eng_only = {'en':training_docs['en']}
+        test_docs_eng_only = {'en':test_docs['en']}
+        build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
+
+    print('Generating yuxtaposed dataset...')
+    if not exists(yuxta_path):
+        build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
+
+
+def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
+                         train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
+    """
+        Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
+        "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
+
+        :param outpath: path where all splits will be dumped
+        :param rcv1_data_home: path to the RCV1-v2 dataset (English only)
+        :param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
+        :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
+        :param langs: the list of languages to consider (as defined in data/languages.py)
+        :param train_for_lang: maximum number of training documents per language
+        :param test_for_lang:  maximum number of test documents per language
+        :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
+        :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
+        :param run: a numeric label naming the random split (useful to keep track of different runs)
+        :return: None
+        """
+
+    assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
+    assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
+    assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
+        "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
+
+    name = 'RCV1/2'
+    run = '_run' + str(run)
+    config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
+                  ('_processed' if preprocess else '_raw')
+
+    indep_path = join(outpath, config_name + run + '.pickle')
+    upper_path = join(outpath, config_name + run +'_upper.pickle')
+    yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
+    wiki_path = join(outpath, config_name + run + '.wiki.pickle')
+    wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
+
+    print('fetching the datasets')
+    rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
+    rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
+    filter_by_categories(rcv1_documents, labels_rcv2)
+    filter_by_categories(rcv2_documents, labels_rcv1)
+
+    label_names = get_active_labels(rcv1_documents+rcv2_documents)
+    print('Active labels in RCV1/2 {}'.format(len(label_names)))
+
+    print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
+    print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
+
+    lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
+
+    # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
+    # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
+    print('Generating upper-bound (English-only) dataset...')
+    train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
+    train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
+    test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
+    build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
+
+    train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
+    for lang in langs:
+        if lang=='en': continue # already split
+        test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
+        train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
+        train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
+        test_lang_doc_map[lang]  = [(d.text, d.categories, d.id) for d in test]
+
+    print('Generating feature-independent dataset...')
+    wiki_docs=None
+    if max_wiki>0:
+        if not exists(wiki_docs_path):
+            wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
+            wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
+            pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+        else:
+            wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
+        wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
+
+    if wiki_docs:
+        lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
+        pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
+    else:
+        lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
+
+    lang_data.save(indep_path)
+
+    print('Generating yuxtaposed dataset...')
+    build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
+
+
+#-----------------------------------------------------------------------------------------------------------------------
+# MAIN BUILDER
+#-----------------------------------------------------------------------------------------------------------------------
+
+if __name__=='__main__':
+    import sys
+
+    assert len(sys.argv) == 5, "wrong number of arguments; required: " \
+                               "<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
+
+    JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
+    RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
+    RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
+    WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
+
+    langs = lang_set['JRC_NLTK']
+    max_wiki = 5000
+
+    for run in range(0,10):
+        print('Building JRC-Acquis datasets run', run)
+        prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
+                             train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
+                             cat_policy='all', most_common_cat=300, run=run)
+
+        print('Building RCV1-v2/2 datasets run', run)
+        prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
+                             train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
+
+        # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
+        # (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
+        # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
+        # outpath = datasetpath.replace('_nltk_','_doclist_')
+        # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
+
+        # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
+        # outpath = datasetpath.replace('_nltk_', '_doclist_')
+        # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)
+
+
+
--- a/src/learning/pycache/learners.cpython-37.pyc
+++ b/src/learning/pycache/learners.cpython-37.pyc
--- a/src/learning/learners.py
+++ b/src/learning/learners.py
@ -0,0 +1,646 @@
+import numpy as np
+import time
+from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
+from scipy.sparse import issparse, csr_matrix
+from sklearn.multiclass import OneVsRestClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.model_selection import KFold
+# from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
+from sklearn.feature_extraction.text import TfidfVectorizer
+
+
+def _sort_if_sparse(X):
+    if issparse(X) and not X.has_sorted_indices:
+        X.sort_indices()
+
+
+def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
+    if n_jobs == 1:
+        return {lang:transformer(lX[lang]) for lang in lX.keys()}
+    else:
+        langs = list(lX.keys())
+        transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
+        return {lang: transformations[i] for i, lang in enumerate(langs)}
+
+
+class TrivialRejector:
+    def fit(self, X, y):
+        self.cats = y.shape[1]
+        return self
+    def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
+    def predict(self, X): return np.zeros((X.shape[0],self.cats))
+    def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
+    def best_params(self): return {}
+
+
+class FunnellingPolylingualClassifier:
+    """
+    This classifier projects each document d into a language-independent feature space where each dimension fi is the
+    decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
+    then trains one single classifier for all documents in this space, irrespective of their originary language
+    """
+    def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
+                 calmode='cal', n_jobs=-1):
+        """
+        :param first_tier_learner: the learner used in the first-tier level
+        :param meta_learner: the learner used in the second-tier level
+        :param first_tier_parameters: parameters for the learner in the doc_projector
+        :param meta_parameters: parameters for the learner in the z-space
+        :param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
+        :param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
+        :param n_jobs: number of parallel threads
+        'sigmoid' to use the sigmoid of the decision_function
+        projects the data before training the final classifier; if greater than one, the training set is split in as
+        many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
+        models trained on the remaining folds. This should increase the generality of the space to unseen data.
+        """
+        assert folded_projections>0, "positive number of folds expected"
+        assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
+        assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
+
+        self.fist_tier_learner = first_tier_learner
+        self.meta_learner = meta_learner
+        self.fist_tier_parameters=first_tier_parameters
+        self.meta_parameters = meta_parameters
+        self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+        self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
+        self.folded_projections = folded_projections
+        self.n_jobs = n_jobs
+        self.calmode = calmode
+
+    def _projection(self, doc_projector, lX):
+        """
+        Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
+        decision_function if otherwise
+        :param doc_projector: the document projector (a NaivePolylingualClassifier)
+        :param lX: {lang:matrix} to train
+        :return: the projection, applied with predict_proba or decision_function
+        """
+        if self.calmode=='cal':
+            return doc_projector.predict_proba(lX)
+        else:
+            l_decision_scores = doc_projector.decision_function(lX)
+            if self.calmode=='sigmoid':
+                def sigmoid(x): return 1 / (1 + np.exp(-x))
+                for lang in l_decision_scores.keys():
+                    l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
+            return l_decision_scores
+
+    def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
+        """
+        Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
+        decision scores (if otherwise). This space is here named zspace.
+        :param lXtr: {lang:matrix} to train
+        :param lYtr: {lang:labels} to train
+        :param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
+        :param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
+        :return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
+        models trained on lXtr, and the lYproj labels stacked consistently
+        """
+        repair_empty_folds = True
+        if lXproj is None and lYproj is None:
+            lXproj, lYproj = lXtr, lYtr
+            repair_empty_folds = False
+
+        print('fitting the projectors... {}'.format(lXtr.keys()))
+        self.doc_projector.fit(lXtr, lYtr)
+
+        print('projecting the documents')
+        langs = list(lXtr.keys())
+        lZ = self._projection(self.doc_projector, lXproj)
+
+        # if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
+        empty_categories = self.doc_projector.empty_categories
+        lZ_bu = self._projection(self.doc_projector_bu, lXproj)
+
+        for lang in langs:
+            repair = empty_categories[lang]
+            lZ[lang][:,repair] = lZ_bu[lang][:,repair]
+
+        Z = np.vstack([lZ[lang] for lang in langs])  # Z is the language independent space
+        zy = np.vstack([lYproj[lang] for lang in langs])
+        return Z, zy
+
+    def _get_zspace_folds(self, lX, ly):
+        self.doc_projector_bu.fit(lX, ly)
+
+        print('split of {} folds'.format(self.folded_projections))
+        skf = KFold(n_splits=self.folded_projections, shuffle=True)
+
+        Z, zy = [], []
+        lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
+        for fold in range(self.folded_projections):
+            print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
+            lfoldXtr, lfoldYtr = {}, {}
+            lfoldXte, lfoldYte = {}, {}
+            for lang in lX.keys():
+                train, test = lfold[lang][fold]
+                lfoldXtr[lang] = lX[lang][train]
+                lfoldYtr[lang] = ly[lang][train]
+                lfoldXte[lang] = lX[lang][test]
+                lfoldYte[lang] = ly[lang][test]
+            Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
+            Z.append(Zfold)
+            zy.append(zYfold)
+        # compose the Z-space as the union of all folded predictions
+        Z = np.vstack(Z)
+        zy = np.vstack(zy)
+        # refit the document projector with all examples to have a more reliable projector for test data
+        self.doc_projector = self.doc_projector_bu
+        return Z, zy
+
+    def fit(self, lX, ly, lZ=None, lzy=None):
+        tinit = time.time()
+        Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
+
+        #experimental: adds the posterior probabilities (computed outside) to the meta-classifier
+        if lZ is not None and lzy is not None:
+            zlangs = list(lZ.keys())
+            Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
+            zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
+
+        print('fitting the Z-space of shape={}'.format(Z.shape))
+        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
+        self.model.fit(Z, zy)
+        self.time = time.time() - tinit
+
+        return self
+
+    def predict(self, lX, lZ=None):
+        """
+        :param lX: a dictionary {language_label: X csr-matrix}
+        :param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
+        :return: a dictionary of predictions
+        """
+        lZ_ = self._projection(self.doc_projector, lX)
+        if lZ is not None:
+            lZ_ = {**lZ_, **lZ}
+        return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
+
+    def best_params(self):
+        params = self.doc_projector.best_params()
+        params['meta'] = self.model.best_params()
+        return params
+
+
+class NaivePolylingualClassifier:
+    """
+    Is a mere set of independet MonolingualClassifiers
+    """
+    def __init__(self, base_learner, parameters=None, n_jobs=-1):
+        self.base_learner = base_learner
+        self.parameters = parameters
+        self.model = None
+        self.n_jobs = n_jobs
+
+    def fit(self, lX, ly):
+        """
+        trains the independent monolingual classifiers
+        :param lX: a dictionary {language_label: X csr-matrix}
+        :param ly: a dictionary {language_label: y np.array}
+        :return: self
+        """
+        tinit = time.time()
+        assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
+        langs = list(lX.keys())
+        for lang in langs:
+            _sort_if_sparse(lX[lang])
+
+        # models = Parallel(n_jobs=self.n_jobs)\
+        #     (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
+
+        models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
+
+        for model, lang in zip(models, langs):
+            model.fit(lX[lang], ly[lang])
+
+        self.model = {lang: models[i] for i, lang in enumerate(langs)}
+        self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
+        self.time = time.time() - tinit
+        return self
+
+    def decision_function(self, lX):
+        """
+        :param lX: a dictionary {language_label: X csr-matrix}
+        :return: a dictionary of classification scores for each class
+        """
+        assert self.model is not None, 'predict called before fit'
+        assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
+        langs=list(lX.keys())
+        scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
+        return {lang:scores[i] for i,lang in enumerate(langs)}
+
+    def predict_proba(self, lX):
+        """
+        :param lX: a dictionary {language_label: X csr-matrix}
+        :return: a dictionary of probabilities that each document belongs to each class
+        """
+        assert self.model is not None, 'predict called before fit'
+        assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
+        langs=list(lX.keys())
+        scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
+        return {lang:scores[i] for i,lang in enumerate(langs)}
+
+    def predict(self, lX):
+        """
+        :param lX: a dictionary {language_label: X csr-matrix}
+        :return: a dictionary of predictions
+        """
+        assert self.model is not None, 'predict called before fit'
+        assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
+        if self.n_jobs == 1:
+            return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()}
+        else:
+            langs = list(lX.keys())
+            scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
+            return {lang: scores[i] for i, lang in enumerate(langs)}
+
+    def best_params(self):
+        return {l:model.best_params() for l,model in self.model.items()}
+
+
+class MonolingualClassifier:
+
+    def __init__(self, base_learner, parameters=None, n_jobs=-1):
+        self.learner = base_learner
+        self.parameters = parameters
+        self.model = None
+        self.n_jobs = n_jobs
+        self.best_params_ = None
+
+    def fit(self, X, y):
+        if X.shape[0] == 0:
+            print('Warning: X has 0 elements, a trivial rejector will be created')
+            self.model = TrivialRejector().fit(X,y)
+            self.empty_categories = np.arange(y.shape[1])
+            return self
+
+        tinit = time.time()
+        _sort_if_sparse(X)
+        self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
+
+        # multi-class format
+        if len(y.shape) == 2:
+            if self.parameters is not None:
+                self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
+                                   for params in self.parameters]
+            self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
+        else:
+            self.model = self.learner
+            raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages')
+
+        # parameter optimization?
+        if self.parameters:
+            print('debug: optimizing parameters:', self.parameters)
+            self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
+                                      error_score=0, verbose=10)
+
+        print('fitting:', self.model)
+        self.model.fit(X, y)
+        if isinstance(self.model, GridSearchCV):
+            self.best_params_ = self.model.best_params_
+            print('best parameters: ', self.best_params_)
+        self.time=time.time()-tinit
+        return self
+
+    def decision_function(self, X):
+        assert self.model is not None, 'predict called before fit'
+        _sort_if_sparse(X)
+        return self.model.decision_function(X)
+
+    def predict_proba(self, X):
+        assert self.model is not None, 'predict called before fit'
+        assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
+        _sort_if_sparse(X)
+        return self.model.predict_proba(X)
+
+    def predict(self, X):
+        assert self.model is not None, 'predict called before fit'
+        _sort_if_sparse(X)
+        return self.model.predict(X)
+
+    def best_params(self):
+        return self.best_params_
+
+
+class PolylingualEmbeddingsClassifier:
+    """
+    This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
+    @article{conneau2017word,
+      title={Word translation without parallel data},
+      author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
+      journal={arXiv preprint arXiv:1710.04087},
+      year={2017}
+    }
+    url: https://github.com/facebookresearch/MUSE
+    """
+    def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
+        """
+        :param wordembeddings_path: the path to the directory containing the polylingual embeddings
+        :param learner: the learner
+        :param c_parameters: parameters for learner
+        :param n_jobs: the number of concurrent threads
+        """
+        self.wordembeddings_path = wordembeddings_path
+        self.learner = learner
+        self.c_parameters=c_parameters
+        self.n_jobs = n_jobs
+        self.lang_tfidf = {}
+        self.model = None
+
+    def fit_vectorizers(self, lX):
+        for lang in lX.keys():
+            if lang not in self.lang_tfidf:
+                tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True)  # text is already processed
+                docs = lX[lang]
+                tfidf.fit(docs)
+                self.lang_tfidf[lang] = tfidf
+
+    def embed(self, docs, lang):
+        assert lang in self.lang_tfidf, 'unknown language'
+        tfidf_vectorizer = self.lang_tfidf[lang]
+        V = tfidf_vectorizer.vocabulary_
+        Xweights = tfidf_vectorizer.transform(docs)
+
+        print('loading word embeddings for ' + lang)
+        we = WordEmbeddings.load(self.wordembeddings_path, lang)
+
+        nD = len(docs)
+        doc_vecs = np.zeros((nD, we.dim()))
+
+        for i, doc in enumerate(docs):
+            print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
+            # averaging with tfidf (summing each word only once, since the frequency is already controlled)
+            for w in set(doc.split()):
+                if w in we and w in V:
+                    doc_vecs[i] += (we[w] * Xweights[i, V[w]])
+            # works much worse with idf; works much worse with document l2-normalization
+        print()
+
+        return doc_vecs
+
+    def fit(self, lX, ly):
+        """
+        :param lX: a dictionary {language_label: [list of preprocessed documents]}
+        :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
+        :return: self
+        """
+        tinit = time.time()
+        langs = list(lX.keys())
+        WEtr, Ytr = [], []
+        self.fit_vectorizers(lX) # if already fit, does nothing
+        _lX = dict()
+        for lang in langs:
+            _lX[lang] = self.lang_tfidf[lang].transform(lX[lang])
+            WEtr.append(self.embed(lX[lang], lang))
+            Ytr.append(ly[lang])
+
+        # TODO @Andrea --> here embeddings should be stacked horizontally!
+        WEtr = np.vstack(WEtr)
+        Ytr = np.vstack(Ytr)
+        self.embed_time = time.time() - tinit
+
+        print('fitting the WE-space of shape={}'.format(WEtr.shape))
+        self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
+        self.model.fit(_lX['da'], ly['da'])
+        self.time = time.time() - tinit
+        return self
+
+    def predict(self, lX):
+        """
+        :param lX: a dictionary {language_label: [list of preprocessed documents]}
+        """
+        assert self.model is not None, 'predict called before fit'
+        langs = list(lX.keys())
+        lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
+        return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
+
+    def predict_proba(self, lX):
+        """
+        :param lX: a dictionary {language_label: [list of preprocessed documents]}
+        """
+        assert self.model is not None, 'predict called before fit'
+        langs = list(lX.keys())
+        # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
+        return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs)
+
+    def best_params(self):
+        return self.model.best_params()
+
+
+class FunnellingEmbeddingPolylingualClassifier:
+    """ Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf
+    vectorizer for the out-of-scope languages (which is not fair)."""
+    def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages,
+                 first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1):
+
+        assert first_tier_learner.probability==True and embed_learner.probability==True, \
+            'both the first-tier classifier and the polyembedding classifier shoud allow calibration'
+
+        self.training_languages = training_languages
+
+        self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner,
+                                                   c_parameters=embed_parameters, n_jobs=n_jobs)
+
+        self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner,
+                                                          first_tier_parameters=first_tier_parameters,
+                                                          meta_parameters=meta_parameters, n_jobs=n_jobs)
+        self.n_jobs = n_jobs
+
+    def vectorize(self, lX):
+        return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()}
+
+    def fit(self, lX, ly):
+        """
+        :param lX: a dictionary {language_label: [list of preprocessed documents]}
+        :param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
+        :return:
+        """
+        self.PLE.fit_vectorizers(lX)
+        tinit = time.time()
+        lX = {l: lX[l] for l in lX.keys() if l in self.training_languages}
+        ly = {l: ly[l] for l in lX.keys() if l in self.training_languages}
+        self.PLE.fit(lX, ly)
+        lZ = self.PLE.predict_proba(lX)
+        self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly)
+        self.time = time.time() - tinit
+        return self
+
+    def predict(self, lX):
+        """
+        :param lX: a dictionary {language_label: [list of preprocessed documents]}
+        """
+        lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages}
+        lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages}
+
+        lZ = self.PLE.predict_proba(lXout)
+
+        return self.Funnelling.predict(self.vectorize(lXin), lZ)
+
+
+    def best_params(self):
+        return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()}
+
+
+class AndreaCLF(FunnellingPolylingualClassifier):
+    def __init__(self,
+                 we_path,
+                 config,
+                 first_tier_learner,
+                 meta_learner,
+                 first_tier_parameters=None,
+                 meta_parameters=None,
+                 folded_projections=1,
+                 calmode='cal', n_jobs=-1):
+
+        super().__init__(first_tier_learner,
+                         meta_learner,
+                         first_tier_parameters,
+                         meta_parameters,
+                         folded_projections,
+                         calmode,
+                         n_jobs)
+
+        self.we_path = we_path
+        self.config = config
+        self.lang_word2idx = dict()
+        self.languages = []
+        self.lang_tfidf = {}
+        self.word_embeddings = {}
+        self.supervised_embeddings = {}
+
+    def vectorize(self, lX, prediction=False):
+        langs = list(lX.keys())
+        print(f'# tfidf-vectorizing docs')
+        if prediction:
+            for lang in langs:
+                assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
+                tfidf_vectorizer = self.lang_tfidf[lang]
+                lX[lang] = tfidf_vectorizer.transform(lX[lang])
+            return self
+
+        for lang in langs:
+            tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
+            self.languages.append(lang)
+            tfidf_vectorizer.fit(lX[lang])
+            lX[lang] = tfidf_vectorizer.transform(lX[lang])
+            _sort_if_sparse(lX[lang])
+            self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
+            self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
+        return self
+
+    # @override std class method
+    def _get_zspace(self, lXtr, lYtr):
+        print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
+        self.doc_projector.fit(lXtr, lYtr)
+
+        print('\nprojecting the documents')
+        lZ = self._projection(self.doc_projector, lXtr)
+
+        return lZ, lYtr
+
+    def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
+        """
+        build embedding matrix for given language and returns its weighted sum wrt tf-idf score
+        """
+        _r = dict()
+        languages = list(lX.keys())
+
+        if prediction:
+            for lang in languages:
+                if unsupervised:    # If unsupervised embeddings ...
+                    M = self.word_embeddings[lang]
+                    if supervised:  # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
+                        S = self.supervised_embeddings[lang]
+                        _r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
+                        continue
+                    _r[lang] = lX[lang].dot(M)  # if not supervised --> just get weighted sum of unsupervised (M) embeddings
+                else:   # If not unsupervised --> get (S) matrix and its weighted sum 
+                    S = self.supervised_embeddings[lang]
+                    _r[lang] = lX[lang].dot(S)
+            return _r
+
+        if unsupervised:
+            for lang in languages:
+                # print('Test building embedding matrix FastTextMuse ...')
+                _, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
+                self.word_embeddings[lang] = M
+                _r[lang] = lX[lang].dot(M)
+
+        if supervised:
+            for lang in languages:
+                S = WCE_matrix(lX, ly, lang)
+                S = np.squeeze(np.asarray(S))   # casting to ndarray to better visualize S while debugging
+                self.supervised_embeddings[lang] = S
+                if unsupervised:
+                    _r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
+                else:
+                    _r[lang] = lX[lang].dot(S)
+        return _r
+
+    # @override std class method
+    def fit(self, lX, ly):
+        tinit = time.time()
+        print('Vectorizing documents...')
+        self.vectorize(lX)
+
+        for lang in self.languages:
+            print(lX[lang].shape)
+
+        Z, zy = self._get_zspace(lX, ly)
+
+        # Z vectors is concatenated with doc's embedding weighted sum
+        Z_embedded = dict()
+        l_weighted_em = self.embed(lX, ly,
+                                   unsupervised=self.config['unsupervised'],
+                                   supervised=self.config['supervised'])
+
+        if self.config['supervised'] or self.config['unsupervised']:
+            for lang in list(lX.keys()):
+                Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
+            Z = Z_embedded
+            del Z_embedded
+
+        # stacking Z_embedded space vertically
+        # _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages])
+        # _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
+        _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
+        _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
+
+        # zlangs = list(Z_embedded.keys())  # creo lista con embedding e poi faccio vstack su lista
+        # for i, lang in enumerate(zlangs):
+        #     if i == 0:
+        #         _vertical_Z = Z_embedded[lang]
+        #         _vertical_Zy = zy[lang]
+        #     else:
+        #         _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang]))
+        #         _vertical_Zy = np.vstack((_vertical_Zy, zy[lang]))
+
+        print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
+        self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
+                                           n_jobs=self.n_jobs)
+        self.model.fit(_vertical_Z, _vertical_Zy)
+        self.time = time.time() - tinit
+        print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
+
+    def predict(self, lX, ly):
+        print('Vectorizing documents')
+        self.vectorize(lX, prediction=True)
+        lZ = self._projection(self.doc_projector, lX)
+
+        if self.config['supervised'] or self.config['unsupervised']:
+            l_weighted_em = self.embed(lX, ly,
+                                       unsupervised=self.config['unsupervised'],
+                                       supervised=self.config['supervised'],
+                                       prediction=True)
+            Z_embedded = dict()
+            for lang in lX.keys():
+                Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
+                print(Z_embedded[lang].shape)
+            
+            return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
+        
+        for lang in lZ.keys():
+            print(lZ[lang].shape)
+        return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
--- a/src/results/results.csv
+++ b/src/results/results.csv
@ -0,0 +1,7 @@
+id	method	learner	embed	optimp	dataset	binary	languages	time	lang	macrof1	microf1	macrok	microk	notes
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	da	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	en	0.7866666666666666	0.0	0.7927111111111111	-0.0003376325207643527	nope
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M	PolyEmbed_andrea	svm	M	False	test_datasetname	not_binary	not_ablation	55.56810355186462	fr	0.7866666666666666	0.0	0.7930666666666667	-0.0001350530083057411	nope
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	da	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	en	0.7933333333333333	0.0	0.7931111111111111	-0.00013505300830574107	nope
+jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None	PolyEmbed_andrea	svm	None	False	test_datasetname	not_binary	not_ablation	24.031760931015015	fr	0.7933333333333333	0.0	0.7933333333333333	0.0	nope
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
--- a/src/transformers/clesa.py
+++ b/src/transformers/clesa.py
@ -0,0 +1,110 @@
+import numpy as np
+import sklearn
+# from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
+
+class ESA(object):
+    """
+    Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
+    """
+    supported_similarity = ['dot', 'cosine']
+
+    def __init__(self, similarity='dot', centered=False, post=None):
+        """
+        :param similarity: the similarity measure between documents to be used
+        :param centered: set to True to subtract the expected similarity due to randomness (experimental)
+        :param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
+        """
+        assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
+        self.similarity = similarity
+        self.centered = centered
+        self.post_processing = post
+        self.W = None
+
+    def fit(self, W):
+        """
+        :param W: doc-by-term already processed matrix of wikipedia documents
+        :return: self
+        """
+        self.W = W
+        return self
+
+    def transform(self, X):
+        """
+        :param X: doc-by-term matrix that is to be transformed into the ESA space.
+        :return: the matrix X transformed into the ESA space in numpy format
+        """
+        assert self.W is not None, 'transform method called before fit'
+
+        W = self.W
+        assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
+
+        if self.similarity in ['dot', 'cosine']:
+            if self.similarity == 'cosine':
+                X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
+                W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
+
+            esa = (X.dot(W.T)).toarray()
+            if self.centered:
+                pX = (X > 0).sum(1) / float(X.shape[1])
+                pW = (W > 0).sum(1) / float(W.shape[1])
+                pXpW = np.sqrt(pX.dot(pW.transpose()))
+                esa = esa - pXpW
+
+            if self.post_processing:
+                esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
+
+            return esa
+
+    def fit_transform(self, W, X, Y=None):
+        self.fit(W)
+        return self.transform(X, Y)
+
+    def dimensionality(self):
+        return self.W.shape[0]
+
+
+
+class CLESA(ESA):
+    """
+    Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
+    """
+    
+    def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
+        super(CLESA, self).__init__(similarity, centered, post)
+        self.lESA = None
+        self.langs = None
+        self.n_jobs = n_jobs
+
+    def fit(self, lW):
+        """
+        :param lW: a dictionary of {language: doc-by-term wiki matrix}
+        :return: self
+        """
+        assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
+
+        self.dimensions = list(lW.values())[0].shape[0]
+        self.langs = list(lW.keys())
+        self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
+        return self
+
+    def transform(self, lX):
+        """
+        :param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
+        :return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
+        """
+        assert self.lESA is not None, 'transform method called before fit'
+        assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
+        langs = list(lX.keys())
+        trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
+        return {lang:trans[i] for i,lang in enumerate(langs)}
+
+    def fit_transform(self, lW, lX):
+        return self.fit(lW).transform(lX)
+
+    def languages(self):
+        return list(self.lESA.keys())
+
+
+
+
--- a/src/transformers/dci.py
+++ b/src/transformers/dci.py
@ -0,0 +1,154 @@
+import numpy as np
+from sklearn.preprocessing import normalize
+from scipy.sparse import csr_matrix, issparse
+from scipy.spatial.distance import cosine
+import operator
+import functools
+import math, sys
+# from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
+
+
+class DistributionalCorrespondenceIndexing:
+
+    prob_dcf = ['linear', 'pmi']
+    vect_dcf = ['cosine']
+    valid_dcf = prob_dcf + vect_dcf
+    valid_post = ['normal', 'l2', None]
+
+    def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
+        """
+        :param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
+                the distribucional correspondence between vectors u and v
+        :param post: post-processing function to apply to document embeddings. Default is to standardize it into a
+                normal distribution; other functions allowed are 'l2' or None
+        """
+        if post not in self.valid_post:
+            raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
+
+        if isinstance(dcf, str):
+            if dcf not in self.valid_dcf:
+                raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
+            self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
+        elif hasattr(dcf, '__call__'):
+            self.dcf = dcf
+        else:
+            raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
+        #self.dcf = lambda u,v:dcf(u,v)
+        self.post = post
+        self.domains = None
+        self.dFP = None
+        self.n_jobs = n_jobs
+
+    def fit(self, dU, dP):
+        """
+        :param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
+                distributional semantic model for a specific domain
+        :param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
+                and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
+                number of pivots
+        :return: self
+        """
+        self.domains = list(dP.keys())
+        assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
+        assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
+        assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
+            "inconsistent dimensions between distributional and pivot spaces"
+        self.dimensions = list(dP.values())[0].shape[1]
+        # embed the feature space from each domain using the pivots of that domain
+        #self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
+        transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
+        self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
+
+    def _dom_transform(self, X, FP):
+        _X = X.dot(FP)
+        if self.post == 'l2':
+            _X = normalize(_X, norm='l2', axis=1)
+        elif self.post == 'normal':
+            std = np.clip(np.std(_X, axis=0), 1e-5, None)
+            _X = (_X - np.mean(_X, axis=0)) / std
+        return _X
+
+    # dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
+    def transform(self, dX):
+        assert self.dFP is not None, 'transform method called before fit'
+        assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
+        domains = list(dX.keys())
+        transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
+        return {d: transformations[i] for i, d in enumerate(domains)}
+
+    def fit_transform(self, dU, dP, dX):
+        return self.fit(dU, dP).transform(dX)
+
+    def _prevalence(self, v):
+        if issparse(v):
+            return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
+        elif isinstance(v, np.ndarray):
+            return float(v[v>0].size) / v.size
+
+    def linear(self, u, v, D):
+        tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
+        den1=tp+fn
+        den2=tn+fp
+        tpr = (tp*1./den1) if den1!=0 else 0.
+        tnr = (tn*1./den2) if den2!=0 else 0.
+        return tpr + tnr - 1
+
+    def pmi(self, u, v, D):
+        tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
+
+        Pxy = tp * 1. / D
+        Pxny = fp * 1. / D
+        Pnxy = fn * 1. / D
+        Px = Pxy + Pxny
+        Py = Pxy + Pnxy
+
+        if (Px == 0 or Py == 0 or Pxy == 0):
+            return 0.0
+
+        score =  math.log2(Pxy / (Px * Py))
+        if np.isnan(score) or np.isinf(score):
+            print('NAN')
+            sys.exit()
+        return score
+
+    def cosine(self, u, v):
+        pu = self._prevalence(u)
+        pv = self._prevalence(v)
+        return cosine(u, v) - np.sqrt(pu * pv)
+
+    def _get_4cellcounters(self, u, v, D):
+        """
+        :param u: a set of indexes with a non-zero value
+        :param v: a set of indexes with a non-zero value
+        :param D: the number of events (i.e., all posible indexes)
+        :return: the 4-cell contingency values tp, fp, fn, tn)
+        """
+        common=u.intersection(v)
+        tp = len(common)
+        fp = len(u) - len(common)
+        fn = len(v) - len(common)
+        tn = D - (tp + fp + fn)
+        return tp, fp, fn, tn
+
+    def dcf_dist(self, U, V):
+        nU,D = U.shape
+        nV = V.shape[0]
+        if issparse(U): U = U.toarray()
+        if issparse(V): V = V.toarray()
+
+        dists = np.zeros((nU, nV))
+        if self.dcf.__name__ in self.prob_dcf:
+            def hits_index(v):
+                return set(np.argwhere(v>0).reshape(-1).tolist())
+            Vhits = {i:hits_index(V[i]) for i in range(nV)}
+            for i in range(nU):
+                Ui_hits = hits_index(U[i])
+                for j in range(nV):
+                    dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
+        else:
+            for i in range(nU):
+                for j in range(nV):
+                    dists[i, j] = self.dcf(self, U[i], V[j])
+        return dists
+
--- a/src/transformers/riboc.py
+++ b/src/transformers/riboc.py
@ -0,0 +1,53 @@
+import math
+import numpy as np
+from scipy.sparse import csr_matrix, issparse
+
+class RandomIndexingBoC(object):
+
+    def __init__(self, latent_dimensions, non_zeros=2):
+        self.latent_dimensions = latent_dimensions
+        self.k = non_zeros
+        self.ri_dict = None
+
+    def fit_transform(self, X):
+        return self.fit(X).transform(X)
+
+    def fit(self, X):
+        nF = X.shape[1]
+        nL = self.latent_dimensions
+        format = 'csr' if issparse(X) else 'np'
+        self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
+        return self
+
+    def transform(self, X):
+        assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
+        if self.ri_dict is None:
+            raise ValueError("Error: transform method called before fit.")
+        P = X.dot(self.ri_dict)
+        if issparse(P):
+            P.sort_indices()
+        return P
+
+
+def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
+    assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
+    nF, latent_dimensions = shape
+    print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
+    val = 1.0 if not normalized else 1.0/math.sqrt(k)
+    #ri_dict = csr_matrix((nF, latent_dimensions))  if format == 'csr' else np.zeros((nF, latent_dimensions))
+    ri_dict = np.zeros((nF, latent_dimensions))
+
+    #TODO: optimize
+    for t in range(nF):
+        dims = np.zeros(k, dtype=np.int32)
+        dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
+        dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
+        values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
+        ri_dict[t,dims]=values
+        print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
+    print('\nDone')
+
+    if format=='csr':
+        ri_dict = csr_matrix(ri_dict)
+    return ri_dict
+
--- a/src/util/pycache/evaluation.cpython-37.pyc
+++ b/src/util/pycache/evaluation.cpython-37.pyc
--- a/src/util/pycache/file.cpython-37.pyc
+++ b/src/util/pycache/file.cpython-37.pyc
--- a/src/util/pycache/metrics.cpython-37.pyc
+++ b/src/util/pycache/metrics.cpython-37.pyc
--- a/src/util/pycache/results.cpython-37.pyc
+++ b/src/util/pycache/results.cpython-37.pyc
--- a/src/util/evaluation.py
+++ b/src/util/evaluation.py
@ -0,0 +1,95 @@
+# from sklearn.externals.joblib import Parallel, delayed
+from joblib import Parallel, delayed
+from util.metrics import *
+from sklearn.metrics import f1_score
+import numpy as np
+import time
+
+def evaluation_metrics(y, y_):
+    if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
+        raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
+    else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
+        return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
+
+def soft_evaluation_metrics(y, y_):
+    if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
+        raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
+    else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
+        return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
+
+def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
+    print('evaluation (n_jobs={})'.format(n_jobs))
+    if n_jobs == 1:
+        return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
+    else:
+        langs = list(ly_true.keys())
+        evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
+        return {lang: evals[i] for i, lang in enumerate(langs)}
+
+def average_results(l_eval, show=True):
+    metrics  = []
+    for lang in l_eval.keys():
+        macrof1, microf1, macrok, microk = l_eval[lang]
+        metrics.append([macrof1, microf1, macrok, microk])
+        if show:
+            print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
+
+    ave = np.mean(np.array(metrics), axis=0)
+    if show:
+        print('Averages: MF1, mF1, MK, mK', ave)
+    return ave
+
+
+def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
+    tinit=time.time()
+    print('prediction for test')
+    assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
+    n_jobs = polylingual_method.n_jobs
+
+    if predictor is None:
+        predictor = polylingual_method.predict
+
+    metrics = evaluation_metrics
+    if soft is True:
+        metrics = soft_evaluation_metrics
+    ly_ = predictor(lX, ly)
+
+    eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
+    if return_time:
+        return eval_, time.time()-tinit
+    else:
+        return eval_
+
+def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
+    print('prediction for test in a single language')
+    if predictor is None:
+        predictor = polylingual_method.predict
+
+    metrics = evaluation_metrics
+    if soft is True:
+        metrics = soft_evaluation_metrics
+
+    ly_ = predictor({lang:X})
+    return metrics(y, ly_[lang])
+
+def get_binary_counters(polylingual_method, lX, ly, predictor=None):
+    print('prediction for test')
+    assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
+    n_jobs = polylingual_method.n_jobs
+    if predictor is None:
+        predictor = polylingual_method.predict
+    ly_ = predictor(lX)
+    print('evaluation (n_jobs={})'.format(n_jobs))
+    if n_jobs == 1:
+        return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
+    else:
+        langs = list(ly.keys())
+        evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
+        return {lang: evals[i] for i, lang in enumerate(langs)}
+
+def binary_counters(y, y_):
+    y = np.reshape(y, (-1))
+    assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
+    counters = hard_single_metric_statistics(y, y_)
+    return counters.tp, counters.tn, counters.fp, counters.fn
+
--- a/src/util/file.py
+++ b/src/util/file.py
@ -0,0 +1,36 @@
+from os import listdir, makedirs
+from os.path import isdir, isfile, join, exists, dirname
+#from sklearn.externals.six.moves import urllib
+import urllib
+
+
+def download_file(url, archive_filename):
+    def progress(blocknum, bs, size):
+        total_sz_mb = '%.2f MB' % (size / 1e6)
+        current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
+        print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
+    print("Downloading %s" % url)
+    urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
+    print("")
+
+def download_file_if_not_exists(url, archive_path):
+    if exists(archive_path): return
+    makedirs_if_not_exist(dirname(archive_path))
+    download_file(url,archive_path)
+
+
+def ls(dir, typecheck):
+    el = [f for f in listdir(dir) if typecheck(join(dir, f))]
+    el.sort()
+    return el
+
+def list_dirs(dir):
+    return ls(dir, typecheck=isdir)
+
+def list_files(dir):
+    return ls(dir, typecheck=isfile)
+
+def makedirs_if_not_exist(path):
+    if not exists(path): makedirs(path)
+
+
--- a/src/util/metrics.py
+++ b/src/util/metrics.py
@ -0,0 +1,168 @@
+import numpy as np
+
+
+"""
+Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
+I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
+affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
+We adhere to the common practice of outputting 1 in this case since the classifier has correctly
+classified all examples as negatives.
+"""
+
+class ContTable:
+    def __init__(self, tp=0, tn=0, fp=0, fn=0):
+        self.tp=tp
+        self.tn=tn
+        self.fp=fp
+        self.fn=fn
+
+    def get_d(self): return self.tp + self.tn + self.fp + self.fn
+
+    def get_c(self): return self.tp + self.fn
+
+    def get_not_c(self): return self.tn + self.fp
+
+    def get_f(self): return self.tp + self.fp
+
+    def get_not_f(self): return self.tn + self.fn
+
+    def p_c(self): return (1.0*self.get_c())/self.get_d()
+
+    def p_not_c(self): return 1.0-self.p_c()
+
+    def p_f(self): return (1.0*self.get_f())/self.get_d()
+
+    def p_not_f(self): return 1.0-self.p_f()
+
+    def p_tp(self): return (1.0*self.tp) / self.get_d()
+
+    def p_tn(self): return (1.0*self.tn) / self.get_d()
+
+    def p_fp(self): return (1.0*self.fp) / self.get_d()
+
+    def p_fn(self): return (1.0*self.fn) / self.get_d()
+
+    def tpr(self):
+        c = 1.0*self.get_c()
+        return self.tp / c if c > 0.0 else 0.0
+
+    def fpr(self):
+        _c = 1.0*self.get_not_c()
+        return self.fp / _c if _c > 0.0 else 0.0
+
+    def __add__(self, other):
+        return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
+
+def accuracy(cell):
+    return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
+
+def f1(cell):
+    num = 2.0 * cell.tp
+    den = 2.0 * cell.tp + cell.fp + cell.fn
+    if den>0: return num / den
+    #we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
+    return 1.0
+
+def K(cell):
+    specificity, recall = 0., 0.
+
+    AN = cell.tn + cell.fp
+    if AN != 0:
+        specificity = cell.tn*1. / AN
+
+    AP = cell.tp + cell.fn
+    if AP != 0:
+        recall = cell.tp*1. / AP
+
+    if AP == 0:
+        return 2. * specificity - 1.
+    elif AN == 0:
+        return 2. * recall - 1.
+    else:
+        return specificity + recall - 1.
+
+#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
+#true_labels and predicted_labels are two vectors of shape (number_documents,)
+def hard_single_metric_statistics(true_labels, predicted_labels):
+    assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
+    nd = len(true_labels)
+    tp = np.sum(predicted_labels[true_labels==1])
+    fp = np.sum(predicted_labels[true_labels == 0])
+    fn = np.sum(true_labels[predicted_labels == 0])
+    tn = nd - (tp+fp+fn)
+    return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
+
+#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
+# probabilitiesfron with respect to the true binary labels
+#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
+def soft_single_metric_statistics(true_labels, posterior_probabilities):
+    assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
+    tp = np.sum(posterior_probabilities[true_labels == 1])
+    fn = np.sum(1. - posterior_probabilities[true_labels == 1])
+    fp = np.sum(posterior_probabilities[true_labels == 0])
+    tn = np.sum(1. - posterior_probabilities[true_labels == 0])
+    return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
+
+#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
+#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
+def __check_consistency_and_adapt(true_labels, predictions):
+    if predictions.ndim == 1:
+        return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
+    if true_labels.ndim == 1:
+        return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
+    if true_labels.shape != predictions.shape:
+        raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
+                         % (true_labels.shape, predictions.shape))
+    _,nC = true_labels.shape
+    return true_labels, predictions, nC
+
+def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
+    true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
+    return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
+
+def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
+    true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
+
+    accum = ContTable()
+    for c in range(nC):
+        other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
+        accum = accum + other
+
+    return metric(accum)
+
+#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
+def macroF1(true_labels, predicted_labels):
+    return macro_average(true_labels,predicted_labels, f1)
+
+#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
+def microF1(true_labels, predicted_labels):
+    return micro_average(true_labels, predicted_labels, f1)
+
+#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
+def macroK(true_labels, predicted_labels):
+    return macro_average(true_labels,predicted_labels, K)
+
+#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
+def microK(true_labels, predicted_labels):
+    return micro_average(true_labels, predicted_labels, K)
+
+#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
+#of the same shape containing real values in [0,1]
+def smoothmacroF1(true_labels, posterior_probabilities):
+    return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
+
+#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
+#of the same shape containing real values in [0,1]
+def smoothmicroF1(true_labels, posterior_probabilities):
+    return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
+
+#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
+#of the same shape containing real values in [0,1]
+def smoothmacroK(true_labels, posterior_probabilities):
+    return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
+
+#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
+#of the same shape containing real values in [0,1]
+def smoothmicroK(true_labels, posterior_probabilities):
+    return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
+
--- a/src/util/results.py
+++ b/src/util/results.py
@ -0,0 +1,33 @@
+import os
+import pandas as pd
+import numpy as np
+
+class PolylingualClassificationResults:
+    def __init__(self, file, autoflush=True, verbose=False):
+        self.file = file
+        self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
+        self.autoflush = autoflush
+        self.verbose = verbose
+        if os.path.exists(file):
+            self.tell('Loading existing file from {}'.format(file))
+            self.df = pd.read_csv(file, sep='\t')
+        else:
+            self.tell('File {} does not exist. Creating new frame.'.format(file))
+            dir = os.path.dirname(self.file)
+            if dir and not os.path.exists(dir): os.makedirs(dir)
+            self.df = pd.DataFrame(columns=self.columns)
+
+    def already_calculated(self, id):
+        return (self.df['id'] == id).any()
+
+    def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
+        s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
+        self.df = self.df.append(s, ignore_index=True)
+        if self.autoflush: self.flush()
+        self.tell(s.to_string())
+
+    def flush(self):
+        self.df.to_csv(self.file, index=False, sep='\t')
+
+    def tell(self, msg):
+        if self.verbose: print(msg)