first commit

This commit is contained in:
andrea 2019-11-28 18:39:19 +01:00
commit 2a5d0243db
47 changed files with 4219 additions and 0 deletions

20
.idea/deployment.xml Normal file
View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" serverName="anna_isti">
<serverData>
<paths name="anna_isti">
<serverdata>
<mappings>
<mapping deploy="/home/andreapdr/funneling_pdr" local="$PROJECT_DIR$" web="/" />
<mapping deploy="/home/andreapdr/CLESA/embeddings" local="/storage/andrea/FUNNELING/embeddings" />
</mappings>
<excludedPaths>
<excludedPath local="true" path="$PROJECT_DIR$/src/venv" />
<excludedPath local="true" path="$PROJECT_DIR$/src/pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle" />
<excludedPath local="true" path="$PROJECT_DIR$/src/results/results.csv" />
</excludedPaths>
</serverdata>
</paths>
</serverData>
</component>
</project>

4
.idea/encodings.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

10
.idea/misc.xml Normal file
View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="NodePackageJsonFileManager">
<packageJsonPaths />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (word-class-embeddings)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/tesi_funneling.iml" filepath="$PROJECT_DIR$/.idea/tesi_funneling.iml" />
</modules>
</component>
</project>

14
.idea/tesi_funneling.iml Normal file
View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/src/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.7 (word-class-embeddings)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

15
.idea/webServers.xml Normal file
View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="WebServers">
<option name="servers">
<webServer id="8f0f329c-a17c-48ba-b459-18d8b1a104e5" name="anna_isti" url="http://anna.isti.cnr.it">
<fileTransfer host="anna.isti.cnr.it" port="22" accessType="SFTP">
<advancedOptions>
<advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
</advancedOptions>
<option name="port" value="22" />
</fileTransfer>
</webServer>
</option>
</component>
</project>

655
.idea/workspace.xml Normal file
View File

@ -0,0 +1,655 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="CoverageDataManager">
<SUITE FILE_PATH="coverage/tesi_funneling$funneling_poly.coverage" NAME="funneling_poly Coverage Results" MODIFIED="1574690332154" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
<SUITE FILE_PATH="coverage/tesi_funneling$last_test.coverage" NAME="last_test Coverage Results" MODIFIED="1574960066673" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
<SUITE FILE_PATH="coverage/tesi_funneling$scratch.coverage" NAME="scratch Coverage Results" MODIFIED="1574759452703" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
</component>
<component name="FileEditorManager">
<splitter split-orientation="horizontal" split-proportion="0.5">
<split-first>
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="182">
<caret line="321" selection-start-line="321" selection-end-line="321" />
<folding>
<element signature="e#13891#17737#0" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</split-first>
<split-second>
<leaf>
<file pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="462">
<caret line="162" selection-start-line="162" selection-end-line="162" />
<folding>
<element signature="e#0#9#0" expanded="true" />
<element signature="e#222#778#0" />
</folding>
</state>
</provider>
</entry>
</file>
</leaf>
</split-second>
</splitter>
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="FindInProjectRecents">
<findStrings>
<find>NaivePolylingualClassifier</find>
<find>tra</find>
<find>base_l</find>
<find>tfidf</find>
<find>proba</find>
<find>we</find>
<find>Wordembeddings</find>
<find>hstack</find>
<find>ha</find>
<find>timeit</find>
<find>ti</find>
<find>time</find>
<find>dot</find>
<find>vec</find>
<find>_fit_binary</find>
<find>oneVs</find>
<find>embed</find>
<find>no tf-</find>
<find>embedding_matrix</find>
<find>WordEm</find>
<find>WordEmbeddings</find>
<find># pretrai</find>
<find># [pre</find>
<find>joblib</find>
</findStrings>
</component>
<component name="HighlightingSettingsPerFile">
<setting file="file://$PROJECT_DIR$/src/learning/learners.py" root0="FORCE_HIGHLIGHTING" />
</component>
<component name="IdeDocumentHistory">
<option name="CHANGED_PATHS">
<list>
<option value="$PROJECT_DIR$/src/dataset_builder.py" />
<option value="$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py" />
<option value="$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
<option value="$PROJECT_DIR$/src/scratch.py" />
<option value="$PROJECT_DIR$/src/data/tsr_function__.py" />
<option value="$PROJECT_DIR$/src/data/supervised.py" />
<option value="$PROJECT_DIR$/src/last_test.py" />
<option value="&lt;8f0f329c-a17c-48ba-b459-18d8b1a104e5&gt;/home/andreapdr/funneling_pdr/src/data/embeddings.py" />
<option value="$PROJECT_DIR$/src/util/results.py" />
<option value="$PROJECT_DIR$/src/transformers/clesa.py" />
<option value="$PROJECT_DIR$/src/transformers/dci.py" />
<option value="$PROJECT_DIR$/src/util/evaluation.py" />
<option value="$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
<option value="$PROJECT_DIR$/src/funneling_poly.py" />
<option value="$PROJECT_DIR$/src/learning/learners.py" />
<option value="$PROJECT_DIR$/src/FPEC_andrea.py" />
<option value="$PROJECT_DIR$/src/data/embeddings.py" />
</list>
</option>
</component>
<component name="ProjectFrameBounds" extendedState="6">
<option name="x" value="5" />
<option name="y" value="28" />
<option name="width" value="960" />
<option name="height" value="1052" />
</component>
<component name="ProjectView">
<navigator proportions="" version="1">
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="Scope" />
<pane id="ProjectPane">
<subPane>
<expand>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="learning" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="pickles" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="results" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="transformers" type="462c0819:PsiDirectoryNode" />
</path>
<path>
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
<item name="src" type="462c0819:PsiDirectoryNode" />
<item name="util" type="462c0819:PsiDirectoryNode" />
</path>
</expand>
<select />
</subPane>
</pane>
</panes>
</component>
<component name="PropertiesComponent">
<property name="WebServerToolWindowFactoryState" value="true" />
<property name="WebServerToolWindowPanel.toolwindow.highlight.mappings" value="true" />
<property name="WebServerToolWindowPanel.toolwindow.highlight.symlinks" value="true" />
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/src/results/results.csv" />
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RecentsManager">
<key name="CopyFile.RECENT_KEYS">
<recent name="$PROJECT_DIR$/src/util" />
<recent name="$PROJECT_DIR$/src/data" />
<recent name="$PROJECT_DIR$/src" />
<recent name="$PROJECT_DIR$/src/learning" />
</key>
</component>
<component name="RunDashboard">
<option name="ruleStates">
<list>
<RuleState>
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
</RuleState>
<RuleState>
<option name="name" value="StatusDashboardGroupingRule" />
</RuleState>
</list>
</option>
</component>
<component name="RunManager" selected="Python.last_test">
<configuration name="funneling_poly" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="tesi_funneling" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/funneling_poly.py" />
<option name="PARAMETERS" value="-d &quot;pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle&quot; -w /storage/andrea/FUNNELING/embeddings/" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="last_test" type="PythonConfigurationType" factoryName="Python">
<module name="tesi_funneling" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/FPEC_andrea.py" />
<option name="PARAMETERS" value="-d &quot;pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle&quot; -w /storage/andrea/FUNNELING/embeddings/ -e unsupervised" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="scratch" type="PythonConfigurationType" factoryName="Python" temporary="true">
<module name="tesi_funneling" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/scratch.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<list>
<item itemvalue="Python.last_test" />
<item itemvalue="Python.funneling_poly" />
<item itemvalue="Python.scratch" />
</list>
<recent_temporary>
<list>
<item itemvalue="Python.scratch" />
<item itemvalue="Python.funneling_poly" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
<created>1574680487463</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1574680487463</updated>
<workItem from="1574680491429" duration="18756000" />
<workItem from="1574705313406" duration="1369000" />
<workItem from="1574758627235" duration="18313000" />
<workItem from="1574845439127" duration="15307000" />
<workItem from="1574870087360" duration="629000" />
<workItem from="1574871032651" duration="671000" />
<workItem from="1574873488200" duration="225000" />
<workItem from="1574876908618" duration="140000" />
<workItem from="1574877826026" duration="560000" />
<workItem from="1574938635317" duration="14980000" />
<workItem from="1574958501259" duration="1736000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="72686000" />
</component>
<component name="ToolWindowManager">
<frame x="0" y="28" width="1920" height="1052" extended-state="6" />
<layout>
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.15544872" />
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
<window_info id="Favorites" order="2" side_tool="true" />
<window_info anchor="bottom" id="File Transfer" order="0" weight="0.3297414" />
<window_info anchor="bottom" id="Message" order="1" />
<window_info anchor="bottom" id="Find" order="2" weight="0.3297414" />
<window_info anchor="bottom" id="Run" order="3" weight="0.53556037" />
<window_info anchor="bottom" id="Debug" order="4" weight="0.5538793" />
<window_info anchor="bottom" id="Cvs" order="5" weight="0.25" />
<window_info anchor="bottom" id="Inspection" order="6" weight="0.4" />
<window_info anchor="bottom" id="TODO" order="7" />
<window_info anchor="bottom" id="Docker" order="8" show_stripe_button="false" />
<window_info anchor="bottom" id="Version Control" order="9" />
<window_info anchor="bottom" id="Database Changes" order="10" />
<window_info anchor="bottom" id="Event Log" order="11" side_tool="true" weight="0.3297414" />
<window_info anchor="bottom" id="Terminal" order="12" weight="0.42456895" />
<window_info anchor="bottom" id="Python Console" order="13" />
<window_info anchor="right" id="Remote Host" order="0" weight="0.32959402" />
<window_info anchor="right" id="Commander" order="1" weight="0.4" />
<window_info anchor="right" id="Ant Build" order="2" weight="0.25" />
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="3" weight="0.25" />
<window_info anchor="right" id="SciView" order="4" weight="0.5918803" />
<window_info anchor="right" id="Database" order="5" />
</layout>
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="1" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/src/data/embeddings.py</url>
<line>162</line>
<option name="timeStamp" value="1" />
</line-breakpoint>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/src/learning/learners.py</url>
<line>566</line>
<option name="timeStamp" value="2" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
</component>
<component name="editorHistoryManager">
<entry file="file://$PROJECT_DIR$/src/data/reader/__init__.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/time.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="152">
<caret line="308" column="4" selection-start-line="308" selection-start-column="4" selection-end-line="308" selection-end-column="4" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/utils/validation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="440">
<caret line="950" selection-start-line="950" selection-end-line="950" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/scipy/sparse/base.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="746">
<caret line="1218" selection-start-line="1218" selection-end-line="1218" />
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/typeshed/stdlib/2and3/time.pyi">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/torch/_C/_TensorBase.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="325">
<caret line="887" column="8" selection-start-line="887" selection-start-column="8" selection-end-line="887" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.7/multiprocessing/connection.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="183">
<caret line="368" selection-start-line="368" selection-end-line="368" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/base.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="201">
<caret line="292" selection-start-line="292" selection-end-line="292" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.7/selectors.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="167">
<caret line="417" selection-start-line="417" selection-end-line="417" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/scratch.py" />
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="191">
<caret line="271" selection-start-line="271" selection-end-line="271" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="191">
<caret line="566" selection-start-line="566" selection-end-line="566" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.7/concurrent/futures/_base.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="101">
<caret line="383" selection-start-line="383" selection-end-line="383" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="182">
<caret line="224" selection-start-line="224" selection-end-line="224" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/classes.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="20">
<caret line="604" selection-start-line="604" selection-end-line="604" />
<folding>
<element signature="e#18103#25253#1" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="166">
<caret line="702" selection-start-line="702" selection-end-line="702" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.7/pickle.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="166">
<caret line="503" selection-start-line="503" selection-end-line="503" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/nn/modules/__init__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="216">
<caret line="12" selection-start-line="12" selection-end-line="12" />
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/pydevd.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="139">
<caret line="1486" selection-start-line="1486" selection-end-line="1486" />
</state>
</provider>
</entry>
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_bundle/pydev_monkey.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="180">
<caret line="12" column="38" lean-forward="true" selection-start-line="12" selection-end-line="13" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/data/tsr_function__.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="54">
<caret line="3" selection-start-line="3" selection-end-line="3" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/data/supervised.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="620">
<caret line="67" column="12" selection-start-line="67" selection-start-column="11" selection-end-line="67" selection-end-column="12" />
<folding>
<element signature="e#0#99#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/__init__.pyi">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="299">
<caret line="491" column="8" selection-start-line="491" selection-start-column="8" selection-end-line="491" selection-end-column="8" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="217">
<caret line="180" selection-start-line="180" selection-end-line="180" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="14">
<caret line="366" selection-start-line="366" selection-end-line="366" />
</state>
</provider>
</entry>
<entry file="file:///usr/lib/python3.7/codecs.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="174">
<caret line="309" selection-start-line="309" selection-end-line="309" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/pandas/core/series.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="160">
<caret line="303" selection-start-line="303" selection-end-line="303" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/transformers/clesa.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="72">
<caret line="4" selection-start-line="4" selection-end-line="4" />
<folding>
<element signature="e#0#18#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/transformers/riboc.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#11#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/util/evaluation.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="18">
<caret line="1" column="36" selection-start-line="1" selection-start-column="36" selection-end-line="1" selection-end-column="36" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/util/file.py">
<provider selected="true" editor-type-id="text-editor">
<state>
<folding>
<element signature="e#0#32#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
<entry file="file://$PROJECT_DIR$/src/transformers/dci.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="159">
<caret line="9" selection-start-line="9" selection-end-line="9" />
<folding>
<element signature="e#0#18#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/util/results.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="594">
<caret line="33" lean-forward="true" selection-start-line="33" selection-end-line="33" />
<folding>
<element signature="e#0#9#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/dataset_builder.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="1314">
<caret line="73" selection-start-line="73" selection-end-line="73" />
<folding>
<element signature="e#0#32#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/util/metrics.py">
<provider selected="true" editor-type-id="text-editor" />
</entry>
<entry file="file://$PROJECT_DIR$/src/funneling_poly.py" />
<entry file="file://$PROJECT_DIR$/src/learning/learners.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="2063">
<caret line="517" selection-start-line="517" selection-end-line="517" />
<folding>
<element signature="e#0#18#0" expanded="true" />
<element signature="e#23965#24743#0" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="462">
<caret line="162" selection-start-line="162" selection-end-line="162" />
<folding>
<element signature="e#0#9#0" expanded="true" />
<element signature="e#222#778#0" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/FPEC_andrea.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="119">
<caret line="99" column="40" lean-forward="true" selection-start-line="74" selection-start-column="6" selection-end-line="99" selection-end-column="40" />
<folding>
<element signature="e#0#27#0" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="182">
<caret line="321" selection-start-line="321" selection-end-line="321" />
<folding>
<element signature="e#13891#17737#0" />
</folding>
</state>
</provider>
</entry>
</component>
</project>

1
src/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.idea

151
src/FPEC_andrea.py Normal file
View File

@ -0,0 +1,151 @@
from sklearn.svm import SVC
import os, sys
from dataset_builder import MultilingualDataset
from learning.learners import *
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimices hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
def get_params(dense=False): # TODO kernel function could be usefull for meta-classifier
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
dataset_file = os.path.basename(op.dataset)
results = PolylingualClassificationResults(op.output)
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
print(lXtr.keys())
small_lXtr = dict()
small_lytr = dict()
small_lXte = dict()
small_lyte = dict()
small_lXtr['da'] = lXtr['da'][:50]
small_lytr['da'] = lytr['da'][:50]
# small_lXtr['en'] = lXtr['en'][:50]
# small_lytr['en'] = lytr['en'][:50]
# small_lXtr['fr'] = lXtr['fr'][:50]
# small_lytr['fr'] = lytr['fr'][:50]
# small_lXte['da'] = lXte['da'][:50]
# small_lyte['da'] = lyte['da'][:50]
# small_lXte['en'] = lXte['en'][:50]
# small_lyte['en'] = lyte['en'][:50]
# small_lXte['fr'] = lXte['fr'][:50]
# small_lyte['fr'] = lyte['fr'][:50]
# small_lXtr['it'] = lXtr['it'][:50]
# small_lytr['it'] = lytr['it'][:50]
# small_lXtr['es'] = lXtr['es'][:50]
# small_lytr['es'] = lytr['es'][:50]
# small_lXtr['de'] = lXtr['de'][:50]
# small_lytr['de'] = lytr['de'][:50]
# small_lXtr['pt'] = lXtr['pt'][:50]
# small_lytr['pt'] = lytr['pt'][:50]
# small_lXtr['nl'] = lXtr['de'][:50]
# small_lytr['nl'] = lytr['de'][:50]
# small_lXtr['fi'] = lXtr['fi'][:50]
# small_lytr['fi'] = lytr['fi'][:50]
# small_lXtr['hu'] = lXtr['hu'][:50]
# small_lytr['hu'] = lytr['hu'][:50]
# small_lXtr['sv'] = lXtr['sv'][:50]
# small_lytr['sv'] = lytr['sv'][:50]
if op.set_c != -1:
meta_parameters = None
else:
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
# Embeddings and WCE config
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
if op.mode_embed == 'none':
config = {'unsupervised': False,
'supervised': False}
_config_id = 'None'
elif op.mode_embed == 'unsupervised':
config = {'unsupervised': True,
'supervised': False}
_config_id = 'M'
elif op.mode_embed == 'supervised':
config = {'unsupervised': False,
'supervised': True}
_config_id = 'F'
elif op.mode_embed == 'both':
config = {'unsupervised': True,
'supervised': True}
_config_id = 'M_and_F'
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
print(f'### PolyEmbedd_andrea_{_config_id}\n')
classifier = AndreaCLF(op.we_path,
config,
first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False),
first_tier_parameters=get_params(dense=True),
meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs)
print('# Fitting ...')
classifier.fit(small_lXtr, small_lytr)
print('# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

Binary file not shown.

0
src/data/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

196
src/data/embeddings.py Normal file
View File

@ -0,0 +1,196 @@
import os
import pickle
import numpy as np
from torchtext.vocab import Vectors
import torch
from abc import ABC, abstractmethod
from data.supervised import get_supervised_embeddings
class PretrainedEmbeddings(ABC):
def __init__(self):
super().__init__()
@abstractmethod
def vocabulary(self): pass
@abstractmethod
def dim(self): pass
@classmethod
def reindex(cls, words, word2index):
source_idx, target_idx = [], []
for i, word in enumerate(words):
if word not in word2index: continue
j = word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx
class WordEmbeddings:
def __init__(self, lang, we, worddim):
self.lang = lang
self.we = we
self.worddim = worddim
self.dimword = {v:k for k,v in self.worddim.items()}
@classmethod
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
filename = 'wiki.multi.{}.vec'.format(lang)
we_path = os.path.join(basedir, filename)
if dopickle and os.path.exists(we_path + '.pkl'):
print('loading pkl in {}'.format(we_path + '.pkl'))
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
else:
word_registry=set()
lines = open(we_path).readlines()
nwords, dims = [int(x) for x in lines[0].split()]
print('reading we of {} dimensions'.format(dims))
we = np.zeros((nwords, dims), dtype=float)
worddim = {}
index = 0
for i, line in enumerate(lines[1:]):
if (i + 1) % 100 == 0:
print('\r{}/{}'.format(i + 1, len(lines)), end='')
word, *vals = line.split()
wordp = word_preprocessor(word) if word_preprocessor is not None else word
if wordp:
wordp=wordp[0]
if wordp in word_registry:
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
elif len(vals) == dims:
worddim[wordp] = index
we[index, :] = np.array(vals).astype(float)
index+=1
# else:
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
we = we[:index]
print('load {} words'.format(index))
if dopickle:
print('saving...')
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
return WordEmbeddings(lang, we, worddim)
def vocabulary(self):
return set(self.worddim.keys())
def __getitem__(self, key):
return self.we[self.worddim[key]]
def dim(self):
return self.we.shape[1]
def __contains__(self, key):
return key in self.worddim
def most_similar(self, word_vect, k):
if word_vect.ndim == 1:
word_vect = word_vect.reshape(1,-1)
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
sim = np.dot(word_vect,self.we.T)
order = np.argsort(-1*sim, axis=1)[:,:k]
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
sim_scores = sim[:,order]
return similar_words, sim_scores
def get_vectors(self, wordlist):
indexes = np.array([self.worddim[w] for w in wordlist])
return self.we[indexes]
def restrict(self, vocabulary):
# vocabulary is a set of terms to be kept
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
lost = len(vocabulary)-len(active_vocabulary)
if lost>0: #some termr are missing, so it will be replaced by UNK
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
self.we = self.get_vectors(active_vocabulary)
assert self.we.shape[0]==len(active_vocabulary)
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
return self
@classmethod
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
if lang_vocabularies is None:
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
else:
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
@classmethod
def merge(cls, we_list):
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
'instances of {} expected'.format(WordEmbeddings.__name__)
polywe = []
worddim={}
offset=0
for we in we_list:
polywe.append(we.we)
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
offset=len(worddim)
polywe = np.vstack(polywe)
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
class FastTextWikiNews(Vectors):
url_base = 'Cant auto-download MUSE embeddings'
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
def __init__(self, cache, language="en", **kwargs):
url = self.url_base.format(language)
name = self.path.format(language)
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
class FastTextMUSE(PretrainedEmbeddings):
def __init__(self, path, lang, limit=None):
super().__init__()
print(f'Loading fastText pretrained vectors from {path}')
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
# print('Done')
def vocabulary(self):
return set(self.embed.stoi.keys())
def dim(self):
return self.embed.dim
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim()))
extraction[source_idx] = self.embed.vectors[target_idx]
return extraction
def embedding_matrix(path, voc, lang):
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
print('[embedding matrix]')
print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
pretrained = FastTextMUSE(path, lang)
P = pretrained.extract(vocabulary).numpy()
del pretrained
print(f'[embedding matrix done] of shape={P.shape}\n')
return vocabulary, P
def WCE_matrix(Xtr, Ytr, lang):
print('\n# [supervised-matrix]')
S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
print(f'[embedding matrix done] of shape={S.shape}\n')
return S

42
src/data/languages.py Normal file
View File

@ -0,0 +1,42 @@
"""
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""
NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
#top 10 languages in wikipedia order by the number of articles
#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
#all languages in JRC-acquis v3
JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}

View File

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,321 @@
from __future__ import print_function
import os, sys
from os.path import join
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
from util.file import download_file, list_dirs, list_files
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from data.languages import JRC_LANGS
from collections import Counter
from random import shuffle
from data.languages import lang_set
"""
JRC Acquis' Nomenclature:
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""
class JRCAcquis_Document:
def __init__(self, id, name, lang, year, head, body, categories):
self.id = id
self.parallel_id = name
self.lang = lang
self.year = year
self.text = body if not head else head + "\n" + body
self.categories = categories
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
for ch in ['a','e','i','o','u']:
text = text.replace('%'+ch+'acute%',ch)
return text
def parse_document(file, year, head=False):
root = ET.parse(file).getroot()
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
doc_lang = root.attrib['lang'] # e.g., 'es'
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
def raise_if_empty(field, from_file):
if isinstance(field, str):
if not field.strip():
raise ValueError("Empty field in file %s" % from_file)
raise_if_empty(doc_name, file)
raise_if_empty(doc_lang, file)
raise_if_empty(doc_id, file)
if head: raise_if_empty(doc_head, file)
raise_if_empty(doc_body, file)
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
# removes documents without a counterpart in all other languages
def _force_parallel(doclist, langs):
n_langs = len(langs)
par_id_count = Counter([d.parallel_id for d in doclist])
parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
def random_sampling_avoiding_parallel(doclist):
random_order = list(range(len(doclist)))
shuffle(random_order)
sampled_request = []
parallel_ids = set()
for ind in random_order:
pid = doclist[ind].parallel_id
if pid not in parallel_ids:
sampled_request.append(doclist[ind])
parallel_ids.add(pid)
print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
return sampled_request
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
if not isinstance(cat_filter, frozenset):
cat_filter = frozenset(cat_filter)
filtered = []
for doc in doclist:
doc.categories = list(cat_filter & set(doc.categories))
if doc.categories:
doc.categories.sort()
filtered.append(doc)
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
return filtered
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
def _get_categories(request):
final_cats = set()
for d in request:
final_cats.update(d.categories)
return list(final_cats)
def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
if not langs:
langs = JRC_LANGS
else:
if isinstance(langs, str): langs = [langs]
for l in langs:
if l not in JRC_LANGS:
raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
if not data_path:
data_path = get_data_home()
if not os.path.exists(data_path):
os.mkdir(data_path)
request = []
total_read = 0
for l in langs:
file_name = 'jrc-'+l+'.tgz'
archive_path = join(data_path, file_name)
if not os.path.exists(archive_path):
print("downloading language-specific dataset (once and for all) into %s" % data_path)
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
download_file(DOWNLOAD_URL, archive_path)
print("untarring dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
documents_dir = join(data_path, l)
print("Reading documents...")
read = 0
for dir in list_dirs(documents_dir):
year = int(dir)
if years==None or year in years:
year_dir = join(documents_dir,dir)
pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
if os.path.exists(pickle_name):
print("loading from file %s" % pickle_name)
l_y_documents = pickle.load(open(pickle_name, "rb"))
read += len(l_y_documents)
else:
l_y_documents = []
all_documents = list_files(year_dir)
empty = 0
for i,doc_file in enumerate(all_documents):
try:
jrc_doc = parse_document(join(year_dir, doc_file), year)
except ValueError:
jrc_doc = None
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
l_y_documents.append(jrc_doc)
else: empty += 1
if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
read+=1
print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
print("\t\t(Pickling object for future runs in %s)" % pickle_name)
pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
request += l_y_documents
print("Read %d documents for language %s\n" % (read, l))
total_read += read
print("Read %d documents in total" % (total_read))
if parallel=='force':
request = _force_parallel(request, langs)
elif parallel == 'avoid':
request = random_sampling_avoiding_parallel(request)
final_cats = _get_categories(request)
if cat_filter:
request = _filter_by_category(request, cat_filter)
final_cats = _get_categories(request)
if cat_threshold > 0:
request, final_cats = _filter_by_frequency(request, cat_threshold)
if most_frequent != -1 and len(final_cats) > most_frequent:
request, final_cats = _most_common(request, most_frequent)
return request, final_cats
def print_cat_analysis(request):
cat_count = Counter()
for d in request:
cat_count.update(d.categories)
print("Number of active categories: {}".format(len(cat_count)))
print(cat_count.most_common())
# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
select="broadest"):
fullpath_pickle = join(data_path, select+'_concepts.pickle')
if os.path.exists(fullpath_pickle):
print("Pickled object found in %s. Loading it." % fullpath_pickle)
return pickle.load(open(fullpath_pickle,'rb'))
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
if not os.path.exists(fullpath):
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
download_file(eurovoc_url, fullpath)
print("Unzipping file...")
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
zipped.close()
print("Parsing %s" %fullpath)
g = rdflib.Graph()
g.parse(location=fullpath, format="application/rdf+xml")
if select == "all":
print("Selecting all concepts")
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
all_concepts.sort()
selected_concepts = all_concepts
elif select=="broadest":
print("Selecting broadest concepts (those without any other broader concept linked to it)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
narrower_concepts = set(g.subjects(SKOS.broader, None))
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
broadest_concepts.sort()
selected_concepts = broadest_concepts
elif select=="leaves":
print("Selecting leaves concepts (those not linked as broader of any other concept)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
broad_concepts = set(g.objects(None, SKOS.broader))
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
leave_concepts.sort()
selected_concepts = leave_concepts
else:
raise ValueError("Selection policy %s is not currently supported" % select)
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
return selected_concepts
if __name__ == '__main__':
def single_label_fragment(doclist):
single = [d for d in doclist if len(d.categories) < 2]
final_categories = set([d.categories[0] if d.categories else [] for d in single])
print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
len(final_categories),
len(doclist)))
return single, list(final_categories)
train_years = list(range(1986, 2006))
test_years = [2006]
cat_policy = 'leaves'
most_common_cat = 300
# JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
langs = lang_set['JRC_NLTK']
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
sys.exit()
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
training_docs, label_names = single_label_fragment(training_docs)
test_docs, label_namestest = single_label_fragment(test_docs)
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))

View File

@ -0,0 +1,225 @@
from zipfile import ZipFile
import xml.etree.ElementTree as ET
from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
from util.file import list_files
from sklearn.datasets import get_data_home
import gzip
from os.path import join, exists
from util.file import download_file_if_not_exists
import re
from collections import Counter
import numpy as np
import sys
"""
RCV2's Nomenclature:
ru = Russian
da = Danish
de = German
es = Spanish
lat = Spanish Latin-American (actually is also 'es' in the collection)
fr = French
it = Italian
nl = Dutch
pt = Portuguese
sv = Swedish
ja = Japanese
htw = Chinese
no = Norwegian
"""
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
'lyrl2004_tokens_test_pt1.dat.gz',
'lyrl2004_tokens_test_pt2.dat.gz',
'lyrl2004_tokens_test_pt3.dat.gz']
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
RCV2_LANG_DIR = {'ru':'REUTE000',
'de':'REUTE00A',
'fr':'REUTE00B',
'sv':'REUTE001',
'no':'REUTE002',
'da':'REUTE003',
'pt':'REUTE004',
'it':'REUTE005',
'es':'REUTE006',
'lat':'REUTE007',
'jp':'REUTE008',
'htw':'REUTE009',
'nl':'REUTERS_'}
class RCV_Document:
def __init__(self, id, text, categories, date='', lang=None):
self.id = id
self.date = date
self.lang = lang
self.text = text
self.categories = categories
class ExpectedLanguageException(Exception): pass
class IDRangeException(Exception): pass
nwords = []
def parse_document(xml_content, assert_lang=None, valid_id_range=None):
root = ET.fromstring(xml_content)
if assert_lang:
if assert_lang not in root.attrib.values():
if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp'
raise ExpectedLanguageException('error: document of a different language')
doc_id = root.attrib['itemid']
if valid_id_range is not None:
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
raise IDRangeException
doc_categories = [cat.attrib['code'] for cat in
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
doc_date = root.attrib['date']
doc_title = root.find('.//title').text
doc_headline = root.find('.//headline').text
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
if not doc_body:
raise ValueError('Empty document')
if doc_title is None: doc_title = ''
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
text_length = len(text.split())
global nwords
nwords.append(text_length)
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
def fetch_RCV1(data_path, split='all'):
assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
request = []
labels = set()
read_documents = 0
lang = 'en'
training_documents = 23149
test_documents = 781265
if split == 'all':
split_range = (2286, 810596)
expected = training_documents+test_documents
elif split == 'train':
split_range = (2286, 26150)
expected = training_documents
else:
split_range = (26151, 810596)
expected = test_documents
global nwords
nwords=[]
for part in list_files(data_path):
if not re.match('\d+\.zip', part): continue
target_file = join(data_path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
labels.update(doc.categories)
request.append(doc)
read_documents += 1
except ValueError:
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
except (IDRangeException, ExpectedLanguageException) as e:
pass
print('\r[{}] read {} documents'.format(part, len(request)), end='')
if read_documents == expected: break
if read_documents == expected: break
print()
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return request, list(labels)
def fetch_RCV2(data_path, languages=None):
if not languages:
languages = list(RCV2_LANG_DIR.keys())
else:
assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
request = []
labels = set()
global nwords
nwords=[]
for lang in languages:
path = join(data_path, RCV2_LANG_DIR[lang])
lang_docs_read = 0
for part in list_files(path):
target_file = join(path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, assert_lang=lang)
labels.update(doc.categories)
request.append(doc)
lang_docs_read += 1
except ValueError:
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
except (IDRangeException, ExpectedLanguageException) as e:
pass
print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
print()
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return request, list(labels)
def fetch_topic_hierarchy(path, topics='all'):
assert topics in ['all', 'leaves']
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
hierarchy = {}
for line in open(path, 'rt'):
parts = line.strip().split()
parent,child = parts[1],parts[3]
if parent not in hierarchy:
hierarchy[parent]=[]
hierarchy[parent].append(child)
del hierarchy['None']
del hierarchy['Root']
print(hierarchy)
if topics=='all':
topics = set(hierarchy.keys())
for parent in hierarchy.keys():
topics.update(hierarchy[parent])
return list(topics)
elif topics=='leaves':
parents = set(hierarchy.keys())
childs = set()
for parent in hierarchy.keys():
childs.update(hierarchy[parent])
return list(childs.difference(parents))

View File

@ -0,0 +1,304 @@
from __future__ import print_function
import ijson
import os, sys
from os.path import join
from bz2 import BZ2File
from ijson.common import ObjectBuilder
import pickle
from util.file import list_dirs, list_files, makedirs_if_not_exist
from itertools import islice
import re
from xml.sax.saxutils import escape
import numpy as np
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
"""
This file contains a set of tools for processing the Wikipedia multilingual documents.
In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
and have processed each document to clean their texts with one of the tools:
- https://github.com/aesuli/wikipediatools (Python 2)
- https://github.com/aesuli/wikipedia-extractor (Python 3)
It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
This tools help you in:
- Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
- Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
- Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
language-specific versions from the document.
- Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
in a way that the i-th element from any list refers to the same element in the respective language.
"""
def _doc_generator(text_path, langs):
dotspace = re.compile(r'\.(?!\s)')
for l,lang in enumerate(langs):
print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
lang_dir = join(text_path, lang)
split_dirs = list_dirs(lang_dir)
for sd,split_dir in enumerate(split_dirs):
print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
split_files = list_files(join(lang_dir, split_dir))
for sf,split_file in enumerate(split_files):
print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
while True:
doc_lines = list(islice(fi, 3))
if doc_lines:
# some sentences are not followed by a space after the dot
doc_lines[1] = dotspace.sub('. ', doc_lines[1])
# [workaround] I found &nbsp; html symbol was not treated, and unescaping it now might not help...
doc_lines[1] = escape(doc_lines[1].replace("&nbsp;", " "))
yield doc_lines, lang
else: break
def _extract_title(doc_lines):
m = re.search('title="(.+?)"', doc_lines[0])
if m: return m.group(1).decode('utf-8')
else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
def _create_doc(target_file, id, doc, lang):
doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
with open(target_file, 'w') as fo:
fo.write('<multidoc id="%s">\n'%id)
[fo.write(line) for line in doc]
fo.write('</multidoc>')
def _append_doc(target_file, doc, lang):
doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
with open(target_file, 'r', buffering=1024*1024) as fi:
lines = fi.readlines()
if doc[0] in lines[1::3]:
return
lines[-1:-1]=doc
with open(target_file, 'w', buffering=1024*1024) as fo:
[fo.write(line) for line in lines]
def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
if not os.path.exists(out_path):
os.makedirs(out_path)
for lang in langs:
if lang not in inv_dict:
raise ValueError("Lang %s is not in the dictionary" % lang)
docs_created = len(list_files(out_path))
print("%d multilingual documents found." % docs_created)
for doc,lang in _doc_generator(text_path, langs):
title = _extract_title(doc)
if title in inv_dict[lang]:
#pass
ids = inv_dict[lang][title]
for id in ids:
target_file = join(out_path, id) + ".xml"
if os.path.exists(target_file):
_append_doc(target_file, doc, lang)
else:
_create_doc(target_file, id, doc, lang)
docs_created+=1
else:
if not re.match('[A-Za-z]+', title):
print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
simplified_file = join(data_dir,filename)
if policy not in policies:
raise ValueError("Policy %s not supported." % policy)
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
lang_prefix = list(langs)
lang_prefix.sort()
pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
if os.path.exists(pickle_invdict):
if return_both and os.path.exists(pickle_dict):
print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
elif return_both==False:
print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
return pickle.load(open(pickle_invdict, 'rb'))
multiling_titles = {}
inv_dict = {lang:{} for lang in langs}
def process_entry(line):
parts = line.strip().split('\t')
id = parts[0]
if id in multiling_titles:
raise ValueError("id <%s> already indexed" % id)
titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
for lang in titles.keys():
if lang not in langs:
del titles[lang]
if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
or (policy == "IN_ANY_LANG" and len(titles) > 0):
multiling_titles[id] = titles
for lang, title in titles.items():
if title in inv_dict[lang]:
inv_dict[lang][title].append(id)
inv_dict[lang][title] = [id]
with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
completed = 0
try:
for line in fi:
process_entry(line)
completed += 1
if completed % 10 == 0:
print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
except EOFError:
print("\nUnexpected file ending... saving anyway")
print("Pickling dictionaries in %s" % data_dir)
pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
print("Done")
return (multiling_titles, inv_dict) if return_both else inv_dict
# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
latest_all_json_file = join(data_dir,json_file)
if policy not in policies:
raise ValueError("Policy %s not supported." % policy)
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
lang_prefix = list(langs)
lang_prefix.sort()
simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
def process_entry(last, fo):
global written
id = last["id"]
titles = None
if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
titles = {lang: last["labels"][lang]["value"] for lang in langs}
elif policy == "IN_ANY_LANG":
titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
if titles:
fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
return True
else:
return False
written = 0
with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
builder = ObjectBuilder()
completed = 0
for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
builder.event(event, value)
if len(builder.value)>1:
if process_entry(builder.value.pop(0), fo): written += 1
completed += 1
print("\rCompleted %d\ttitles %d" % (completed,written), end="")
print("")
#process the last entry
process_entry(builder.value.pop(0))
return simple_titles_path
"""
Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
specific version of the same document. Documents are forced to contain version in all specified languages and to contain
a minimum number of words; otherwise it is discarded.
"""
class MinWordsNotReached(Exception): pass
class WrongDocumentFormat(Exception): pass
def _load_multilang_doc(path, langs, min_words=100):
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, ParseError
try:
root = ET.parse(path).getroot()
doc = {}
for lang in langs:
doc_body = root.find('.//doc[@lang="' + lang + '"]')
if isinstance(doc_body, Element):
n_words = len(doc_body.text.split(' '))
if n_words >= min_words:
doc[lang] = doc_body.text
else:
raise MinWordsNotReached
else:
raise WrongDocumentFormat
except ParseError:
raise WrongDocumentFormat
return doc
#returns the multilingual documents mapped by language, and a counter with the number of documents readed
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
if pickle_name and os.path.exists(pickle_name):
print("unpickling %s" % pickle_name)
return pickle.load(open(pickle_name, 'rb'))
multi_docs = list_files(wiki_multi_path)
mling_documents = {l:[] for l in langs}
valid_documents = 0
minwords_exception = 0
wrongdoc_exception = 0
for d,multi_doc in enumerate(multi_docs):
print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
(d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
doc_path = join(wiki_multi_path, multi_doc)
try:
m_doc = _load_multilang_doc(doc_path, langs, min_words)
valid_documents += 1
for l in langs:
mling_documents[l].append(m_doc[l])
except MinWordsNotReached:
minwords_exception += 1
if deletions: os.remove(doc_path)
except WrongDocumentFormat:
wrongdoc_exception += 1
if deletions: os.remove(doc_path)
if max_documents>0 and valid_documents>=max_documents:
break
if pickle_name:
print("Pickling wikipedia documents object in %s" % pickle_name)
pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
return mling_documents
def random_wiki_sample(l_wiki, max_documents):
if max_documents == 0: return None
langs = list(l_wiki.keys())
assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
ndocs_per_lang = len(l_wiki[langs[0]])
if ndocs_per_lang > max_documents:
sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
for lang in langs:
l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
return l_wiki
if __name__ == "__main__":
wikipedia_home = "../Datasets/Wikipedia"
from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
langs = frozenset(langs)
simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
_, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))

75
src/data/supervised.py Executable file
View File

@ -0,0 +1,75 @@
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
# from util.common import *
from sklearn.decomposition import PCA
import numpy as np
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(x, axis=axis)
return (x - mean) / std
def supervised_embeddings_tfidf(X,Y):
tfidf_norm = X.sum(axis=0)
F = (X.T).dot(Y) / tfidf_norm.T
return F
def supervised_embeddings_ppmi(X,Y):
Xbin = X>0
D = X.shape[0]
Pxy = (Xbin.T).dot(Y)/D
Px = Xbin.sum(axis=0)/D
Py = Y.sum(axis=0)/D
F = np.asarray(Pxy/(Px.T*Py))
F = np.maximum(F, 1.0)
F = np.log(F)
return F
def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
D = X.shape[0]
if D>max_documents:
print(f'sampling {max_documents}')
random_sample = np.random.permutation(D)[:max_documents]
X = X[random_sample]
Y = Y[random_sample]
cell_matrix = get_supervised_matrix(X, Y)
F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
return F
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
print('computing supervised embeddings...')
nC = Y.shape[1]
if nC==2 and binary_structural_problems > nC:
raise ValueError('not implemented in this branch')
if method=='ppmi':
F = supervised_embeddings_ppmi(X, Y)
elif method == 'dotn':
F = supervised_embeddings_tfidf(X, Y)
elif method == 'ig':
F = supervised_embeddings_tsr(X, Y, information_gain)
elif method == 'chi2':
F = supervised_embeddings_tsr(X, Y, chi_square)
if dozscore:
F = zscores(F, axis=0)
if nC > max_label_space:
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
F = pca.fit(F).transform(F)
return F

View File

@ -0,0 +1,33 @@
from nltk.corpus import stopwords
from data.languages import NLTK_LANGMAP
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
def preprocess_documents(documents, lang):
tokens = NLTKStemTokenizer(lang, verbose=True)
sw = stopwords.words(NLTK_LANGMAP[lang])
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
class NLTKStemTokenizer(object):
def __init__(self, lang, verbose=False):
if lang not in NLTK_LANGMAP:
raise ValueError('Language %s is not supported in NLTK' % lang)
self.verbose=verbose
self.called = 0
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
self.cache = {}
def __call__(self, doc):
self.called += 1
if self.verbose:
print("\r\t\t[documents processed %d]" % (self.called), end="")
tokens = word_tokenize(doc)
stems = []
for t in tokens:
if t not in self.cache:
self.cache[t] = self.wnl.stem(t)
stems.append(self.cache[t])
return stems

270
src/data/tsr_function__.py Executable file
View File

@ -0,0 +1,270 @@
import math
import numpy as np
from scipy.stats import t
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix, csc_matrix
def get_probs(tpr, fpr, pc):
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
pnc = 1.0 - pc
tp = tpr * pc
fn = pc - tp
fp = fpr * pnc
tn = pnc - fp
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
def apply_tsr(tpr, fpr, pc, tsr):
cell = get_probs(tpr, fpr, pc)
return tsr(cell)
def positive_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0.0
else:
return information_gain(cell)
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def information_gain_mod(cell):
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gain_ratio(cell):
pc = cell.p_c()
pnc = 1.0 - pc
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
return information_gain(cell) / (-norm)
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def relevance_frequency(cell):
a = cell.tp
c = cell.fp
if c == 0: c = 1
return math.log(2.0 + (a * 1.0 / c), 2)
def idf(cell):
if cell.p_f()>0:
return math.log(1.0 / cell.p_f())
return 0.0
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
print(f'[selectiong {k} terms]')
nC = Y.shape[1]
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
best_features_idx = np.argsort(-FC, axis=0).flatten()
tsr_values = FC.flatten()
selected_indexes_set = set()
selected_indexes = list()
selected_value = list()
from_category = list()
round_robin = iter(best_features_idx)
values_iter = iter(tsr_values)
round=0
while len(selected_indexes) < k:
term_idx = next(round_robin)
term_val = next(values_iter)
if term_idx not in selected_indexes_set:
selected_indexes_set.add(term_idx)
selected_indexes.append(term_idx)
selected_value.append(term_val)
from_category.append(round)
round = (round + 1) % nC
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC,nF = cell_matrix.shape
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
take as input any real-valued feature column (e.g., tf-idf weights).
feat is the feature vector, and c is a binary classification vector.
This implementation covers only the binary case, while the formula is defined for multiclass
single-label scenarios, for which the version [2] might be preferred.
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
"""
def fisher_score_binary(feat, c):
neg = np.ones_like(c) - c
npos = np.sum(c)
nneg = np.sum(neg)
mupos = np.mean(feat[c == 1])
muneg = np.mean(feat[neg == 1])
mu = np.mean(feat)
stdpos = np.std(feat[c == 1])
stdneg = np.std(feat[neg == 1])
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
if den>0:
return num / den
else:
return num

567
src/dataset_builder.py Normal file
View File

@ -0,0 +1,567 @@
from os.path import join, exists
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from data.reader.jrcacquis_reader import *
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import issparse
import itertools
class MultilingualDataset:
"""
A multilingual dataset is a dictionary of training and test documents indexed by language code.
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
labels of each document, and ids is a list of document-identifiers from the original collection.
"""
def __init__(self):
self.dataset_name = ""
self.multiling_dataset = {}
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
def save(self, file):
self.sort_indexes()
pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
return self
def __getitem__(self, item):
if item in self.langs():
return self.multiling_dataset[item]
return None
@classmethod
def load(cls, file):
data = pickle.load(open(file, 'rb'))
data.sort_indexes()
return data
@classmethod
def load_ids(cls, file):
data = pickle.load(open(file, 'rb'))
tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
return tr_ids, te_ids
def sort_indexes(self):
for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
if issparse(Xtr): Xtr.sort_indices()
if issparse(Xte): Xte.sort_indices()
def set_view(self, categories=None, languages=None):
if categories is not None:
if isinstance(categories, int):
categories = np.array([categories])
elif isinstance(categories, list):
categories = np.array(categories)
self.categories_view = categories
if languages is not None:
self.languages_view = languages
def training(self):
return self.lXtr(), self.lYtr()
def test(self):
return self.lXte(), self.lYte()
def lXtr(self):
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self):
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self):
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lYte(self):
return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
def cat_view(self, Y):
if hasattr(self, 'categories_view'):
return Y[:,self.categories_view]
else:
return Y
def langs(self):
if hasattr(self, 'languages_view'):
langs = self.languages_view
else:
langs = sorted(self.multiling_dataset.keys())
return langs
def num_categories(self):
return self.lYtr()[self.langs()[0]].shape[1]
def show_dimensions(self):
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue
if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'):
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape))
def show_category_prevalences(self):
#pass
nC = self.num_categories()
accum_tr = np.zeros(nC, dtype=np.int)
accum_te = np.zeros(nC, dtype=np.int)
in_langs = np.zeros(nC, dtype=np.int) #count languages with at least one positive example (per category)
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue
prev_train = np.sum(self.cat_view(Ytr), axis=0)
prev_test = np.sum(self.cat_view(Yte), axis=0)
accum_tr += prev_train
accum_te += prev_test
in_langs += (prev_train>0)*1
print(lang+'-train', prev_train)
print(lang+'-test', prev_test)
print('all-train', accum_tr)
print('all-test', accum_te)
return accum_tr, accum_te, in_langs
def set_labels(self, labels):
self.labels = labels
# ----------------------------------------------------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------------------------------------------------
def get_active_labels(doclist):
cat_list = set()
for d in doclist:
cat_list.update(d.categories)
return list(cat_list)
def filter_by_categories(doclist, keep_categories):
catset = frozenset(keep_categories)
for d in doclist:
d.categories = list(set(d.categories).intersection(catset))
def __years_to_str(years):
if isinstance(years, list):
if len(years) > 1:
return str(years[0])+'-'+str(years[-1])
return str(years[0])
return str(years)
# ----------------------------------------------------------------------------------------------------------------------
# Matrix builders
# ----------------------------------------------------------------------------------------------------------------------
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
"""
Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
i.e., each language-specific matrix lies in a dedicate feature space.
:param dataset_name: the name of the dataset (str)
:param langs: list of languages (str)
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param label_names: list of names of labels (str)
:param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
by language the processed wikipedia documents in their respective language-specific feature spaces
"""
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
lW = {}
multilingual_dataset = MultilingualDataset()
multilingual_dataset.dataset_name = dataset_name
multilingual_dataset.set_labels(mlb.classes_)
for lang in langs:
print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
(len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
tr_data, tr_labels, IDtr = zip(*training_docs[lang])
te_data, te_labels, IDte = zip(*test_docs[lang])
if preprocess:
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
tokenizer=NLTKStemTokenizer(lang, verbose=True),
stop_words=stopwords.words(NLTK_LANGMAP[lang]))
else:
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
Xtr = tfidf.fit_transform(tr_data)
Xte = tfidf.transform(te_data)
if wiki_docs:
lW[lang] = tfidf.transform(wiki_docs[lang])
Ytr = mlb.transform(tr_labels)
Yte = mlb.transform(te_labels)
multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
multilingual_dataset.show_dimensions()
multilingual_dataset.show_category_prevalences()
if wiki_docs:
return multilingual_dataset, lW
else:
return multilingual_dataset
# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
"""
Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
since all of them lie on the same yuxtaposed feature space.
:param dataset_name: the name of the dataset (str)
:param langs: list of languages (str)
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param label_names: list of names of labels (str)
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
by language the processed wikipedia documents in their respective language-specific feature spaces
"""
multiling_dataset = MultilingualDataset()
multiling_dataset.dataset_name = dataset_name
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
multiling_dataset.set_labels(mlb.classes_)
tr_data_stack = []
for lang in langs:
print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
te_data, te_labels, te_ID = zip(*test_docs[lang])
if preprocess:
tr_data = preprocess_documents(tr_data, lang)
te_data = preprocess_documents(te_data, lang)
tr_data_stack.extend(tr_data)
multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
tfidf.fit(tr_data_stack)
for lang in langs:
print("\nweighting documents for language <%s>" % (lang))
(tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
Xtr = tfidf.transform(tr_data)
Xte = tfidf.transform(te_data)
Ytr = mlb.transform(tr_labels)
Yte = mlb.transform(te_labels)
multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
multiling_dataset.show_dimensions()
return multiling_dataset
# ----------------------------------------------------------------------------------------------------------------------
# Methods to recover the original documents from the MultilingualDataset's ids
# ----------------------------------------------------------------------------------------------------------------------
"""
This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
"""
def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
langs = list(tr_ids.keys())
print('fetching the datasets')
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
filter_by_categories(rcv1_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_documents + rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
all_docs = rcv1_documents + rcv2_documents
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
for lang in langs:
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
dataset.save(outpath)
"""
Same thing but for JRC-Acquis
"""
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
langs = list(tr_ids.keys())
print('fetching the datasets')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
cat_filter=cat_list, cat_threshold=1, parallel=None,
most_frequent=most_common_cat)
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
parallel='force')
def filter_by_id(doclist, ids):
ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
training_docs = filter_by_id(training_docs, tr_ids)
test_docs = filter_by_id(test_docs, te_ids)
print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
for lang in langs:
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
dataset.save(outpath)
# ----------------------------------------------------------------------------------------------------------------------
# Dataset Generators
# ----------------------------------------------------------------------------------------------------------------------
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
"""
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
:param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
all splits will be generated
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
:param langs: the list of languages to consider (as defined in data/languages.py)
:param train_years: a list of ints containing the years to be considered as training documents
:param test_years: a list of ints containing the years to be considered as test documents
:param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
(select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
:param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
:param run: a numeric label naming the random split (useful to keep track of different runs)
:return: None
"""
name = 'JRCacquis'
run = '_run' + str(run)
config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
'vs' + __years_to_str(test_years) + \
'_' + cat_policy + \
('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
'_noparallel_processed'
indep_path = join(jrc_data_home, config_name + run + '.pickle')
upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle')
wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
cat_filter=cat_list, cat_threshold=1, parallel=None,
most_frequent=most_common_cat)
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
parallel='force')
print('Generating feature-independent dataset...')
training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
def _group_by_lang(doc_list, langs):
return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
for lang in langs}
training_docs = _group_by_lang(training_docs, langs)
training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
test_docs = _group_by_lang(test_docs, langs)
if not exists(indep_path):
wiki_docs=None
if max_wiki>0:
if not exists(wiki_docs_path):
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
if wiki_docs:
lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
lang_data.save(indep_path)
print('Generating upper-bound (English-only) dataset...')
if not exists(upper_path):
training_docs_eng_only = {'en':training_docs['en']}
test_docs_eng_only = {'en':test_docs['en']}
build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
print('Generating yuxtaposed dataset...')
if not exists(yuxta_path):
build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
"""
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
:param outpath: path where all splits will be dumped
:param rcv1_data_home: path to the RCV1-v2 dataset (English only)
:param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
:param langs: the list of languages to consider (as defined in data/languages.py)
:param train_for_lang: maximum number of training documents per language
:param test_for_lang: maximum number of test documents per language
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
:param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
:param run: a numeric label naming the random split (useful to keep track of different runs)
:return: None
"""
assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
"languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
name = 'RCV1/2'
run = '_run' + str(run)
config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
('_processed' if preprocess else '_raw')
indep_path = join(outpath, config_name + run + '.pickle')
upper_path = join(outpath, config_name + run +'_upper.pickle')
yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
wiki_path = join(outpath, config_name + run + '.wiki.pickle')
wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
print('fetching the datasets')
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
filter_by_categories(rcv1_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_documents+rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
# for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
# would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
print('Generating upper-bound (English-only) dataset...')
train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
for lang in langs:
if lang=='en': continue # already split
test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test]
print('Generating feature-independent dataset...')
wiki_docs=None
if max_wiki>0:
if not exists(wiki_docs_path):
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
if wiki_docs:
lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
lang_data.save(indep_path)
print('Generating yuxtaposed dataset...')
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
#-----------------------------------------------------------------------------------------------------------------------
# MAIN BUILDER
#-----------------------------------------------------------------------------------------------------------------------
if __name__=='__main__':
import sys
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
langs = lang_set['JRC_NLTK']
max_wiki = 5000
for run in range(0,10):
print('Building JRC-Acquis datasets run', run)
prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
cat_policy='all', most_common_cat=300, run=run)
print('Building RCV1-v2/2 datasets run', run)
prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
# uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
# (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
# datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
# outpath = datasetpath.replace('_nltk_','_doclist_')
# retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
# datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
# outpath = datasetpath.replace('_nltk_', '_doclist_')
# retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)

Binary file not shown.

646
src/learning/learners.py Normal file
View File

@ -0,0 +1,646 @@
import numpy as np
import time
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
from scipy.sparse import issparse, csr_matrix
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
def _sort_if_sparse(X):
if issparse(X) and not X.has_sorted_indices:
X.sort_indices()
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
if n_jobs == 1:
return {lang:transformer(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
return {lang: transformations[i] for i, lang in enumerate(langs)}
class TrivialRejector:
def fit(self, X, y):
self.cats = y.shape[1]
return self
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
def predict(self, X): return np.zeros((X.shape[0],self.cats))
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
def best_params(self): return {}
class FunnellingPolylingualClassifier:
"""
This classifier projects each document d into a language-independent feature space where each dimension fi is the
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
then trains one single classifier for all documents in this space, irrespective of their originary language
"""
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
calmode='cal', n_jobs=-1):
"""
:param first_tier_learner: the learner used in the first-tier level
:param meta_learner: the learner used in the second-tier level
:param first_tier_parameters: parameters for the learner in the doc_projector
:param meta_parameters: parameters for the learner in the z-space
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
:param n_jobs: number of parallel threads
'sigmoid' to use the sigmoid of the decision_function
projects the data before training the final classifier; if greater than one, the training set is split in as
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
models trained on the remaining folds. This should increase the generality of the space to unseen data.
"""
assert folded_projections>0, "positive number of folds expected"
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
self.fist_tier_learner = first_tier_learner
self.meta_learner = meta_learner
self.fist_tier_parameters=first_tier_parameters
self.meta_parameters = meta_parameters
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
self.folded_projections = folded_projections
self.n_jobs = n_jobs
self.calmode = calmode
def _projection(self, doc_projector, lX):
"""
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
decision_function if otherwise
:param doc_projector: the document projector (a NaivePolylingualClassifier)
:param lX: {lang:matrix} to train
:return: the projection, applied with predict_proba or decision_function
"""
if self.calmode=='cal':
return doc_projector.predict_proba(lX)
else:
l_decision_scores = doc_projector.decision_function(lX)
if self.calmode=='sigmoid':
def sigmoid(x): return 1 / (1 + np.exp(-x))
for lang in l_decision_scores.keys():
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
return l_decision_scores
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
"""
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
decision scores (if otherwise). This space is here named zspace.
:param lXtr: {lang:matrix} to train
:param lYtr: {lang:labels} to train
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
models trained on lXtr, and the lYproj labels stacked consistently
"""
repair_empty_folds = True
if lXproj is None and lYproj is None:
lXproj, lYproj = lXtr, lYtr
repair_empty_folds = False
print('fitting the projectors... {}'.format(lXtr.keys()))
self.doc_projector.fit(lXtr, lYtr)
print('projecting the documents')
langs = list(lXtr.keys())
lZ = self._projection(self.doc_projector, lXproj)
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
empty_categories = self.doc_projector.empty_categories
lZ_bu = self._projection(self.doc_projector_bu, lXproj)
for lang in langs:
repair = empty_categories[lang]
lZ[lang][:,repair] = lZ_bu[lang][:,repair]
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
zy = np.vstack([lYproj[lang] for lang in langs])
return Z, zy
def _get_zspace_folds(self, lX, ly):
self.doc_projector_bu.fit(lX, ly)
print('split of {} folds'.format(self.folded_projections))
skf = KFold(n_splits=self.folded_projections, shuffle=True)
Z, zy = [], []
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
for fold in range(self.folded_projections):
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
lfoldXtr, lfoldYtr = {}, {}
lfoldXte, lfoldYte = {}, {}
for lang in lX.keys():
train, test = lfold[lang][fold]
lfoldXtr[lang] = lX[lang][train]
lfoldYtr[lang] = ly[lang][train]
lfoldXte[lang] = lX[lang][test]
lfoldYte[lang] = ly[lang][test]
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
Z.append(Zfold)
zy.append(zYfold)
# compose the Z-space as the union of all folded predictions
Z = np.vstack(Z)
zy = np.vstack(zy)
# refit the document projector with all examples to have a more reliable projector for test data
self.doc_projector = self.doc_projector_bu
return Z, zy
def fit(self, lX, ly, lZ=None, lzy=None):
tinit = time.time()
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier
if lZ is not None and lzy is not None:
zlangs = list(lZ.keys())
Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
self.model.fit(Z, zy)
self.time = time.time() - tinit
return self
def predict(self, lX, lZ=None):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
:return: a dictionary of predictions
"""
lZ_ = self._projection(self.doc_projector, lX)
if lZ is not None:
lZ_ = {**lZ_, **lZ}
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
def best_params(self):
params = self.doc_projector.best_params()
params['meta'] = self.model.best_params()
return params
class NaivePolylingualClassifier:
"""
Is a mere set of independet MonolingualClassifiers
"""
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.base_learner = base_learner
self.parameters = parameters
self.model = None
self.n_jobs = n_jobs
def fit(self, lX, ly):
"""
trains the independent monolingual classifiers
:param lX: a dictionary {language_label: X csr-matrix}
:param ly: a dictionary {language_label: y np.array}
:return: self
"""
tinit = time.time()
assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
langs = list(lX.keys())
for lang in langs:
_sort_if_sparse(lX[lang])
# models = Parallel(n_jobs=self.n_jobs)\
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
for model, lang in zip(models, langs):
model.fit(lX[lang], ly[lang])
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
self.time = time.time() - tinit
return self
def decision_function(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of classification scores for each class
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs=list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
return {lang:scores[i] for i,lang in enumerate(langs)}
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of probabilities that each document belongs to each class
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs=list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
return {lang:scores[i] for i,lang in enumerate(langs)}
def predict(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of predictions
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
if self.n_jobs == 1:
return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self):
return {l:model.best_params() for l,model in self.model.items()}
class MonolingualClassifier:
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.learner = base_learner
self.parameters = parameters
self.model = None
self.n_jobs = n_jobs
self.best_params_ = None
def fit(self, X, y):
if X.shape[0] == 0:
print('Warning: X has 0 elements, a trivial rejector will be created')
self.model = TrivialRejector().fit(X,y)
self.empty_categories = np.arange(y.shape[1])
return self
tinit = time.time()
_sort_if_sparse(X)
self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
# multi-class format
if len(y.shape) == 2:
if self.parameters is not None:
self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
for params in self.parameters]
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
else:
self.model = self.learner
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages')
# parameter optimization?
if self.parameters:
print('debug: optimizing parameters:', self.parameters)
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
error_score=0, verbose=10)
print('fitting:', self.model)
self.model.fit(X, y)
if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_
print('best parameters: ', self.best_params_)
self.time=time.time()-tinit
return self
def decision_function(self, X):
assert self.model is not None, 'predict called before fit'
_sort_if_sparse(X)
return self.model.decision_function(X)
def predict_proba(self, X):
assert self.model is not None, 'predict called before fit'
assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
_sort_if_sparse(X)
return self.model.predict_proba(X)
def predict(self, X):
assert self.model is not None, 'predict called before fit'
_sort_if_sparse(X)
return self.model.predict(X)
def best_params(self):
return self.best_params_
class PolylingualEmbeddingsClassifier:
"""
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
@article{conneau2017word,
title={Word translation without parallel data},
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
journal={arXiv preprint arXiv:1710.04087},
year={2017}
}
url: https://github.com/facebookresearch/MUSE
"""
def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
"""
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
:param learner: the learner
:param c_parameters: parameters for learner
:param n_jobs: the number of concurrent threads
"""
self.wordembeddings_path = wordembeddings_path
self.learner = learner
self.c_parameters=c_parameters
self.n_jobs = n_jobs
self.lang_tfidf = {}
self.model = None
def fit_vectorizers(self, lX):
for lang in lX.keys():
if lang not in self.lang_tfidf:
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
docs = lX[lang]
tfidf.fit(docs)
self.lang_tfidf[lang] = tfidf
def embed(self, docs, lang):
assert lang in self.lang_tfidf, 'unknown language'
tfidf_vectorizer = self.lang_tfidf[lang]
V = tfidf_vectorizer.vocabulary_
Xweights = tfidf_vectorizer.transform(docs)
print('loading word embeddings for ' + lang)
we = WordEmbeddings.load(self.wordembeddings_path, lang)
nD = len(docs)
doc_vecs = np.zeros((nD, we.dim()))
for i, doc in enumerate(docs):
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
for w in set(doc.split()):
if w in we and w in V:
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
# works much worse with idf; works much worse with document l2-normalization
print()
return doc_vecs
def fit(self, lX, ly):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
:return: self
"""
tinit = time.time()
langs = list(lX.keys())
WEtr, Ytr = [], []
self.fit_vectorizers(lX) # if already fit, does nothing
_lX = dict()
for lang in langs:
_lX[lang] = self.lang_tfidf[lang].transform(lX[lang])
WEtr.append(self.embed(lX[lang], lang))
Ytr.append(ly[lang])
# TODO @Andrea --> here embeddings should be stacked horizontally!
WEtr = np.vstack(WEtr)
Ytr = np.vstack(Ytr)
self.embed_time = time.time() - tinit
print('fitting the WE-space of shape={}'.format(WEtr.shape))
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
self.model.fit(_lX['da'], ly['da'])
self.time = time.time() - tinit
return self
def predict(self, lX):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
langs = list(lX.keys())
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
assert self.model is not None, 'predict called before fit'
langs = list(lX.keys())
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
class FunnellingEmbeddingPolylingualClassifier:
""" Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf
vectorizer for the out-of-scope languages (which is not fair)."""
def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages,
first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1):
assert first_tier_learner.probability==True and embed_learner.probability==True, \
'both the first-tier classifier and the polyembedding classifier shoud allow calibration'
self.training_languages = training_languages
self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner,
c_parameters=embed_parameters, n_jobs=n_jobs)
self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner,
first_tier_parameters=first_tier_parameters,
meta_parameters=meta_parameters, n_jobs=n_jobs)
self.n_jobs = n_jobs
def vectorize(self, lX):
return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()}
def fit(self, lX, ly):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
:return:
"""
self.PLE.fit_vectorizers(lX)
tinit = time.time()
lX = {l: lX[l] for l in lX.keys() if l in self.training_languages}
ly = {l: ly[l] for l in lX.keys() if l in self.training_languages}
self.PLE.fit(lX, ly)
lZ = self.PLE.predict_proba(lX)
self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly)
self.time = time.time() - tinit
return self
def predict(self, lX):
"""
:param lX: a dictionary {language_label: [list of preprocessed documents]}
"""
lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages}
lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages}
lZ = self.PLE.predict_proba(lXout)
return self.Funnelling.predict(self.vectorize(lXin), lZ)
def best_params(self):
return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()}
class AndreaCLF(FunnellingPolylingualClassifier):
def __init__(self,
we_path,
config,
first_tier_learner,
meta_learner,
first_tier_parameters=None,
meta_parameters=None,
folded_projections=1,
calmode='cal', n_jobs=-1):
super().__init__(first_tier_learner,
meta_learner,
first_tier_parameters,
meta_parameters,
folded_projections,
calmode,
n_jobs)
self.we_path = we_path
self.config = config
self.lang_word2idx = dict()
self.languages = []
self.lang_tfidf = {}
self.word_embeddings = {}
self.supervised_embeddings = {}
def vectorize(self, lX, prediction=False):
langs = list(lX.keys())
print(f'# tfidf-vectorizing docs')
if prediction:
for lang in langs:
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
tfidf_vectorizer = self.lang_tfidf[lang]
lX[lang] = tfidf_vectorizer.transform(lX[lang])
return self
for lang in langs:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
self.languages.append(lang)
tfidf_vectorizer.fit(lX[lang])
lX[lang] = tfidf_vectorizer.transform(lX[lang])
_sort_if_sparse(lX[lang])
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
return self
# @override std class method
def _get_zspace(self, lXtr, lYtr):
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
self.doc_projector.fit(lXtr, lYtr)
print('\nprojecting the documents')
lZ = self._projection(self.doc_projector, lXtr)
return lZ, lYtr
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
"""
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
"""
_r = dict()
languages = list(lX.keys())
if prediction:
for lang in languages:
if unsupervised: # If unsupervised embeddings ...
M = self.word_embeddings[lang]
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
S = self.supervised_embeddings[lang]
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
continue
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
else: # If not unsupervised --> get (S) matrix and its weighted sum
S = self.supervised_embeddings[lang]
_r[lang] = lX[lang].dot(S)
return _r
if unsupervised:
for lang in languages:
# print('Test building embedding matrix FastTextMuse ...')
_, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
self.word_embeddings[lang] = M
_r[lang] = lX[lang].dot(M)
if supervised:
for lang in languages:
S = WCE_matrix(lX, ly, lang)
S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
self.supervised_embeddings[lang] = S
if unsupervised:
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
else:
_r[lang] = lX[lang].dot(S)
return _r
# @override std class method
def fit(self, lX, ly):
tinit = time.time()
print('Vectorizing documents...')
self.vectorize(lX)
for lang in self.languages:
print(lX[lang].shape)
Z, zy = self._get_zspace(lX, ly)
# Z vectors is concatenated with doc's embedding weighted sum
Z_embedded = dict()
l_weighted_em = self.embed(lX, ly,
unsupervised=self.config['unsupervised'],
supervised=self.config['supervised'])
if self.config['supervised'] or self.config['unsupervised']:
for lang in list(lX.keys()):
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
Z = Z_embedded
del Z_embedded
# stacking Z_embedded space vertically
# _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages])
# _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
# zlangs = list(Z_embedded.keys()) # creo lista con embedding e poi faccio vstack su lista
# for i, lang in enumerate(zlangs):
# if i == 0:
# _vertical_Z = Z_embedded[lang]
# _vertical_Zy = zy[lang]
# else:
# _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang]))
# _vertical_Zy = np.vstack((_vertical_Zy, zy[lang]))
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
self.model.fit(_vertical_Z, _vertical_Zy)
self.time = time.time() - tinit
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
def predict(self, lX, ly):
print('Vectorizing documents')
self.vectorize(lX, prediction=True)
lZ = self._projection(self.doc_projector, lX)
if self.config['supervised'] or self.config['unsupervised']:
l_weighted_em = self.embed(lX, ly,
unsupervised=self.config['unsupervised'],
supervised=self.config['supervised'],
prediction=True)
Z_embedded = dict()
for lang in lX.keys():
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
print(Z_embedded[lang].shape)
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
for lang in lZ.keys():
print(lZ[lang].shape)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)

7
src/results/results.csv Normal file
View File

@ -0,0 +1,7 @@
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
1 id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
2 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
3 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
4 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
5 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
6 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
7 jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope

View File

110
src/transformers/clesa.py Normal file
View File

@ -0,0 +1,110 @@
import numpy as np
import sklearn
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
class ESA(object):
"""
Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
"""
supported_similarity = ['dot', 'cosine']
def __init__(self, similarity='dot', centered=False, post=None):
"""
:param similarity: the similarity measure between documents to be used
:param centered: set to True to subtract the expected similarity due to randomness (experimental)
:param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
"""
assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
self.similarity = similarity
self.centered = centered
self.post_processing = post
self.W = None
def fit(self, W):
"""
:param W: doc-by-term already processed matrix of wikipedia documents
:return: self
"""
self.W = W
return self
def transform(self, X):
"""
:param X: doc-by-term matrix that is to be transformed into the ESA space.
:return: the matrix X transformed into the ESA space in numpy format
"""
assert self.W is not None, 'transform method called before fit'
W = self.W
assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
if self.similarity in ['dot', 'cosine']:
if self.similarity == 'cosine':
X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
esa = (X.dot(W.T)).toarray()
if self.centered:
pX = (X > 0).sum(1) / float(X.shape[1])
pW = (W > 0).sum(1) / float(W.shape[1])
pXpW = np.sqrt(pX.dot(pW.transpose()))
esa = esa - pXpW
if self.post_processing:
esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
return esa
def fit_transform(self, W, X, Y=None):
self.fit(W)
return self.transform(X, Y)
def dimensionality(self):
return self.W.shape[0]
class CLESA(ESA):
"""
Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
"""
def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
super(CLESA, self).__init__(similarity, centered, post)
self.lESA = None
self.langs = None
self.n_jobs = n_jobs
def fit(self, lW):
"""
:param lW: a dictionary of {language: doc-by-term wiki matrix}
:return: self
"""
assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
self.dimensions = list(lW.values())[0].shape[0]
self.langs = list(lW.keys())
self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
return self
def transform(self, lX):
"""
:param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
:return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
"""
assert self.lESA is not None, 'transform method called before fit'
assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
langs = list(lX.keys())
trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
return {lang:trans[i] for i,lang in enumerate(langs)}
def fit_transform(self, lW, lX):
return self.fit(lW).transform(lX)
def languages(self):
return list(self.lESA.keys())

154
src/transformers/dci.py Normal file
View File

@ -0,0 +1,154 @@
import numpy as np
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, issparse
from scipy.spatial.distance import cosine
import operator
import functools
import math, sys
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
class DistributionalCorrespondenceIndexing:
prob_dcf = ['linear', 'pmi']
vect_dcf = ['cosine']
valid_dcf = prob_dcf + vect_dcf
valid_post = ['normal', 'l2', None]
def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
"""
:param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
the distribucional correspondence between vectors u and v
:param post: post-processing function to apply to document embeddings. Default is to standardize it into a
normal distribution; other functions allowed are 'l2' or None
"""
if post not in self.valid_post:
raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
if isinstance(dcf, str):
if dcf not in self.valid_dcf:
raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
elif hasattr(dcf, '__call__'):
self.dcf = dcf
else:
raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
#self.dcf = lambda u,v:dcf(u,v)
self.post = post
self.domains = None
self.dFP = None
self.n_jobs = n_jobs
def fit(self, dU, dP):
"""
:param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
distributional semantic model for a specific domain
:param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
number of pivots
:return: self
"""
self.domains = list(dP.keys())
assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
"inconsistent dimensions between distributional and pivot spaces"
self.dimensions = list(dP.values())[0].shape[1]
# embed the feature space from each domain using the pivots of that domain
#self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
def _dom_transform(self, X, FP):
_X = X.dot(FP)
if self.post == 'l2':
_X = normalize(_X, norm='l2', axis=1)
elif self.post == 'normal':
std = np.clip(np.std(_X, axis=0), 1e-5, None)
_X = (_X - np.mean(_X, axis=0)) / std
return _X
# dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
def transform(self, dX):
assert self.dFP is not None, 'transform method called before fit'
assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
domains = list(dX.keys())
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
return {d: transformations[i] for i, d in enumerate(domains)}
def fit_transform(self, dU, dP, dX):
return self.fit(dU, dP).transform(dX)
def _prevalence(self, v):
if issparse(v):
return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
elif isinstance(v, np.ndarray):
return float(v[v>0].size) / v.size
def linear(self, u, v, D):
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
den1=tp+fn
den2=tn+fp
tpr = (tp*1./den1) if den1!=0 else 0.
tnr = (tn*1./den2) if den2!=0 else 0.
return tpr + tnr - 1
def pmi(self, u, v, D):
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
Pxy = tp * 1. / D
Pxny = fp * 1. / D
Pnxy = fn * 1. / D
Px = Pxy + Pxny
Py = Pxy + Pnxy
if (Px == 0 or Py == 0 or Pxy == 0):
return 0.0
score = math.log2(Pxy / (Px * Py))
if np.isnan(score) or np.isinf(score):
print('NAN')
sys.exit()
return score
def cosine(self, u, v):
pu = self._prevalence(u)
pv = self._prevalence(v)
return cosine(u, v) - np.sqrt(pu * pv)
def _get_4cellcounters(self, u, v, D):
"""
:param u: a set of indexes with a non-zero value
:param v: a set of indexes with a non-zero value
:param D: the number of events (i.e., all posible indexes)
:return: the 4-cell contingency values tp, fp, fn, tn)
"""
common=u.intersection(v)
tp = len(common)
fp = len(u) - len(common)
fn = len(v) - len(common)
tn = D - (tp + fp + fn)
return tp, fp, fn, tn
def dcf_dist(self, U, V):
nU,D = U.shape
nV = V.shape[0]
if issparse(U): U = U.toarray()
if issparse(V): V = V.toarray()
dists = np.zeros((nU, nV))
if self.dcf.__name__ in self.prob_dcf:
def hits_index(v):
return set(np.argwhere(v>0).reshape(-1).tolist())
Vhits = {i:hits_index(V[i]) for i in range(nV)}
for i in range(nU):
Ui_hits = hits_index(U[i])
for j in range(nV):
dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
else:
for i in range(nU):
for j in range(nV):
dists[i, j] = self.dcf(self, U[i], V[j])
return dists

53
src/transformers/riboc.py Normal file
View File

@ -0,0 +1,53 @@
import math
import numpy as np
from scipy.sparse import csr_matrix, issparse
class RandomIndexingBoC(object):
def __init__(self, latent_dimensions, non_zeros=2):
self.latent_dimensions = latent_dimensions
self.k = non_zeros
self.ri_dict = None
def fit_transform(self, X):
return self.fit(X).transform(X)
def fit(self, X):
nF = X.shape[1]
nL = self.latent_dimensions
format = 'csr' if issparse(X) else 'np'
self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
return self
def transform(self, X):
assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
if self.ri_dict is None:
raise ValueError("Error: transform method called before fit.")
P = X.dot(self.ri_dict)
if issparse(P):
P.sort_indices()
return P
def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
nF, latent_dimensions = shape
print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
val = 1.0 if not normalized else 1.0/math.sqrt(k)
#ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions))
ri_dict = np.zeros((nF, latent_dimensions))
#TODO: optimize
for t in range(nF):
dims = np.zeros(k, dtype=np.int32)
dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
ri_dict[t,dims]=values
print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
print('\nDone')
if format=='csr':
ri_dict = csr_matrix(ri_dict)
return ri_dict

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

95
src/util/evaluation.py Normal file
View File

@ -0,0 +1,95 @@
# from sklearn.externals.joblib import Parallel, delayed
from joblib import Parallel, delayed
from util.metrics import *
from sklearn.metrics import f1_score
import numpy as np
import time
def evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
def soft_evaluation_metrics(y, y_):
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
print('evaluation (n_jobs={})'.format(n_jobs))
if n_jobs == 1:
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)}
def average_results(l_eval, show=True):
metrics = []
for lang in l_eval.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if show:
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
ave = np.mean(np.array(metrics), axis=0)
if show:
print('Averages: MF1, mF1, MK, mK', ave)
return ave
def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
tinit=time.time()
print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
n_jobs = polylingual_method.n_jobs
if predictor is None:
predictor = polylingual_method.predict
metrics = evaluation_metrics
if soft is True:
metrics = soft_evaluation_metrics
ly_ = predictor(lX, ly)
eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
if return_time:
return eval_, time.time()-tinit
else:
return eval_
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
print('prediction for test in a single language')
if predictor is None:
predictor = polylingual_method.predict
metrics = evaluation_metrics
if soft is True:
metrics = soft_evaluation_metrics
ly_ = predictor({lang:X})
return metrics(y, ly_[lang])
def get_binary_counters(polylingual_method, lX, ly, predictor=None):
print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
n_jobs = polylingual_method.n_jobs
if predictor is None:
predictor = polylingual_method.predict
ly_ = predictor(lX)
print('evaluation (n_jobs={})'.format(n_jobs))
if n_jobs == 1:
return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
else:
langs = list(ly.keys())
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)}
def binary_counters(y, y_):
y = np.reshape(y, (-1))
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
counters = hard_single_metric_statistics(y, y_)
return counters.tp, counters.tn, counters.fp, counters.fn

36
src/util/file.py Normal file
View File

@ -0,0 +1,36 @@
from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
#from sklearn.externals.six.moves import urllib
import urllib
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if exists(archive_path): return
makedirs_if_not_exist(dirname(archive_path))
download_file(url,archive_path)
def ls(dir, typecheck):
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
el.sort()
return el
def list_dirs(dir):
return ls(dir, typecheck=isdir)
def list_files(dir):
return ls(dir, typecheck=isfile)
def makedirs_if_not_exist(path):
if not exists(path): makedirs(path)

168
src/util/metrics.py Normal file
View File

@ -0,0 +1,168 @@
import numpy as np
"""
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
"""
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __add__(self, other):
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
def accuracy(cell):
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
def f1(cell):
num = 2.0 * cell.tp
den = 2.0 * cell.tp + cell.fp + cell.fn
if den>0: return num / den
#we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def K(cell):
specificity, recall = 0., 0.
AN = cell.tn + cell.fp
if AN != 0:
specificity = cell.tn*1. / AN
AP = cell.tp + cell.fn
if AP != 0:
recall = cell.tp*1. / AP
if AP == 0:
return 2. * specificity - 1.
elif AN == 0:
return 2. * recall - 1.
else:
return specificity + recall - 1.
#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
#true_labels and predicted_labels are two vectors of shape (number_documents,)
def hard_single_metric_statistics(true_labels, predicted_labels):
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels==1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
# probabilitiesfron with respect to the true binary labels
#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
def soft_single_metric_statistics(true_labels, posterior_probabilities):
assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
tp = np.sum(posterior_probabilities[true_labels == 1])
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
fp = np.sum(posterior_probabilities[true_labels == 0])
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
def __check_consistency_and_adapt(true_labels, predictions):
if predictions.ndim == 1:
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
if true_labels.ndim == 1:
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
if true_labels.shape != predictions.shape:
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
% (true_labels.shape, predictions.shape))
_,nC = true_labels.shape
return true_labels, predictions, nC
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
accum = ContTable()
for c in range(nC):
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
accum = accum + other
return metric(accum)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroF1(true_labels, predicted_labels):
return macro_average(true_labels,predicted_labels, f1)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microF1(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, f1)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroK(true_labels, predicted_labels):
return macro_average(true_labels,predicted_labels, K)
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microK(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, K)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmacroF1(true_labels, posterior_probabilities):
return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmicroF1(true_labels, posterior_probabilities):
return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmacroK(true_labels, posterior_probabilities):
return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
#of the same shape containing real values in [0,1]
def smoothmicroK(true_labels, posterior_probabilities):
return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)

33
src/util/results.py Normal file
View File

@ -0,0 +1,33 @@
import os
import pandas as pd
import numpy as np
class PolylingualClassificationResults:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file):
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
else:
self.tell('File {} does not exist. Creating new frame.'.format(file))
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)