first commit
This commit is contained in:
commit
2a5d0243db
|
|
@ -0,0 +1,20 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="PublishConfigData" serverName="anna_isti">
|
||||
<serverData>
|
||||
<paths name="anna_isti">
|
||||
<serverdata>
|
||||
<mappings>
|
||||
<mapping deploy="/home/andreapdr/funneling_pdr" local="$PROJECT_DIR$" web="/" />
|
||||
<mapping deploy="/home/andreapdr/CLESA/embeddings" local="/storage/andrea/FUNNELING/embeddings" />
|
||||
</mappings>
|
||||
<excludedPaths>
|
||||
<excludedPath local="true" path="$PROJECT_DIR$/src/venv" />
|
||||
<excludedPath local="true" path="$PROJECT_DIR$/src/pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle" />
|
||||
<excludedPath local="true" path="$PROJECT_DIR$/src/results/results.csv" />
|
||||
</excludedPaths>
|
||||
</serverdata>
|
||||
</paths>
|
||||
</serverData>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
|
||||
</project>
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="JavaScriptSettings">
|
||||
<option name="languageLevel" value="ES6" />
|
||||
</component>
|
||||
<component name="NodePackageJsonFileManager">
|
||||
<packageJsonPaths />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (word-class-embeddings)" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/tesi_funneling.iml" filepath="$PROJECT_DIR$/.idea/tesi_funneling.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/src/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.7 (word-class-embeddings)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="TestRunnerService">
|
||||
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="WebServers">
|
||||
<option name="servers">
|
||||
<webServer id="8f0f329c-a17c-48ba-b459-18d8b1a104e5" name="anna_isti" url="http://anna.isti.cnr.it">
|
||||
<fileTransfer host="anna.isti.cnr.it" port="22" accessType="SFTP">
|
||||
<advancedOptions>
|
||||
<advancedOptions dataProtectionLevel="Private" passiveMode="true" shareSSLContext="true" />
|
||||
</advancedOptions>
|
||||
<option name="port" value="22" />
|
||||
</fileTransfer>
|
||||
</webServer>
|
||||
</option>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1,655 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ChangeListManager">
|
||||
<list default="true" id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
|
||||
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
|
||||
<option name="SHOW_DIALOG" value="false" />
|
||||
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
||||
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
||||
<option name="LAST_RESOLUTION" value="IGNORE" />
|
||||
</component>
|
||||
<component name="CoverageDataManager">
|
||||
<SUITE FILE_PATH="coverage/tesi_funneling$funneling_poly.coverage" NAME="funneling_poly Coverage Results" MODIFIED="1574690332154" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
|
||||
<SUITE FILE_PATH="coverage/tesi_funneling$last_test.coverage" NAME="last_test Coverage Results" MODIFIED="1574960066673" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
|
||||
<SUITE FILE_PATH="coverage/tesi_funneling$scratch.coverage" NAME="scratch Coverage Results" MODIFIED="1574759452703" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/src" />
|
||||
</component>
|
||||
<component name="FileEditorManager">
|
||||
<splitter split-orientation="horizontal" split-proportion="0.5">
|
||||
<split-first>
|
||||
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="182">
|
||||
<caret line="321" selection-start-line="321" selection-end-line="321" />
|
||||
<folding>
|
||||
<element signature="e#13891#17737#0" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</split-first>
|
||||
<split-second>
|
||||
<leaf>
|
||||
<file pinned="false" current-in-tab="true">
|
||||
<entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="462">
|
||||
<caret line="162" selection-start-line="162" selection-end-line="162" />
|
||||
<folding>
|
||||
<element signature="e#0#9#0" expanded="true" />
|
||||
<element signature="e#222#778#0" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</file>
|
||||
</leaf>
|
||||
</split-second>
|
||||
</splitter>
|
||||
</component>
|
||||
<component name="FileTemplateManagerImpl">
|
||||
<option name="RECENT_TEMPLATES">
|
||||
<list>
|
||||
<option value="Python Script" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="FindInProjectRecents">
|
||||
<findStrings>
|
||||
<find>NaivePolylingualClassifier</find>
|
||||
<find>tra</find>
|
||||
<find>base_l</find>
|
||||
<find>tfidf</find>
|
||||
<find>proba</find>
|
||||
<find>we</find>
|
||||
<find>Wordembeddings</find>
|
||||
<find>hstack</find>
|
||||
<find>ha</find>
|
||||
<find>timeit</find>
|
||||
<find>ti</find>
|
||||
<find>time</find>
|
||||
<find>dot</find>
|
||||
<find>vec</find>
|
||||
<find>_fit_binary</find>
|
||||
<find>oneVs</find>
|
||||
<find>embed</find>
|
||||
<find>no tf-</find>
|
||||
<find>embedding_matrix</find>
|
||||
<find>WordEm</find>
|
||||
<find>WordEmbeddings</find>
|
||||
<find># pretrai</find>
|
||||
<find># [pre</find>
|
||||
<find>joblib</find>
|
||||
</findStrings>
|
||||
</component>
|
||||
<component name="HighlightingSettingsPerFile">
|
||||
<setting file="file://$PROJECT_DIR$/src/learning/learners.py" root0="FORCE_HIGHLIGHTING" />
|
||||
</component>
|
||||
<component name="IdeDocumentHistory">
|
||||
<option name="CHANGED_PATHS">
|
||||
<list>
|
||||
<option value="$PROJECT_DIR$/src/dataset_builder.py" />
|
||||
<option value="$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py" />
|
||||
<option value="$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
|
||||
<option value="$PROJECT_DIR$/src/scratch.py" />
|
||||
<option value="$PROJECT_DIR$/src/data/tsr_function__.py" />
|
||||
<option value="$PROJECT_DIR$/src/data/supervised.py" />
|
||||
<option value="$PROJECT_DIR$/src/last_test.py" />
|
||||
<option value="<8f0f329c-a17c-48ba-b459-18d8b1a104e5>/home/andreapdr/funneling_pdr/src/data/embeddings.py" />
|
||||
<option value="$PROJECT_DIR$/src/util/results.py" />
|
||||
<option value="$PROJECT_DIR$/src/transformers/clesa.py" />
|
||||
<option value="$PROJECT_DIR$/src/transformers/dci.py" />
|
||||
<option value="$PROJECT_DIR$/src/util/evaluation.py" />
|
||||
<option value="$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
|
||||
<option value="$PROJECT_DIR$/src/funneling_poly.py" />
|
||||
<option value="$PROJECT_DIR$/src/learning/learners.py" />
|
||||
<option value="$PROJECT_DIR$/src/FPEC_andrea.py" />
|
||||
<option value="$PROJECT_DIR$/src/data/embeddings.py" />
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="ProjectFrameBounds" extendedState="6">
|
||||
<option name="x" value="5" />
|
||||
<option name="y" value="28" />
|
||||
<option name="width" value="960" />
|
||||
<option name="height" value="1052" />
|
||||
</component>
|
||||
<component name="ProjectView">
|
||||
<navigator proportions="" version="1">
|
||||
<foldersAlwaysOnTop value="true" />
|
||||
</navigator>
|
||||
<panes>
|
||||
<pane id="Scope" />
|
||||
<pane id="ProjectPane">
|
||||
<subPane>
|
||||
<expand>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="learning" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="pickles" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="results" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="transformers" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
<path>
|
||||
<item name="tesi_funneling" type="b2602c69:ProjectViewProjectNode" />
|
||||
<item name="tesi_funneling" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="src" type="462c0819:PsiDirectoryNode" />
|
||||
<item name="util" type="462c0819:PsiDirectoryNode" />
|
||||
</path>
|
||||
</expand>
|
||||
<select />
|
||||
</subPane>
|
||||
</pane>
|
||||
</panes>
|
||||
</component>
|
||||
<component name="PropertiesComponent">
|
||||
<property name="WebServerToolWindowFactoryState" value="true" />
|
||||
<property name="WebServerToolWindowPanel.toolwindow.highlight.mappings" value="true" />
|
||||
<property name="WebServerToolWindowPanel.toolwindow.highlight.symlinks" value="true" />
|
||||
<property name="WebServerToolWindowPanel.toolwindow.show.date" value="false" />
|
||||
<property name="WebServerToolWindowPanel.toolwindow.show.permissions" value="false" />
|
||||
<property name="WebServerToolWindowPanel.toolwindow.show.size" value="false" />
|
||||
<property name="last_opened_file_path" value="$PROJECT_DIR$/src/results/results.csv" />
|
||||
<property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
|
||||
<property name="nodejs_npm_path_reset_for_default_project" value="true" />
|
||||
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
||||
</component>
|
||||
<component name="RecentsManager">
|
||||
<key name="CopyFile.RECENT_KEYS">
|
||||
<recent name="$PROJECT_DIR$/src/util" />
|
||||
<recent name="$PROJECT_DIR$/src/data" />
|
||||
<recent name="$PROJECT_DIR$/src" />
|
||||
<recent name="$PROJECT_DIR$/src/learning" />
|
||||
</key>
|
||||
</component>
|
||||
<component name="RunDashboard">
|
||||
<option name="ruleStates">
|
||||
<list>
|
||||
<RuleState>
|
||||
<option name="name" value="ConfigurationTypeDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
<RuleState>
|
||||
<option name="name" value="StatusDashboardGroupingRule" />
|
||||
</RuleState>
|
||||
</list>
|
||||
</option>
|
||||
</component>
|
||||
<component name="RunManager" selected="Python.last_test">
|
||||
<configuration name="funneling_poly" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="tesi_funneling" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/funneling_poly.py" />
|
||||
<option name="PARAMETERS" value="-d "pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle" -w /storage/andrea/FUNNELING/embeddings/" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="last_test" type="PythonConfigurationType" factoryName="Python">
|
||||
<module name="tesi_funneling" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/FPEC_andrea.py" />
|
||||
<option name="PARAMETERS" value="-d "pickles/jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle" -w /storage/andrea/FUNNELING/embeddings/ -e unsupervised" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<configuration name="scratch" type="PythonConfigurationType" factoryName="Python" temporary="true">
|
||||
<module name="tesi_funneling" />
|
||||
<option name="INTERPRETER_OPTIONS" value="" />
|
||||
<option name="PARENT_ENVS" value="true" />
|
||||
<envs>
|
||||
<env name="PYTHONUNBUFFERED" value="1" />
|
||||
</envs>
|
||||
<option name="SDK_HOME" value="" />
|
||||
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/src" />
|
||||
<option name="IS_MODULE_SDK" value="true" />
|
||||
<option name="ADD_CONTENT_ROOTS" value="true" />
|
||||
<option name="ADD_SOURCE_ROOTS" value="true" />
|
||||
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
|
||||
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/scratch.py" />
|
||||
<option name="PARAMETERS" value="" />
|
||||
<option name="SHOW_COMMAND_LINE" value="false" />
|
||||
<option name="EMULATE_TERMINAL" value="false" />
|
||||
<option name="MODULE_MODE" value="false" />
|
||||
<option name="REDIRECT_INPUT" value="false" />
|
||||
<option name="INPUT_FILE" value="" />
|
||||
<method v="2" />
|
||||
</configuration>
|
||||
<list>
|
||||
<item itemvalue="Python.last_test" />
|
||||
<item itemvalue="Python.funneling_poly" />
|
||||
<item itemvalue="Python.scratch" />
|
||||
</list>
|
||||
<recent_temporary>
|
||||
<list>
|
||||
<item itemvalue="Python.scratch" />
|
||||
<item itemvalue="Python.funneling_poly" />
|
||||
</list>
|
||||
</recent_temporary>
|
||||
</component>
|
||||
<component name="SvnConfiguration">
|
||||
<configuration />
|
||||
</component>
|
||||
<component name="TaskManager">
|
||||
<task active="true" id="Default" summary="Default task">
|
||||
<changelist id="f02472ae-5920-4b7a-abc6-0eb6f03dd03f" name="Default Changelist" comment="" />
|
||||
<created>1574680487463</created>
|
||||
<option name="number" value="Default" />
|
||||
<option name="presentableId" value="Default" />
|
||||
<updated>1574680487463</updated>
|
||||
<workItem from="1574680491429" duration="18756000" />
|
||||
<workItem from="1574705313406" duration="1369000" />
|
||||
<workItem from="1574758627235" duration="18313000" />
|
||||
<workItem from="1574845439127" duration="15307000" />
|
||||
<workItem from="1574870087360" duration="629000" />
|
||||
<workItem from="1574871032651" duration="671000" />
|
||||
<workItem from="1574873488200" duration="225000" />
|
||||
<workItem from="1574876908618" duration="140000" />
|
||||
<workItem from="1574877826026" duration="560000" />
|
||||
<workItem from="1574938635317" duration="14980000" />
|
||||
<workItem from="1574958501259" duration="1736000" />
|
||||
</task>
|
||||
<servers />
|
||||
</component>
|
||||
<component name="TimeTrackingManager">
|
||||
<option name="totallyTimeSpent" value="72686000" />
|
||||
</component>
|
||||
<component name="ToolWindowManager">
|
||||
<frame x="0" y="28" width="1920" height="1052" extended-state="6" />
|
||||
<layout>
|
||||
<window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.15544872" />
|
||||
<window_info id="Structure" order="1" side_tool="true" weight="0.25" />
|
||||
<window_info id="Favorites" order="2" side_tool="true" />
|
||||
<window_info anchor="bottom" id="File Transfer" order="0" weight="0.3297414" />
|
||||
<window_info anchor="bottom" id="Message" order="1" />
|
||||
<window_info anchor="bottom" id="Find" order="2" weight="0.3297414" />
|
||||
<window_info anchor="bottom" id="Run" order="3" weight="0.53556037" />
|
||||
<window_info anchor="bottom" id="Debug" order="4" weight="0.5538793" />
|
||||
<window_info anchor="bottom" id="Cvs" order="5" weight="0.25" />
|
||||
<window_info anchor="bottom" id="Inspection" order="6" weight="0.4" />
|
||||
<window_info anchor="bottom" id="TODO" order="7" />
|
||||
<window_info anchor="bottom" id="Docker" order="8" show_stripe_button="false" />
|
||||
<window_info anchor="bottom" id="Version Control" order="9" />
|
||||
<window_info anchor="bottom" id="Database Changes" order="10" />
|
||||
<window_info anchor="bottom" id="Event Log" order="11" side_tool="true" weight="0.3297414" />
|
||||
<window_info anchor="bottom" id="Terminal" order="12" weight="0.42456895" />
|
||||
<window_info anchor="bottom" id="Python Console" order="13" />
|
||||
<window_info anchor="right" id="Remote Host" order="0" weight="0.32959402" />
|
||||
<window_info anchor="right" id="Commander" order="1" weight="0.4" />
|
||||
<window_info anchor="right" id="Ant Build" order="2" weight="0.25" />
|
||||
<window_info anchor="right" content_ui="combo" id="Hierarchy" order="3" weight="0.25" />
|
||||
<window_info anchor="right" id="SciView" order="4" weight="0.5918803" />
|
||||
<window_info anchor="right" id="Database" order="5" />
|
||||
</layout>
|
||||
</component>
|
||||
<component name="TypeScriptGeneratedFilesManager">
|
||||
<option name="version" value="1" />
|
||||
</component>
|
||||
<component name="XDebuggerManager">
|
||||
<breakpoint-manager>
|
||||
<breakpoints>
|
||||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/src/data/embeddings.py</url>
|
||||
<line>162</line>
|
||||
<option name="timeStamp" value="1" />
|
||||
</line-breakpoint>
|
||||
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
|
||||
<url>file://$PROJECT_DIR$/src/learning/learners.py</url>
|
||||
<line>566</line>
|
||||
<option name="timeStamp" value="2" />
|
||||
</line-breakpoint>
|
||||
</breakpoints>
|
||||
</breakpoint-manager>
|
||||
</component>
|
||||
<component name="editorHistoryManager">
|
||||
<entry file="file://$PROJECT_DIR$/src/data/reader/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/time.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="152">
|
||||
<caret line="308" column="4" selection-start-line="308" selection-start-column="4" selection-end-line="308" selection-end-column="4" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/utils/validation.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="440">
|
||||
<caret line="950" selection-start-line="950" selection-end-line="950" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/scipy/sparse/base.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="746">
|
||||
<caret line="1218" selection-start-line="1218" selection-end-line="1218" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$APPLICATION_HOME_DIR$/helpers/typeshed/stdlib/2and3/time.pyi">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$USER_HOME$/.PyCharm2018.3/system/python_stubs/1539147038/torch/_C/_TensorBase.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="325">
|
||||
<caret line="887" column="8" selection-start-line="887" selection-start-column="8" selection-end-line="887" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/lib/python3.7/multiprocessing/connection.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="183">
|
||||
<caret line="368" selection-start-line="368" selection-end-line="368" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/base.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="201">
|
||||
<caret line="292" selection-start-line="292" selection-end-line="292" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/lib/python3.7/selectors.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="167">
|
||||
<caret line="417" selection-start-line="417" selection-end-line="417" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/scratch.py" />
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="191">
|
||||
<caret line="271" selection-start-line="271" selection-end-line="271" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="191">
|
||||
<caret line="566" selection-start-line="566" selection-end-line="566" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/lib/python3.7/concurrent/futures/_base.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="101">
|
||||
<caret line="383" selection-start-line="383" selection-end-line="383" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="182">
|
||||
<caret line="224" selection-start-line="224" selection-end-line="224" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/svm/classes.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="20">
|
||||
<caret line="604" selection-start-line="604" selection-end-line="604" />
|
||||
<folding>
|
||||
<element signature="e#18103#25253#1" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="166">
|
||||
<caret line="702" selection-start-line="702" selection-end-line="702" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/lib/python3.7/pickle.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="166">
|
||||
<caret line="503" selection-start-line="503" selection-end-line="503" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/nn/modules/__init__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="216">
|
||||
<caret line="12" selection-start-line="12" selection-end-line="12" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/pydevd.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="139">
|
||||
<caret line="1486" selection-start-line="1486" selection-end-line="1486" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$APPLICATION_HOME_DIR$/helpers/pydev/_pydev_bundle/pydev_monkey.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="180">
|
||||
<caret line="12" column="38" lean-forward="true" selection-start-line="12" selection-end-line="13" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/data/tsr_function__.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="54">
|
||||
<caret line="3" selection-start-line="3" selection-end-line="3" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/data/supervised.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="620">
|
||||
<caret line="67" column="12" selection-start-line="67" selection-start-column="11" selection-end-line="67" selection-end-column="12" />
|
||||
<folding>
|
||||
<element signature="e#0#99#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/data/embeddings_fasttext.py" />
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torch/__init__.pyi">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="299">
|
||||
<caret line="491" column="8" selection-start-line="491" selection-start-column="8" selection-end-line="491" selection-end-column="8" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/multiclass.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="217">
|
||||
<caret line="180" selection-start-line="180" selection-end-line="180" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/sklearn/model_selection/_search.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="14">
|
||||
<caret line="366" selection-start-line="366" selection-end-line="366" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file:///usr/lib/python3.7/codecs.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="174">
|
||||
<caret line="309" selection-start-line="309" selection-end-line="309" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/pandas/core/series.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="160">
|
||||
<caret line="303" selection-start-line="303" selection-end-line="303" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/transformers/clesa.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="72">
|
||||
<caret line="4" selection-start-line="4" selection-end-line="4" />
|
||||
<folding>
|
||||
<element signature="e#0#18#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/transformers/riboc.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state>
|
||||
<folding>
|
||||
<element signature="e#0#11#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/util/evaluation.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="18">
|
||||
<caret line="1" column="36" selection-start-line="1" selection-start-column="36" selection-end-line="1" selection-end-column="36" />
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/util/file.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state>
|
||||
<folding>
|
||||
<element signature="e#0#32#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/util/disable_sklearn_warnings.py" />
|
||||
<entry file="file://$PROJECT_DIR$/src/transformers/dci.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="159">
|
||||
<caret line="9" selection-start-line="9" selection-end-line="9" />
|
||||
<folding>
|
||||
<element signature="e#0#18#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/util/results.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="594">
|
||||
<caret line="33" lean-forward="true" selection-start-line="33" selection-end-line="33" />
|
||||
<folding>
|
||||
<element signature="e#0#9#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/dataset_builder.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="1314">
|
||||
<caret line="73" selection-start-line="73" selection-end-line="73" />
|
||||
<folding>
|
||||
<element signature="e#0#32#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/util/metrics.py">
|
||||
<provider selected="true" editor-type-id="text-editor" />
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/funneling_poly.py" />
|
||||
<entry file="file://$PROJECT_DIR$/src/learning/learners.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="2063">
|
||||
<caret line="517" selection-start-line="517" selection-end-line="517" />
|
||||
<folding>
|
||||
<element signature="e#0#18#0" expanded="true" />
|
||||
<element signature="e#23965#24743#0" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/data/embeddings.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="462">
|
||||
<caret line="162" selection-start-line="162" selection-end-line="162" />
|
||||
<folding>
|
||||
<element signature="e#0#9#0" expanded="true" />
|
||||
<element signature="e#222#778#0" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/src/FPEC_andrea.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="119">
|
||||
<caret line="99" column="40" lean-forward="true" selection-start-line="74" selection-start-column="6" selection-end-line="99" selection-end-column="40" />
|
||||
<folding>
|
||||
<element signature="e#0#27#0" expanded="true" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
<entry file="file://$PROJECT_DIR$/../word-class-embeddings/src/venv/lib/python3.7/site-packages/torchtext/vocab.py">
|
||||
<provider selected="true" editor-type-id="text-editor">
|
||||
<state relative-caret-position="182">
|
||||
<caret line="321" selection-start-line="321" selection-end-line="321" />
|
||||
<folding>
|
||||
<element signature="e#13891#17737#0" />
|
||||
</folding>
|
||||
</state>
|
||||
</provider>
|
||||
</entry>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -0,0 +1 @@
|
|||
*.idea
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
from sklearn.svm import SVC
|
||||
import os, sys
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.learners import *
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimices hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
|
||||
|
||||
|
||||
def get_params(dense=False): # TODO kernel function could be usefull for meta-classifier
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
print(lXtr.keys())
|
||||
|
||||
small_lXtr = dict()
|
||||
small_lytr = dict()
|
||||
small_lXte = dict()
|
||||
small_lyte = dict()
|
||||
|
||||
small_lXtr['da'] = lXtr['da'][:50]
|
||||
small_lytr['da'] = lytr['da'][:50]
|
||||
# small_lXtr['en'] = lXtr['en'][:50]
|
||||
# small_lytr['en'] = lytr['en'][:50]
|
||||
# small_lXtr['fr'] = lXtr['fr'][:50]
|
||||
# small_lytr['fr'] = lytr['fr'][:50]
|
||||
# small_lXte['da'] = lXte['da'][:50]
|
||||
# small_lyte['da'] = lyte['da'][:50]
|
||||
# small_lXte['en'] = lXte['en'][:50]
|
||||
# small_lyte['en'] = lyte['en'][:50]
|
||||
# small_lXte['fr'] = lXte['fr'][:50]
|
||||
# small_lyte['fr'] = lyte['fr'][:50]
|
||||
# small_lXtr['it'] = lXtr['it'][:50]
|
||||
# small_lytr['it'] = lytr['it'][:50]
|
||||
# small_lXtr['es'] = lXtr['es'][:50]
|
||||
# small_lytr['es'] = lytr['es'][:50]
|
||||
# small_lXtr['de'] = lXtr['de'][:50]
|
||||
# small_lytr['de'] = lytr['de'][:50]
|
||||
# small_lXtr['pt'] = lXtr['pt'][:50]
|
||||
# small_lytr['pt'] = lytr['pt'][:50]
|
||||
# small_lXtr['nl'] = lXtr['de'][:50]
|
||||
# small_lytr['nl'] = lytr['de'][:50]
|
||||
# small_lXtr['fi'] = lXtr['fi'][:50]
|
||||
# small_lytr['fi'] = lytr['fi'][:50]
|
||||
# small_lXtr['hu'] = lXtr['hu'][:50]
|
||||
# small_lytr['hu'] = lytr['hu'][:50]
|
||||
# small_lXtr['sv'] = lXtr['sv'][:50]
|
||||
# small_lytr['sv'] = lytr['sv'][:50]
|
||||
|
||||
if op.set_c != -1:
|
||||
meta_parameters = None
|
||||
else:
|
||||
meta_parameters = [{'C': [1e3, 1e2, 1e1, 1, 1e-1]}]
|
||||
|
||||
# Embeddings and WCE config
|
||||
_available_mode = ['none', 'unsupervised', 'supervised', 'both']
|
||||
assert op.mode_embed in _available_mode , f'{op.mode_embed} not in {_available_mode}'
|
||||
|
||||
if op.mode_embed == 'none':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': False}
|
||||
_config_id = 'None'
|
||||
elif op.mode_embed == 'unsupervised':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': False}
|
||||
_config_id = 'M'
|
||||
elif op.mode_embed == 'supervised':
|
||||
config = {'unsupervised': False,
|
||||
'supervised': True}
|
||||
_config_id = 'F'
|
||||
elif op.mode_embed == 'both':
|
||||
config = {'unsupervised': True,
|
||||
'supervised': True}
|
||||
_config_id = 'M_and_F'
|
||||
|
||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||
|
||||
print(f'### PolyEmbedd_andrea_{_config_id}\n')
|
||||
classifier = AndreaCLF(op.we_path,
|
||||
config,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
meta_learner=get_learner(calibrate=False),
|
||||
first_tier_parameters=get_params(dense=True),
|
||||
meta_parameters=get_params(dense=True),
|
||||
n_jobs=op.n_jobs)
|
||||
|
||||
print('# Fitting ...')
|
||||
classifier.fit(small_lXtr, small_lytr)
|
||||
|
||||
print('# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, op.optimc, 'test_datasetname', 'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, 'nope')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,196 @@
|
|||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
from torchtext.vocab import Vectors
|
||||
import torch
|
||||
from abc import ABC, abstractmethod
|
||||
from data.supervised import get_supervised_embeddings
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def vocabulary(self): pass
|
||||
|
||||
@abstractmethod
|
||||
def dim(self): pass
|
||||
|
||||
@classmethod
|
||||
def reindex(cls, words, word2index):
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(words):
|
||||
if word not in word2index: continue
|
||||
j = word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
||||
|
||||
|
||||
class WordEmbeddings:
|
||||
|
||||
def __init__(self, lang, we, worddim):
|
||||
self.lang = lang
|
||||
self.we = we
|
||||
self.worddim = worddim
|
||||
self.dimword = {v:k for k,v in self.worddim.items()}
|
||||
|
||||
@classmethod
|
||||
def load(cls, basedir, lang, word_preprocessor=None, dopickle=True):
|
||||
filename = 'wiki.multi.{}.vec'.format(lang)
|
||||
we_path = os.path.join(basedir, filename)
|
||||
|
||||
if dopickle and os.path.exists(we_path + '.pkl'):
|
||||
print('loading pkl in {}'.format(we_path + '.pkl'))
|
||||
(worddim, we) = pickle.load(open(we_path + '.pkl', 'rb'))
|
||||
else:
|
||||
word_registry=set()
|
||||
lines = open(we_path).readlines()
|
||||
nwords, dims = [int(x) for x in lines[0].split()]
|
||||
print('reading we of {} dimensions'.format(dims))
|
||||
we = np.zeros((nwords, dims), dtype=float)
|
||||
worddim = {}
|
||||
index = 0
|
||||
for i, line in enumerate(lines[1:]):
|
||||
if (i + 1) % 100 == 0:
|
||||
print('\r{}/{}'.format(i + 1, len(lines)), end='')
|
||||
word, *vals = line.split()
|
||||
wordp = word_preprocessor(word) if word_preprocessor is not None else word
|
||||
if wordp:
|
||||
wordp=wordp[0]
|
||||
if wordp in word_registry:
|
||||
print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp))
|
||||
elif len(vals) == dims:
|
||||
worddim[wordp] = index
|
||||
we[index, :] = np.array(vals).astype(float)
|
||||
index+=1
|
||||
# else:
|
||||
# print('warning: word <{}> generates an empty string after preprocessing'.format(word))
|
||||
we = we[:index]
|
||||
print('load {} words'.format(index))
|
||||
if dopickle:
|
||||
print('saving...')
|
||||
pickle.dump((worddim, we), open(we_path + '.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return WordEmbeddings(lang, we, worddim)
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.worddim.keys())
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.we[self.worddim[key]]
|
||||
|
||||
def dim(self):
|
||||
return self.we.shape[1]
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.worddim
|
||||
|
||||
def most_similar(self, word_vect, k):
|
||||
if word_vect.ndim == 1:
|
||||
word_vect = word_vect.reshape(1,-1)
|
||||
assert word_vect.shape[1] == self.dim(), 'inconsistent dimensions'
|
||||
|
||||
sim = np.dot(word_vect,self.we.T)
|
||||
order = np.argsort(-1*sim, axis=1)[:,:k]
|
||||
|
||||
similar_words = [[self.dimword[order[vi,ki]] for ki in range(k)] for vi in range(word_vect.shape[0])]
|
||||
sim_scores = sim[:,order]
|
||||
return similar_words, sim_scores
|
||||
|
||||
def get_vectors(self, wordlist):
|
||||
indexes = np.array([self.worddim[w] for w in wordlist])
|
||||
return self.we[indexes]
|
||||
|
||||
def restrict(self, vocabulary):
|
||||
# vocabulary is a set of terms to be kept
|
||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
||||
lost = len(vocabulary)-len(active_vocabulary)
|
||||
if lost>0: #some termr are missing, so it will be replaced by UNK
|
||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
||||
self.we = self.get_vectors(active_vocabulary)
|
||||
assert self.we.shape[0]==len(active_vocabulary)
|
||||
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
||||
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def load_poly(cls, basedir, langs, lang_vocabularies=None, word_preprocessor=None):
|
||||
if lang_vocabularies is None:
|
||||
return cls.merge([cls.load(basedir,lang, word_preprocessor) for lang in langs])
|
||||
else:
|
||||
# assert all([l in lang_vocabularies for l in langs]), 'missing vocabulary for some languages'
|
||||
return cls.merge([cls.load(basedir, lang, word_preprocessor).restrict(lang_vocabularies[lang]) for lang in langs])
|
||||
|
||||
@classmethod
|
||||
def merge(cls, we_list):
|
||||
assert all([isinstance(we, WordEmbeddings) for we in we_list]), \
|
||||
'instances of {} expected'.format(WordEmbeddings.__name__)
|
||||
|
||||
polywe = []
|
||||
worddim={}
|
||||
offset=0
|
||||
for we in we_list:
|
||||
polywe.append(we.we)
|
||||
worddim.update({'{}::{}'.format(we.lang, w):d+offset for w,d in we.worddim.items()})
|
||||
offset=len(worddim)
|
||||
polywe = np.vstack(polywe)
|
||||
|
||||
return WordEmbeddings(lang='poly', we=polywe, worddim=worddim)
|
||||
|
||||
|
||||
class FastTextWikiNews(Vectors):
|
||||
|
||||
url_base = 'Cant auto-download MUSE embeddings'
|
||||
path = '/storage/andrea/FUNNELING/embeddings/wiki.multi.{}.vec'
|
||||
|
||||
def __init__(self, cache, language="en", **kwargs):
|
||||
url = self.url_base.format(language)
|
||||
name = self.path.format(language)
|
||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||
|
||||
|
||||
class FastTextMUSE(PretrainedEmbeddings):
|
||||
|
||||
def __init__(self, path, lang, limit=None):
|
||||
super().__init__()
|
||||
print(f'Loading fastText pretrained vectors from {path}')
|
||||
assert os.path.exists(path), print(f'pre-trained vectors not found in {path}')
|
||||
self.embed = FastTextWikiNews(path, lang, max_vectors=limit)
|
||||
# print('Done')
|
||||
|
||||
def vocabulary(self):
|
||||
return set(self.embed.stoi.keys())
|
||||
|
||||
def dim(self):
|
||||
return self.embed.dim
|
||||
|
||||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
|
||||
def embedding_matrix(path, voc, lang):
|
||||
vocabulary = np.asarray(list(zip(*sorted(voc.items(), key=lambda x:x[1])))[0])
|
||||
|
||||
print('[embedding matrix]')
|
||||
print(f'# [pretrained-matrix: FastTextMUSE {lang}]')
|
||||
pretrained = FastTextMUSE(path, lang)
|
||||
P = pretrained.extract(vocabulary).numpy()
|
||||
del pretrained
|
||||
print(f'[embedding matrix done] of shape={P.shape}\n')
|
||||
|
||||
return vocabulary, P
|
||||
|
||||
|
||||
def WCE_matrix(Xtr, Ytr, lang):
|
||||
print('\n# [supervised-matrix]')
|
||||
S = get_supervised_embeddings(Xtr[lang], Ytr[lang])
|
||||
print(f'[embedding matrix done] of shape={S.shape}\n')
|
||||
return S
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
bg = Bulgarian
|
||||
cs = Czech
|
||||
da = Danish
|
||||
de = German
|
||||
el = Greek
|
||||
en = English
|
||||
es = Spanish
|
||||
et = Estonian
|
||||
fi = Finnish
|
||||
fr = French
|
||||
hu = Hungarian
|
||||
it = Italian
|
||||
lt = Lithuanian
|
||||
lv = Latvian
|
||||
nl = Dutch
|
||||
mt = Maltese
|
||||
pl = Polish
|
||||
pt = Portuguese
|
||||
ro = Romanian
|
||||
sk = Slovak
|
||||
sl = Slovene
|
||||
sv = Swedish
|
||||
"""
|
||||
|
||||
NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
|
||||
'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
|
||||
|
||||
|
||||
#top 10 languages in wikipedia order by the number of articles
|
||||
#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
|
||||
|
||||
#all languages in JRC-acquis v3
|
||||
JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
|
||||
JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
|
||||
|
||||
RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
|
||||
RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
|
||||
|
||||
lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
|
||||
'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,321 @@
|
|||
from __future__ import print_function
|
||||
import os, sys
|
||||
from os.path import join
|
||||
import tarfile
|
||||
import xml.etree.ElementTree as ET
|
||||
from sklearn.datasets import get_data_home
|
||||
import pickle
|
||||
from util.file import download_file, list_dirs, list_files
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, SKOS
|
||||
from rdflib import URIRef
|
||||
import zipfile
|
||||
from data.languages import JRC_LANGS
|
||||
from collections import Counter
|
||||
from random import shuffle
|
||||
from data.languages import lang_set
|
||||
|
||||
"""
|
||||
JRC Acquis' Nomenclature:
|
||||
bg = Bulgarian
|
||||
cs = Czech
|
||||
da = Danish
|
||||
de = German
|
||||
el = Greek
|
||||
en = English
|
||||
es = Spanish
|
||||
et = Estonian
|
||||
fi = Finnish
|
||||
fr = French
|
||||
hu = Hungarian
|
||||
it = Italian
|
||||
lt = Lithuanian
|
||||
lv = Latvian
|
||||
nl = Dutch
|
||||
mt = Maltese
|
||||
pl = Polish
|
||||
pt = Portuguese
|
||||
ro = Romanian
|
||||
sk = Slovak
|
||||
sl = Slovene
|
||||
sv = Swedish
|
||||
"""
|
||||
|
||||
class JRCAcquis_Document:
|
||||
def __init__(self, id, name, lang, year, head, body, categories):
|
||||
self.id = id
|
||||
self.parallel_id = name
|
||||
self.lang = lang
|
||||
self.year = year
|
||||
self.text = body if not head else head + "\n" + body
|
||||
self.categories = categories
|
||||
|
||||
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
|
||||
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
|
||||
# standard codification), so it might be preferable not to read the header after all (as here by default)
|
||||
def _proc_acute(text):
|
||||
for ch in ['a','e','i','o','u']:
|
||||
text = text.replace('%'+ch+'acute%',ch)
|
||||
return text
|
||||
|
||||
def parse_document(file, year, head=False):
|
||||
root = ET.parse(file).getroot()
|
||||
|
||||
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
|
||||
doc_lang = root.attrib['lang'] # e.g., 'es'
|
||||
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
|
||||
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
|
||||
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
|
||||
|
||||
def raise_if_empty(field, from_file):
|
||||
if isinstance(field, str):
|
||||
if not field.strip():
|
||||
raise ValueError("Empty field in file %s" % from_file)
|
||||
|
||||
raise_if_empty(doc_name, file)
|
||||
raise_if_empty(doc_lang, file)
|
||||
raise_if_empty(doc_id, file)
|
||||
if head: raise_if_empty(doc_head, file)
|
||||
raise_if_empty(doc_body, file)
|
||||
|
||||
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
|
||||
|
||||
# removes documents without a counterpart in all other languages
|
||||
def _force_parallel(doclist, langs):
|
||||
n_langs = len(langs)
|
||||
par_id_count = Counter([d.parallel_id for d in doclist])
|
||||
parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
|
||||
return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
|
||||
|
||||
def random_sampling_avoiding_parallel(doclist):
|
||||
random_order = list(range(len(doclist)))
|
||||
shuffle(random_order)
|
||||
sampled_request = []
|
||||
parallel_ids = set()
|
||||
for ind in random_order:
|
||||
pid = doclist[ind].parallel_id
|
||||
if pid not in parallel_ids:
|
||||
sampled_request.append(doclist[ind])
|
||||
parallel_ids.add(pid)
|
||||
print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
|
||||
return sampled_request
|
||||
|
||||
|
||||
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
|
||||
def _filter_by_category(doclist, cat_filter):
|
||||
if not isinstance(cat_filter, frozenset):
|
||||
cat_filter = frozenset(cat_filter)
|
||||
filtered = []
|
||||
for doc in doclist:
|
||||
doc.categories = list(cat_filter & set(doc.categories))
|
||||
if doc.categories:
|
||||
doc.categories.sort()
|
||||
filtered.append(doc)
|
||||
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
|
||||
return filtered
|
||||
|
||||
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
|
||||
def _filter_by_frequency(doclist, cat_threshold):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
#select top most_frequent categories (and filters documents containing those categories)
|
||||
def _most_common(doclist, most_frequent):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
def _get_categories(request):
|
||||
final_cats = set()
|
||||
for d in request:
|
||||
final_cats.update(d.categories)
|
||||
return list(final_cats)
|
||||
|
||||
def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
|
||||
parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
|
||||
|
||||
assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
|
||||
if not langs:
|
||||
langs = JRC_LANGS
|
||||
else:
|
||||
if isinstance(langs, str): langs = [langs]
|
||||
for l in langs:
|
||||
if l not in JRC_LANGS:
|
||||
raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
|
||||
|
||||
if not data_path:
|
||||
data_path = get_data_home()
|
||||
|
||||
if not os.path.exists(data_path):
|
||||
os.mkdir(data_path)
|
||||
|
||||
request = []
|
||||
total_read = 0
|
||||
for l in langs:
|
||||
file_name = 'jrc-'+l+'.tgz'
|
||||
archive_path = join(data_path, file_name)
|
||||
|
||||
if not os.path.exists(archive_path):
|
||||
print("downloading language-specific dataset (once and for all) into %s" % data_path)
|
||||
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
|
||||
download_file(DOWNLOAD_URL, archive_path)
|
||||
print("untarring dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
documents_dir = join(data_path, l)
|
||||
|
||||
print("Reading documents...")
|
||||
read = 0
|
||||
for dir in list_dirs(documents_dir):
|
||||
year = int(dir)
|
||||
if years==None or year in years:
|
||||
year_dir = join(documents_dir,dir)
|
||||
pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
|
||||
if os.path.exists(pickle_name):
|
||||
print("loading from file %s" % pickle_name)
|
||||
l_y_documents = pickle.load(open(pickle_name, "rb"))
|
||||
read += len(l_y_documents)
|
||||
else:
|
||||
l_y_documents = []
|
||||
all_documents = list_files(year_dir)
|
||||
empty = 0
|
||||
for i,doc_file in enumerate(all_documents):
|
||||
try:
|
||||
jrc_doc = parse_document(join(year_dir, doc_file), year)
|
||||
except ValueError:
|
||||
jrc_doc = None
|
||||
|
||||
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
|
||||
l_y_documents.append(jrc_doc)
|
||||
else: empty += 1
|
||||
if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
|
||||
print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
|
||||
read+=1
|
||||
print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
|
||||
print("\t\t(Pickling object for future runs in %s)" % pickle_name)
|
||||
pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
request += l_y_documents
|
||||
print("Read %d documents for language %s\n" % (read, l))
|
||||
total_read += read
|
||||
print("Read %d documents in total" % (total_read))
|
||||
|
||||
if parallel=='force':
|
||||
request = _force_parallel(request, langs)
|
||||
elif parallel == 'avoid':
|
||||
request = random_sampling_avoiding_parallel(request)
|
||||
|
||||
final_cats = _get_categories(request)
|
||||
|
||||
if cat_filter:
|
||||
request = _filter_by_category(request, cat_filter)
|
||||
final_cats = _get_categories(request)
|
||||
if cat_threshold > 0:
|
||||
request, final_cats = _filter_by_frequency(request, cat_threshold)
|
||||
if most_frequent != -1 and len(final_cats) > most_frequent:
|
||||
request, final_cats = _most_common(request, most_frequent)
|
||||
|
||||
return request, final_cats
|
||||
|
||||
def print_cat_analysis(request):
|
||||
cat_count = Counter()
|
||||
for d in request:
|
||||
cat_count.update(d.categories)
|
||||
print("Number of active categories: {}".format(len(cat_count)))
|
||||
print(cat_count.most_common())
|
||||
|
||||
# inspects the Eurovoc thesaurus in order to select a subset of categories
|
||||
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
|
||||
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
|
||||
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
|
||||
select="broadest"):
|
||||
|
||||
fullpath_pickle = join(data_path, select+'_concepts.pickle')
|
||||
if os.path.exists(fullpath_pickle):
|
||||
print("Pickled object found in %s. Loading it." % fullpath_pickle)
|
||||
return pickle.load(open(fullpath_pickle,'rb'))
|
||||
|
||||
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
|
||||
if not os.path.exists(fullpath):
|
||||
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
|
||||
download_file(eurovoc_url, fullpath)
|
||||
print("Unzipping file...")
|
||||
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
|
||||
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
|
||||
zipped.close()
|
||||
|
||||
print("Parsing %s" %fullpath)
|
||||
g = rdflib.Graph()
|
||||
g.parse(location=fullpath, format="application/rdf+xml")
|
||||
|
||||
if select == "all":
|
||||
print("Selecting all concepts")
|
||||
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
|
||||
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
|
||||
all_concepts.sort()
|
||||
selected_concepts = all_concepts
|
||||
elif select=="broadest":
|
||||
print("Selecting broadest concepts (those without any other broader concept linked to it)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
narrower_concepts = set(g.subjects(SKOS.broader, None))
|
||||
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
|
||||
broadest_concepts.sort()
|
||||
selected_concepts = broadest_concepts
|
||||
elif select=="leaves":
|
||||
print("Selecting leaves concepts (those not linked as broader of any other concept)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
broad_concepts = set(g.objects(None, SKOS.broader))
|
||||
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
|
||||
leave_concepts.sort()
|
||||
selected_concepts = leave_concepts
|
||||
else:
|
||||
raise ValueError("Selection policy %s is not currently supported" % select)
|
||||
|
||||
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
|
||||
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
|
||||
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return selected_concepts
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def single_label_fragment(doclist):
|
||||
single = [d for d in doclist if len(d.categories) < 2]
|
||||
final_categories = set([d.categories[0] if d.categories else [] for d in single])
|
||||
print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
|
||||
len(final_categories),
|
||||
len(doclist)))
|
||||
return single, list(final_categories)
|
||||
|
||||
train_years = list(range(1986, 2006))
|
||||
test_years = [2006]
|
||||
cat_policy = 'leaves'
|
||||
most_common_cat = 300
|
||||
# JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
|
||||
JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
|
||||
langs = lang_set['JRC_NLTK']
|
||||
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
|
||||
sys.exit()
|
||||
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
|
||||
test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
|
||||
|
||||
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
|
||||
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
|
||||
|
||||
training_docs, label_names = single_label_fragment(training_docs)
|
||||
test_docs, label_namestest = single_label_fragment(test_docs)
|
||||
|
||||
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
|
||||
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,225 @@
|
|||
from zipfile import ZipFile
|
||||
import xml.etree.ElementTree as ET
|
||||
from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS
|
||||
from util.file import list_files
|
||||
from sklearn.datasets import get_data_home
|
||||
import gzip
|
||||
from os.path import join, exists
|
||||
from util.file import download_file_if_not_exists
|
||||
import re
|
||||
from collections import Counter
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
"""
|
||||
RCV2's Nomenclature:
|
||||
ru = Russian
|
||||
da = Danish
|
||||
de = German
|
||||
es = Spanish
|
||||
lat = Spanish Latin-American (actually is also 'es' in the collection)
|
||||
fr = French
|
||||
it = Italian
|
||||
nl = Dutch
|
||||
pt = Portuguese
|
||||
sv = Swedish
|
||||
ja = Japanese
|
||||
htw = Chinese
|
||||
no = Norwegian
|
||||
"""
|
||||
|
||||
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
|
||||
RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
|
||||
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
|
||||
RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
|
||||
|
||||
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
|
||||
'lyrl2004_tokens_test_pt1.dat.gz',
|
||||
'lyrl2004_tokens_test_pt2.dat.gz',
|
||||
'lyrl2004_tokens_test_pt3.dat.gz']
|
||||
|
||||
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
|
||||
|
||||
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
|
||||
|
||||
RCV2_LANG_DIR = {'ru':'REUTE000',
|
||||
'de':'REUTE00A',
|
||||
'fr':'REUTE00B',
|
||||
'sv':'REUTE001',
|
||||
'no':'REUTE002',
|
||||
'da':'REUTE003',
|
||||
'pt':'REUTE004',
|
||||
'it':'REUTE005',
|
||||
'es':'REUTE006',
|
||||
'lat':'REUTE007',
|
||||
'jp':'REUTE008',
|
||||
'htw':'REUTE009',
|
||||
'nl':'REUTERS_'}
|
||||
|
||||
|
||||
class RCV_Document:
|
||||
|
||||
def __init__(self, id, text, categories, date='', lang=None):
|
||||
self.id = id
|
||||
self.date = date
|
||||
self.lang = lang
|
||||
self.text = text
|
||||
self.categories = categories
|
||||
|
||||
|
||||
class ExpectedLanguageException(Exception): pass
|
||||
class IDRangeException(Exception): pass
|
||||
|
||||
|
||||
nwords = []
|
||||
|
||||
def parse_document(xml_content, assert_lang=None, valid_id_range=None):
|
||||
root = ET.fromstring(xml_content)
|
||||
if assert_lang:
|
||||
if assert_lang not in root.attrib.values():
|
||||
if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp'
|
||||
raise ExpectedLanguageException('error: document of a different language')
|
||||
|
||||
doc_id = root.attrib['itemid']
|
||||
if valid_id_range is not None:
|
||||
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
|
||||
raise IDRangeException
|
||||
|
||||
doc_categories = [cat.attrib['code'] for cat in
|
||||
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
|
||||
|
||||
doc_date = root.attrib['date']
|
||||
doc_title = root.find('.//title').text
|
||||
doc_headline = root.find('.//headline').text
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
|
||||
|
||||
if not doc_body:
|
||||
raise ValueError('Empty document')
|
||||
|
||||
if doc_title is None: doc_title = ''
|
||||
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
|
||||
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
|
||||
|
||||
text_length = len(text.split())
|
||||
global nwords
|
||||
nwords.append(text_length)
|
||||
|
||||
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
|
||||
|
||||
|
||||
def fetch_RCV1(data_path, split='all'):
|
||||
|
||||
assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
read_documents = 0
|
||||
lang = 'en'
|
||||
|
||||
training_documents = 23149
|
||||
test_documents = 781265
|
||||
|
||||
if split == 'all':
|
||||
split_range = (2286, 810596)
|
||||
expected = training_documents+test_documents
|
||||
elif split == 'train':
|
||||
split_range = (2286, 26150)
|
||||
expected = training_documents
|
||||
else:
|
||||
split_range = (26151, 810596)
|
||||
expected = test_documents
|
||||
|
||||
global nwords
|
||||
nwords=[]
|
||||
for part in list_files(data_path):
|
||||
if not re.match('\d+\.zip', part): continue
|
||||
target_file = join(data_path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
read_documents += 1
|
||||
except ValueError:
|
||||
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
|
||||
except (IDRangeException, ExpectedLanguageException) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents'.format(part, len(request)), end='')
|
||||
if read_documents == expected: break
|
||||
if read_documents == expected: break
|
||||
print()
|
||||
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
return request, list(labels)
|
||||
|
||||
|
||||
def fetch_RCV2(data_path, languages=None):
|
||||
|
||||
if not languages:
|
||||
languages = list(RCV2_LANG_DIR.keys())
|
||||
else:
|
||||
assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
global nwords
|
||||
nwords=[]
|
||||
for lang in languages:
|
||||
path = join(data_path, RCV2_LANG_DIR[lang])
|
||||
lang_docs_read = 0
|
||||
for part in list_files(path):
|
||||
target_file = join(path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, assert_lang=lang)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
lang_docs_read += 1
|
||||
except ValueError:
|
||||
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
|
||||
except (IDRangeException, ExpectedLanguageException) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
|
||||
print()
|
||||
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
return request, list(labels)
|
||||
|
||||
|
||||
def fetch_topic_hierarchy(path, topics='all'):
|
||||
assert topics in ['all', 'leaves']
|
||||
|
||||
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
|
||||
hierarchy = {}
|
||||
for line in open(path, 'rt'):
|
||||
parts = line.strip().split()
|
||||
parent,child = parts[1],parts[3]
|
||||
if parent not in hierarchy:
|
||||
hierarchy[parent]=[]
|
||||
hierarchy[parent].append(child)
|
||||
|
||||
del hierarchy['None']
|
||||
del hierarchy['Root']
|
||||
print(hierarchy)
|
||||
|
||||
if topics=='all':
|
||||
topics = set(hierarchy.keys())
|
||||
for parent in hierarchy.keys():
|
||||
topics.update(hierarchy[parent])
|
||||
return list(topics)
|
||||
elif topics=='leaves':
|
||||
parents = set(hierarchy.keys())
|
||||
childs = set()
|
||||
for parent in hierarchy.keys():
|
||||
childs.update(hierarchy[parent])
|
||||
return list(childs.difference(parents))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,304 @@
|
|||
from __future__ import print_function
|
||||
import ijson
|
||||
import os, sys
|
||||
from os.path import join
|
||||
from bz2 import BZ2File
|
||||
from ijson.common import ObjectBuilder
|
||||
import pickle
|
||||
from util.file import list_dirs, list_files, makedirs_if_not_exist
|
||||
from itertools import islice
|
||||
import re
|
||||
from xml.sax.saxutils import escape
|
||||
import numpy as np
|
||||
|
||||
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
|
||||
|
||||
"""
|
||||
This file contains a set of tools for processing the Wikipedia multilingual documents.
|
||||
In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
|
||||
and have processed each document to clean their texts with one of the tools:
|
||||
- https://github.com/aesuli/wikipediatools (Python 2)
|
||||
- https://github.com/aesuli/wikipedia-extractor (Python 3)
|
||||
It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
|
||||
|
||||
This tools help you in:
|
||||
- Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
|
||||
Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
|
||||
extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
|
||||
Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
|
||||
- Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
|
||||
- Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
|
||||
language-specific versions from the document.
|
||||
- Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
|
||||
in a way that the i-th element from any list refers to the same element in the respective language.
|
||||
"""
|
||||
|
||||
def _doc_generator(text_path, langs):
|
||||
dotspace = re.compile(r'\.(?!\s)')
|
||||
for l,lang in enumerate(langs):
|
||||
print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
|
||||
lang_dir = join(text_path, lang)
|
||||
split_dirs = list_dirs(lang_dir)
|
||||
for sd,split_dir in enumerate(split_dirs):
|
||||
print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
|
||||
split_files = list_files(join(lang_dir, split_dir))
|
||||
for sf,split_file in enumerate(split_files):
|
||||
print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
|
||||
with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
|
||||
while True:
|
||||
doc_lines = list(islice(fi, 3))
|
||||
if doc_lines:
|
||||
# some sentences are not followed by a space after the dot
|
||||
doc_lines[1] = dotspace.sub('. ', doc_lines[1])
|
||||
# [workaround] I found html symbol was not treated, and unescaping it now might not help...
|
||||
doc_lines[1] = escape(doc_lines[1].replace(" ", " "))
|
||||
yield doc_lines, lang
|
||||
else: break
|
||||
|
||||
def _extract_title(doc_lines):
|
||||
m = re.search('title="(.+?)"', doc_lines[0])
|
||||
if m: return m.group(1).decode('utf-8')
|
||||
else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
|
||||
|
||||
def _create_doc(target_file, id, doc, lang):
|
||||
doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
|
||||
with open(target_file, 'w') as fo:
|
||||
fo.write('<multidoc id="%s">\n'%id)
|
||||
[fo.write(line) for line in doc]
|
||||
fo.write('</multidoc>')
|
||||
|
||||
def _append_doc(target_file, doc, lang):
|
||||
doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
|
||||
with open(target_file, 'r', buffering=1024*1024) as fi:
|
||||
lines = fi.readlines()
|
||||
if doc[0] in lines[1::3]:
|
||||
return
|
||||
lines[-1:-1]=doc
|
||||
with open(target_file, 'w', buffering=1024*1024) as fo:
|
||||
[fo.write(line) for line in lines]
|
||||
|
||||
def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
|
||||
if not os.path.exists(out_path):
|
||||
os.makedirs(out_path)
|
||||
for lang in langs:
|
||||
if lang not in inv_dict:
|
||||
raise ValueError("Lang %s is not in the dictionary" % lang)
|
||||
|
||||
docs_created = len(list_files(out_path))
|
||||
print("%d multilingual documents found." % docs_created)
|
||||
for doc,lang in _doc_generator(text_path, langs):
|
||||
title = _extract_title(doc)
|
||||
|
||||
if title in inv_dict[lang]:
|
||||
#pass
|
||||
ids = inv_dict[lang][title]
|
||||
for id in ids:
|
||||
target_file = join(out_path, id) + ".xml"
|
||||
if os.path.exists(target_file):
|
||||
_append_doc(target_file, doc, lang)
|
||||
else:
|
||||
_create_doc(target_file, id, doc, lang)
|
||||
docs_created+=1
|
||||
else:
|
||||
if not re.match('[A-Za-z]+', title):
|
||||
print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
|
||||
|
||||
|
||||
|
||||
def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
|
||||
simplified_file = join(data_dir,filename)
|
||||
|
||||
if policy not in policies:
|
||||
raise ValueError("Policy %s not supported." % policy)
|
||||
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
|
||||
|
||||
lang_prefix = list(langs)
|
||||
lang_prefix.sort()
|
||||
pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
|
||||
pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
|
||||
pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
|
||||
if os.path.exists(pickle_invdict):
|
||||
if return_both and os.path.exists(pickle_dict):
|
||||
print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
|
||||
return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
|
||||
elif return_both==False:
|
||||
print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
|
||||
return pickle.load(open(pickle_invdict, 'rb'))
|
||||
|
||||
multiling_titles = {}
|
||||
inv_dict = {lang:{} for lang in langs}
|
||||
|
||||
def process_entry(line):
|
||||
parts = line.strip().split('\t')
|
||||
id = parts[0]
|
||||
if id in multiling_titles:
|
||||
raise ValueError("id <%s> already indexed" % id)
|
||||
|
||||
titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
|
||||
for lang in titles.keys():
|
||||
if lang not in langs:
|
||||
del titles[lang]
|
||||
|
||||
if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
|
||||
or (policy == "IN_ANY_LANG" and len(titles) > 0):
|
||||
multiling_titles[id] = titles
|
||||
for lang, title in titles.items():
|
||||
if title in inv_dict[lang]:
|
||||
inv_dict[lang][title].append(id)
|
||||
inv_dict[lang][title] = [id]
|
||||
|
||||
with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
|
||||
completed = 0
|
||||
try:
|
||||
for line in fi:
|
||||
process_entry(line)
|
||||
completed += 1
|
||||
if completed % 10 == 0:
|
||||
print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
|
||||
print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
|
||||
except EOFError:
|
||||
print("\nUnexpected file ending... saving anyway")
|
||||
|
||||
print("Pickling dictionaries in %s" % data_dir)
|
||||
pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
print("Done")
|
||||
|
||||
return (multiling_titles, inv_dict) if return_both else inv_dict
|
||||
|
||||
|
||||
# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
|
||||
def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
|
||||
latest_all_json_file = join(data_dir,json_file)
|
||||
|
||||
if policy not in policies:
|
||||
raise ValueError("Policy %s not supported." % policy)
|
||||
|
||||
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
|
||||
|
||||
lang_prefix = list(langs)
|
||||
lang_prefix.sort()
|
||||
simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
|
||||
|
||||
def process_entry(last, fo):
|
||||
global written
|
||||
id = last["id"]
|
||||
titles = None
|
||||
if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
|
||||
titles = {lang: last["labels"][lang]["value"] for lang in langs}
|
||||
elif policy == "IN_ANY_LANG":
|
||||
titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
|
||||
|
||||
if titles:
|
||||
fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
written = 0
|
||||
with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
|
||||
BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
|
||||
builder = ObjectBuilder()
|
||||
completed = 0
|
||||
for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
|
||||
builder.event(event, value)
|
||||
if len(builder.value)>1:
|
||||
if process_entry(builder.value.pop(0), fo): written += 1
|
||||
completed += 1
|
||||
print("\rCompleted %d\ttitles %d" % (completed,written), end="")
|
||||
print("")
|
||||
|
||||
#process the last entry
|
||||
process_entry(builder.value.pop(0))
|
||||
|
||||
return simple_titles_path
|
||||
|
||||
"""
|
||||
Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
|
||||
specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
|
||||
specific version of the same document. Documents are forced to contain version in all specified languages and to contain
|
||||
a minimum number of words; otherwise it is discarded.
|
||||
"""
|
||||
class MinWordsNotReached(Exception): pass
|
||||
class WrongDocumentFormat(Exception): pass
|
||||
|
||||
def _load_multilang_doc(path, langs, min_words=100):
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import Element, ParseError
|
||||
try:
|
||||
root = ET.parse(path).getroot()
|
||||
doc = {}
|
||||
for lang in langs:
|
||||
doc_body = root.find('.//doc[@lang="' + lang + '"]')
|
||||
if isinstance(doc_body, Element):
|
||||
n_words = len(doc_body.text.split(' '))
|
||||
if n_words >= min_words:
|
||||
doc[lang] = doc_body.text
|
||||
else:
|
||||
raise MinWordsNotReached
|
||||
else:
|
||||
raise WrongDocumentFormat
|
||||
except ParseError:
|
||||
raise WrongDocumentFormat
|
||||
return doc
|
||||
|
||||
#returns the multilingual documents mapped by language, and a counter with the number of documents readed
|
||||
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
|
||||
if pickle_name and os.path.exists(pickle_name):
|
||||
print("unpickling %s" % pickle_name)
|
||||
return pickle.load(open(pickle_name, 'rb'))
|
||||
|
||||
multi_docs = list_files(wiki_multi_path)
|
||||
mling_documents = {l:[] for l in langs}
|
||||
valid_documents = 0
|
||||
minwords_exception = 0
|
||||
wrongdoc_exception = 0
|
||||
for d,multi_doc in enumerate(multi_docs):
|
||||
print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
|
||||
(d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
|
||||
doc_path = join(wiki_multi_path, multi_doc)
|
||||
try:
|
||||
m_doc = _load_multilang_doc(doc_path, langs, min_words)
|
||||
valid_documents += 1
|
||||
for l in langs:
|
||||
mling_documents[l].append(m_doc[l])
|
||||
except MinWordsNotReached:
|
||||
minwords_exception += 1
|
||||
if deletions: os.remove(doc_path)
|
||||
except WrongDocumentFormat:
|
||||
wrongdoc_exception += 1
|
||||
if deletions: os.remove(doc_path)
|
||||
if max_documents>0 and valid_documents>=max_documents:
|
||||
break
|
||||
|
||||
if pickle_name:
|
||||
print("Pickling wikipedia documents object in %s" % pickle_name)
|
||||
pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return mling_documents
|
||||
|
||||
def random_wiki_sample(l_wiki, max_documents):
|
||||
if max_documents == 0: return None
|
||||
langs = list(l_wiki.keys())
|
||||
assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
|
||||
ndocs_per_lang = len(l_wiki[langs[0]])
|
||||
if ndocs_per_lang > max_documents:
|
||||
sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
|
||||
for lang in langs:
|
||||
l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
|
||||
return l_wiki
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
wikipedia_home = "../Datasets/Wikipedia"
|
||||
|
||||
from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
|
||||
langs = frozenset(langs)
|
||||
|
||||
simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
|
||||
_, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
|
||||
extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
|
||||
out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,75 @@
|
|||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||
# from util.common import *
|
||||
from sklearn.decomposition import PCA
|
||||
import numpy as np
|
||||
|
||||
|
||||
def zscores(x, axis=0): #scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
std = np.clip(np.std(x, ddof=1, axis=axis), 1e-5, None)
|
||||
mean = np.mean(x, axis=axis)
|
||||
return (x - mean) / std
|
||||
|
||||
|
||||
def supervised_embeddings_tfidf(X,Y):
|
||||
tfidf_norm = X.sum(axis=0)
|
||||
F = (X.T).dot(Y) / tfidf_norm.T
|
||||
return F
|
||||
|
||||
|
||||
def supervised_embeddings_ppmi(X,Y):
|
||||
Xbin = X>0
|
||||
D = X.shape[0]
|
||||
Pxy = (Xbin.T).dot(Y)/D
|
||||
Px = Xbin.sum(axis=0)/D
|
||||
Py = Y.sum(axis=0)/D
|
||||
F = np.asarray(Pxy/(Px.T*Py))
|
||||
F = np.maximum(F, 1.0)
|
||||
F = np.log(F)
|
||||
return F
|
||||
|
||||
|
||||
def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=25000):
|
||||
D = X.shape[0]
|
||||
if D>max_documents:
|
||||
print(f'sampling {max_documents}')
|
||||
random_sample = np.random.permutation(D)[:max_documents]
|
||||
X = X[random_sample]
|
||||
Y = Y[random_sample]
|
||||
cell_matrix = get_supervised_matrix(X, Y)
|
||||
F = get_tsr_matrix(cell_matrix, tsr_score_funtion=tsr_function).T
|
||||
return F
|
||||
|
||||
|
||||
def get_supervised_embeddings(X, Y, max_label_space=300, binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||
print('computing supervised embeddings...')
|
||||
|
||||
nC = Y.shape[1]
|
||||
if nC==2 and binary_structural_problems > nC:
|
||||
raise ValueError('not implemented in this branch')
|
||||
|
||||
if method=='ppmi':
|
||||
F = supervised_embeddings_ppmi(X, Y)
|
||||
elif method == 'dotn':
|
||||
F = supervised_embeddings_tfidf(X, Y)
|
||||
elif method == 'ig':
|
||||
F = supervised_embeddings_tsr(X, Y, information_gain)
|
||||
elif method == 'chi2':
|
||||
F = supervised_embeddings_tsr(X, Y, chi_square)
|
||||
|
||||
if dozscore:
|
||||
F = zscores(F, axis=0)
|
||||
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
F = pca.fit(F).transform(F)
|
||||
|
||||
return F
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
from nltk.corpus import stopwords
|
||||
from data.languages import NLTK_LANGMAP
|
||||
from nltk import word_tokenize
|
||||
from nltk.stem import SnowballStemmer
|
||||
|
||||
|
||||
def preprocess_documents(documents, lang):
|
||||
tokens = NLTKStemTokenizer(lang, verbose=True)
|
||||
sw = stopwords.words(NLTK_LANGMAP[lang])
|
||||
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
|
||||
|
||||
|
||||
class NLTKStemTokenizer(object):
|
||||
|
||||
def __init__(self, lang, verbose=False):
|
||||
if lang not in NLTK_LANGMAP:
|
||||
raise ValueError('Language %s is not supported in NLTK' % lang)
|
||||
self.verbose=verbose
|
||||
self.called = 0
|
||||
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
|
||||
self.cache = {}
|
||||
|
||||
def __call__(self, doc):
|
||||
self.called += 1
|
||||
if self.verbose:
|
||||
print("\r\t\t[documents processed %d]" % (self.called), end="")
|
||||
tokens = word_tokenize(doc)
|
||||
stems = []
|
||||
for t in tokens:
|
||||
if t not in self.cache:
|
||||
self.cache[t] = self.wnl.stem(t)
|
||||
stems.append(self.cache[t])
|
||||
return stems
|
||||
|
|
@ -0,0 +1,270 @@
|
|||
import math
|
||||
import numpy as np
|
||||
from scipy.stats import t
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import csr_matrix, csc_matrix
|
||||
|
||||
|
||||
def get_probs(tpr, fpr, pc):
|
||||
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
|
||||
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
|
||||
pnc = 1.0 - pc
|
||||
tp = tpr * pc
|
||||
fn = pc - tp
|
||||
fp = fpr * pnc
|
||||
tn = pnc - fp
|
||||
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
|
||||
|
||||
|
||||
def apply_tsr(tpr, fpr, pc, tsr):
|
||||
cell = get_probs(tpr, fpr, pc)
|
||||
return tsr(cell)
|
||||
|
||||
|
||||
def positive_information_gain(cell):
|
||||
if cell.tpr() < cell.fpr():
|
||||
return 0.0
|
||||
else:
|
||||
return information_gain(cell)
|
||||
|
||||
|
||||
def posneg_information_gain(cell):
|
||||
ig = information_gain(cell)
|
||||
if cell.tpr() < cell.fpr():
|
||||
return -ig
|
||||
else:
|
||||
return ig
|
||||
|
||||
|
||||
def __ig_factor(p_tc, p_t, p_c):
|
||||
den = p_t * p_c
|
||||
if den != 0.0 and p_tc != 0:
|
||||
return p_tc * math.log(p_tc / den, 2)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def information_gain(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||
|
||||
|
||||
def information_gain_mod(cell):
|
||||
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
|
||||
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
|
||||
|
||||
|
||||
def pointwise_mutual_information(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||
|
||||
|
||||
def gain_ratio(cell):
|
||||
pc = cell.p_c()
|
||||
pnc = 1.0 - pc
|
||||
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
|
||||
return information_gain(cell) / (-norm)
|
||||
|
||||
|
||||
def chi_square(cell):
|
||||
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||
if den==0.0: return 0.0
|
||||
num = gss(cell)**2
|
||||
return num / den
|
||||
|
||||
|
||||
def relevance_frequency(cell):
|
||||
a = cell.tp
|
||||
c = cell.fp
|
||||
if c == 0: c = 1
|
||||
return math.log(2.0 + (a * 1.0 / c), 2)
|
||||
|
||||
|
||||
def idf(cell):
|
||||
if cell.p_f()>0:
|
||||
return math.log(1.0 / cell.p_f())
|
||||
return 0.0
|
||||
|
||||
|
||||
def gss(cell):
|
||||
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||
|
||||
|
||||
def conf_interval(xt, n):
|
||||
if n>30:
|
||||
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||
else:
|
||||
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||
p = (xt + 0.5 * z2) / (n + z2)
|
||||
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||
return p, amplitude
|
||||
|
||||
def strength(minPosRelFreq, minPos, maxNeg):
|
||||
if minPos > maxNeg:
|
||||
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||
def conf_weight(cell, cancel_features=False):
|
||||
c = cell.get_c()
|
||||
not_c = cell.get_not_c()
|
||||
tp = cell.tp
|
||||
fp = cell.fp
|
||||
|
||||
pos_p, pos_amp = conf_interval(tp, c)
|
||||
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||
|
||||
min_pos = pos_p-pos_amp
|
||||
max_neg = neg_p+neg_amp
|
||||
den = (min_pos + max_neg)
|
||||
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||
|
||||
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||
|
||||
if str_tplus == 0 and not cancel_features:
|
||||
return 1e-20
|
||||
|
||||
return str_tplus;
|
||||
|
||||
|
||||
class ContTable:
|
||||
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
|
||||
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
|
||||
print(f'[selectiong {k} terms]')
|
||||
nC = Y.shape[1]
|
||||
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
|
||||
best_features_idx = np.argsort(-FC, axis=0).flatten()
|
||||
tsr_values = FC.flatten()
|
||||
selected_indexes_set = set()
|
||||
selected_indexes = list()
|
||||
selected_value = list()
|
||||
from_category = list()
|
||||
round_robin = iter(best_features_idx)
|
||||
values_iter = iter(tsr_values)
|
||||
round=0
|
||||
while len(selected_indexes) < k:
|
||||
term_idx = next(round_robin)
|
||||
term_val = next(values_iter)
|
||||
if term_idx not in selected_indexes_set:
|
||||
selected_indexes_set.add(term_idx)
|
||||
selected_indexes.append(term_idx)
|
||||
selected_value.append(term_val)
|
||||
from_category.append(round)
|
||||
round = (round + 1) % nC
|
||||
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
|
||||
|
||||
|
||||
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||
tn_ = nD - (tp_ + fp_ + fn_)
|
||||
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||
|
||||
|
||||
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||
|
||||
|
||||
"""
|
||||
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||
"""
|
||||
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||
nD, nF = coocurrence_matrix.shape
|
||||
nD2, nC = label_matrix.shape
|
||||
|
||||
if nD != nD2:
|
||||
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||
(coocurrence_matrix.shape,label_matrix.shape))
|
||||
|
||||
def nonzero_set(matrix, col):
|
||||
return set(matrix[:, col].nonzero()[0])
|
||||
|
||||
if isinstance(coocurrence_matrix, csr_matrix):
|
||||
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
|
||||
return np.array(cell_matrix)
|
||||
|
||||
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
|
||||
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||
nC,nF = cell_matrix.shape
|
||||
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||
return np.array(tsr_matrix)
|
||||
|
||||
|
||||
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
|
||||
take as input any real-valued feature column (e.g., tf-idf weights).
|
||||
feat is the feature vector, and c is a binary classification vector.
|
||||
This implementation covers only the binary case, while the formula is defined for multiclass
|
||||
single-label scenarios, for which the version [2] might be preferred.
|
||||
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
|
||||
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
|
||||
"""
|
||||
def fisher_score_binary(feat, c):
|
||||
neg = np.ones_like(c) - c
|
||||
|
||||
npos = np.sum(c)
|
||||
nneg = np.sum(neg)
|
||||
|
||||
mupos = np.mean(feat[c == 1])
|
||||
muneg = np.mean(feat[neg == 1])
|
||||
mu = np.mean(feat)
|
||||
|
||||
stdpos = np.std(feat[c == 1])
|
||||
stdneg = np.std(feat[neg == 1])
|
||||
|
||||
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
|
||||
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
|
||||
|
||||
if den>0:
|
||||
return num / den
|
||||
else:
|
||||
return num
|
||||
|
|
@ -0,0 +1,567 @@
|
|||
from os.path import join, exists
|
||||
from nltk.corpus import stopwords
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from data.reader.jrcacquis_reader import *
|
||||
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
import pickle
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from scipy.sparse import issparse
|
||||
import itertools
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
"""
|
||||
A multilingual dataset is a dictionary of training and test documents indexed by language code.
|
||||
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
|
||||
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
|
||||
labels of each document, and ids is a list of document-identifiers from the original collection.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.dataset_name = ""
|
||||
self.multiling_dataset = {}
|
||||
|
||||
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
|
||||
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
|
||||
|
||||
def save(self, file):
|
||||
self.sort_indexes()
|
||||
pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
return self
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item in self.langs():
|
||||
return self.multiling_dataset[item]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def load(cls, file):
|
||||
data = pickle.load(open(file, 'rb'))
|
||||
data.sort_indexes()
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def load_ids(cls, file):
|
||||
data = pickle.load(open(file, 'rb'))
|
||||
tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
|
||||
te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
|
||||
return tr_ids, te_ids
|
||||
|
||||
def sort_indexes(self):
|
||||
for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
|
||||
if issparse(Xtr): Xtr.sort_indices()
|
||||
if issparse(Xte): Xte.sort_indices()
|
||||
|
||||
def set_view(self, categories=None, languages=None):
|
||||
if categories is not None:
|
||||
if isinstance(categories, int):
|
||||
categories = np.array([categories])
|
||||
elif isinstance(categories, list):
|
||||
categories = np.array(categories)
|
||||
self.categories_view = categories
|
||||
if languages is not None:
|
||||
self.languages_view = languages
|
||||
|
||||
def training(self):
|
||||
return self.lXtr(), self.lYtr()
|
||||
|
||||
def test(self):
|
||||
return self.lXte(), self.lYte()
|
||||
|
||||
def lXtr(self):
|
||||
return {lang:Xtr for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lXte(self):
|
||||
return {lang:Xte for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lYtr(self):
|
||||
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lYte(self):
|
||||
return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def cat_view(self, Y):
|
||||
if hasattr(self, 'categories_view'):
|
||||
return Y[:,self.categories_view]
|
||||
else:
|
||||
return Y
|
||||
|
||||
def langs(self):
|
||||
if hasattr(self, 'languages_view'):
|
||||
langs = self.languages_view
|
||||
else:
|
||||
langs = sorted(self.multiling_dataset.keys())
|
||||
return langs
|
||||
|
||||
def num_categories(self):
|
||||
return self.lYtr()[self.langs()[0]].shape[1]
|
||||
|
||||
def show_dimensions(self):
|
||||
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
|
||||
if lang not in self.langs(): continue
|
||||
if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'):
|
||||
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape))
|
||||
|
||||
def show_category_prevalences(self):
|
||||
#pass
|
||||
nC = self.num_categories()
|
||||
accum_tr = np.zeros(nC, dtype=np.int)
|
||||
accum_te = np.zeros(nC, dtype=np.int)
|
||||
in_langs = np.zeros(nC, dtype=np.int) #count languages with at least one positive example (per category)
|
||||
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
|
||||
if lang not in self.langs(): continue
|
||||
prev_train = np.sum(self.cat_view(Ytr), axis=0)
|
||||
prev_test = np.sum(self.cat_view(Yte), axis=0)
|
||||
accum_tr += prev_train
|
||||
accum_te += prev_test
|
||||
in_langs += (prev_train>0)*1
|
||||
print(lang+'-train', prev_train)
|
||||
print(lang+'-test', prev_test)
|
||||
print('all-train', accum_tr)
|
||||
print('all-test', accum_te)
|
||||
|
||||
return accum_tr, accum_te, in_langs
|
||||
|
||||
def set_labels(self, labels):
|
||||
self.labels = labels
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def get_active_labels(doclist):
|
||||
cat_list = set()
|
||||
for d in doclist:
|
||||
cat_list.update(d.categories)
|
||||
return list(cat_list)
|
||||
|
||||
def filter_by_categories(doclist, keep_categories):
|
||||
catset = frozenset(keep_categories)
|
||||
for d in doclist:
|
||||
d.categories = list(set(d.categories).intersection(catset))
|
||||
|
||||
def __years_to_str(years):
|
||||
if isinstance(years, list):
|
||||
if len(years) > 1:
|
||||
return str(years[0])+'-'+str(years[-1])
|
||||
return str(years[0])
|
||||
return str(years)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Matrix builders
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
|
||||
"""
|
||||
Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
|
||||
i.e., each language-specific matrix lies in a dedicate feature space.
|
||||
:param dataset_name: the name of the dataset (str)
|
||||
:param langs: list of languages (str)
|
||||
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param label_names: list of names of labels (str)
|
||||
:param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
|
||||
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
|
||||
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
|
||||
by language the processed wikipedia documents in their respective language-specific feature spaces
|
||||
"""
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
lW = {}
|
||||
|
||||
multilingual_dataset = MultilingualDataset()
|
||||
multilingual_dataset.dataset_name = dataset_name
|
||||
multilingual_dataset.set_labels(mlb.classes_)
|
||||
for lang in langs:
|
||||
print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
|
||||
(len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
|
||||
|
||||
tr_data, tr_labels, IDtr = zip(*training_docs[lang])
|
||||
te_data, te_labels, IDte = zip(*test_docs[lang])
|
||||
|
||||
if preprocess:
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
|
||||
tokenizer=NLTKStemTokenizer(lang, verbose=True),
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang]))
|
||||
else:
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
|
||||
|
||||
Xtr = tfidf.fit_transform(tr_data)
|
||||
Xte = tfidf.transform(te_data)
|
||||
if wiki_docs:
|
||||
lW[lang] = tfidf.transform(wiki_docs[lang])
|
||||
|
||||
Ytr = mlb.transform(tr_labels)
|
||||
Yte = mlb.transform(te_labels)
|
||||
|
||||
multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
multilingual_dataset.show_dimensions()
|
||||
multilingual_dataset.show_category_prevalences()
|
||||
|
||||
if wiki_docs:
|
||||
return multilingual_dataset, lW
|
||||
else:
|
||||
return multilingual_dataset
|
||||
|
||||
|
||||
# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
|
||||
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
|
||||
"""
|
||||
Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
|
||||
since all of them lie on the same yuxtaposed feature space.
|
||||
:param dataset_name: the name of the dataset (str)
|
||||
:param langs: list of languages (str)
|
||||
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param label_names: list of names of labels (str)
|
||||
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
|
||||
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
|
||||
by language the processed wikipedia documents in their respective language-specific feature spaces
|
||||
"""
|
||||
|
||||
multiling_dataset = MultilingualDataset()
|
||||
multiling_dataset.dataset_name = dataset_name
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
multiling_dataset.set_labels(mlb.classes_)
|
||||
|
||||
tr_data_stack = []
|
||||
for lang in langs:
|
||||
print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
|
||||
tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
|
||||
te_data, te_labels, te_ID = zip(*test_docs[lang])
|
||||
if preprocess:
|
||||
tr_data = preprocess_documents(tr_data, lang)
|
||||
te_data = preprocess_documents(te_data, lang)
|
||||
tr_data_stack.extend(tr_data)
|
||||
multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
|
||||
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
|
||||
tfidf.fit(tr_data_stack)
|
||||
|
||||
for lang in langs:
|
||||
print("\nweighting documents for language <%s>" % (lang))
|
||||
(tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
|
||||
Xtr = tfidf.transform(tr_data)
|
||||
Xte = tfidf.transform(te_data)
|
||||
Ytr = mlb.transform(tr_labels)
|
||||
Yte = mlb.transform(te_labels)
|
||||
multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
|
||||
|
||||
multiling_dataset.show_dimensions()
|
||||
return multiling_dataset
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Methods to recover the original documents from the MultilingualDataset's ids
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
"""
|
||||
This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
|
||||
article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
|
||||
from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
|
||||
"""
|
||||
def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
|
||||
|
||||
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
|
||||
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
|
||||
langs = list(tr_ids.keys())
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
|
||||
|
||||
filter_by_categories(rcv1_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_documents + rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
all_docs = rcv1_documents + rcv2_documents
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
"""
|
||||
Same thing but for JRC-Acquis
|
||||
"""
|
||||
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
|
||||
|
||||
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
|
||||
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
|
||||
langs = list(tr_ids.keys())
|
||||
|
||||
print('fetching the datasets')
|
||||
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
|
||||
cat_filter=cat_list, cat_threshold=1, parallel=None,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
|
||||
parallel='force')
|
||||
|
||||
def filter_by_id(doclist, ids):
|
||||
ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
|
||||
return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
|
||||
|
||||
training_docs = filter_by_id(training_docs, tr_ids)
|
||||
test_docs = filter_by_id(test_docs, te_ids)
|
||||
|
||||
print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Dataset Generators
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
|
||||
"""
|
||||
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
|
||||
:param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
|
||||
all splits will be generated
|
||||
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
|
||||
:param langs: the list of languages to consider (as defined in data/languages.py)
|
||||
:param train_years: a list of ints containing the years to be considered as training documents
|
||||
:param test_years: a list of ints containing the years to be considered as test documents
|
||||
:param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
|
||||
(select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
|
||||
leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
|
||||
:param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
|
||||
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
|
||||
:param run: a numeric label naming the random split (useful to keep track of different runs)
|
||||
:return: None
|
||||
"""
|
||||
|
||||
name = 'JRCacquis'
|
||||
run = '_run' + str(run)
|
||||
config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
|
||||
'vs' + __years_to_str(test_years) + \
|
||||
'_' + cat_policy + \
|
||||
('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
|
||||
'_noparallel_processed'
|
||||
|
||||
indep_path = join(jrc_data_home, config_name + run + '.pickle')
|
||||
upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
|
||||
yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
|
||||
wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle')
|
||||
wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
|
||||
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
|
||||
cat_filter=cat_list, cat_threshold=1, parallel=None,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
|
||||
parallel='force')
|
||||
|
||||
print('Generating feature-independent dataset...')
|
||||
training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
|
||||
|
||||
def _group_by_lang(doc_list, langs):
|
||||
return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
|
||||
for lang in langs}
|
||||
|
||||
training_docs = _group_by_lang(training_docs, langs)
|
||||
training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
|
||||
test_docs = _group_by_lang(test_docs, langs)
|
||||
if not exists(indep_path):
|
||||
wiki_docs=None
|
||||
if max_wiki>0:
|
||||
if not exists(wiki_docs_path):
|
||||
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
|
||||
if wiki_docs:
|
||||
lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
|
||||
pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
|
||||
|
||||
lang_data.save(indep_path)
|
||||
|
||||
print('Generating upper-bound (English-only) dataset...')
|
||||
if not exists(upper_path):
|
||||
training_docs_eng_only = {'en':training_docs['en']}
|
||||
test_docs_eng_only = {'en':test_docs['en']}
|
||||
build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
|
||||
|
||||
print('Generating yuxtaposed dataset...')
|
||||
if not exists(yuxta_path):
|
||||
build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
|
||||
|
||||
|
||||
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
|
||||
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
|
||||
"""
|
||||
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
|
||||
:param outpath: path where all splits will be dumped
|
||||
:param rcv1_data_home: path to the RCV1-v2 dataset (English only)
|
||||
:param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
|
||||
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
|
||||
:param langs: the list of languages to consider (as defined in data/languages.py)
|
||||
:param train_for_lang: maximum number of training documents per language
|
||||
:param test_for_lang: maximum number of test documents per language
|
||||
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
|
||||
:param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
|
||||
:param run: a numeric label naming the random split (useful to keep track of different runs)
|
||||
:return: None
|
||||
"""
|
||||
|
||||
assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
|
||||
assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
|
||||
assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
|
||||
"languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
|
||||
|
||||
name = 'RCV1/2'
|
||||
run = '_run' + str(run)
|
||||
config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
|
||||
('_processed' if preprocess else '_raw')
|
||||
|
||||
indep_path = join(outpath, config_name + run + '.pickle')
|
||||
upper_path = join(outpath, config_name + run +'_upper.pickle')
|
||||
yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
|
||||
wiki_path = join(outpath, config_name + run + '.wiki.pickle')
|
||||
wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
|
||||
filter_by_categories(rcv1_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_documents+rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
|
||||
|
||||
# for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
|
||||
# would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
|
||||
print('Generating upper-bound (English-only) dataset...')
|
||||
train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
|
||||
train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
|
||||
test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
|
||||
build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
|
||||
|
||||
train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
|
||||
for lang in langs:
|
||||
if lang=='en': continue # already split
|
||||
test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
|
||||
train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
|
||||
train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
|
||||
test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test]
|
||||
|
||||
print('Generating feature-independent dataset...')
|
||||
wiki_docs=None
|
||||
if max_wiki>0:
|
||||
if not exists(wiki_docs_path):
|
||||
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
|
||||
if wiki_docs:
|
||||
lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
|
||||
pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
|
||||
|
||||
lang_data.save(indep_path)
|
||||
|
||||
print('Generating yuxtaposed dataset...')
|
||||
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
# MAIN BUILDER
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
if __name__=='__main__':
|
||||
import sys
|
||||
|
||||
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
|
||||
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
|
||||
|
||||
JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
|
||||
RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
|
||||
RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
|
||||
WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
|
||||
|
||||
langs = lang_set['JRC_NLTK']
|
||||
max_wiki = 5000
|
||||
|
||||
for run in range(0,10):
|
||||
print('Building JRC-Acquis datasets run', run)
|
||||
prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
|
||||
train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
|
||||
cat_policy='all', most_common_cat=300, run=run)
|
||||
|
||||
print('Building RCV1-v2/2 datasets run', run)
|
||||
prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
|
||||
train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
|
||||
|
||||
# uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
|
||||
# (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
|
||||
# datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
|
||||
# outpath = datasetpath.replace('_nltk_','_doclist_')
|
||||
# retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
|
||||
|
||||
# datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
|
||||
# outpath = datasetpath.replace('_nltk_', '_doclist_')
|
||||
# retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
|
|
@ -0,0 +1,646 @@
|
|||
import numpy as np
|
||||
import time
|
||||
from data.embeddings import WordEmbeddings, embedding_matrix, WCE_matrix
|
||||
from scipy.sparse import issparse, csr_matrix
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import KFold
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
if issparse(X) and not X.has_sorted_indices:
|
||||
X.sort_indices()
|
||||
|
||||
|
||||
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang:transformer(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
|
||||
return {lang: transformations[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
class TrivialRejector:
|
||||
def fit(self, X, y):
|
||||
self.cats = y.shape[1]
|
||||
return self
|
||||
def decision_function(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def predict(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def predict_proba(self, X): return np.zeros((X.shape[0],self.cats))
|
||||
def best_params(self): return {}
|
||||
|
||||
|
||||
class FunnellingPolylingualClassifier:
|
||||
"""
|
||||
This classifier projects each document d into a language-independent feature space where each dimension fi is the
|
||||
decision score phi_l(d,ci) of an auxiliary classifier phi_l trained on category ci for documents in language l;
|
||||
then trains one single classifier for all documents in this space, irrespective of their originary language
|
||||
"""
|
||||
def __init__(self, first_tier_learner, meta_learner, first_tier_parameters=None, meta_parameters=None, folded_projections=1,
|
||||
calmode='cal', n_jobs=-1):
|
||||
"""
|
||||
:param first_tier_learner: the learner used in the first-tier level
|
||||
:param meta_learner: the learner used in the second-tier level
|
||||
:param first_tier_parameters: parameters for the learner in the doc_projector
|
||||
:param meta_parameters: parameters for the learner in the z-space
|
||||
:param folded_projections: if 1 then the model trains the auxiliar classifiers with all training data and
|
||||
:param calmode: 'cal' to calibrate the base classifiers, 'nocal' to use the decision_function instead, or
|
||||
:param n_jobs: number of parallel threads
|
||||
'sigmoid' to use the sigmoid of the decision_function
|
||||
projects the data before training the final classifier; if greater than one, the training set is split in as
|
||||
many folds as indicated, and the projected space is composed by concatenating each fold prediction based on
|
||||
models trained on the remaining folds. This should increase the generality of the space to unseen data.
|
||||
"""
|
||||
assert folded_projections>0, "positive number of folds expected"
|
||||
assert calmode in ['cal','nocal','sigmoid'], 'unknown calmode'
|
||||
assert calmode!='cal' or first_tier_learner.probability, 'calmode=cal requires the learner to have probability=True'
|
||||
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.meta_learner = meta_learner
|
||||
self.fist_tier_parameters=first_tier_parameters
|
||||
self.meta_parameters = meta_parameters
|
||||
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
self.doc_projector_bu = NaivePolylingualClassifier(self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs)
|
||||
self.folded_projections = folded_projections
|
||||
self.n_jobs = n_jobs
|
||||
self.calmode = calmode
|
||||
|
||||
def _projection(self, doc_projector, lX):
|
||||
"""
|
||||
Decides the projection function to be applied; predict_proba if the base classifiers are calibrated or
|
||||
decision_function if otherwise
|
||||
:param doc_projector: the document projector (a NaivePolylingualClassifier)
|
||||
:param lX: {lang:matrix} to train
|
||||
:return: the projection, applied with predict_proba or decision_function
|
||||
"""
|
||||
if self.calmode=='cal':
|
||||
return doc_projector.predict_proba(lX)
|
||||
else:
|
||||
l_decision_scores = doc_projector.decision_function(lX)
|
||||
if self.calmode=='sigmoid':
|
||||
def sigmoid(x): return 1 / (1 + np.exp(-x))
|
||||
for lang in l_decision_scores.keys():
|
||||
l_decision_scores[lang] = sigmoid(l_decision_scores[lang])
|
||||
return l_decision_scores
|
||||
|
||||
def _get_zspace(self, lXtr, lYtr, lXproj=None, lYproj=None):
|
||||
"""
|
||||
Produces the vector space of posterior probabilities (in case the first-tier is calibrated) or of
|
||||
decision scores (if otherwise). This space is here named zspace.
|
||||
:param lXtr: {lang:matrix} to train
|
||||
:param lYtr: {lang:labels} to train
|
||||
:param lXproj: {lang:matrix} to project (if None, then projects the lXtr)
|
||||
:param lYproj: {lang:labels} to stack in the same order (if None, then lYtr will be stacked)
|
||||
:return: the projection of lXproj documents into the Z-space defined by the confidence scores of language-specific
|
||||
models trained on lXtr, and the lYproj labels stacked consistently
|
||||
"""
|
||||
repair_empty_folds = True
|
||||
if lXproj is None and lYproj is None:
|
||||
lXproj, lYproj = lXtr, lYtr
|
||||
repair_empty_folds = False
|
||||
|
||||
print('fitting the projectors... {}'.format(lXtr.keys()))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('projecting the documents')
|
||||
langs = list(lXtr.keys())
|
||||
lZ = self._projection(self.doc_projector, lXproj)
|
||||
|
||||
# if repair_empty_folds: #empty folds are replaced by the posterior probabilities generated by the non-folded version
|
||||
empty_categories = self.doc_projector.empty_categories
|
||||
lZ_bu = self._projection(self.doc_projector_bu, lXproj)
|
||||
|
||||
for lang in langs:
|
||||
repair = empty_categories[lang]
|
||||
lZ[lang][:,repair] = lZ_bu[lang][:,repair]
|
||||
|
||||
Z = np.vstack([lZ[lang] for lang in langs]) # Z is the language independent space
|
||||
zy = np.vstack([lYproj[lang] for lang in langs])
|
||||
return Z, zy
|
||||
|
||||
def _get_zspace_folds(self, lX, ly):
|
||||
self.doc_projector_bu.fit(lX, ly)
|
||||
|
||||
print('split of {} folds'.format(self.folded_projections))
|
||||
skf = KFold(n_splits=self.folded_projections, shuffle=True)
|
||||
|
||||
Z, zy = [], []
|
||||
lfold = {lang: list(skf.split(lX[lang], ly[lang])) for lang in lX.keys()}
|
||||
for fold in range(self.folded_projections):
|
||||
print('fitting the projectors ({}/{})...'.format(fold + 1, self.folded_projections))
|
||||
lfoldXtr, lfoldYtr = {}, {}
|
||||
lfoldXte, lfoldYte = {}, {}
|
||||
for lang in lX.keys():
|
||||
train, test = lfold[lang][fold]
|
||||
lfoldXtr[lang] = lX[lang][train]
|
||||
lfoldYtr[lang] = ly[lang][train]
|
||||
lfoldXte[lang] = lX[lang][test]
|
||||
lfoldYte[lang] = ly[lang][test]
|
||||
Zfold, zYfold = self._get_zspace(lfoldXtr, lfoldYtr, lfoldXte, lfoldYte)
|
||||
Z.append(Zfold)
|
||||
zy.append(zYfold)
|
||||
# compose the Z-space as the union of all folded predictions
|
||||
Z = np.vstack(Z)
|
||||
zy = np.vstack(zy)
|
||||
# refit the document projector with all examples to have a more reliable projector for test data
|
||||
self.doc_projector = self.doc_projector_bu
|
||||
return Z, zy
|
||||
|
||||
def fit(self, lX, ly, lZ=None, lzy=None):
|
||||
tinit = time.time()
|
||||
Z, zy = self._get_zspace(lX, ly) if self.folded_projections == 1 else self._get_zspace_folds(lX, ly)
|
||||
|
||||
#experimental: adds the posterior probabilities (computed outside) to the meta-classifier
|
||||
if lZ is not None and lzy is not None:
|
||||
zlangs = list(lZ.keys())
|
||||
Z = np.vstack((Z, *[lZ[l] for l in zlangs]))
|
||||
zy = np.vstack((zy, *[lzy[l] for l in zlangs]))
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs)
|
||||
self.model.fit(Z, zy)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, lX, lZ=None):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:param lZ: a dictionary {language_label: Z matrix}; if specified, concats this representation
|
||||
:return: a dictionary of predictions
|
||||
"""
|
||||
lZ_ = self._projection(self.doc_projector, lX)
|
||||
if lZ is not None:
|
||||
lZ_ = {**lZ_, **lZ}
|
||||
return _joblib_transform_multiling(self.model.predict, lZ_, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
params = self.doc_projector.best_params()
|
||||
params['meta'] = self.model.best_params()
|
||||
return params
|
||||
|
||||
|
||||
class NaivePolylingualClassifier:
|
||||
"""
|
||||
Is a mere set of independet MonolingualClassifiers
|
||||
"""
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.base_learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
trains the independent monolingual classifiers
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:param ly: a dictionary {language_label: y np.array}
|
||||
:return: self
|
||||
"""
|
||||
tinit = time.time()
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
|
||||
langs = list(lX.keys())
|
||||
for lang in langs:
|
||||
_sort_if_sparse(lX[lang])
|
||||
|
||||
# models = Parallel(n_jobs=self.n_jobs)\
|
||||
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||
|
||||
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
||||
|
||||
for model, lang in zip(models, langs):
|
||||
model.fit(lX[lang], ly[lang])
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of classification scores for each class
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs=list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
|
||||
return {lang:scores[i] for i,lang in enumerate(langs)}
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of probabilities that each document belongs to each class
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs=list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
|
||||
return {lang:scores[i] for i,lang in enumerate(langs)}
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of predictions
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
|
||||
if self.n_jobs == 1:
|
||||
return {lang:self.model[lang].predict(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
return {l:model.best_params() for l,model in self.model.items()}
|
||||
|
||||
|
||||
class MonolingualClassifier:
|
||||
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.n_jobs = n_jobs
|
||||
self.best_params_ = None
|
||||
|
||||
def fit(self, X, y):
|
||||
if X.shape[0] == 0:
|
||||
print('Warning: X has 0 elements, a trivial rejector will be created')
|
||||
self.model = TrivialRejector().fit(X,y)
|
||||
self.empty_categories = np.arange(y.shape[1])
|
||||
return self
|
||||
|
||||
tinit = time.time()
|
||||
_sort_if_sparse(X)
|
||||
self.empty_categories = np.argwhere(np.sum(y, axis=0)==0).flatten()
|
||||
|
||||
# multi-class format
|
||||
if len(y.shape) == 2:
|
||||
if self.parameters is not None:
|
||||
self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
|
||||
for params in self.parameters]
|
||||
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
|
||||
else:
|
||||
self.model = self.learner
|
||||
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in the labels across languages')
|
||||
|
||||
# parameter optimization?
|
||||
if self.parameters:
|
||||
print('debug: optimizing parameters:', self.parameters)
|
||||
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
|
||||
error_score=0, verbose=10)
|
||||
|
||||
print('fitting:', self.model)
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
print('best parameters: ', self.best_params_)
|
||||
self.time=time.time()-tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.decision_function(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict_proba(X)
|
||||
|
||||
def predict(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict(X)
|
||||
|
||||
def best_params(self):
|
||||
return self.best_params_
|
||||
|
||||
|
||||
class PolylingualEmbeddingsClassifier:
|
||||
"""
|
||||
This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article
|
||||
@article{conneau2017word,
|
||||
title={Word translation without parallel data},
|
||||
author={Conneau, Alexis and Lample, Guillaume and Ranzato, Marc'Aurelio and Denoyer, Ludovic and J{\'e}gou, Herv{\'e}},
|
||||
journal={arXiv preprint arXiv:1710.04087},
|
||||
year={2017}
|
||||
}
|
||||
url: https://github.com/facebookresearch/MUSE
|
||||
"""
|
||||
def __init__(self, wordembeddings_path, learner, c_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
:param wordembeddings_path: the path to the directory containing the polylingual embeddings
|
||||
:param learner: the learner
|
||||
:param c_parameters: parameters for learner
|
||||
:param n_jobs: the number of concurrent threads
|
||||
"""
|
||||
self.wordembeddings_path = wordembeddings_path
|
||||
self.learner = learner
|
||||
self.c_parameters=c_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.lang_tfidf = {}
|
||||
self.model = None
|
||||
|
||||
def fit_vectorizers(self, lX):
|
||||
for lang in lX.keys():
|
||||
if lang not in self.lang_tfidf:
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, use_idf=True) # text is already processed
|
||||
docs = lX[lang]
|
||||
tfidf.fit(docs)
|
||||
self.lang_tfidf[lang] = tfidf
|
||||
|
||||
def embed(self, docs, lang):
|
||||
assert lang in self.lang_tfidf, 'unknown language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
V = tfidf_vectorizer.vocabulary_
|
||||
Xweights = tfidf_vectorizer.transform(docs)
|
||||
|
||||
print('loading word embeddings for ' + lang)
|
||||
we = WordEmbeddings.load(self.wordembeddings_path, lang)
|
||||
|
||||
nD = len(docs)
|
||||
doc_vecs = np.zeros((nD, we.dim()))
|
||||
|
||||
for i, doc in enumerate(docs):
|
||||
print('\r\tcomplete {:.3f}%'.format(100 * (i + 1) / nD), end='')
|
||||
# averaging with tfidf (summing each word only once, since the frequency is already controlled)
|
||||
for w in set(doc.split()):
|
||||
if w in we and w in V:
|
||||
doc_vecs[i] += (we[w] * Xweights[i, V[w]])
|
||||
# works much worse with idf; works much worse with document l2-normalization
|
||||
print()
|
||||
|
||||
return doc_vecs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
||||
:return: self
|
||||
"""
|
||||
tinit = time.time()
|
||||
langs = list(lX.keys())
|
||||
WEtr, Ytr = [], []
|
||||
self.fit_vectorizers(lX) # if already fit, does nothing
|
||||
_lX = dict()
|
||||
for lang in langs:
|
||||
_lX[lang] = self.lang_tfidf[lang].transform(lX[lang])
|
||||
WEtr.append(self.embed(lX[lang], lang))
|
||||
Ytr.append(ly[lang])
|
||||
|
||||
# TODO @Andrea --> here embeddings should be stacked horizontally!
|
||||
WEtr = np.vstack(WEtr)
|
||||
Ytr = np.vstack(Ytr)
|
||||
self.embed_time = time.time() - tinit
|
||||
|
||||
print('fitting the WE-space of shape={}'.format(WEtr.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs)
|
||||
self.model.fit(_lX['da'], ly['da'])
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
langs = list(lX.keys())
|
||||
lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict, lWEte, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
langs = list(lX.keys())
|
||||
# lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory
|
||||
return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
||||
class FunnellingEmbeddingPolylingualClassifier:
|
||||
""" Simulated: this setting is merely for testing purposes, and is not realistic. We here assume to have a tfidf
|
||||
vectorizer for the out-of-scope languages (which is not fair)."""
|
||||
def __init__(self, first_tier_learner, embed_learner, meta_learner, wordembeddings_path, training_languages,
|
||||
first_tier_parameters = None, embed_parameters = None, meta_parameters = None, n_jobs=-1):
|
||||
|
||||
assert first_tier_learner.probability==True and embed_learner.probability==True, \
|
||||
'both the first-tier classifier and the polyembedding classifier shoud allow calibration'
|
||||
|
||||
self.training_languages = training_languages
|
||||
|
||||
self.PLE = PolylingualEmbeddingsClassifier(wordembeddings_path, embed_learner,
|
||||
c_parameters=embed_parameters, n_jobs=n_jobs)
|
||||
|
||||
self.Funnelling = FunnellingPolylingualClassifier(first_tier_learner, meta_learner,
|
||||
first_tier_parameters=first_tier_parameters,
|
||||
meta_parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def vectorize(self, lX):
|
||||
return {l:self.PLE.lang_tfidf[l].transform(lX[l]) for l in lX.keys()}
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
:param ly: a dictionary {language_label: ndarray of shape (ndocs, ncats) binary labels}
|
||||
:return:
|
||||
"""
|
||||
self.PLE.fit_vectorizers(lX)
|
||||
tinit = time.time()
|
||||
lX = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
||||
ly = {l: ly[l] for l in lX.keys() if l in self.training_languages}
|
||||
self.PLE.fit(lX, ly)
|
||||
lZ = self.PLE.predict_proba(lX)
|
||||
self.Funnelling.fit(self.vectorize(lX),ly,lZ,ly)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: [list of preprocessed documents]}
|
||||
"""
|
||||
lXin = {l: lX[l] for l in lX.keys() if l in self.training_languages}
|
||||
lXout = {l: lX[l] for l in lX.keys() if l not in self.training_languages}
|
||||
|
||||
lZ = self.PLE.predict_proba(lXout)
|
||||
|
||||
return self.Funnelling.predict(self.vectorize(lXin), lZ)
|
||||
|
||||
|
||||
def best_params(self):
|
||||
return {'PLE':self.PLE.best_params(), 'Funnelling':self.Funnelling.best_params()}
|
||||
|
||||
|
||||
class AndreaCLF(FunnellingPolylingualClassifier):
|
||||
def __init__(self,
|
||||
we_path,
|
||||
config,
|
||||
first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters=None,
|
||||
meta_parameters=None,
|
||||
folded_projections=1,
|
||||
calmode='cal', n_jobs=-1):
|
||||
|
||||
super().__init__(first_tier_learner,
|
||||
meta_learner,
|
||||
first_tier_parameters,
|
||||
meta_parameters,
|
||||
folded_projections,
|
||||
calmode,
|
||||
n_jobs)
|
||||
|
||||
self.we_path = we_path
|
||||
self.config = config
|
||||
self.lang_word2idx = dict()
|
||||
self.languages = []
|
||||
self.lang_tfidf = {}
|
||||
self.word_embeddings = {}
|
||||
self.supervised_embeddings = {}
|
||||
|
||||
def vectorize(self, lX, prediction=False):
|
||||
langs = list(lX.keys())
|
||||
print(f'# tfidf-vectorizing docs')
|
||||
if prediction:
|
||||
for lang in langs:
|
||||
assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language'
|
||||
tfidf_vectorizer = self.lang_tfidf[lang]
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
return self
|
||||
|
||||
for lang in langs:
|
||||
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True)
|
||||
self.languages.append(lang)
|
||||
tfidf_vectorizer.fit(lX[lang])
|
||||
lX[lang] = tfidf_vectorizer.transform(lX[lang])
|
||||
_sort_if_sparse(lX[lang])
|
||||
self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_
|
||||
self.lang_tfidf[lang] = tfidf_vectorizer # utile in fase di testing
|
||||
return self
|
||||
|
||||
# @override std class method
|
||||
def _get_zspace(self, lXtr, lYtr):
|
||||
print('\nfitting the projectors... {}'.format(list(lXtr.keys())))
|
||||
self.doc_projector.fit(lXtr, lYtr)
|
||||
|
||||
print('\nprojecting the documents')
|
||||
lZ = self._projection(self.doc_projector, lXtr)
|
||||
|
||||
return lZ, lYtr
|
||||
|
||||
def embed(self, lX, ly, unsupervised=False, supervised=False, prediction=False):
|
||||
"""
|
||||
build embedding matrix for given language and returns its weighted sum wrt tf-idf score
|
||||
"""
|
||||
_r = dict()
|
||||
languages = list(lX.keys())
|
||||
|
||||
if prediction:
|
||||
for lang in languages:
|
||||
if unsupervised: # If unsupervised embeddings ...
|
||||
M = self.word_embeddings[lang]
|
||||
if supervised: # and also unsupervised --> get both (M) and (S) weighted sum matrices and hstack them
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = np.hstack((lX[lang].dot(M), lX[lang].dot(S)))
|
||||
continue
|
||||
_r[lang] = lX[lang].dot(M) # if not supervised --> just get weighted sum of unsupervised (M) embeddings
|
||||
else: # If not unsupervised --> get (S) matrix and its weighted sum
|
||||
S = self.supervised_embeddings[lang]
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
|
||||
if unsupervised:
|
||||
for lang in languages:
|
||||
# print('Test building embedding matrix FastTextMuse ...')
|
||||
_, M = embedding_matrix(self.we_path, self.lang_word2idx[lang], lang)
|
||||
self.word_embeddings[lang] = M
|
||||
_r[lang] = lX[lang].dot(M)
|
||||
|
||||
if supervised:
|
||||
for lang in languages:
|
||||
S = WCE_matrix(lX, ly, lang)
|
||||
S = np.squeeze(np.asarray(S)) # casting to ndarray to better visualize S while debugging
|
||||
self.supervised_embeddings[lang] = S
|
||||
if unsupervised:
|
||||
_r[lang] = np.hstack((_r[lang], lX[lang].dot(S)))
|
||||
else:
|
||||
_r[lang] = lX[lang].dot(S)
|
||||
return _r
|
||||
|
||||
# @override std class method
|
||||
def fit(self, lX, ly):
|
||||
tinit = time.time()
|
||||
print('Vectorizing documents...')
|
||||
self.vectorize(lX)
|
||||
|
||||
for lang in self.languages:
|
||||
print(lX[lang].shape)
|
||||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
# Z vectors is concatenated with doc's embedding weighted sum
|
||||
Z_embedded = dict()
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'])
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
for lang in list(lX.keys()):
|
||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||
Z = Z_embedded
|
||||
del Z_embedded
|
||||
|
||||
# stacking Z_embedded space vertically
|
||||
# _vertical_Z = np.vstack([Z_embedded[lang] for lang in self.languages])
|
||||
# _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
# zlangs = list(Z_embedded.keys()) # creo lista con embedding e poi faccio vstack su lista
|
||||
# for i, lang in enumerate(zlangs):
|
||||
# if i == 0:
|
||||
# _vertical_Z = Z_embedded[lang]
|
||||
# _vertical_Zy = zy[lang]
|
||||
# else:
|
||||
# _vertical_Z = np.vstack((_vertical_Z, Z_embedded[lang]))
|
||||
# _vertical_Zy = np.vstack((_vertical_Zy, zy[lang]))
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
self.model.fit(_vertical_Z, _vertical_Zy)
|
||||
self.time = time.time() - tinit
|
||||
print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min')
|
||||
|
||||
def predict(self, lX, ly):
|
||||
print('Vectorizing documents')
|
||||
self.vectorize(lX, prediction=True)
|
||||
lZ = self._projection(self.doc_projector, lX)
|
||||
|
||||
if self.config['supervised'] or self.config['unsupervised']:
|
||||
l_weighted_em = self.embed(lX, ly,
|
||||
unsupervised=self.config['unsupervised'],
|
||||
supervised=self.config['supervised'],
|
||||
prediction=True)
|
||||
Z_embedded = dict()
|
||||
for lang in lX.keys():
|
||||
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
||||
print(Z_embedded[lang].shape)
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
id method learner embed optimp dataset binary languages time lang macrof1 microf1 macrok microk notes
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 en 0.7866666666666666 0.0 0.7927111111111111 -0.0003376325207643527 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_M PolyEmbed_andrea svm M False test_datasetname not_binary not_ablation 55.56810355186462 fr 0.7866666666666666 0.0 0.7930666666666667 -0.0001350530083057411 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 da 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 en 0.7933333333333333 0.0 0.7931111111111111 -0.00013505300830574107 nope
|
||||
jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.picklePolyEmbedd_andrea_None PolyEmbed_andrea svm None False test_datasetname not_binary not_ablation 24.031760931015015 fr 0.7933333333333333 0.0 0.7933333333333333 0.0 nope
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
import numpy as np
|
||||
import sklearn
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
class ESA(object):
|
||||
"""
|
||||
Implementation of Explicit Sematic Analysis (ESA) in its mono-lingual version, as a transformer
|
||||
"""
|
||||
supported_similarity = ['dot', 'cosine']
|
||||
|
||||
def __init__(self, similarity='dot', centered=False, post=None):
|
||||
"""
|
||||
:param similarity: the similarity measure between documents to be used
|
||||
:param centered: set to True to subtract the expected similarity due to randomness (experimental)
|
||||
:param post: any valid sklearn normalization method to be applied to the resulting doc embeddings, or None (default)
|
||||
"""
|
||||
assert similarity in self.supported_similarity, ("Similarity method %s is not supported" % similarity)
|
||||
self.similarity = similarity
|
||||
self.centered = centered
|
||||
self.post_processing = post
|
||||
self.W = None
|
||||
|
||||
def fit(self, W):
|
||||
"""
|
||||
:param W: doc-by-term already processed matrix of wikipedia documents
|
||||
:return: self
|
||||
"""
|
||||
self.W = W
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""
|
||||
:param X: doc-by-term matrix that is to be transformed into the ESA space.
|
||||
:return: the matrix X transformed into the ESA space in numpy format
|
||||
"""
|
||||
assert self.W is not None, 'transform method called before fit'
|
||||
|
||||
W = self.W
|
||||
assert X.shape[1] == W.shape[1], ('the feature spaces for X=%s and W=%s do not agree' % (str(X.shape), str(W.shape)))
|
||||
|
||||
if self.similarity in ['dot', 'cosine']:
|
||||
if self.similarity == 'cosine':
|
||||
X = sklearn.preprocessing.normalize(X, norm='l2', axis=1, copy=True)
|
||||
W = sklearn.preprocessing.normalize(W, norm='l2', axis=1, copy=True)
|
||||
|
||||
esa = (X.dot(W.T)).toarray()
|
||||
if self.centered:
|
||||
pX = (X > 0).sum(1) / float(X.shape[1])
|
||||
pW = (W > 0).sum(1) / float(W.shape[1])
|
||||
pXpW = np.sqrt(pX.dot(pW.transpose()))
|
||||
esa = esa - pXpW
|
||||
|
||||
if self.post_processing:
|
||||
esa = sklearn.preprocessing.normalize(esa, norm=self.post_processing, axis=1, copy=True)
|
||||
|
||||
return esa
|
||||
|
||||
def fit_transform(self, W, X, Y=None):
|
||||
self.fit(W)
|
||||
return self.transform(X, Y)
|
||||
|
||||
def dimensionality(self):
|
||||
return self.W.shape[0]
|
||||
|
||||
|
||||
|
||||
class CLESA(ESA):
|
||||
"""
|
||||
Implementation of Cross-Lingual Explicit Sematic Analysis (ESA) as a transformer
|
||||
"""
|
||||
|
||||
def __init__(self, similarity='dot', centered=False, post=False, n_jobs=-1):
|
||||
super(CLESA, self).__init__(similarity, centered, post)
|
||||
self.lESA = None
|
||||
self.langs = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lW):
|
||||
"""
|
||||
:param lW: a dictionary of {language: doc-by-term wiki matrix}
|
||||
:return: self
|
||||
"""
|
||||
assert len(np.unique([W.shape[0] for W in lW.values()])) == 1, "inconsistent dimensions across languages"
|
||||
|
||||
self.dimensions = list(lW.values())[0].shape[0]
|
||||
self.langs = list(lW.keys())
|
||||
self.lESA = {lang:ESA(self.similarity, self.centered, self.post_processing).fit(lW[lang]) for lang in self.langs}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
:param lX: dictionary of {language : doc-by-term matrix} that is to be transformed into the CL-ESA space
|
||||
:return: a dictionary {language : doc-by-dim matrix} containing the matrix-transformed versions
|
||||
"""
|
||||
assert self.lESA is not None, 'transform method called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.langs)), 'languages in lX are not scope'
|
||||
langs = list(lX.keys())
|
||||
trans = Parallel(n_jobs=self.n_jobs)(delayed(self.lESA[lang].transform)(lX[lang]) for lang in langs)
|
||||
return {lang:trans[i] for i,lang in enumerate(langs)}
|
||||
|
||||
def fit_transform(self, lW, lX):
|
||||
return self.fit(lW).transform(lX)
|
||||
|
||||
def languages(self):
|
||||
return list(self.lESA.keys())
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
import numpy as np
|
||||
from sklearn.preprocessing import normalize
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
from scipy.spatial.distance import cosine
|
||||
import operator
|
||||
import functools
|
||||
import math, sys
|
||||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
|
||||
class DistributionalCorrespondenceIndexing:
|
||||
|
||||
prob_dcf = ['linear', 'pmi']
|
||||
vect_dcf = ['cosine']
|
||||
valid_dcf = prob_dcf + vect_dcf
|
||||
valid_post = ['normal', 'l2', None]
|
||||
|
||||
def __init__(self, dcf='cosine', post='normal', n_jobs=-1):
|
||||
"""
|
||||
:param dcf: a distributional correspondence function name (e.g., 'cosine') or a callable f(u,v) which measures
|
||||
the distribucional correspondence between vectors u and v
|
||||
:param post: post-processing function to apply to document embeddings. Default is to standardize it into a
|
||||
normal distribution; other functions allowed are 'l2' or None
|
||||
"""
|
||||
if post not in self.valid_post:
|
||||
raise ValueError("unknown post processing function; valid ones are [%s]" % ', '.join(self.valid_post))
|
||||
|
||||
if isinstance(dcf, str):
|
||||
if dcf not in self.valid_dcf:
|
||||
raise ValueError("unknown dcf; use any in [%s]" % ', '.join(self.valid_dcf))
|
||||
self.dcf = getattr(DistributionalCorrespondenceIndexing, dcf)
|
||||
elif hasattr(dcf, '__call__'):
|
||||
self.dcf = dcf
|
||||
else:
|
||||
raise ValueError('param dcf should either be a valid dcf name in [%s] or a callable comparing two vectors')
|
||||
#self.dcf = lambda u,v:dcf(u,v)
|
||||
self.post = post
|
||||
self.domains = None
|
||||
self.dFP = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, dU, dP):
|
||||
"""
|
||||
:param dU: a dictionary of {domain:dsm_matrix}, where dsm is a document-by-term matrix representing the
|
||||
distributional semantic model for a specific domain
|
||||
:param dP: a dictionary {domain:pivot_matrix} where domain is a string representing each domain,
|
||||
and pivot_matrix has shape (d,p) with d the dimensionality of the distributional space, and p the
|
||||
number of pivots
|
||||
:return: self
|
||||
"""
|
||||
self.domains = list(dP.keys())
|
||||
assert len(np.unique([P.shape[1] for P in dP.values()]))==1, "inconsistent number of pivots across domains"
|
||||
assert set(dU.keys())==set(self.domains), "inconsistent domains in dU and dP"
|
||||
assert not [1 for d in self.domains if dU[d].shape[0]!=dP[d].shape[0]], \
|
||||
"inconsistent dimensions between distributional and pivot spaces"
|
||||
self.dimensions = list(dP.values())[0].shape[1]
|
||||
# embed the feature space from each domain using the pivots of that domain
|
||||
#self.dFP = {d:self.dcf_dist(dU[d].transpose(), dP[d].transpose()) for d in self.domains}
|
||||
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self.dcf_dist)(dU[d].transpose(),dP[d].transpose()) for d in self.domains)
|
||||
self.dFP = {d: transformations[i] for i, d in enumerate(self.domains)}
|
||||
|
||||
def _dom_transform(self, X, FP):
|
||||
_X = X.dot(FP)
|
||||
if self.post == 'l2':
|
||||
_X = normalize(_X, norm='l2', axis=1)
|
||||
elif self.post == 'normal':
|
||||
std = np.clip(np.std(_X, axis=0), 1e-5, None)
|
||||
_X = (_X - np.mean(_X, axis=0)) / std
|
||||
return _X
|
||||
|
||||
# dX is a dictionary of {domain:dsm}, where dsm (distributional semantic model) is, e.g., a document-by-term csr_matrix
|
||||
def transform(self, dX):
|
||||
assert self.dFP is not None, 'transform method called before fit'
|
||||
assert set(dX.keys()).issubset(self.domains), 'domains in dX are not scope'
|
||||
domains = list(dX.keys())
|
||||
transformations = Parallel(n_jobs=self.n_jobs)(delayed(self._dom_transform)(dX[d], self.dFP[d]) for d in domains)
|
||||
return {d: transformations[i] for i, d in enumerate(domains)}
|
||||
|
||||
def fit_transform(self, dU, dP, dX):
|
||||
return self.fit(dU, dP).transform(dX)
|
||||
|
||||
def _prevalence(self, v):
|
||||
if issparse(v):
|
||||
return float(v.nnz) / functools.reduce(operator.mul, v.shape, 1) #this works for arrays of any rank
|
||||
elif isinstance(v, np.ndarray):
|
||||
return float(v[v>0].size) / v.size
|
||||
|
||||
def linear(self, u, v, D):
|
||||
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
||||
den1=tp+fn
|
||||
den2=tn+fp
|
||||
tpr = (tp*1./den1) if den1!=0 else 0.
|
||||
tnr = (tn*1./den2) if den2!=0 else 0.
|
||||
return tpr + tnr - 1
|
||||
|
||||
def pmi(self, u, v, D):
|
||||
tp, fp, fn, tn = self._get_4cellcounters(u, v, D)
|
||||
|
||||
Pxy = tp * 1. / D
|
||||
Pxny = fp * 1. / D
|
||||
Pnxy = fn * 1. / D
|
||||
Px = Pxy + Pxny
|
||||
Py = Pxy + Pnxy
|
||||
|
||||
if (Px == 0 or Py == 0 or Pxy == 0):
|
||||
return 0.0
|
||||
|
||||
score = math.log2(Pxy / (Px * Py))
|
||||
if np.isnan(score) or np.isinf(score):
|
||||
print('NAN')
|
||||
sys.exit()
|
||||
return score
|
||||
|
||||
def cosine(self, u, v):
|
||||
pu = self._prevalence(u)
|
||||
pv = self._prevalence(v)
|
||||
return cosine(u, v) - np.sqrt(pu * pv)
|
||||
|
||||
def _get_4cellcounters(self, u, v, D):
|
||||
"""
|
||||
:param u: a set of indexes with a non-zero value
|
||||
:param v: a set of indexes with a non-zero value
|
||||
:param D: the number of events (i.e., all posible indexes)
|
||||
:return: the 4-cell contingency values tp, fp, fn, tn)
|
||||
"""
|
||||
common=u.intersection(v)
|
||||
tp = len(common)
|
||||
fp = len(u) - len(common)
|
||||
fn = len(v) - len(common)
|
||||
tn = D - (tp + fp + fn)
|
||||
return tp, fp, fn, tn
|
||||
|
||||
def dcf_dist(self, U, V):
|
||||
nU,D = U.shape
|
||||
nV = V.shape[0]
|
||||
if issparse(U): U = U.toarray()
|
||||
if issparse(V): V = V.toarray()
|
||||
|
||||
dists = np.zeros((nU, nV))
|
||||
if self.dcf.__name__ in self.prob_dcf:
|
||||
def hits_index(v):
|
||||
return set(np.argwhere(v>0).reshape(-1).tolist())
|
||||
Vhits = {i:hits_index(V[i]) for i in range(nV)}
|
||||
for i in range(nU):
|
||||
Ui_hits = hits_index(U[i])
|
||||
for j in range(nV):
|
||||
dists[i, j] = self.dcf(self, Ui_hits, Vhits[j], D)
|
||||
else:
|
||||
for i in range(nU):
|
||||
for j in range(nV):
|
||||
dists[i, j] = self.dcf(self, U[i], V[j])
|
||||
return dists
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
import math
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
|
||||
class RandomIndexingBoC(object):
|
||||
|
||||
def __init__(self, latent_dimensions, non_zeros=2):
|
||||
self.latent_dimensions = latent_dimensions
|
||||
self.k = non_zeros
|
||||
self.ri_dict = None
|
||||
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
def fit(self, X):
|
||||
nF = X.shape[1]
|
||||
nL = self.latent_dimensions
|
||||
format = 'csr' if issparse(X) else 'np'
|
||||
self.ri_dict = _create_random_index_dictionary(shape=(nF, nL), k=self.k, normalized=True, format=format)
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
assert X.shape[1] == self.ri_dict.shape[0], 'feature space is inconsistent with the RI dictionary'
|
||||
if self.ri_dict is None:
|
||||
raise ValueError("Error: transform method called before fit.")
|
||||
P = X.dot(self.ri_dict)
|
||||
if issparse(P):
|
||||
P.sort_indices()
|
||||
return P
|
||||
|
||||
|
||||
def _create_random_index_dictionary(shape, k, normalized=False, format='csr', positive=False):
|
||||
assert format in ['csr', 'np'], 'Format should be in "[csr, np]"'
|
||||
nF, latent_dimensions = shape
|
||||
print("Creating the random index dictionary for |V|={} with {} dimensions".format(nF,latent_dimensions))
|
||||
val = 1.0 if not normalized else 1.0/math.sqrt(k)
|
||||
#ri_dict = csr_matrix((nF, latent_dimensions)) if format == 'csr' else np.zeros((nF, latent_dimensions))
|
||||
ri_dict = np.zeros((nF, latent_dimensions))
|
||||
|
||||
#TODO: optimize
|
||||
for t in range(nF):
|
||||
dims = np.zeros(k, dtype=np.int32)
|
||||
dims[0] = t % latent_dimensions #the first dimension is choosen in a round-robin manner (prevents gaps)
|
||||
dims[1:] = np.random.choice(latent_dimensions, size=k-1, replace=False)
|
||||
values = (np.random.randint(0,2, size=k)*2.0-1.0) * val if not positive else np.array([+val]*k)
|
||||
ri_dict[t,dims]=values
|
||||
print("\rprogress [%.2f%% complete]" % (t * 100.0 / nF), end='')
|
||||
print('\nDone')
|
||||
|
||||
if format=='csr':
|
||||
ri_dict = csr_matrix(ri_dict)
|
||||
return ri_dict
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,95 @@
|
|||
# from sklearn.externals.joblib import Parallel, delayed
|
||||
from joblib import Parallel, delayed
|
||||
from util.metrics import *
|
||||
from sklearn.metrics import f1_score
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||
|
||||
def soft_evaluation_metrics(y, y_):
|
||||
if len(y.shape)==len(y_.shape)==1 and len(np.unique(y))>2: #single-label
|
||||
raise NotImplementedError()#return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: #the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return smoothmacroF1(y, y_), smoothmicroF1(y, y_), smoothmacroK(y, y_), smoothmicroK(y, y_)
|
||||
|
||||
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||
print('evaluation (n_jobs={})'.format(n_jobs))
|
||||
if n_jobs == 1:
|
||||
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
|
||||
else:
|
||||
langs = list(ly_true.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def average_results(l_eval, show=True):
|
||||
metrics = []
|
||||
for lang in l_eval.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if show:
|
||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||
|
||||
ave = np.mean(np.array(metrics), axis=0)
|
||||
if show:
|
||||
print('Averages: MF1, mF1, MK, mK', ave)
|
||||
return ave
|
||||
|
||||
|
||||
def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, return_time=False):
|
||||
tinit=time.time()
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
n_jobs = polylingual_method.n_jobs
|
||||
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
|
||||
metrics = evaluation_metrics
|
||||
if soft is True:
|
||||
metrics = soft_evaluation_metrics
|
||||
ly_ = predictor(lX, ly)
|
||||
|
||||
eval_ = evaluate(ly, ly_, metrics=metrics, n_jobs=n_jobs)
|
||||
if return_time:
|
||||
return eval_, time.time()-tinit
|
||||
else:
|
||||
return eval_
|
||||
|
||||
def evaluate_single_lang(polylingual_method, X, y, lang, predictor=None, soft=False):
|
||||
print('prediction for test in a single language')
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
|
||||
metrics = evaluation_metrics
|
||||
if soft is True:
|
||||
metrics = soft_evaluation_metrics
|
||||
|
||||
ly_ = predictor({lang:X})
|
||||
return metrics(y, ly_[lang])
|
||||
|
||||
def get_binary_counters(polylingual_method, lX, ly, predictor=None):
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
n_jobs = polylingual_method.n_jobs
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
ly_ = predictor(lX)
|
||||
print('evaluation (n_jobs={})'.format(n_jobs))
|
||||
if n_jobs == 1:
|
||||
return {lang: binary_counters(ly[lang], ly_[lang]) for lang in ly.keys()}
|
||||
else:
|
||||
langs = list(ly.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(delayed(binary_counters)(ly[lang], ly_[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def binary_counters(y, y_):
|
||||
y = np.reshape(y, (-1))
|
||||
assert y.shape==y_.shape and len(y.shape)==1, 'error, binary vector expected'
|
||||
counters = hard_single_metric_statistics(y, y_)
|
||||
return counters.tp, counters.tn, counters.fp, counters.fn
|
||||
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
from os import listdir, makedirs
|
||||
from os.path import isdir, isfile, join, exists, dirname
|
||||
#from sklearn.externals.six.moves import urllib
|
||||
import urllib
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
print("Downloading %s" % url)
|
||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if exists(archive_path): return
|
||||
makedirs_if_not_exist(dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
|
||||
|
||||
def ls(dir, typecheck):
|
||||
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
|
||||
el.sort()
|
||||
return el
|
||||
|
||||
def list_dirs(dir):
|
||||
return ls(dir, typecheck=isdir)
|
||||
|
||||
def list_files(dir):
|
||||
return ls(dir, typecheck=isfile)
|
||||
|
||||
def makedirs_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,168 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
"""
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives ammount to 0, all
|
||||
affected metrices (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
"""
|
||||
|
||||
class ContTable:
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
def __add__(self, other):
|
||||
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
|
||||
|
||||
def accuracy(cell):
|
||||
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
|
||||
|
||||
def f1(cell):
|
||||
num = 2.0 * cell.tp
|
||||
den = 2.0 * cell.tp + cell.fp + cell.fn
|
||||
if den>0: return num / den
|
||||
#we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
def K(cell):
|
||||
specificity, recall = 0., 0.
|
||||
|
||||
AN = cell.tn + cell.fp
|
||||
if AN != 0:
|
||||
specificity = cell.tn*1. / AN
|
||||
|
||||
AP = cell.tp + cell.fn
|
||||
if AP != 0:
|
||||
recall = cell.tp*1. / AP
|
||||
|
||||
if AP == 0:
|
||||
return 2. * specificity - 1.
|
||||
elif AN == 0:
|
||||
return 2. * recall - 1.
|
||||
else:
|
||||
return specificity + recall - 1.
|
||||
|
||||
#computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||
#true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||
assert len(true_labels)==len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels==1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
#computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||
# probabilitiesfron with respect to the true binary labels
|
||||
#true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||
def soft_single_metric_statistics(true_labels, posterior_probabilities):
|
||||
assert len(true_labels)==len(posterior_probabilities), "Format not consistent between true and predicted labels."
|
||||
tp = np.sum(posterior_probabilities[true_labels == 1])
|
||||
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
|
||||
fp = np.sum(posterior_probabilities[true_labels == 0])
|
||||
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
#if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||
#to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||
def __check_consistency_and_adapt(true_labels, predictions):
|
||||
if predictions.ndim == 1:
|
||||
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
|
||||
if true_labels.ndim == 1:
|
||||
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1),predictions)
|
||||
if true_labels.shape != predictions.shape:
|
||||
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
|
||||
% (true_labels.shape, predictions.shape))
|
||||
_,nC = true_labels.shape
|
||||
return true_labels, predictions, nC
|
||||
|
||||
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
|
||||
|
||||
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
|
||||
accum = ContTable()
|
||||
for c in range(nC):
|
||||
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
|
||||
accum = accum + other
|
||||
|
||||
return metric(accum)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroF1(true_labels, predicted_labels):
|
||||
return macro_average(true_labels,predicted_labels, f1)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microF1(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroK(true_labels, predicted_labels):
|
||||
return macro_average(true_labels,predicted_labels, K)
|
||||
|
||||
#true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microK(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, K)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmacroF1(true_labels, posterior_probabilities):
|
||||
return macro_average(true_labels,posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmicroF1(true_labels, posterior_probabilities):
|
||||
return micro_average(true_labels, posterior_probabilities, f1, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmacroK(true_labels, posterior_probabilities):
|
||||
return macro_average(true_labels,posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
#true_labels is a matrix in sklearn.preprocessing.MultiLabelBinarizer format and posterior_probabilities is a matrix
|
||||
#of the same shape containing real values in [0,1]
|
||||
def smoothmicroK(true_labels, posterior_probabilities):
|
||||
return micro_average(true_labels, posterior_probabilities, K, metric_statistics=soft_single_metric_statistics)
|
||||
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
class PolylingualClassificationResults:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['id', 'method', 'learner', 'embed', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
else:
|
||||
self.tell('File {} does not exist. Creating new frame.'.format(file))
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
|
||||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([id, method, learner, embed, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
Loading…
Reference in New Issue