diff --git a/.coverage b/.coverage index fc13d34..9f20b3c 100644 Binary files a/.coverage and b/.coverage differ diff --git a/quacc/data.py b/quacc/data.py index b05105b..df03086 100644 --- a/quacc/data.py +++ b/quacc/data.py @@ -2,7 +2,6 @@ from typing import List, Optional, Self import numpy as np import math -import quapy as qp import scipy.sparse as sp from quapy.data import LabelledCollection @@ -147,17 +146,3 @@ class ExtendedCollection(LabelledCollection): return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)]) - -def get_dataset(name): - datasets = { - "spambase": lambda: qp.datasets.fetch_UCIDataset( - "spambase", verbose=False - ).train_test, - "hp": lambda: qp.datasets.fetch_reviews("hp", tfidf=True).train_test, - "imdb": lambda: qp.datasets.fetch_reviews("imdb", tfidf=True).train_test, - } - - try: - return datasets[name]() - except KeyError: - raise KeyError(f"{name} is not available as a dataset") diff --git a/quacc/dataset.py b/quacc/dataset.py new file mode 100644 index 0000000..eed7384 --- /dev/null +++ b/quacc/dataset.py @@ -0,0 +1,4 @@ +import quapy as qp + +def getImdbTrainTest(): + return qp.datasets.fetch_reviews("imdb", tfidf=True).train_test \ No newline at end of file diff --git a/quacc/evaluation.py b/quacc/evaluation.py index 48502d4..3d1cc20 100644 --- a/quacc/evaluation.py +++ b/quacc/evaluation.py @@ -36,7 +36,7 @@ def estimate( _bprev_col_0 = ["base"] _bprev_col_1 = ["0", "1"] _prev_col_0 = ["true", "estim"] -_prev_col_1 = ["T0", "F1", "F0", "T1"] +_prev_col_1 = ["TN", "FP", "FN", "TP"] _err_col_0 = ["errors"] diff --git a/rcv1_hierarchy b/rcv1_hierarchy new file mode 100644 index 0000000..1cea90e --- /dev/null +++ b/rcv1_hierarchy @@ -0,0 +1,104 @@ +parent: None child: Root child-description: No Description +parent: CCAT child: C11 child-description: STRATEGY/PLANS +parent: CCAT child: C12 child-description: LEGAL/JUDICIAL +parent: CCAT child: C13 child-description: REGULATION/POLICY +parent: CCAT child: C14 child-description: SHARE LISTINGS +parent: CCAT child: C15 child-description: PERFORMANCE +parent: C15 child: C151 child-description: ACCOUNTS/EARNINGS +parent: C151 child: C1511 child-description: ANNUAL RESULTS +parent: C15 child: C152 child-description: COMMENT/FORECASTS +parent: CCAT child: C16 child-description: INSOLVENCY/LIQUIDITY +parent: CCAT child: C17 child-description: FUNDING/CAPITAL +parent: C17 child: C171 child-description: SHARE CAPITAL +parent: C17 child: C172 child-description: BONDS/DEBT ISSUES +parent: C17 child: C173 child-description: LOANS/CREDITS +parent: C17 child: C174 child-description: CREDIT RATINGS +parent: CCAT child: C18 child-description: OWNERSHIP CHANGES +parent: C18 child: C181 child-description: MERGERS/ACQUISITIONS +parent: C18 child: C182 child-description: ASSET TRANSFERS +parent: C18 child: C183 child-description: PRIVATISATIONS +parent: CCAT child: C21 child-description: PRODUCTION/SERVICES +parent: CCAT child: C22 child-description: NEW PRODUCTS/SERVICES +parent: CCAT child: C23 child-description: RESEARCH/DEVELOPMENT +parent: CCAT child: C24 child-description: CAPACITY/FACILITIES +parent: CCAT child: C31 child-description: MARKETS/MARKETING +parent: C31 child: C311 child-description: DOMESTIC MARKETS +parent: C31 child: C312 child-description: EXTERNAL MARKETS +parent: C31 child: C313 child-description: MARKET SHARE +parent: CCAT child: C32 child-description: ADVERTISING/PROMOTION +parent: CCAT child: C33 child-description: CONTRACTS/ORDERS +parent: C33 child: C331 child-description: DEFENCE CONTRACTS +parent: CCAT child: C34 child-description: MONOPOLIES/COMPETITION +parent: CCAT child: C41 child-description: MANAGEMENT +parent: C41 child: C411 child-description: MANAGEMENT MOVES +parent: CCAT child: C42 child-description: LABOUR +parent: Root child: CCAT child-description: CORPORATE/INDUSTRIAL +parent: ECAT child: E11 child-description: ECONOMIC PERFORMANCE +parent: ECAT child: E12 child-description: MONETARY/ECONOMIC +parent: E12 child: E121 child-description: MONEY SUPPLY +parent: ECAT child: E13 child-description: INFLATION/PRICES +parent: E13 child: E131 child-description: CONSUMER PRICES +parent: E13 child: E132 child-description: WHOLESALE PRICES +parent: ECAT child: E14 child-description: CONSUMER FINANCE +parent: E14 child: E141 child-description: PERSONAL INCOME +parent: E14 child: E142 child-description: CONSUMER CREDIT +parent: E14 child: E143 child-description: RETAIL SALES +parent: ECAT child: E21 child-description: GOVERNMENT FINANCE +parent: E21 child: E211 child-description: EXPENDITURE/REVENUE +parent: E21 child: E212 child-description: GOVERNMENT BORROWING +parent: ECAT child: E31 child-description: OUTPUT/CAPACITY +parent: E31 child: E311 child-description: INDUSTRIAL PRODUCTION +parent: E31 child: E312 child-description: CAPACITY UTILIZATION +parent: E31 child: E313 child-description: INVENTORIES +parent: ECAT child: E41 child-description: EMPLOYMENT/LABOUR +parent: E41 child: E411 child-description: UNEMPLOYMENT +parent: ECAT child: E51 child-description: TRADE/RESERVES +parent: E51 child: E511 child-description: BALANCE OF PAYMENTS +parent: E51 child: E512 child-description: MERCHANDISE TRADE +parent: E51 child: E513 child-description: RESERVES +parent: ECAT child: E61 child-description: HOUSING STARTS +parent: ECAT child: E71 child-description: LEADING INDICATORS +parent: Root child: ECAT child-description: ECONOMICS +parent: GCAT child: G15 child-description: EUROPEAN COMMUNITY +parent: G15 child: G151 child-description: EC INTERNAL MARKET +parent: G15 child: G152 child-description: EC CORPORATE POLICY +parent: G15 child: G153 child-description: EC AGRICULTURE POLICY +parent: G15 child: G154 child-description: EC MONETARY/ECONOMIC +parent: G15 child: G155 child-description: EC INSTITUTIONS +parent: G15 child: G156 child-description: EC ENVIRONMENT ISSUES +parent: G15 child: G157 child-description: EC COMPETITION/SUBSIDY +parent: G15 child: G158 child-description: EC EXTERNAL RELATIONS +parent: G15 child: G159 child-description: EC GENERAL +parent: Root child: GCAT child-description: GOVERNMENT/SOCIAL +parent: GCAT child: GCRIM child-description: CRIME, LAW ENFORCEMENT +parent: GCAT child: GDEF child-description: DEFENCE +parent: GCAT child: GDIP child-description: INTERNATIONAL RELATIONS +parent: GCAT child: GDIS child-description: DISASTERS AND ACCIDENTS +parent: GCAT child: GENT child-description: ARTS, CULTURE, ENTERTAINMENT +parent: GCAT child: GENV child-description: ENVIRONMENT AND NATURAL WORLD +parent: GCAT child: GFAS child-description: FASHION +parent: GCAT child: GHEA child-description: HEALTH +parent: GCAT child: GJOB child-description: LABOUR ISSUES +parent: GCAT child: GMIL child-description: MILLENNIUM ISSUES +parent: GCAT child: GOBIT child-description: OBITUARIES +parent: GCAT child: GODD child-description: HUMAN INTEREST +parent: GCAT child: GPOL child-description: DOMESTIC POLITICS +parent: GCAT child: GPRO child-description: BIOGRAPHIES, PERSONALITIES, PEOPLE +parent: GCAT child: GREL child-description: RELIGION +parent: GCAT child: GSCI child-description: SCIENCE AND TECHNOLOGY +parent: GCAT child: GSPO child-description: SPORTS +parent: GCAT child: GTOUR child-description: TRAVEL AND TOURISM +parent: GCAT child: GVIO child-description: WAR, CIVIL WAR +parent: GCAT child: GVOTE child-description: ELECTIONS +parent: GCAT child: GWEA child-description: WEATHER +parent: GCAT child: GWELF child-description: WELFARE, SOCIAL SERVICES +parent: MCAT child: M11 child-description: EQUITY MARKETS +parent: MCAT child: M12 child-description: BOND MARKETS +parent: MCAT child: M13 child-description: MONEY MARKETS +parent: M13 child: M131 child-description: INTERBANK MARKETS +parent: M13 child: M132 child-description: FOREX MARKETS +parent: MCAT child: M14 child-description: COMMODITY MARKETS +parent: M14 child: M141 child-description: SOFT COMMODITIES +parent: M14 child: M142 child-description: METALS TRADING +parent: M14 child: M143 child-description: ENERGY MARKETS +parent: Root child: MCAT child-description: MARKETS \ No newline at end of file diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 0000000..4a77368 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,32 @@ +import pytest +from quacc.dataset import Rcv1Helper + + +@pytest.fixture +def rcv1_helper() -> Rcv1Helper: + return Rcv1Helper() + + +class TestDataset: + def test_rcv1_binary_datasets(self, rcv1_helper): + count = 0 + for X, Y, name in rcv1_helper.rcv1_binary_datasets(): + count += 1 + print(X.shape) + assert X.shape == (517978, 47236) + assert Y.shape == (517978,) + + assert count == 37 + + @pytest.mark.parametrize("label", ["CCAT", "GCAT", "M11"]) + def test_rcv1_binary_dataset_by_label(self, rcv1_helper, label): + train, test = rcv1_helper.rcv1_binary_dataset_by_label(label) + assert train.X.shape == (23149, 47236) + assert train.y.shape == (23149,) + assert test.X.shape == (781265, 47236) + assert test.y.shape == (781265,) + + assert ( + dict(rcv1_helper.documents_per_class_rcv1())[label] + == train.y.sum() + test.y.sum() + )