Dataset updated, evaluation updated, tests updated

This commit is contained in:
Lorenzo Volpi 2023-09-12 17:38:49 +02:00
parent ad98a515c4
commit c01ac0915c
6 changed files with 141 additions and 16 deletions

BIN
.coverage

Binary file not shown.

View File

@ -2,7 +2,6 @@ from typing import List, Optional, Self
import numpy as np import numpy as np
import math import math
import quapy as qp
import scipy.sparse as sp import scipy.sparse as sp
from quapy.data import LabelledCollection from quapy.data import LabelledCollection
@ -147,17 +146,3 @@ class ExtendedCollection(LabelledCollection):
return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)]) return ExtendedCollection(n_x, n_y, classes=[*range(0, n_classes * n_classes)])
def get_dataset(name):
datasets = {
"spambase": lambda: qp.datasets.fetch_UCIDataset(
"spambase", verbose=False
).train_test,
"hp": lambda: qp.datasets.fetch_reviews("hp", tfidf=True).train_test,
"imdb": lambda: qp.datasets.fetch_reviews("imdb", tfidf=True).train_test,
}
try:
return datasets[name]()
except KeyError:
raise KeyError(f"{name} is not available as a dataset")

4
quacc/dataset.py Normal file
View File

@ -0,0 +1,4 @@
import quapy as qp
def getImdbTrainTest():
return qp.datasets.fetch_reviews("imdb", tfidf=True).train_test

View File

@ -36,7 +36,7 @@ def estimate(
_bprev_col_0 = ["base"] _bprev_col_0 = ["base"]
_bprev_col_1 = ["0", "1"] _bprev_col_1 = ["0", "1"]
_prev_col_0 = ["true", "estim"] _prev_col_0 = ["true", "estim"]
_prev_col_1 = ["T0", "F1", "F0", "T1"] _prev_col_1 = ["TN", "FP", "FN", "TP"]
_err_col_0 = ["errors"] _err_col_0 = ["errors"]

104
rcv1_hierarchy Normal file
View File

@ -0,0 +1,104 @@
parent: None child: Root child-description: No Description
parent: CCAT child: C11 child-description: STRATEGY/PLANS
parent: CCAT child: C12 child-description: LEGAL/JUDICIAL
parent: CCAT child: C13 child-description: REGULATION/POLICY
parent: CCAT child: C14 child-description: SHARE LISTINGS
parent: CCAT child: C15 child-description: PERFORMANCE
parent: C15 child: C151 child-description: ACCOUNTS/EARNINGS
parent: C151 child: C1511 child-description: ANNUAL RESULTS
parent: C15 child: C152 child-description: COMMENT/FORECASTS
parent: CCAT child: C16 child-description: INSOLVENCY/LIQUIDITY
parent: CCAT child: C17 child-description: FUNDING/CAPITAL
parent: C17 child: C171 child-description: SHARE CAPITAL
parent: C17 child: C172 child-description: BONDS/DEBT ISSUES
parent: C17 child: C173 child-description: LOANS/CREDITS
parent: C17 child: C174 child-description: CREDIT RATINGS
parent: CCAT child: C18 child-description: OWNERSHIP CHANGES
parent: C18 child: C181 child-description: MERGERS/ACQUISITIONS
parent: C18 child: C182 child-description: ASSET TRANSFERS
parent: C18 child: C183 child-description: PRIVATISATIONS
parent: CCAT child: C21 child-description: PRODUCTION/SERVICES
parent: CCAT child: C22 child-description: NEW PRODUCTS/SERVICES
parent: CCAT child: C23 child-description: RESEARCH/DEVELOPMENT
parent: CCAT child: C24 child-description: CAPACITY/FACILITIES
parent: CCAT child: C31 child-description: MARKETS/MARKETING
parent: C31 child: C311 child-description: DOMESTIC MARKETS
parent: C31 child: C312 child-description: EXTERNAL MARKETS
parent: C31 child: C313 child-description: MARKET SHARE
parent: CCAT child: C32 child-description: ADVERTISING/PROMOTION
parent: CCAT child: C33 child-description: CONTRACTS/ORDERS
parent: C33 child: C331 child-description: DEFENCE CONTRACTS
parent: CCAT child: C34 child-description: MONOPOLIES/COMPETITION
parent: CCAT child: C41 child-description: MANAGEMENT
parent: C41 child: C411 child-description: MANAGEMENT MOVES
parent: CCAT child: C42 child-description: LABOUR
parent: Root child: CCAT child-description: CORPORATE/INDUSTRIAL
parent: ECAT child: E11 child-description: ECONOMIC PERFORMANCE
parent: ECAT child: E12 child-description: MONETARY/ECONOMIC
parent: E12 child: E121 child-description: MONEY SUPPLY
parent: ECAT child: E13 child-description: INFLATION/PRICES
parent: E13 child: E131 child-description: CONSUMER PRICES
parent: E13 child: E132 child-description: WHOLESALE PRICES
parent: ECAT child: E14 child-description: CONSUMER FINANCE
parent: E14 child: E141 child-description: PERSONAL INCOME
parent: E14 child: E142 child-description: CONSUMER CREDIT
parent: E14 child: E143 child-description: RETAIL SALES
parent: ECAT child: E21 child-description: GOVERNMENT FINANCE
parent: E21 child: E211 child-description: EXPENDITURE/REVENUE
parent: E21 child: E212 child-description: GOVERNMENT BORROWING
parent: ECAT child: E31 child-description: OUTPUT/CAPACITY
parent: E31 child: E311 child-description: INDUSTRIAL PRODUCTION
parent: E31 child: E312 child-description: CAPACITY UTILIZATION
parent: E31 child: E313 child-description: INVENTORIES
parent: ECAT child: E41 child-description: EMPLOYMENT/LABOUR
parent: E41 child: E411 child-description: UNEMPLOYMENT
parent: ECAT child: E51 child-description: TRADE/RESERVES
parent: E51 child: E511 child-description: BALANCE OF PAYMENTS
parent: E51 child: E512 child-description: MERCHANDISE TRADE
parent: E51 child: E513 child-description: RESERVES
parent: ECAT child: E61 child-description: HOUSING STARTS
parent: ECAT child: E71 child-description: LEADING INDICATORS
parent: Root child: ECAT child-description: ECONOMICS
parent: GCAT child: G15 child-description: EUROPEAN COMMUNITY
parent: G15 child: G151 child-description: EC INTERNAL MARKET
parent: G15 child: G152 child-description: EC CORPORATE POLICY
parent: G15 child: G153 child-description: EC AGRICULTURE POLICY
parent: G15 child: G154 child-description: EC MONETARY/ECONOMIC
parent: G15 child: G155 child-description: EC INSTITUTIONS
parent: G15 child: G156 child-description: EC ENVIRONMENT ISSUES
parent: G15 child: G157 child-description: EC COMPETITION/SUBSIDY
parent: G15 child: G158 child-description: EC EXTERNAL RELATIONS
parent: G15 child: G159 child-description: EC GENERAL
parent: Root child: GCAT child-description: GOVERNMENT/SOCIAL
parent: GCAT child: GCRIM child-description: CRIME, LAW ENFORCEMENT
parent: GCAT child: GDEF child-description: DEFENCE
parent: GCAT child: GDIP child-description: INTERNATIONAL RELATIONS
parent: GCAT child: GDIS child-description: DISASTERS AND ACCIDENTS
parent: GCAT child: GENT child-description: ARTS, CULTURE, ENTERTAINMENT
parent: GCAT child: GENV child-description: ENVIRONMENT AND NATURAL WORLD
parent: GCAT child: GFAS child-description: FASHION
parent: GCAT child: GHEA child-description: HEALTH
parent: GCAT child: GJOB child-description: LABOUR ISSUES
parent: GCAT child: GMIL child-description: MILLENNIUM ISSUES
parent: GCAT child: GOBIT child-description: OBITUARIES
parent: GCAT child: GODD child-description: HUMAN INTEREST
parent: GCAT child: GPOL child-description: DOMESTIC POLITICS
parent: GCAT child: GPRO child-description: BIOGRAPHIES, PERSONALITIES, PEOPLE
parent: GCAT child: GREL child-description: RELIGION
parent: GCAT child: GSCI child-description: SCIENCE AND TECHNOLOGY
parent: GCAT child: GSPO child-description: SPORTS
parent: GCAT child: GTOUR child-description: TRAVEL AND TOURISM
parent: GCAT child: GVIO child-description: WAR, CIVIL WAR
parent: GCAT child: GVOTE child-description: ELECTIONS
parent: GCAT child: GWEA child-description: WEATHER
parent: GCAT child: GWELF child-description: WELFARE, SOCIAL SERVICES
parent: MCAT child: M11 child-description: EQUITY MARKETS
parent: MCAT child: M12 child-description: BOND MARKETS
parent: MCAT child: M13 child-description: MONEY MARKETS
parent: M13 child: M131 child-description: INTERBANK MARKETS
parent: M13 child: M132 child-description: FOREX MARKETS
parent: MCAT child: M14 child-description: COMMODITY MARKETS
parent: M14 child: M141 child-description: SOFT COMMODITIES
parent: M14 child: M142 child-description: METALS TRADING
parent: M14 child: M143 child-description: ENERGY MARKETS
parent: Root child: MCAT child-description: MARKETS

32
tests/test_dataset.py Normal file
View File

@ -0,0 +1,32 @@
import pytest
from quacc.dataset import Rcv1Helper
@pytest.fixture
def rcv1_helper() -> Rcv1Helper:
return Rcv1Helper()
class TestDataset:
def test_rcv1_binary_datasets(self, rcv1_helper):
count = 0
for X, Y, name in rcv1_helper.rcv1_binary_datasets():
count += 1
print(X.shape)
assert X.shape == (517978, 47236)
assert Y.shape == (517978,)
assert count == 37
@pytest.mark.parametrize("label", ["CCAT", "GCAT", "M11"])
def test_rcv1_binary_dataset_by_label(self, rcv1_helper, label):
train, test = rcv1_helper.rcv1_binary_dataset_by_label(label)
assert train.X.shape == (23149, 47236)
assert train.y.shape == (23149,)
assert test.X.shape == (781265, 47236)
assert test.y.shape == (781265,)
assert (
dict(rcv1_helper.documents_per_class_rcv1())[label]
== train.y.sum() + test.y.sum()
)