adding uci ml datasets

This commit is contained in:
Alejandro Moreo Fernandez 2021-01-25 18:38:56 +01:00
parent 8a53cfe677
commit 301e8b9088
2 changed files with 114 additions and 22 deletions

View File

@ -140,7 +140,15 @@ UCI_DATASETS = ['acute.a', 'acute.b',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german'] # ongoing...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
@ -164,6 +172,16 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)',
'german': 'Statlog German Credit Data',
'haberman': "Haberman's Survival Data",
'ionosphere': 'Johns Hopkins University Ionosphere DB',
'iris.1': 'Iris Plants Database(x)',
'iris.2': 'Iris Plants Database(versicolour)',
'iris.3': 'Iris Plants Database(virginica)',
'mammographic': 'Mammographic Mass',
'pageblocks.5': 'Page Blocks Classification (5)',
'semeion': 'Semeion Handwritten Digit (8)',
'sonar': 'Sonar, Mines vs. Rocks'
}
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
@ -181,44 +199,59 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
'ctg.1': '00193',
'ctg.2': '00193',
'ctg.3': '00193',
'german': 'statlog/german'
'german': 'statlog/german',
'haberman': 'haberman',
'ionosphere': 'ionosphere',
'iris.1': 'iris',
'iris.2': 'iris',
'iris.3': 'iris',
'mammographic': 'mammographic-masses',
'pageblocks.5': 'page-blocks',
'semeion': 'semeion',
'sonar': 'undocumented/connectionist-bench/sonar'
}
# the filename is the name of the file within the data_folder indexed by the identifier
file_name = {
'acute': 'diagnosis.data',
'balance-scale': 'balance-scale.data',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data',
'cmc': 'cmc.data',
'00193': 'CTG.xls',
'statlog/german': 'german.data-numeric'
'statlog/german': 'german.data-numeric',
'mammographic-masses': 'mammographic_masses.data',
'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data'
}
# the filename containing the dataset description (if any)
desc_name = {
'acute': 'diagnosis.names',
'balance-scale': 'balance-scale.names',
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names',
'cmc': 'cmc.names',
'00193': None,
'statlog/german': 'german.doc'
'statlog/german': 'german.doc',
'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names'
}
identifier = identifier_map[dataset_name]
filename = file_name.get(identifier, f'{identifier}.data')
descfile = desc_name.get(identifier, f'{identifier}.names')
fullname = dataset_fullname[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_dir = join(data_home, 'uci_datasets', identifier)
data_path = join(data_dir, file_name[identifier])
download_file_if_not_exists(f'{URL}/{file_name[identifier]}', data_path)
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
descfile = desc_name[identifier]
if descfile:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose:
print(open(f'{data_dir}/{descfile}', 'rt').read())
try:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose:
print(open(f'{data_dir}/{descfile}', 'rt').read())
except Exception:
print('could not read the description file')
elif verbose:
print('no file description available')
print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})')
print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
if dataset_name == 'acute.a':
@ -270,12 +303,12 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:22].astype(float).values
y = df['NSP'].astype(int).values
if dataset_name == 'ctg.1': # 1==Normal
y = binarize(y, pos_class=1)
if dataset_name == 'ctg.1':
y = binarize(y, pos_class=1) # 1==Normal
elif dataset_name == 'ctg.2':
y = binarize(y, pos_class=2) # 1==Suspect
y = binarize(y, pos_class=2) # 2==Suspect
elif dataset_name == 'ctg.3':
y = binarize(y, pos_class=3) # 1==Pathologic
y = binarize(y, pos_class=3) # 3==Pathologic
if identifier == 'statlog/german':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
@ -283,6 +316,64 @@ def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3
y = df[24].astype(int).values
y = binarize(y, pos_class=1)
if identifier == 'haberman':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:3].astype(float).values
y = df[3].astype(int).values
y = binarize(y, pos_class=2)
if identifier == 'ionosphere':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:34].astype(float).values
y = df[34].values
y = binarize(y, pos_class='b')
if identifier == 'iris':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:4].astype(float).values
y = df[4].values
if dataset_name == 'iris.1':
y = binarize(y, pos_class='Iris-setosa') # 1==Setosa
elif dataset_name == 'iris.2':
y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor
elif dataset_name == 'iris.3':
y = binarize(y, pos_class='Iris-virginica') # 3==Virginica
if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',')
Xy[df == '?'] = np.nan
Xy = Xy.dropna(axis=0)
X = Xy.iloc[:, 0:5]
X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1)
if identifier == 'page-blocks':
data_path_ = data_path.replace('.Z', '')
if not os.path.exists(data_path_):
raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
f'attempt to load this dataset, then you have to manually unzip the {data_path} '
f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
X = df.iloc[:, 0:10].astype(float).values
y = df[10].values
y = binarize(y, pos_class=5) # 5==block "graphic"
if identifier == 'semeion':
df = pd.read_csv(data_path, header=None, delim_whitespace=True )
X = df.iloc[:, 0:256].astype(float).values
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
y = binarize(y, pos_class=1)
if identifier == 'undocumented/connectionist-bench/sonar':
df = pd.read_csv(data_path, header=None, sep=',')
print(df)
X = df.iloc[:, 0:60].astype(float).values
y = df[60].values
y = binarize(y, pos_class='R')
data = LabelledCollection(X, y)
data.stats()
return Dataset(*data.split_stratified(1-test_split, random_state=0))

View File

@ -11,7 +11,8 @@ from classification.methods import PCALR
from classification.neural import NeuralClassifierTrainer, CNNnet
from quapy.model_selection import GridSearchQ
dataset = qp.datasets.fetch_UCIDataset('sonar', verbose=True)
sys.exit(0)
qp.environ['SAMPLE_SIZE'] = 500