QuaPy/quapy/data/reader.py

import numpy as np
from scipy.sparse import dok_matrix
from tqdm import tqdm


def from_text(path, encoding='utf-8', verbose=1, class2int=True):
    """
    Reads a labelled colletion of documents.
    File fomart <0 or 1>\t<document>\n

    :param path: path to the labelled collection
    :param encoding: the text encoding used to open the file
    :param verbose: if >0 (default) shows some progress information in standard output
    :return: a list of sentences, and a list of labels
    """
    all_sentences, all_labels = [], []
    if verbose>0:
        file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')
    else:
        file = open(path, 'rt', encoding=encoding).readlines()
    for line in file:
        line = line.strip()
        if line:
            try:
                label, sentence = line.split('\t')
                sentence = sentence.strip()
                if class2int:
                    label = int(label)
                if sentence:
                    all_sentences.append(sentence)
                    all_labels.append(label)
            except ValueError:
                print(f'format error in {line}')
    return all_sentences, all_labels


def from_sparse(path):
    """
    Reads a labelled collection of real-valued instances expressed in sparse format
    File format <-1 or 0 or 1>[\s col(int):val(float)]\n

    :param path: path to the labelled collection
    :return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
    """

    def split_col_val(col_val):
        col, val = col_val.split(':')
        col, val = int(col) - 1, float(val)
        return col, val

    all_documents, all_labels = [], []
    max_col = 0
    for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):
        parts = line.strip().split()
        if parts:
            all_labels.append(int(parts[0]))
            cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]])
            cols, vals = np.asarray(cols), np.asarray(vals)
            max_col = max(max_col, cols.max())
            all_documents.append((cols, vals))
    n_docs = len(all_labels)
    X = dok_matrix((n_docs, max_col + 1), dtype=float)
    for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents),
                                desc=f'\-- filling matrix of shape {X.shape}'):
        X[i, cols] = vals
    X = X.tocsr()
    y = np.asarray(all_labels) + 1
    return X, y


def from_csv(path, encoding='utf-8'):
    """
    Reads a csv file in which columns are separated by ','.
    File format <label>,<feat1>,<feat2>,...,<featn>\n

    :param path: path to the csv file
    :param encoding: the text encoding used to open the file
    :return: a np.ndarray for the labels and a ndarray (float) for the covariates
    """

    X, y = [], []
    for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):
        yi, *xi = instance.strip().split(',')
        X.append(list(map(float,xi)))
        y.append(yi)
    X = np.asarray(X)
    y = np.asarray(y)
    return X, y


def reindex_labels(y):
    """
    Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
    E.g.:

    >>> reindex_labels(['B', 'B', 'A', 'C'])
    >>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))

    :param y: the list or array of original labels
    :return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.
    """
    y = np.asarray(y)
    classnames = np.asarray(sorted(np.unique(y)))
    label2index = {label: index for index, label in enumerate(classnames)}
    indexed = np.empty(y.shape, dtype=int)
    for label in classnames:
        indexed[y==label] = label2index[label]
    return indexed, classnames


def binarize(y, pos_class):
    """
    Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:

    >>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)
    >>> array([0, 1, 0, 0, 0, 0])

    :param y: array-like of labels
    :param pos_class: integer, the positive class
    :return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
        0 otherwise
    """
    y = np.asarray(y)
    ybin = np.zeros(y.shape, dtype=int)
    ybin[y == pos_class] = 1
    return ybin
data loading 2020-12-03 16:24:21 +01:00			`import numpy as np`
			`from scipy.sparse import dok_matrix`
			`from tqdm import tqdm`


branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00			`def from_text(path, encoding='utf-8', verbose=1, class2int=True):`
data loading 2020-12-03 16:24:21 +01:00			`"""`
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00			`Reads a labelled colletion of documents.`
data loading 2020-12-03 16:24:21 +01:00			`File fomart <0 or 1>\t<document>\n`
updating the documentation 2021-12-06 18:25:47 +01:00
data loading 2020-12-03 16:24:21 +01:00			`:param path: path to the labelled collection`
updating the documentation 2021-12-06 18:25:47 +01:00			`:param encoding: the text encoding used to open the file`
			`:param verbose: if >0 (default) shows some progress information in standard output`
data loading 2020-12-03 16:24:21 +01:00			`:return: a list of sentences, and a list of labels`
			`"""`
			`all_sentences, all_labels = [], []`
branch for LeQua2022 - first commit 2021-10-13 20:36:53 +02:00			`if verbose>0:`
			`file = tqdm(open(path, 'rt', encoding=encoding).readlines(), f'loading {path}')`
			`else:`
			`file = open(path, 'rt', encoding=encoding).readlines()`
			`for line in file:`
data loading 2020-12-03 16:24:21 +01:00			`line = line.strip()`
			`if line:`
testing baselines for lequa 2021-11-24 11:20:42 +01:00			`try:`
			`label, sentence = line.split('\t')`
			`sentence = sentence.strip()`
			`if class2int:`
			`label = int(label)`
			`if sentence:`
			`all_sentences.append(sentence)`
			`all_labels.append(label)`
			`except ValueError:`
			`print(f'format error in {line}')`
data loading 2020-12-03 16:24:21 +01:00			`return all_sentences, all_labels`


			`def from_sparse(path):`
			`"""`
Added encoding option with default to utf-8. 2021-04-30 17:00:46 +02:00			`Reads a labelled collection of real-valued instances expressed in sparse format`
			`File format <-1 or 0 or 1>[\s col(int):val(float)]\n`
updating the documentation 2021-12-06 18:25:47 +01:00
data loading 2020-12-03 16:24:21 +01:00			`:param path: path to the labelled collection`
updating the documentation 2021-12-06 18:25:47 +01:00			:return: a `csr_matrix` containing the instances (rows), and a ndarray containing the labels
data loading 2020-12-03 16:24:21 +01:00			`"""`

			`def split_col_val(col_val):`
			`col, val = col_val.split(':')`
			`col, val = int(col) - 1, float(val)`
			`return col, val`

			`all_documents, all_labels = [], []`
			`max_col = 0`
			`for line in tqdm(open(path, 'rt').readlines(), f'loading {path}'):`
			`parts = line.strip().split()`
			`if parts:`
			`all_labels.append(int(parts[0]))`
			`cols, vals = zip(*[split_col_val(col_val) for col_val in parts[1:]])`
			`cols, vals = np.asarray(cols), np.asarray(vals)`
			`max_col = max(max_col, cols.max())`
			`all_documents.append((cols, vals))`
			`n_docs = len(all_labels)`
			`X = dok_matrix((n_docs, max_col + 1), dtype=float)`
			`for i, (cols, vals) in tqdm(enumerate(all_documents), total=len(all_documents),`
			`desc=f'\-- filling matrix of shape {X.shape}'):`
			`X[i, cols] = vals`
			`X = X.tocsr()`
			`y = np.asarray(all_labels) + 1`
			`return X, y`
dataset fetch for polarity reviews (hp, kindle, imdb) and twitter sentiment (11 datasets) added 2020-12-14 18:36:19 +01:00
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00
Added encoding option with default to utf-8. 2021-04-30 17:00:46 +02:00			`def from_csv(path, encoding='utf-8'):`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`"""`
Added encoding option with default to utf-8. 2021-04-30 17:00:46 +02:00			`Reads a csv file in which columns are separated by ','.`
			`File format <label>,<feat1>,<feat2>,...,<featn>\n`
updating the documentation 2021-12-06 18:25:47 +01:00
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`:param path: path to the csv file`
updating the documentation 2021-12-06 18:25:47 +01:00			`:param encoding: the text encoding used to open the file`
			`:return: a np.ndarray for the labels and a ndarray (float) for the covariates`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`"""`

			`X, y = [], []`
Added encoding option with default to utf-8. 2021-04-30 17:00:46 +02:00			`for instance in tqdm(open(path, 'rt', encoding=encoding).readlines(), desc=f'reading {path}'):`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`yi, *xi = instance.strip().split(',')`
			`X.append(list(map(float,xi)))`
			`y.append(yi)`
			`X = np.asarray(X)`
			`y = np.asarray(y)`
			`return X, y`


			`def reindex_labels(y):`
			`"""`
			`Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.`
updating the documentation 2021-12-06 18:25:47 +01:00			`E.g.:`

			`>>> reindex_labels(['B', 'B', 'A', 'C'])`
			`>>> (array([1, 1, 0, 2]), array(['A', 'B', 'C'], dtype='<U1'))`

added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`:param y: the list or array of original labels`
			`:return: a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.`
			`"""`
updating the documentation 2021-12-06 18:25:47 +01:00			`y = np.asarray(y)`
			`classnames = np.asarray(sorted(np.unique(y)))`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`label2index = {label: index for index, label in enumerate(classnames)}`
some bugfixes, unittest and minor changes 2023-01-16 13:51:29 +01:00			`indexed = np.empty(y.shape, dtype=int)`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`for label in classnames:`
			`indexed[y==label] = label2index[label]`
			`return indexed, classnames`


			`def binarize(y, pos_class):`
updating the documentation 2021-12-06 18:25:47 +01:00			`"""`
			Binarizes a categorical array-like collection of labels towards the positive class `pos_class`. E.g.,:

			`>>> binarize([1, 2, 3, 1, 1, 0], pos_class=2)`
			`>>> array([0, 1, 0, 0, 0, 0])`

			`:param y: array-like of labels`
			`:param pos_class: integer, the positive class`
			:return: a binary np.ndarray, in which values 1 corresponds to positions in whcih `y` had `pos_class` labels, and
			`0 otherwise`
			`"""`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`y = np.asarray(y)`
some bugfixes, unittest and minor changes 2023-01-16 13:51:29 +01:00			`ybin = np.zeros(y.shape, dtype=int)`
added Ensemble methods (methods ALL, ACC, Ptr, DS from Pérez-Gallego et al 2017 and 2019) and some UCI ML datasets used in those articles (only 5 datasets out of 32 they used) 2021-01-06 14:58:29 +01:00			`ybin[y == pos_class] = 1`
all uci datasets from Pérez-Gállego added, quantification report added 2021-01-28 18:22:43 +01:00			`return ybin`