1
0
Fork 0

fixing dataset loading

This commit is contained in:
Alejandro Moreo Fernandez 2020-12-03 16:36:54 +01:00
parent b6820e8dba
commit e81009e665
3 changed files with 10 additions and 10 deletions

View File

@ -1,4 +1,5 @@
from .base import * from .base import *
from . import base from .reader import *
from . import reader
from . import preprocessing from . import preprocessing

View File

@ -1,6 +1,5 @@
import numpy as np import numpy as np
from scipy.sparse import issparse, dok_matrix from scipy.sparse import issparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from quapy.functional import artificial_prevalence_sampling from quapy.functional import artificial_prevalence_sampling
from scipy.sparse import vstack from scipy.sparse import vstack

View File

@ -1,7 +1,7 @@
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from dataset.base import Dataset from dataset.base import Dataset
from scipy.sparse import spmatrix from scipy.sparse import spmatrix
import numpy as np
from utils.util import parallelize from utils.util import parallelize
from .base import LabelledCollection from .base import LabelledCollection
@ -17,8 +17,8 @@ def text2tfidf(dataset:Dataset, min_df=3, sublinear_tf=True, inplace=False, **kw
:return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True) :return: a new Dataset in csr_matrix format (if inplace=False) or a reference to the current Dataset (inplace=True)
where the instances are stored in a csr_matrix of real-valued tfidf scores where the instances are stored in a csr_matrix of real-valued tfidf scores
""" """
__check_type(dataset.training.instances, list, str) __check_type(dataset.training.instances, np.ndarray, str)
__check_type(dataset.test.instances, list, str) __check_type(dataset.test.instances, np.ndarray, str)
vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs) vectorizer = TfidfVectorizer(min_df=min_df, sublinear_tf=sublinear_tf, **kwargs)
training_documents = vectorizer.fit_transform(dataset.training.instances) training_documents = vectorizer.fit_transform(dataset.training.instances)
@ -45,8 +45,8 @@ def reduce_columns(dataset:Dataset, min_df=5, inplace=False):
:return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True) :return: a new Dataset (if inplace=False) or a reference to the current Dataset (inplace=True)
where the dimensions corresponding to infrequent instances have been removed where the dimensions corresponding to infrequent instances have been removed
""" """
__check_type(dataset.training, spmatrix) __check_type(dataset.training.instances, spmatrix)
__check_type(dataset.test, spmatrix) __check_type(dataset.test.instances, spmatrix)
assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces' assert dataset.training.instances.shape[1] == dataset.test.instances.shape[1], 'unaligned vector spaces'
def filter_by_occurrences(X, W): def filter_by_occurrences(X, W):
@ -101,7 +101,7 @@ def __check_type(container, container_type=None, element_type=None):
assert isinstance(container, container_type), \ assert isinstance(container, container_type), \
f'unexpected type of container (expected {container_type}, found {type(container)})' f'unexpected type of container (expected {container_type}, found {type(container)})'
if element_type: if element_type:
assert isinstance(next(container), element_type), \ assert isinstance(container[0], element_type), \
f'unexpected type of element (expected {container_type}, found {type(container)})' f'unexpected type of element (expected {container_type}, found {type(container)})'