Source code for driftai.data.dataset

import hashlib
import string
from datetime import datetime
import dateutil.parser
from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split

from .datasource import Datasource, FileDatasource, ImageDatasource
from driftai.utils import uri_to_filepath, maybe_make_dir, str_to_date, import_from
from driftai.db import Persistent, Collections

from driftai.exceptions import OptAppInvalidStructureException, \
                              OptAppInstanceExistsException, \
                              OptAppMethodNotImplementedYetException, \
                              OptAppInvalidStructureException


[docs]class Dataset(Persistent):
    """
    Indexed dataset over a datasource
    """

    def __init__(self, datasource, infolist=None, problem_type=None, creation_date=None, id=None):
        """
        Parameters
        ----------
        datasource: Datasource
            Datasource of the dataset
        problem_type: str, optional
            Objective of the algorithm.
            If `problem type` is not set manually, driftai will infere it automatically
            Possible values are: binary_clf, clf or regression
        creation_date: datetime
            Creation date of the dataset. Should not be set manually
        id: str
            Unique identifier for Dataset
        """
        self.datasource = datasource
        self.infolist = infolist or self.datasource.get_infolist()
        self.problem_type = problem_type or self._get_problem_type()
        self.creation_date = str_to_date(creation_date) or datetime.now()
        self._id = id or self._get_id()
        if creation_date is None and Dataset.collection().exists(self.id):
            raise OptAppInstanceExistsException("Dataset")

    @property
    def id(self):
        return  self._id

[docs]    @staticmethod
    def collection():
        """
        Get table containing datasets

        Returns
        -------
        TinyDB instance
        """
        return Collections.datasets()

[docs]    def get_labels(self):
        """
        Get all the labels

        Returns
        -------
        list
            List with all labels
        """
        return [x[1] for x in self.infolist]

    def _get_problem_type(self):
        # TODO: Is it really necessary?
        seen = set()
        labels = [i[-1] for i in self.infolist if i[-1] not in seen and not seen.add(i[-1])]
        if len(labels) == 2:
            return "binary_clf"
        # TODO: Think a better solution
        elif all([type(l) == str for l in labels]) or len(set(labels)) < 10:
            return "clf"
        else:
            return "regression"

[docs]    @staticmethod
    def from_dir(path, path_pattern=None, datatype="img"):
        """
        Create a Dataset from dir

        Parameters
        ----------
        path: str
            DataSource location path
        path_pattern: str, optional
            Pattern to generate metadate. If path_pattern is left to None the default path_pattern is taken
        datatype: str, optional
            Directory datatype

        Returns
        -------
        DirectoryDatasource
        """
        def _custom_datasource_class(dtype):
            split = dtype.split('.')
            return import_from('.'.join(split[:-1]), split[-1])

        datasource_classes = {
            "img": ImageDatasource,
        }

        if datatype not in datasource_classes:
            ds_class = _custom_datasource_class(datatype)
        else:
            ds_class = datasource_classes[datatype]
            
        datasource_parameters = dict(path=path)
        if path_pattern:
            datasource_parameters["parsing_pattern"] = path_pattern

        params = {
            "datasource": ds_class(**datasource_parameters),
            "id": Path(path).stem,
        }
        return Dataset(**params)

[docs]    @staticmethod
    def read_file(path, label=None, first_line_heading=True):
        """
        Create a Dataset from a file

        Parameters
        ----------
        path: str
            DataSource location path
        label: str, optional
            Name of the label. If label is left to None the default label is assumed to be the last column
        first_line_heading: bool, optional
            If True considers that first line is the header
        """
        params = {
            "datasource": FileDatasource(path, label, first_line_heading),
            "infolist": None,
            "id": Path(path).stem,
        }
        return Dataset(**params)

[docs]    @classmethod
    def load_from_data(cls, data):
        """
        Creates a Dataset object from serialized JSON data coming from TinyDB

        Parameters
        ----------
        data: dict
            JSON data from TinyDB

        Raises
        ------
        OptAppInvalidStructureException
            In case file keys are incorrect

        Returns
        -------
        driftai.Dataset
            New Dataset instance
        """
        def check_dataset_info_structure(params):
            dict_contents = {"datasource", "creation_date", "id", "infolist", "problem_type"}
            return isinstance(params, dict) and \
                    dict_contents.intersection(list(params.keys())) == dict_contents 

        if check_dataset_info_structure(data):
            data["datasource"] = Datasource.load_from_data(data["datasource"])
            return cls(**data)
        else:
            raise OptAppInvalidStructureException()

[docs]    def get_info(self):
        """
        Get info to serialize a Dataset instance

        Returns
        -------
        dict
            Dictionariy containing a Dataset object summary::

            {
                "datasource": dict containing path, first_line_heading and label of the datasource,
                "infolist": <TODO>,
                "problem_type": <multiclass clf, regression, binary clf>,
                "creation_date": <creation date of the dataset>,
                "id": <unique identifier>
            }
        """
        info = {
            "datasource": {
                **self.datasource.get_info()
            },
            "infolist": self.infolist,
            "problem_type": self.problem_type,
            "creation_date": str(self.creation_date),
            "id": self.id,
        }
        return info

[docs]    def generate_subdataset(self, method, by):
        """
        Creates a subdataset of the current Dataset

        Parameters
        ----------
        method: str
            Evaluation sets split approach.
            Can be: ``train_test`` ``k_fold``

        by: float, int
            If train_test method is specified, by represents the traininig set size. For example: .85
            If k_fold method is specified, `by` is the number of folds
        """
        return SubDataset(dataset=self, method=method, by=by)

[docs]    def get_data(self):
        """
        Get datasource data
        """
        return self.datasource.get_data()

    def __getitem__(self, indices):
        return self.datasource[np.array(self.infolist)[indices]]

    def _get_id(self):
        h = hashlib.md5(str(self.creation_date).encode('utf-8')).hexdigest()
        return h


[docs]class SubDataset(Persistent):
    def __init__(self, dataset, method, by=None, indices=None, id=None, creation_date=None):
        """
        Parameters
        ----------
        dataset: Dataset
            DriftAI dataset which the current subdataset inherits from
        method: str
            Evaluation sets split approach.
            Can be: train_test, k_fold
        by: float, int, optional
            If train_test method is specified, by represents the traininig set size. For example: .85
            If k_fold method is specified, `by` is the number of folds
        indices: dict
            Contains the number of sets and the indices of each set::

            {
                "method": str
                "indices:" {
                    "train": list of int
                    "test": list of int
                }
            }

            Should not be set by the developer
        id: str, optional
            Unique identifier
        creation_date: str, datetime, optional
            Creation date of the subdataset. Should not be set manually
        """
        self.dataset = dataset

        # if indices are not passed as parameter, is required to generate indices
        if indices is None and by is None:
            raise TypeError(
                "missing one of the two arguments: 'indices' or 'by'")

        self.indices = indices or self._generate_indices(method=method, by=by)
        self.method = method
        self.by = by
        self.creation_date = str_to_date(creation_date) or datetime.now()
        self._id = id or self._get_id()

        if creation_date is None and SubDataset.collection().exists(self.id):
            raise OptAppInstanceExistsException("SubDataset")


    @property
    def id(self):
        return self._id

[docs]    @staticmethod
    def collection():
        """
        Get table containing subdatasets

        Returns
        -------
        TinyDB instance
        """
        return Collections.subdatasets()

[docs]    @classmethod
    def load_from_data(cls, data):
        """
        Loads a subdataset from data coming from TinyDB

        Parameters
        ----------
        data: dict
            JSON data

        Raises
        ------
        OptAppSubDatasetInfoFileWrongStructureException
            If data has worng keys

        Returns
        -------
        driftai.SubDataset
            New SubDataset instance
        """
        def check_subdataset_info_structure(params):
            if not isinstance(params, dict):
                return False
            dict_contents = {"dataset", "creation_date", "method", "by", "indices", "id"}
            return dict_contents.intersection(list(params.keys())) == dict_contents

        if check_subdataset_info_structure(data):
            data["dataset"] = Dataset.load(data["dataset"])
            return cls(**data)
        else:
            raise OptAppInvalidStructureException()

    def _get_id(self):
        return self.dataset.id + "_" + self.method + "_" + str(self.by)

    def _generate_indices(self, method="train_test", by=None):
        # Generate the indices depending on the method
        infolist = self.dataset.infolist

        if method == "train_test":
            train, test = self._train_test_split(
                infolist=infolist, split=by, seed=None)
            sets = {"0": {"train": train, "test": test}}

        elif method == "k_fold":
            sets = {}
            train_test_folds = self._k_fold_cv_split(
                infolist=infolist, split=by, seed=None)
            for k in range(by):
                train = train_test_folds[k]["train"]
                test = train_test_folds[k]["test"]
                sets[string.ascii_uppercase[k]] = {"train": train,
                                                   "test": test}
        # elif method == "stratified_train_test":
        # elif method == "bootstrap":

        else:
            raise OptAppMethodNotImplementedYetException()

        return {"method": method, "sets": sets}

    def _train_test_split(self, infolist, split, seed=None):
        indices = list(range(len(infolist)))
        return train_test_split(indices, train_size=split, test_size=1-split)

    def _k_fold_cv_split(self, infolist, split, seed=None):
        from sklearn.model_selection import KFold
        
        kf = KFold(n_splits=split, shuffle=True)
        indices = list(range(len(infolist)))

        folds = []
        for train_indices, test_indices in kf.split(X=indices):
            folds.append({
                "train": train_indices.tolist(),
                "test": test_indices.tolist()
            })
        return folds

[docs]    def get_info(self):
        """
        Get info to serialize a SubDataset instance
        
        Returns
        -------
        dict
            Contains subdataset essential information::

            {
                "dataset": str, parent dataset path,
                "creation_date": str, Subdataset creation date,
                "id": str,
                "indices": dict, structure specified at the costructor parameters documentation,
                "path": str, subdataset path
            }

        """
        return {
            "dataset": self.dataset.id,
            "creation_date": str(self.creation_date),
            "id": self.id,
            "indices": self.indices,
            "by": self.by,
            "method": self.method
        }


    def _get_data(self, subset, train_test):
        index = self.indices["sets"][subset][train_test]
        return self.dataset[index]

[docs]    def get_train_data(self, subset):
        """
        Get the training data of a subset

        Parameters
        ----------
        subset: str
            subset identifier

        Returns
        -------
        dict
            Containing each training set instance with its label::

            {
                "X": list,
                "y": list
            }

        """
        return self._get_data(subset, "train")

[docs]    def get_test_data(self, subset):
        """
        Get the test data of a subset

        Parameters
        ----------
        subset: str
            subset identifier

        Returns
        -------
        dict
            Containing all instances which belog to test set with its label::
            
                { 
                    "X": list,
                    "y": list
                }

        """
        return self._get_data(subset, "test")
    
    def _get_labels(self, train_test, subset):
        index = self.indices["sets"][subset][train_test]
        labels = np.array(self.dataset.get_labels())
        return labels[index].tolist()

[docs]    def get_train_labels(self, subset):
        """
        Get the labels of training set of an specific subset

        Parameters
        ----------
        subset: str
            subset identifier

        Returns
        -------
        list
            Ground truths of subset's training data
        """
        return self._get_labels('train', subset)

[docs]    def get_test_labels(self, subset):
        """
        Get the labels of test set of an specific subset

        Parameters
        ----------
        subset: str
            subset identifier

        Returns
        -------
        list
            Ground truths of subset's test data
        """
        return self._get_labels('test', subset)