Source code for driftai.data.dataset

import hashlib
import string
from datetime import datetime
import dateutil.parser
from pathlib import Path

import numpy as np
from sklearn.model_selection import train_test_split

from .datasource import Datasource, FileDatasource, ImageDatasource
from driftai.utils import uri_to_filepath, maybe_make_dir, str_to_date, import_from
from driftai.db import Persistent, Collections

from driftai.exceptions import OptAppInvalidStructureException, \
                              OptAppInstanceExistsException, \
                              OptAppMethodNotImplementedYetException, \
                              OptAppInvalidStructureException


[docs]class Dataset(Persistent): """ Indexed dataset over a datasource """ def __init__(self, datasource, infolist=None, problem_type=None, creation_date=None, id=None): """ Parameters ---------- datasource: Datasource Datasource of the dataset problem_type: str, optional Objective of the algorithm. If `problem type` is not set manually, driftai will infere it automatically Possible values are: binary_clf, clf or regression creation_date: datetime Creation date of the dataset. Should not be set manually id: str Unique identifier for Dataset """ self.datasource = datasource self.infolist = infolist or self.datasource.get_infolist() self.problem_type = problem_type or self._get_problem_type() self.creation_date = str_to_date(creation_date) or datetime.now() self._id = id or self._get_id() if creation_date is None and Dataset.collection().exists(self.id): raise OptAppInstanceExistsException("Dataset") @property def id(self): return self._id
[docs] @staticmethod def collection(): """ Get table containing datasets Returns ------- TinyDB instance """ return Collections.datasets()
[docs] def get_labels(self): """ Get all the labels Returns ------- list List with all labels """ return [x[1] for x in self.infolist]
def _get_problem_type(self): # TODO: Is it really necessary? seen = set() labels = [i[-1] for i in self.infolist if i[-1] not in seen and not seen.add(i[-1])] if len(labels) == 2: return "binary_clf" # TODO: Think a better solution elif all([type(l) == str for l in labels]) or len(set(labels)) < 10: return "clf" else: return "regression"
[docs] @staticmethod def from_dir(path, path_pattern=None, datatype="img"): """ Create a Dataset from dir Parameters ---------- path: str DataSource location path path_pattern: str, optional Pattern to generate metadate. If path_pattern is left to None the default path_pattern is taken datatype: str, optional Directory datatype Returns ------- DirectoryDatasource """ def _custom_datasource_class(dtype): split = dtype.split('.') return import_from('.'.join(split[:-1]), split[-1]) datasource_classes = { "img": ImageDatasource, } if datatype not in datasource_classes: ds_class = _custom_datasource_class(datatype) else: ds_class = datasource_classes[datatype] datasource_parameters = dict(path=path) if path_pattern: datasource_parameters["parsing_pattern"] = path_pattern params = { "datasource": ds_class(**datasource_parameters), "id": Path(path).stem, } return Dataset(**params)
[docs] @staticmethod def read_file(path, label=None, first_line_heading=True): """ Create a Dataset from a file Parameters ---------- path: str DataSource location path label: str, optional Name of the label. If label is left to None the default label is assumed to be the last column first_line_heading: bool, optional If True considers that first line is the header """ params = { "datasource": FileDatasource(path, label, first_line_heading), "infolist": None, "id": Path(path).stem, } return Dataset(**params)
[docs] @classmethod def load_from_data(cls, data): """ Creates a Dataset object from serialized JSON data coming from TinyDB Parameters ---------- data: dict JSON data from TinyDB Raises ------ OptAppInvalidStructureException In case file keys are incorrect Returns ------- driftai.Dataset New Dataset instance """ def check_dataset_info_structure(params): dict_contents = {"datasource", "creation_date", "id", "infolist", "problem_type"} return isinstance(params, dict) and \ dict_contents.intersection(list(params.keys())) == dict_contents if check_dataset_info_structure(data): data["datasource"] = Datasource.load_from_data(data["datasource"]) return cls(**data) else: raise OptAppInvalidStructureException()
[docs] def get_info(self): """ Get info to serialize a Dataset instance Returns ------- dict Dictionariy containing a Dataset object summary:: { "datasource": dict containing path, first_line_heading and label of the datasource, "infolist": <TODO>, "problem_type": <multiclass clf, regression, binary clf>, "creation_date": <creation date of the dataset>, "id": <unique identifier> } """ info = { "datasource": { **self.datasource.get_info() }, "infolist": self.infolist, "problem_type": self.problem_type, "creation_date": str(self.creation_date), "id": self.id, } return info
[docs] def generate_subdataset(self, method, by): """ Creates a subdataset of the current Dataset Parameters ---------- method: str Evaluation sets split approach. Can be: ``train_test`` ``k_fold`` by: float, int If train_test method is specified, by represents the traininig set size. For example: .85 If k_fold method is specified, `by` is the number of folds """ return SubDataset(dataset=self, method=method, by=by)
[docs] def get_data(self): """ Get datasource data """ return self.datasource.get_data()
def __getitem__(self, indices): return self.datasource[np.array(self.infolist)[indices]] def _get_id(self): h = hashlib.md5(str(self.creation_date).encode('utf-8')).hexdigest() return h
[docs]class SubDataset(Persistent): def __init__(self, dataset, method, by=None, indices=None, id=None, creation_date=None): """ Parameters ---------- dataset: Dataset DriftAI dataset which the current subdataset inherits from method: str Evaluation sets split approach. Can be: train_test, k_fold by: float, int, optional If train_test method is specified, by represents the traininig set size. For example: .85 If k_fold method is specified, `by` is the number of folds indices: dict Contains the number of sets and the indices of each set:: { "method": str "indices:" { "train": list of int "test": list of int } } Should not be set by the developer id: str, optional Unique identifier creation_date: str, datetime, optional Creation date of the subdataset. Should not be set manually """ self.dataset = dataset # if indices are not passed as parameter, is required to generate indices if indices is None and by is None: raise TypeError( "missing one of the two arguments: 'indices' or 'by'") self.indices = indices or self._generate_indices(method=method, by=by) self.method = method self.by = by self.creation_date = str_to_date(creation_date) or datetime.now() self._id = id or self._get_id() if creation_date is None and SubDataset.collection().exists(self.id): raise OptAppInstanceExistsException("SubDataset") @property def id(self): return self._id
[docs] @staticmethod def collection(): """ Get table containing subdatasets Returns ------- TinyDB instance """ return Collections.subdatasets()
[docs] @classmethod def load_from_data(cls, data): """ Loads a subdataset from data coming from TinyDB Parameters ---------- data: dict JSON data Raises ------ OptAppSubDatasetInfoFileWrongStructureException If data has worng keys Returns ------- driftai.SubDataset New SubDataset instance """ def check_subdataset_info_structure(params): if not isinstance(params, dict): return False dict_contents = {"dataset", "creation_date", "method", "by", "indices", "id"} return dict_contents.intersection(list(params.keys())) == dict_contents if check_subdataset_info_structure(data): data["dataset"] = Dataset.load(data["dataset"]) return cls(**data) else: raise OptAppInvalidStructureException()
def _get_id(self): return self.dataset.id + "_" + self.method + "_" + str(self.by) def _generate_indices(self, method="train_test", by=None): # Generate the indices depending on the method infolist = self.dataset.infolist if method == "train_test": train, test = self._train_test_split( infolist=infolist, split=by, seed=None) sets = {"0": {"train": train, "test": test}} elif method == "k_fold": sets = {} train_test_folds = self._k_fold_cv_split( infolist=infolist, split=by, seed=None) for k in range(by): train = train_test_folds[k]["train"] test = train_test_folds[k]["test"] sets[string.ascii_uppercase[k]] = {"train": train, "test": test} # elif method == "stratified_train_test": # elif method == "bootstrap": else: raise OptAppMethodNotImplementedYetException() return {"method": method, "sets": sets} def _train_test_split(self, infolist, split, seed=None): indices = list(range(len(infolist))) return train_test_split(indices, train_size=split, test_size=1-split) def _k_fold_cv_split(self, infolist, split, seed=None): from sklearn.model_selection import KFold kf = KFold(n_splits=split, shuffle=True) indices = list(range(len(infolist))) folds = [] for train_indices, test_indices in kf.split(X=indices): folds.append({ "train": train_indices.tolist(), "test": test_indices.tolist() }) return folds
[docs] def get_info(self): """ Get info to serialize a SubDataset instance Returns ------- dict Contains subdataset essential information:: { "dataset": str, parent dataset path, "creation_date": str, Subdataset creation date, "id": str, "indices": dict, structure specified at the costructor parameters documentation, "path": str, subdataset path } """ return { "dataset": self.dataset.id, "creation_date": str(self.creation_date), "id": self.id, "indices": self.indices, "by": self.by, "method": self.method }
def _get_data(self, subset, train_test): index = self.indices["sets"][subset][train_test] return self.dataset[index]
[docs] def get_train_data(self, subset): """ Get the training data of a subset Parameters ---------- subset: str subset identifier Returns ------- dict Containing each training set instance with its label:: { "X": list, "y": list } """ return self._get_data(subset, "train")
[docs] def get_test_data(self, subset): """ Get the test data of a subset Parameters ---------- subset: str subset identifier Returns ------- dict Containing all instances which belog to test set with its label:: { "X": list, "y": list } """ return self._get_data(subset, "test")
def _get_labels(self, train_test, subset): index = self.indices["sets"][subset][train_test] labels = np.array(self.dataset.get_labels()) return labels[index].tolist()
[docs] def get_train_labels(self, subset): """ Get the labels of training set of an specific subset Parameters ---------- subset: str subset identifier Returns ------- list Ground truths of subset's training data """ return self._get_labels('train', subset)
[docs] def get_test_labels(self, subset): """ Get the labels of test set of an specific subset Parameters ---------- subset: str subset identifier Returns ------- list Ground truths of subset's test data """ return self._get_labels('test', subset)