import os
import re
import inspect
import hashlib
from pathlib import Path
from abc import ABC, abstractmethod, abstractproperty
import pandas as pd
import numpy as np
from PIL import Image
from driftai.exceptions import OptAppFileDatasourceNotCompatibeException, OptAppMethodNotImplementedYetException
from driftai.utils import filepath_to_uri, uri_to_filepath, check_uri, get_file_extension, compile_path_pattern, import_from
[docs]class Datasource(ABC):
"""
Abstract datasource
"""
def __init__(self, data_uri):
self.datasource = data_uri
self.data = None
[docs] @abstractmethod
def get_infolist(self):
"""
Get list of labeled indices
Returns
-------
list of tuples
First element of the tuple is the index and the second element is the label
"""
pass
[docs] @abstractmethod
def get_data(self):
"""
Get all datasource data
"""
pass
@abstractmethod
def __getitem__(self, indices):
"""
Get data by infolist
Parameters
----------
indices: list of items of infolist
List of data indices
Returns
-------
dict
Dict with X and y keys, contining the data and the labels
"""
pass
[docs] def get_uri(self):
"""
Get datasource location URI formated
Returns
-------
str
Datasource location
"""
return self.datasource
[docs] def get_path(self):
"""
Get the location of datasource
Returns
-------
str
File system datasource location
"""
return str(Path(uri_to_filepath(self.datasource)).resolve())
@abstractmethod
def __len__(self):
pass
[docs] @staticmethod
def load_from_data(data):
"""
Create datasource from serialized data
Parameters
----------
data: dict
Dictionary containing serialized datasource data
Returns
-------
Datasource
"""
datasource_class_ = import_from(data["module"],
data["class_name"])
del data["class_name"]
del data["module"]
return datasource_class_(**data)
[docs] def get_info(self):
"""
Datasource summary
Returns
-------
dict
Dictionary used to serialize DriftAI Datasource instance
"""
return {
"module": self.__module__,
"class_name": self.__class__.__name__,
"path": self.get_uri()
}
[docs]class FileDatasource(Datasource):
"""
Datasource subclass
Responsible of handling datasets comming from a local file like csv files
"""
def __init__(self, path, label=None, first_line_heading=True):
"""
Parameters
----------
path_to_data: str
Location of the dataset. Accept formats are:
- Filesystem path
- File URI
label: str, optional
Name of the label. If label is left to None the default label is assumed to be the last column
first_line_heading: bool, optional
If True considers that first line is the header
"""
# check if uri
# if not uri, convert it
if not check_uri(path):
path = filepath_to_uri(path)
super().__init__(path)
self._label = label
self.first_line_heading = first_line_heading
def __len__(self):
if not self.data:
self.data = self.get_data()
return len(self.data)
@property
def label(self):
if self._label:
return self._label
else:
df = self.get_data()
self._label = df.columns[-1]
# If column hasn't got a name,
# cast the index (originaly numpy.int64) to python's int
if not isinstance(self._label, str):
self._label = int(self._label)
return self._label
[docs] def get_infolist(self):
"""
Get list of labeled indices
Returns
-------
list of tuples
First element of the tuple is the index and the second element is the label
Raises
------
OptAppFileDatasourceNotCompatibeException
If file extension is not compatible with DriftAI
"""
compatible_extensions = ["csv"]
file_ext = get_file_extension(self.datasource)
if file_ext == None:
raise OptAppFileDatasourceNotCompatibeException(self.datasource)
if file_ext in compatible_extensions:
self.data = self._load_csv()
else:
raise OptAppFileDatasourceNotCompatibeException(file_ext)
return self.data
def _load_csv(self, has_label=True):
"""
Loads a csv file, reading from datasource. Considers that first line is the header and if has_label, last
variable is the label
Parameters
----------
has_labels: bool
Indicates that the dataset has a label column (its always the last one)
Returns
-------
tuple
A tuple with: (index, the whole line content, data_type, label)
"""
df = self.get_data()
indices = list(range(df.shape[0]))
labels = df[self.label].values.tolist()
return list(map(list, zip(indices, labels)))
[docs] def get_data(self):
"""
Get the content of csv file
Returns
-------
pandas.DataFrame
DataFrame wrapping the csv content
"""
params = dict()
if not self.first_line_heading:
params["header"] = None
return pd.read_csv(self.get_path(), **params)
def __getitem__(self, indices):
"""
Get data by indices
Parameters
----------
indices: list of items in infolist
Returns
-------
pd.DataFrame
"""
# TODO: Lazy loading dataset (No load all file in memory)
df = self.get_data().iloc[[i[0] for i in indices]]
X = df.drop(self.label, axis=1).values
y = df[self.label].values
return dict(X=X, y=y)
[docs] def get_info(self):
return {
**super(FileDatasource, self).get_info(),
"first_line_heading": self.first_line_heading,
"label": self.label
}
[docs]class DirectoryDatasource(Datasource):
def __init__(self, path, parsing_pattern):
"""
Parameters
----------
path: str
Location of the dataset. Accept formats are:
- Filesystem path
- File URI
parsing_pattern: Pattern to get the label and data from file. Example: {testset}/{class}/{filename}.[txt|tsv]
"""
# check if uri
# if not uri, convert it
if not check_uri(path):
path = filepath_to_uri(path)
super(DirectoryDatasource, self).__init__(path)
self.parsing_pattern = parsing_pattern.replace('/', os.path.sep)
self._compiled_pattern = compile_path_pattern(self.parsing_pattern,
self.get_path(),
'file_idx')
def _parse_file(self, path_pattern, file_path):
# Get attributes from parsing pattern
t = re.match(path_pattern, file_path)
if t:
return {
"file": file_path,
"extension" : file_path.split(".")[1],
**t.groupdict()
}
[docs] def get_infolist(self):
"""
Get list of labeled indices
Returns
-------
list of tuples
First element of the tuple is the index and the second element is the label
"""
info_list = []
for root, _, files in os.walk(self.get_path()):
for file in files:
f = Path(root, file)
p = self._parse_file(self._compiled_pattern, str(f.resolve()))
if p:
info_list.append((p["file_idx"], p["class"]))
return info_list
[docs] def get_info(self):
"""
Directory datasource summary
Returns
-------
dict
Dictionary used to serialize an DriftAI DirectoryDatasource instance
"""
return {
**super(DirectoryDatasource, self).get_info(),
"parsing_pattern": self.parsing_pattern,
}
[docs] def get_data(self):
"""
Get all data under the datasource path
Returns
-------
list of tuples
First element of the tuple is the index and the second element is the label
"""
datalist = []
for inf in self.get_infolist():
blob = self.loader(str(Path(self.get_path(), inf[0])))
y = inf[1]
datalist.append(dict(X=blob, y=y))
return datalist
def __getitem__(self, info_list):
data = dict(X=[], y=[])
for idx, label in info_list:
data["X"].append(self.loader(str(Path(self.get_path(), idx))))
data["y"].append(label)
return data
@abstractproperty
def loader(self, idx):
pass
def __len__(self):
return len(self.data)
[docs]class ImageDatasource(DirectoryDatasource):
def __init__(self, path, parsing_pattern="{testset}/{class}_{}.[png|jpg|jpeg]"):
super(ImageDatasource, self).__init__(path=path,
parsing_pattern=parsing_pattern)
[docs] def loader(self, idx):
return np.asarray(Image.open(idx)).reshape(-1)