"""
This module contains scripts to download/process
all datasets available in dbcollection.
These scripts are self contained, meaning they can be imported
and used to manually setup a dataset.
"""
from __future__ import print_function
import os
import h5py
from dbcollection.utils.hdf5 import HDF5Manager
from dbcollection.utils.url import download_extract_urls
[docs]class BaseDataset(object):
""" Base class for download/processing a dataset.
Parameters
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
extract_data : bool, optional
Extracts the downloaded files if they are compacted.
verbose : bool
Be verbose
Attributes
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
extract_data : bool, optional
Extracts the downloaded files if they are compacted.
verbose : bool
Be verbose
urls : list
List of URL links to download.
keywords : list
List of keywords.
tasks : dict
Dataset's tasks.
default_task : str
Default task name.
"""
# download url
urls = () # list of urls to download
# some keywords. These are used to classify datasets for easier
# categorization in the cache file.
keywords = ()
# init tasks
tasks = {} # dictionary of available tasks to process
# Example: tasks = {'classification':Classification}
default_task = '' # Should define a default class!
# Example: default_task='classification'
def __init__(self, data_path, cache_path, extract_data=True, verbose=True):
"""Initialize class."""
assert data_path
assert cache_path
self.data_path = data_path
self.cache_path = cache_path
self.extract_data = extract_data
self.verbose = verbose
[docs] def download(self):
"""
Download and extract files to disk.
Returns
-------
tuple
A list of keywords.
"""
# download + extract data and remove temporary files
download_extract_urls(
urls=self.urls,
save_dir=self.data_path,
extract_data=self.extract_data,
verbose=self.verbose
)
return self.keywords
[docs] def parse_task_name(self, task):
"""Parses the task string to look for key suffixes.
Parameters
----------
task : str
Task name.
Returns
-------
str
Returns a task name without the '_s' suffix.
"""
if task.endswith('_s'):
return task[:-2], '_s'
else:
return task, None
[docs] def get_task_constructor(self, task):
"""Returns the class constructor for the input task.
Parameters
----------
task : str
Task name.
Returns
-------
str
Task name.
str
Task's ending suffix (if any).
BaseTask
Constructor to process the metadata of a task.
"""
if task == '':
task_, suffix = self.default_task, None
elif task == 'default':
task_, suffix = self.default_task, None
else:
task_, suffix = self.parse_task_name(task)
return task_, suffix, self.tasks[task_]
[docs] def process(self, task='default'):
"""Processes the metadata of a task.
Parameters
----------
task : str, optional
Task name.
Returns
-------
dict
Returns a dictionary with the task name as key and the filename as value.
"""
task_, suffix, task_constructor = self.get_task_constructor(task)
if self.verbose:
print('\nProcessing \'{}\' task:'.format(task_))
task_loader = task_constructor(self.data_path, self.cache_path, suffix, self.verbose)
task_filename = task_loader.run()
if suffix:
task_ = task_ + suffix
return {task_: {"filename": task_filename, "categories": self.keywords}}
class BaseDatasetNew(object):
"""Base class for download/processing a dataset.
Parameters
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
extract_data : bool, optional
Extracts the downloaded files if they are compacted.
verbose : bool, optional
Displays text information to the screen (if true).
Attributes
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
extract_data : bool
Extracts the downloaded files if they are compacted.
verbose : bool
Displays text information to the screen (if true).
urls : list
List of URL paths to download.
keywords : list
List of keywords to classify datasets.
tasks : dict
Dataset's tasks for processing.
default_task : str
Default task name.
"""
urls = () # list of urls to download
keywords = () # List of keywords to classify/categorize datasets in the cache.
tasks = {} # dictionary of available tasks to process
default_task = '' # Defines the default class
def __init__(self, data_path, cache_path, extract_data=True, verbose=True):
"""Initialize class."""
assert isinstance(data_path, str), "Must insert a valid data path"
assert isinstance(cache_path, str), "Must insert a valid cache path"
self.data_path = data_path
self.cache_path = cache_path
self.extract_data = extract_data
self.verbose = verbose
def download(self):
"""Downloads and extract files to disk.
Returns
-------
tuple
A list of keywords.
"""
download_extract_urls(
urls=self.urls,
save_dir=self.data_path,
extract_data=self.extract_data,
verbose=self.verbose
)
def process(self, task='default'):
"""Processes the metadata of a task.
Parameters
----------
task : str, optional
Name of the task.
Returns
-------
dict
Returns a dictionary with the task name as key and the filename as value.
"""
task_ = self.parse_task_name(task)
if self.verbose:
print("\nProcessing '{}' task:".format(task_))
hdf5_filename = self.process_metadata(task_)
return {task_: {"filename": hdf5_filename, "categories": self.keywords}}
def parse_task_name(self, task):
"""Parses the task name to a valid name."""
if task == '' or task == 'default':
return self.default_task
else:
return task
def process_metadata(self, task):
"""Processes the metadata for a task.
Parameters
----------
task : str
Name of the task.
Returns
-------
str
File name + path of the resulting HDFR5 metadata file of the task.
"""
constructor = self.get_task_constructor(task)
processer = constructor(data_path=self.data_path,
cache_path=self.cache_path,
verbose=self.verbose)
return processer.run()
def get_task_constructor(self, task):
"""Returns the class constructor for the input task.
Parameters
----------
task : str
Name of the task.
Returns
-------
BaseTask
Constructor to process the metadata of a task.
"""
assert task
return self.tasks[task]
[docs]class BaseTask(object):
"""Base class for processing a task of a dataset.
Parameters
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
suffix : str, optional
Suffix to select optional properties for a task.
verbose : bool, optional
Be verbose.
Attributes
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
suffix : str, optional
Suffix to select optional properties for a task.
verbose : bool, optional
Be verbose.
filename_h5 : str
hdf5 metadata file name.
"""
# name of the task file
filename_h5 = 'task'
def __init__(self, data_path, cache_path, suffix=None, verbose=True):
"""Initialize class."""
assert data_path
assert cache_path
self.cache_path = cache_path
self.data_path = data_path
self.suffix = suffix
self.verbose = verbose
[docs] def load_data(self):
"""
Load data of the dataset (create a generator).
Load data from annnotations and split it to corresponding
sets (train, val, test, etc.)
"""
pass # stub
[docs] def add_data_to_source(self, hdf5_handler, data, set_name=None):
"""
Store data annotations in a nested tree fashion.
It closely follows the tree structure of the data.
Parameters
----------
hdf5_handler : h5py._hl.group.Group
hdf5 group object handler.
data : list/dict
List or dict containing the data annotations of a particular set or sets.
set_name : str
Set name.
"""
pass # stub
[docs] def add_data_to_default(self, handler, data, set_name=None):
"""
Add data of a set to the default group.
For each field, the data is organized into a single big matrix.
Parameters
----------
hdf5_handler : h5py._hl.group.Group
hdf5 group object handler.
data : list/dict
List or dict containing the data annotations of a particular set or sets.
set_name : str
Set name.
"""
pass # stub
[docs] def run(self):
"""Run task processing."""
filename = self.process_metadata()
return filename
class BaseTaskNew(object):
"""Base class for processing the metadata of a task of a dataset.
Parameters
----------
data_path : str
Path to the data directory.
cache_path : str
Path to the cache file
verbose : bool, optional
Displays text information to the screen (if true).
Attributes
----------
data_path : str
Path to the data directory of the dataset.
cache_path : str
Path to store the HDF5 metadata file of a dataset in the cache directory.
verbose : bool
Displays text information to the screen (if true).
filename_h5 : str
Name of the HDF5 file.
hdf5_filepath : str
File name + path of the HDF5 metadata file in disk.
"""
filename_h5 = '' # name of the task file
def __init__(self, data_path, cache_path, verbose=True):
"""Initialize class."""
assert data_path, "Must insert a valid data path"
assert cache_path, "Must insert a valid cache path"
self.cache_path = cache_path
self.data_path = data_path
self.verbose = verbose
self.hdf5_filepath = self.get_hdf5_save_filename()
self.hdf5_manager = None
def get_hdf5_save_filename(self):
"""Builds the HDF5 file name + path on disk."""
return os.path.join(self.cache_path, self.filename_h5 + '.h5')
def run(self):
"""Main Method. Runs the task metadata processing.
It creates an HDF5 file in disk to store the resulting
subgroups of the dataset's set partitions (e.g., train/val/test/etc.).
Then, it loads the dataset's raw metadata from disk into memory as a
generator, retrieves the data fields obtained in the processing stage
and saves them into an HDF5 file in disk.
Returns
-------
str
File name + path of the task's HDF5 metadata file.
"""
self.setup_hdf5_manager()
data_generator = self.load_data()
self.process_metadata(data_generator)
self.teardown_hdf5_manager()
return self.hdf5_filepath
def setup_hdf5_manager(self):
"""Sets up the metadata manager to store the processed data to disk."""
if self.verbose:
print('\n==> Storing metadata to file: {}'.format(self.hdf5_filepath))
self.hdf5_manager = HDF5Manager(filename=self.hdf5_filepath)
def load_data(self):
"""Loads the dataset's (meta)data from disk (create a generator).
Load data from annnotations and split it to corresponding
sets (train, val, test, etc.)
Returns
-------
generator
A sequence of dictionary objects with a key-value pair
with the name of the set split and the data.
"""
pass # stub
def process_metadata(self, data_generator):
"""Processes the dataset's (meta)data and stores it into an HDF5 file."""
for data in data_generator:
for set_name in data:
if self.verbose:
print('\nSaving set metadata: {}'.format(set_name))
self.process_set_metadata(data[set_name], set_name)
def process_set_metadata(self, data, set_name):
"""Sets up the set's data fields to be stored in the HDF5 metadata file.
All fields set in this method are organized as a single big matrix.
This results in much faster data retrieval than by transversing nested
groups + datasets in an HDF5 file.
Parameters
----------
data : dict
Dictionary containing the data annotations of a set split.
set_name : str
Name of the set split.
"""
pass
def teardown_hdf5_manager(self):
"""Sets up the MetadataManager object to manage the metadata save process to disk."""
self.hdf5_manager.close()
class BaseField(object):
"""Base class for the dataset's data fields processor."""
def __init__(self, **kwargs):
for key, value in kwargs.items():
setattr(self, key, value)
def save_field_to_hdf5(self, set_name, field, data, **kwargs):
"""Saves data of a field into the HDF% metadata file.
Parameters
----------
set_name: str
Name of the set split.
field : str
Name of the data field.
data : np.ndarray
Numpy ndarray of the field's data.
"""
self.hdf5_manager.add_field_to_group(
group=set_name,
field=field,
data=data,
**kwargs
)