Source code for dbcollection.datasets

"""
This module contains scripts to download/process
all datasets available in dbcollection.

These scripts are self contained, meaning they can be imported
and used to manually setup a dataset.
"""


from __future__ import print_function
import os
import h5py

from dbcollection.utils.hdf5 import HDF5Manager
from dbcollection.utils.url import download_extract_urls


[docs]class BaseDataset(object):
    """ Base class for download/processing a dataset.

    Parameters
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    extract_data : bool, optional
        Extracts the downloaded files if they are compacted.
    verbose : bool
        Be verbose

    Attributes
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    extract_data : bool, optional
        Extracts the downloaded files if they are compacted.
    verbose : bool
        Be verbose
    urls : list
        List of URL links to download.
    keywords : list
        List of keywords.
    tasks : dict
        Dataset's tasks.
    default_task : str
        Default task name.

    """

    # download url
    urls = ()  # list of urls to download

    # some keywords. These are used to classify datasets for easier
    # categorization in the cache file.
    keywords = ()

    # init tasks
    tasks = {}  # dictionary of available tasks to process
    # Example: tasks = {'classification':Classification}
    default_task = ''  # Should define a default class!
    # Example: default_task='classification'

    def __init__(self, data_path, cache_path, extract_data=True, verbose=True):
        """Initialize class."""
        assert data_path
        assert cache_path
        self.data_path = data_path
        self.cache_path = cache_path
        self.extract_data = extract_data
        self.verbose = verbose

[docs]    def download(self):
        """
        Download and extract files to disk.

        Returns
        -------
        tuple
            A list of keywords.

        """
        # download + extract data and remove temporary files
        download_extract_urls(
            urls=self.urls,
            save_dir=self.data_path,
            extract_data=self.extract_data,
            verbose=self.verbose
        )

        return self.keywords

[docs]    def parse_task_name(self, task):
        """Parses the task string to look for key suffixes.

        Parameters
        ----------
        task : str
            Task name.

        Returns
        -------
        str
            Returns a task name without the '_s' suffix.

        """
        if task.endswith('_s'):
            return task[:-2], '_s'
        else:
            return task, None

[docs]    def get_task_constructor(self, task):
        """Returns the class constructor for the input task.

        Parameters
        ----------
        task : str
            Task name.

        Returns
        -------
        str
            Task name.
        str
            Task's ending suffix (if any).
        BaseTask
            Constructor to process the metadata of a task.

        """
        if task == '':
            task_, suffix = self.default_task, None
        elif task == 'default':
            task_, suffix = self.default_task, None
        else:
            task_, suffix = self.parse_task_name(task)
        return task_, suffix, self.tasks[task_]

[docs]    def process(self, task='default'):
        """Processes the metadata of a task.

        Parameters
        ----------
        task : str, optional
            Task name.

        Returns
        -------
        dict
            Returns a dictionary with the task name as key and the filename as value.

        """
        task_, suffix, task_constructor = self.get_task_constructor(task)
        if self.verbose:
            print('\nProcessing \'{}\' task:'.format(task_))
        task_loader = task_constructor(self.data_path, self.cache_path, suffix, self.verbose)
        task_filename = task_loader.run()
        if suffix:
            task_ = task_ + suffix
        return {task_: {"filename": task_filename, "categories": self.keywords}}


class BaseDatasetNew(object):
    """Base class for download/processing a dataset.

    Parameters
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    extract_data : bool, optional
        Extracts the downloaded files if they are compacted.
    verbose : bool, optional
         Displays text information to the screen (if true).

    Attributes
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    extract_data : bool
        Extracts the downloaded files if they are compacted.
    verbose : bool
        Displays text information to the screen (if true).
    urls : list
        List of URL paths to download.
    keywords : list
        List of keywords to classify datasets.
    tasks : dict
        Dataset's tasks for processing.
    default_task : str
        Default task name.

    """

    urls = ()  # list of urls to download
    keywords = ()  # List of keywords to classify/categorize datasets in the cache.
    tasks = {}  # dictionary of available tasks to process
    default_task = ''  # Defines the default class

    def __init__(self, data_path, cache_path, extract_data=True, verbose=True):
        """Initialize class."""
        assert isinstance(data_path, str), "Must insert a valid data path"
        assert isinstance(cache_path, str), "Must insert a valid cache path"
        self.data_path = data_path
        self.cache_path = cache_path
        self.extract_data = extract_data
        self.verbose = verbose

    def download(self):
        """Downloads and extract files to disk.

        Returns
        -------
        tuple
            A list of keywords.

        """
        download_extract_urls(
            urls=self.urls,
            save_dir=self.data_path,
            extract_data=self.extract_data,
            verbose=self.verbose
        )

    def process(self, task='default'):
        """Processes the metadata of a task.

        Parameters
        ----------
        task : str, optional
            Name of the task.

        Returns
        -------
        dict
            Returns a dictionary with the task name as key and the filename as value.

        """
        task_ = self.parse_task_name(task)
        if self.verbose:
            print("\nProcessing '{}' task:".format(task_))
        hdf5_filename = self.process_metadata(task_)
        return {task_: {"filename": hdf5_filename, "categories": self.keywords}}

    def parse_task_name(self, task):
        """Parses the task name to a valid name."""
        if task == '' or task == 'default':
            return self.default_task
        else:
            return task

    def process_metadata(self, task):
        """Processes the metadata for a task.

        Parameters
        ----------
        task : str
            Name of the task.

        Returns
        -------
        str
            File name + path of the resulting HDFR5 metadata file of the task.
        """
        constructor = self.get_task_constructor(task)
        processer = constructor(data_path=self.data_path,
                                cache_path=self.cache_path,
                                verbose=self.verbose)
        return processer.run()

    def get_task_constructor(self, task):
        """Returns the class constructor for the input task.

        Parameters
        ----------
        task : str
            Name of the task.

        Returns
        -------
        BaseTask
            Constructor to process the metadata of a task.

        """
        assert task
        return self.tasks[task]


[docs]class BaseTask(object):
    """Base class for processing a task of a dataset.

    Parameters
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    suffix : str, optional
        Suffix to select optional properties for a task.
    verbose : bool, optional
        Be verbose.

    Attributes
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    suffix : str, optional
        Suffix to select optional properties for a task.
    verbose : bool, optional
        Be verbose.
    filename_h5 : str
        hdf5 metadata file name.

    """

    # name of the task file
    filename_h5 = 'task'

    def __init__(self, data_path, cache_path, suffix=None, verbose=True):
        """Initialize class."""
        assert data_path
        assert cache_path
        self.cache_path = cache_path
        self.data_path = data_path
        self.suffix = suffix
        self.verbose = verbose

[docs]    def load_data(self):
        """
        Load data of the dataset (create a generator).

        Load data from annnotations and split it to corresponding
        sets (train, val, test, etc.)

        """
        pass  # stub

[docs]    def add_data_to_source(self, hdf5_handler, data, set_name=None):
        """
        Store data annotations in a nested tree fashion.

        It closely follows the tree structure of the data.

        Parameters
        ----------
        hdf5_handler : h5py._hl.group.Group
            hdf5 group object handler.
        data : list/dict
            List or dict containing the data annotations of a particular set or sets.
        set_name : str
            Set name.

        """
        pass  # stub

[docs]    def add_data_to_default(self, handler, data, set_name=None):
        """
        Add data of a set to the default group.

        For each field, the data is organized into a single big matrix.

        Parameters
        ----------
        hdf5_handler : h5py._hl.group.Group
            hdf5 group object handler.
        data : list/dict
            List or dict containing the data annotations of a particular set or sets.
        set_name : str
            Set name.

        """
        pass  # stub

[docs]    def process_metadata(self):
        """
        Process metadata and store it in a hdf5 file.
        """

        # create/open hdf5 file with subgroups for train/val/test
        if self.suffix:
            file_name = os.path.join(self.cache_path, self.filename_h5 + self.suffix + '.h5')
        else:
            file_name = os.path.join(self.cache_path, self.filename_h5 + '.h5')
        fileh5 = h5py.File(file_name, 'w', libver='latest')

        if self.verbose:
            print('\n==> Storing metadata to file: {}'.format(file_name))

        # setup data generator
        data_gen = self.load_data()

        for data in data_gen:
            for set_name in data:

                if self.verbose:
                    print('\nSaving set metadata: {}'.format(set_name))

                # add data to the **source** group
                if self.suffix is '_s':
                    sourceg = fileh5.create_group(set_name + '/source')
                    self.add_data_to_source(sourceg, data[set_name], set_name)

                # add data to the **default** group
                defaultg = fileh5.create_group(set_name)
                self.add_data_to_default(defaultg, data[set_name], set_name)

        # close file
        fileh5.close()

        # return information of the task + cache file
        return file_name

[docs]    def run(self):
        """Run task processing."""
        filename = self.process_metadata()
        return filename


class BaseTaskNew(object):
    """Base class for processing the metadata of a task of a dataset.

    Parameters
    ----------
    data_path : str
        Path to the data directory.
    cache_path : str
        Path to the cache file
    verbose : bool, optional
        Displays text information to the screen (if true).

    Attributes
    ----------
    data_path : str
        Path to the data directory of the dataset.
    cache_path : str
        Path to store the HDF5 metadata file of a dataset in the cache directory.
    verbose : bool
        Displays text information to the screen (if true).
    filename_h5 : str
        Name of the HDF5 file.
    hdf5_filepath : str
        File name + path of the HDF5 metadata file in disk.

    """

    filename_h5 = ''  # name of the task file

    def __init__(self, data_path, cache_path, verbose=True):
        """Initialize class."""
        assert data_path, "Must insert a valid data path"
        assert cache_path, "Must insert a valid cache path"
        self.cache_path = cache_path
        self.data_path = data_path
        self.verbose = verbose
        self.hdf5_filepath = self.get_hdf5_save_filename()
        self.hdf5_manager = None

    def get_hdf5_save_filename(self):
        """Builds the HDF5 file name + path on disk."""
        return os.path.join(self.cache_path, self.filename_h5 + '.h5')

    def run(self):
        """Main Method. Runs the task metadata processing.

        It creates an HDF5 file in disk to store the resulting
        subgroups of the dataset's set partitions (e.g., train/val/test/etc.).
        Then, it loads the dataset's raw metadata from disk into memory as a
        generator, retrieves the data fields obtained in the processing stage
        and saves them into an HDF5 file in disk.

        Returns
        -------
        str
            File name + path of the task's HDF5 metadata file.
        """
        self.setup_hdf5_manager()
        data_generator = self.load_data()
        self.process_metadata(data_generator)
        self.teardown_hdf5_manager()
        return self.hdf5_filepath

    def setup_hdf5_manager(self):
        """Sets up the metadata manager to store the processed data to disk."""
        if self.verbose:
            print('\n==> Storing metadata to file: {}'.format(self.hdf5_filepath))
        self.hdf5_manager = HDF5Manager(filename=self.hdf5_filepath)

    def load_data(self):
        """Loads the dataset's (meta)data from disk (create a generator).

        Load data from annnotations and split it to corresponding
        sets (train, val, test, etc.)

        Returns
        -------
        generator
            A sequence of dictionary objects with a key-value pair
            with the name of the set split and the data.

        """
        pass  # stub

    def process_metadata(self, data_generator):
        """Processes the dataset's (meta)data and stores it into an HDF5 file."""
        for data in data_generator:
            for set_name in data:
                if self.verbose:
                    print('\nSaving set metadata: {}'.format(set_name))
                self.process_set_metadata(data[set_name], set_name)

    def process_set_metadata(self, data, set_name):
        """Sets up the set's data fields to be stored in the HDF5 metadata file.

        All fields set in this method are organized as a single big matrix.
        This results in much faster data retrieval than by transversing nested
        groups + datasets in an HDF5 file.

        Parameters
        ----------
        data : dict
            Dictionary containing the data annotations of a set split.
        set_name : str
            Name of the set split.

        """
        pass

    def teardown_hdf5_manager(self):
        """Sets up the MetadataManager object to manage the metadata save process to disk."""
        self.hdf5_manager.close()


class BaseField(object):
    """Base class for the dataset's data fields processor."""

    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

    def save_field_to_hdf5(self, set_name, field, data, **kwargs):
        """Saves data of a field into the HDF% metadata file.

        Parameters
        ----------
        set_name: str
            Name of the set split.
        field : str
            Name of the data field.
        data : np.ndarray
            Numpy ndarray of the field's data.

        """
        self.hdf5_manager.add_field_to_group(
            group=set_name,
            field=field,
            data=data,
            **kwargs
        )