Source code for dbcollection.core.api.add

"""
Add API class.
"""


from __future__ import print_function

from dbcollection.core.manager import CacheManager


[docs]def add(name, task, data_dir, hdf5_filename, categories=(), verbose=True, force_overwrite=False): """Add a dataset/task to the list of available datasets for loading. Parameters ---------- name : str Name of the dataset. task : str Name of the task to load. data_dir : str Path of the stored data in disk. hdf5_filename : str Path to the metadata HDF5 file. categories : list, optional List of keyword strings to categorize the dataset. verbose : bool, optional Displays text information (if true). force_overwrite : bool, optional Forces the overwrite of data in cache Examples -------- Add a dataset manually to dbcollection. >>> import dbcollection as dbc >>> dbc.add('new_db', 'new_task', 'new/path/db', 'newdb.h5', ['new_category']) >>> dbc.query('new_db') {'new_db': {'tasks': {'new_task': 'newdb.h5'}, 'data_dir': 'new/path/db', 'keywords': ['new_category']}} """ assert name, "Must input a valid name." assert task, "Must input a valid task." assert data_dir, "Must input a valid data_dir." assert hdf5_filename, "Must input a valid file_path." if isinstance(categories, str): categories = (categories,) else: categories = tuple(categories) db_adder = AddAPI(name=name, task=task, data_dir=data_dir, hdf5_filename=hdf5_filename, categories=categories, verbose=verbose, force_overwrite=force_overwrite) db_adder.run()
[docs]class AddAPI(object): """Add dataset API class. This class contains methods to correctly register a dataset in the cache. Parameters ---------- name : str Name of the dataset. task : str Name of the task to load. data_dir : str Path of the stored data in disk. hdf5_filename : str Path to the metadata HDF5 file. categories : tuple List of keyword strings to categorize the dataset. verbose : bool Displays text information. force_overwrite : bool Forces the overwrite of data in cache Attributes ---------- name : str Name of the dataset. task : str Name of the task to load. data_dir : str Path of the stored data in disk. hdf5_filename : bool Path to the metadata HDF5 file. categories : tuple Tuple of keyword strings to categorize the dataset. verbose : bool Displays text information. force_overwrite : bool Forces the overwrite of data in cache cache_manager : CacheManager Cache manager object. """ def __init__(self, name, task, data_dir, hdf5_filename, categories, verbose, force_overwrite): """Initialize class.""" assert isinstance(name, str), "Must input a valid name." assert isinstance(task, str), "Must input a valid task." assert isinstance(data_dir, str), "Must input a valid data_dir." assert isinstance(hdf5_filename, str), "Must input a valid file_path." assert isinstance(categories, tuple), "Must input a valid list(tuple) of categories." assert isinstance(verbose, bool), "Must input a valid boolean for verbose." assert isinstance(force_overwrite, bool), "Must input a valid boolean for force_overwrite." self.name = name self.task = task self.data_dir = data_dir self.hdf5_filename = hdf5_filename self.categories = categories self.verbose = verbose self.force_overwrite = force_overwrite self.cache_manager = self.get_cache_manager() def get_cache_manager(self): return CacheManager()
[docs] def run(self): """Main method.""" self.add_dataset_to_cache() if self.verbose: print('==> Dataset successfully registered.')
def add_dataset_to_cache(self): if self.dataset_exists_in_cache(self.name): self.update_dataset_cache_data() self.add_task_to_cache() else: self.add_new_data_to_cache() def dataset_exists_in_cache(self, name): return self.cache_manager.dataset.exists(name) def update_dataset_cache_data(self): if any(self.data_dir): self.cache_manager.dataset.update( name=self.name, data_dir=self.data_dir ) def add_task_to_cache(self): if self.check_if_task_exists_in_cache(): if self.force_overwrite: self.update_task_entry_in_cache() else: msg = "'{}' already exists in cache for '{}'. ".format(self.task, self.name) + \ "To overwrite it, you must set 'force_overwrite=True'." raise Exception(msg) else: self.add_task_entry_to_cache() def check_if_task_exists_in_cache(self): return self.cache_manager.task.exists(self.name, self.task) def update_task_entry_in_cache(self): self.cache_manager.task.update( name=self.name, task=self.task, filename=self.hdf5_filename, categories=self.categories ) def add_task_entry_to_cache(self): self.cache_manager.task.add( name=self.name, task=self.task, filename=self.hdf5_filename, categories=self.categories ) def add_new_data_to_cache(self): self.cache_manager.dataset.add( name=self.name, data_dir=self.data_dir, tasks={ self.task: { "filename": self.hdf5_filename, "categories": self.categories } } )