Source code for solaris.nets.datagen

from tensorflow import keras
import numpy as np
import rasterio
from torch.utils.data import Dataset, DataLoader
from .transform import _check_augs, process_aug_dict
from ..utils.core import _check_df_load
from ..utils.geo import split_geom
from ..utils.io import imread, _check_channel_order


[docs]def make_data_generator(framework, config, df, stage='train'): """Create an appropriate data generator based on the framework used. A wrapper for the high-end ``solaris`` API to create data generators. Using the ``config`` dictionary, this function creates an instance of either :class:`KerasSegmentationSequence` or :class:`TorchDataset` (depending on the framework used for the pipeline). If using Torch, this instance is then wrapped in a :class:`torch.utils.data.DataLoader` and returned; if Keras, the sequence object is directly returned. Arguments --------- framework : str One of ['keras', 'pytorch', 'simrdwn', 'tf', 'tf_obj_api'], the deep learning framework used for the model to be used. config : dict The config dictionary for the entire pipeline. df : :class:`pandas.DataFrame` or :class:`str` A :class:`pandas.DataFrame` containing two columns: ``'image'``, with the path to images for training, and ``'label'``, with the path to the label file corresponding to each image. stage : str, optional Either ``'train'`` or ``'validate'``, indicates whether the object created is being used for training or validation. This determines which augmentations from the config file are applied within the returned object. Returns ------- data_gen : :class:`KerasSegmentationSequence` or :class:`torch.utils.data.DataLoader` An object to pass data into the :class:`solaris.nets.train.Trainer` instance during model training. See Also -------- :class:`KerasSegmentationSequence` :class:`TorchDataset` :class:`InferenceTiler` """ if framework.lower() not in ['keras', 'pytorch', 'torch']: raise ValueError('{} is not an accepted value for `framework`'.format( framework)) # make sure the df is loaded df = _check_df_load(df) if stage == 'train': augs = config['training_augmentation'] shuffle = config['training_augmentation']['shuffle'] elif stage == 'validate': augs = config['validation_augmentation'] shuffle = False try: num_classes = config['data_specs']['num_classes'] except KeyError: num_classes = 1 if framework.lower() == 'keras': data_gen = KerasSegmentationSequence( df, height=config['data_specs']['height'], width=config['data_specs']['width'], input_channels=config['data_specs']['channels'], output_channels=config['data_specs']['mask_channels'], augs=augs, batch_size=config['batch_size'], label_type=config['data_specs']['label_type'], is_categorical=config['data_specs']['is_categorical'], num_classes=num_classes, shuffle=shuffle) elif framework in ['torch', 'pytorch']: dataset = TorchDataset( df, augs=augs, batch_size=config['batch_size'], label_type=config['data_specs']['label_type'], is_categorical=config['data_specs']['is_categorical'], num_classes=num_classes, dtype=config['data_specs']['dtype']) # set up workers for DataLoader for pytorch data_workers = config['data_specs'].get('data_workers') if data_workers == 1 or data_workers is None: data_workers = 0 # for DataLoader to run in main process data_gen = DataLoader( dataset, batch_size=config['batch_size'], shuffle=config['training_augmentation']['shuffle'], num_workers=data_workers) return data_gen
[docs]class KerasSegmentationSequence(keras.utils.Sequence): """An object to stream images from files into a Keras model in solaris. Attributes ---------- df : :class:`pandas.DataFrame` The :class:`pandas.DataFrame` specifying where inputs are stored. height : int The height of generated images. width : int The width of generated images. input_channels : int The number of channels in generated inputs. output_channels : int The number of channels in target masks created. aug : :class:`albumentations.core.composition.Compose` An albumentations Compose object to pass imagery through before passing it into the neural net. If an augmentation config subdict was provided during initialization, this is created by parsing the dict with :func:`solaris.nets.transform.process_aug_dict`. batch_size : int The batch size generated. n_batches : int The number of batches per epoch. Inferred based on the number of input files in `df` and `batch_size`. label_type : str Type of labels. Currently always ``"mask"``. is_categorical : bool Indicates whether masks output are boolean or categorical labels. num_classes: int Indicates the number of classes in the dataset shuffle : bool Indicates whether or not input order is shuffled for each epoch. """ def __init__(self, df, height, width, input_channels, output_channels, augs, batch_size, label_type='mask', is_categorical=False, num_classes=1, shuffle=True): """Create an instance of KerasSegmentationSequence. Arguments --------- df : :class:`pandas.DataFrame` A pandas DataFrame specifying images and label files to read into the model. See `the reference file creation tutorial`_ for more. height : int The height of model inputs in pixels. width : int The width of model inputs in pixels. input_channels : int The number of channels in model input imagery. output_channels : int The number of channels in the model output. augs : :class:`dict` or :class:`albumentations.core.composition.Compose` Either the config subdict specifying augmentations to apply, or a pre-created :class:`albumentations.core.composition.Compose` object containing all of the augmentations to apply. batch_size : int The number of samples in a training batch. label_type : str, optional The type of labels to be used. At present, only ``"mask"`` is supported. is_categorical : bool, optional Is the data categorical or boolean (default)? num_classes: int Indicates the number of classes in the dataset shuffle : bool, optional Should image order be shuffled in each epoch? .. _the reference file creation tutorial: https://solaris.readthedocs.io/en/latest/tutorials/notebooks/creating_im_reference_csvs.html """ # TODO: IMPLEMENT GETTING INPUT FILE LISTS HERE! self.df = df self.height = height self.width = width self.input_channels = input_channels self.output_channels = output_channels self.aug = _check_augs(augs) # checks if they're loaded; loads if not self.batch_size = batch_size self.n_batches = int(np.floor(len(self.df)/self.batch_size)) self.label_type = label_type self.is_categorical = is_categorical self.num_classes = num_classes self.shuffle = shuffle self.on_epoch_end()
[docs] def on_epoch_end(self): """Update indices after each epoch.""" # reorder images self.image_indexes = np.arange(len(self.df)) if self.shuffle: np.random.shuffle(self.image_indexes)
def _data_generation(self, image_idxs): # initialize the output array X = np.empty((self.batch_size, self.height, self.width, self.input_channels)) if self.label_type == 'mask': y = np.empty((self.batch_size, self.height, self.width, self.output_channels)) else: pass # TODO: IMPLEMENT BBOX LABEL SETUP HERE! for i in range(self.batch_size): im = imread(self.df['image'].iloc[image_idxs[i]]) im = _check_channel_order(im, 'keras') if self.label_type == 'mask': label = imread(self.df['label'].iloc[image_idxs[i]]) if not self.is_categorical: label[label != 0] = 1 aug_result = self.aug(image=im, mask=label) # if image shape is 2D, convert to 3D if len(aug_result['image'].shape) == 2: aug_result['image'] = aug_result['image'][:, :, np.newaxis] X[i, :, :, :] = aug_result['image'] if len(aug_result['mask'].shape) == 2: aug_result['mask'] = aug_result['mask'][:, :, np.newaxis] y[i, :, :, :] = aug_result['mask'] else: raise NotImplementedError( 'Usage of non-mask labels is not implemented yet.') return X, y def __len__(self): """Denotes the number of batches per epoch. This is a required method for Keras Sequence objects. """ return self.n_batches def __getitem__(self, index): """Generate one batch of data.""" # Generate indexes of the batch im_inds = self.image_indexes[index*self.batch_size: (index+1)*self.batch_size] # Generate data X, y = self._data_generation(image_idxs=im_inds) return X, y
[docs]class TorchDataset(Dataset): """A PyTorch dataset object for solaris. Note that this object is wrapped in a :class:`torch.utils.data.DataLoader` before being passed to the :class:solaris.nets.train.Trainer` instance. Attributes ---------- df : :class:`pandas.DataFrame` The :class:`pandas.DataFrame` specifying where inputs are stored. aug : :class:`albumentations.core.composition.Compose` An albumentations Compose object to pass imagery through before passing it into the neural net. If an augmentation config subdict was provided during initialization, this is created by parsing the dict with :func:`solaris.nets.transform.process_aug_dict`. batch_size : int The batch size generated. n_batches : int The number of batches per epoch. Inferred based on the number of input files in `df` and `batch_size`. dtype : :class:`numpy.dtype` The numpy dtype that image inputs should be when passed to the model. is_categorical : bool Indicates whether masks output are boolean or categorical labels. num_classes: int Indicates the number of classes in the dataset dtype : class:`numpy.dtype` The data type images should be converted to before being passed to neural nets. """ def __init__(self, df, augs, batch_size, label_type='mask', is_categorical=False, num_classes=1, dtype=None): """ Create an instance of TorchDataset for use in model training. Arguments --------- df : :class:`pandas.DataFrame` A pandas DataFrame specifying images and label files to read into the model. See `the reference file creation tutorial`_ for more. augs : :class:`dict` or :class:`albumentations.core.composition.Compose` Either the config subdict specifying augmentations to apply, or a pre-created :class:`albumentations.core.composition.Compose` object containing all of the augmentations to apply. batch_size : int The number of samples in a training batch. label_type : str, optional The type of labels to be used. At present, only ``"mask"`` is supported. is_categorical : bool, optional Is the data categorical or boolean (default)? num_classes: int Indicates the number of classes in the dataset dtype : str, optional The dtype that image arrays should be converted to before being passed to the neural net. If not provided, defaults to ``"float32"``. Must be one of the `numpy dtype options`_. .. _numpy dtype options: https://docs.scipy.org/doc/numpy/user/basics.types.html """ super().__init__() self.df = df self.batch_size = batch_size self.n_batches = int(np.floor(len(self.df)/self.batch_size)) self.aug = _check_augs(augs) self.is_categorical = is_categorical self.num_classes = num_classes if dtype is None: self.dtype = np.float32 # default # if it's a string, get the appropriate object elif isinstance(dtype, str): try: self.dtype = getattr(np, dtype) except AttributeError: raise ValueError( 'The data type {} is not supported'.format(dtype)) # lastly, check if it's already defined in the right format for use elif issubclass(dtype, np.number) or isinstance(dtype, np.dtype): self.dtype = dtype def __len__(self): return len(self.df) def __getitem__(self, idx): """Get one image, mask pair""" # Generate indexes of the batch image = imread(self.df['image'].iloc[idx]) mask = imread(self.df['label'].iloc[idx]) if not self.is_categorical: mask[mask != 0] = 1 if len(mask.shape) == 2: mask = mask[:, :, np.newaxis] if len(image.shape) == 2: image = image[:, :, np.newaxis] sample = {'image': image, 'mask': mask} if self.aug: sample = self.aug(**sample) # add in additional inputs (if applicable) # additional_inputs = self.config['data_specs'].get('additional_inputs', # None) # if additional_inputs is not None: # for input in additional_inputs: # sample[input] = self.df[input].iloc[idx] sample['image'] = _check_channel_order(sample['image'], 'torch').astype(self.dtype) sample['mask'] = _check_channel_order(sample['mask'], 'torch').astype(np.float32) return sample
[docs]class InferenceTiler(object): """An object to tile fragments of images for inference. This object allows you to pass images of arbitrary size into Solaris for inference, similar to the pre-existing CosmiQ Works tool, BASISS_. The object will step across an input image creating tiles of size ``[height, width]``, taking steps of size ``[y_step, x_step]`` as it goes. When it reaches an edge, it will take tiles from ``-height`` or ``-width`` to the edge. Clearly, these can overlap with one another; the intention is that overlaps will be resolved using :func:`solaris.raster.image.stitch_images` when re-creating the output. .. _BASISS: https://github.com/cosmiq/basiss Attributes ---------- framework : str The deep learning framework used. Can be one of ``"torch"``, ``"pytorch"``, or ``"keras"``. width : int The width of images to load into the neural net. height : int The height of images to load into the neural net. x_step : int, optional The step size taken in the x direction when sampling for new images. y_step : int, optional The step size taken in the y direction when sampling for new images. aug : :class:`albumentations.core.composition.Compose` Augmentations to apply before passing to a neural net. Generally used for pre-processing. See Also -------- :func:`solaris.raster.image.stitch_images` :func:`make_data_generator` """ def __init__(self, framework, width, height, x_step=None, y_step=None, augmentations=None): """Create the tiler instance. Arguments --------- framework : str The deep learning framework used. Can be one of ``"torch"``, ``"pytorch"``, or ``"keras"``. width : int The width of images to load into the neural net. height : int The height of images to load into the neural net. x_step : int, optional The step size taken in the x direction when sampling for new images. If not provided, defaults to `width`. y_step : int, optional The step size taken in the y direction when sampling for new images. If not provided, defaults to `height`. aug : :class:`albumentations.core.composition.Compose` Augmentations to apply before passing to a neural net. Generally used for pre-processing. """ self.framework = framework self.width = width self.height = height if x_step is None: self.x_step = self.width else: self.x_step = x_step if y_step is None: self.y_step = self.height else: self.y_step = y_step self.aug = _check_augs(augmentations) def __call__(self, im): """Create an inference array along with an indexing reference list. Arguments --------- im : :class:`str` or :class:`numpy.array` An image to perform inference on. Returns ------- output_arr, top_left_corner_idxs output_arr : ``[N, Y, X, C]`` :class:`numpy.array` A :class:`numpy.array` for use in model inferencing. Each item along the first axis corresponds to a single sample for the model. top_left_corner_idxs : :class:`list` of :class:`tuple` s of :class:`int` s A :class:`list` of ``(top, left)`` tuples corresponding to the top left corner indices of each sample along the first axis of ``inference_arr`` . These values can be used to stitch the inferencing result back together. """ # read in the image if it's a path if isinstance(im, str): im = imread(im) # determine how many samples will be generated with the sliding window src_im_height = im.shape[0] src_im_width = im.shape[1] y_steps = int(1+np.ceil((src_im_height-self.height)/self.y_step)) x_steps = int(1+np.ceil((src_im_width-self.width)/self.x_step)) if len(im.shape) == 2: # if there's no channel axis im = im[:, :, np.newaxis] # create one - will be needed for model top_left_corner_idxs = [] output_arr = [] for y in range(y_steps): if self.y_step*y + self.height > im.shape[0]: y_min = im.shape[0] - self.height else: y_min = self.y_step*y for x in range(x_steps): if self.x_step*x + self.width > im.shape[1]: x_min = im.shape[1] - self.width else: x_min = self.x_step*x subarr = im[y_min:y_min + self.height, x_min:x_min + self.width, :] if self.aug is not None: subarr = self.aug(image=subarr)['image'] output_arr.append(subarr) top_left_corner_idxs.append((y_min, x_min)) output_arr = np.stack(output_arr).astype(np.float32) if self.framework in ['torch', 'pytorch']: output_arr = np.moveaxis(output_arr, 3, 1) return output_arr, top_left_corner_idxs, (src_im_height, src_im_width)