Skip to content

data_processor module

ACES Data Processor Module:

This module provides functions for data input/output and preprocessing for the ACES project.

DataProcessor

ACES Data processor Class:

This class provides functions for data input/output and preprocessing for the ACES project.

Source code in aces/data_processor.py
class DataProcessor:
    """
    ACES Data processor Class:

    This class provides functions for data input/output and preprocessing for the ACES project.
    """

    @staticmethod
    @tf.autograph.experimental.do_not_convert
    def create_tfrecord_from_file(filename: str) -> tf.data.TFRecordDataset:
        """
        Create a TensorFlow Dataset from a TFRecord file.

        Parameters:

        * filename (str): The filename of the TFRecord file.

        Returns:

        * tf.data.TFRecordDataset: The TensorFlow Dataset created from the TFRecord file.
        """
        return tf.data.TFRecordDataset(filename, compression_type="GZIP")

    @staticmethod
    @tf.autograph.experimental.do_not_convert
    def get_sum_tensor(records):
        """
        Gets the total number of tensor record by mapping through them.

        Parameters:

        * records: The input tensor records.

        Returns:

        * tf.Tensor: The total number of tensor records.
        """
        dataset = records.map(lambda x, y: tf.constant(1, dtype=tf.int64), num_parallel_calls=tf.data.AUTOTUNE)
        # ignores any error encountered while reading the records
        # works with v2.x
        dataset = dataset.apply(tf.data.experimental.ignore_errors())
        n_tensors = dataset.reduce(np.int64(0), lambda x, y: x + y).numpy()
        return n_tensors

    @staticmethod
    def calculate_n_samples(**config):
        """
        Calculate the number of samples in the training, testing, and validation datasets.

        Parameters:

        **config: The configuration settings.

        Returns:

        * int: The number of training samples.

        * int: The number of testing samples.

        * int: The number of validation samples.

        """
        parser_tupler = partial(DataProcessor.parse_tfrecord,
                         patch_size=config.get("PATCH_SHAPE_SINGLE"),
                         features=config.get("FEATURES"),
                         labels=config.get("LABELS"),
                         depth=config.get("OUT_CLASS_NUM"))

        tf_training_records = tf.data.Dataset.list_files(f"{str(config.get('TRAINING_DIR'))}/*")\
                                             .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
        tf_training_records = tf_training_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)


        if config.get("PRINT_DATASET", False):
            DataProcessor.print_dataset_info(tf_training_records, "Training")

        n_training_records = DataProcessor.get_sum_tensor(tf_training_records)

        tf_testing_records = tf.data.Dataset.list_files(f"{str(config.get('TESTING_DIR'))}/*")\
                                            .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
        tf_testing_records = tf_testing_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
        n_testing_records = DataProcessor.get_sum_tensor(tf_testing_records)

        tf_validation_records = tf.data.Dataset.list_files(f"{str(config.get('VALIDATION_DIR'))}/*")\
                                               .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
        tf_validation_records = tf_validation_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
        n_validation_records = DataProcessor.get_sum_tensor(tf_validation_records)

        return n_training_records, n_testing_records, n_validation_records

    @staticmethod
    def print_dataset_info(dataset: tf.data.Dataset, dataset_name: str) -> None:
        """
        Print information about a dataset.

        Parameters:

        * dataset (tf.data.Dataset): The dataset to print information about.

        * dataset_name (str): The name of the dataset.
        """
        print(dataset_name)
        for inputs, outputs in dataset.take(1):
            try:
                print(f"inputs: {inputs.dtype.name} {inputs.shape}")
                print(inputs)
                print(f"outputs: {outputs.dtype.name} {outputs.shape}")
                print(outputs)
            except:
                print(f" > inputs:")
                for name, values in inputs.items():
                    print(f"    {name}: {values.dtype.name} {values.shape}")
                # print(f"    example \n: {dataset.take(1)}")
                print(f" > outputs: {outputs.dtype.name} {outputs.shape}")

    @staticmethod
    @tf.function
    def random_transform(dataset: tf.Tensor, label: tf.Tensor) -> tf.Tensor:
        """
        Apply random transformations to a dataset.

        Parameters:

        * dataset (tf.Tensor): The input dataset.

        Returns:

        * tf.Tensor: The transformed dataset.
        """
        x = tf.random.uniform(())
        if x < 0.10:
            dataset = tf.image.flip_left_right(dataset)
            label = tf.image.flip_left_right(label)
        elif tf.math.logical_and(x >= 0.10, x < 0.20):
            dataset = tf.image.flip_up_down(dataset)
            label = tf.image.flip_up_down(label)
        elif tf.math.logical_and(x >= 0.20, x < 0.30):
            dataset = tf.image.flip_left_right(tf.image.flip_up_down(dataset))
            label = tf.image.flip_left_right(tf.image.flip_up_down(label))
        elif tf.math.logical_and(x >= 0.30, x < 0.40):
            dataset = tf.image.rot90(dataset, k=1)
            label = tf.image.rot90(label, k=1)
        elif tf.math.logical_and(x >= 0.40, x < 0.50):
            dataset = tf.image.rot90(dataset, k=2)
            label = tf.image.rot90(label, k=2)
        elif tf.math.logical_and(x >= 0.50, x < 0.60):
            dataset = tf.image.rot90(dataset, k=3)
            label = tf.image.rot90(label, k=3)
        elif tf.math.logical_and(x >= 0.60, x < 0.70):
            dataset = tf.image.flip_left_right(tf.image.rot90(dataset, k=2))
            label = tf.image.flip_left_right(tf.image.rot90(label, k=2))
        else:
            dataset = dataset
            label = label

        return dataset, label

    @staticmethod
    @tf.function
    def parse_tfrecord(example_proto: tf.Tensor, patch_size: int, features: list = None, labels: list = None, depth: int = 1) -> tf.data.Dataset:
        """
        Parse a TFRecord example.

        Parameters:
        * example_proto (tf.Tensor): The example to parse.

        * patch_size (int): The size of the patch.

        * features (list, optional): The list of feature names to include. Default is None.

        * labels (list, optional): The list of label names to include. Default is None.

        Returns:

        * tf.data.Dataset: The parsed dataset.
        """
        keys = features + labels
        columns = [
            tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
        ]
        proto_struct = dict(zip(keys, columns))
        inputs = tf.io.parse_single_example(example_proto, proto_struct)
        inputs_list = [inputs.get(key) for key in keys]
        stacked = tf.stack(inputs_list, axis=0)
        stacked = tf.transpose(stacked, [1, 2, 0])
        label = stacked[:, :, len(features):]
        y = tf.one_hot(tf.cast(label[:, :, -1], tf.uint8), depth)
        return stacked[:, :, :len(features)], y

    @staticmethod
    @tf.function
    def to_tuple(dataset: tf.Tensor, n_features: int = None, inverse_labels: bool = False) -> tuple:
        """
        Convert a dataset to a tuple of features and labels.

        Parameters:

        * dataset (tf.Tensor): The input dataset.

        * n_features (int, optional): The number of features. Default is None.

        * inverse_labels (bool, optional): Whether to inverse the labels. Default is False.

        Returns:

        * tuple: A tuple containing the features and labels.
        """
        features = dataset[:, :, :, :n_features]
        labels = dataset[:, :, :, n_features:]
        if inverse_labels:
            labels_inverse = tf.math.abs(labels - 1)
            labels = tf.concat([labels_inverse, labels], axis=-1)
        return features, labels

    @staticmethod
    @tf.function
    def parse_tfrecord_with_name(example_proto: tf.Tensor, patch_size: int, features: list = None, labels: list = None) -> tf.data.Dataset:
        """
        Parse a TFRecord example with named features.

        Parameters:

        * example_proto (tf.Tensor): The example to parse.

        * patch_size (int): The size of the patch.

        * features (list, optional): The list of feature names to include. Default is None.

        * labels (list, optional): The list of label names to include. Default is None.

        Returns:

        * tf.data.Dataset: The parsed dataset.
        """
        keys = features + labels
        columns = [
            tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
        ]
        proto_struct = dict(zip(keys, columns))
        return tf.io.parse_single_example(example_proto, proto_struct)

    @staticmethod
    @tf.function
    def to_tuple_with_name(inputs: tf.Tensor, features: list = None, labels: list = None, n_classes: int = 1) -> tuple:
        """
        Convert inputs with named features to a tuple of features and one-hot encoded labels.

        Parameters:

        * inputs (tf.Tensor): The input dataset.

        * features (list, optional): The list of feature names. Default is None.

        * labels (list, optional): The list of label names. Default is None.

        * n_classes (int, optional): The number of classes for one-hot encoding. Default is 1.

        Returns:

        * tuple: A tuple containing the features and one-hot encoded labels.
        """
        return (
            {name: inputs[name] for name in features},
            tf.one_hot(tf.cast(inputs[labels[0]], tf.uint8), n_classes)
        )

    @staticmethod
    @tf.function
    def parse_tfrecord_dnn(example_proto: tf.Tensor, features: list = None, labels: list = None) -> tuple:
        """
        Parse a TFRecord example for DNN models.

        Parameters:

        * example_proto (tf.Tensor): The example to parse.

        * features (list, optional): The list of feature names to include. Default is None.

        * labels (list, optional): The list of label names to include. Default is None.

        Returns:

        * tuple: A tuple containing the parsed features and labels.
        """
        keys = features + labels
        columns = [
            tf.io.FixedLenFeature(shape=[1], dtype=tf.float32) for _ in keys
        ]
        proto_struct = dict(zip(keys, columns))
        parsed_features = tf.io.parse_single_example(example_proto, proto_struct)
        label = parsed_features.pop(labels[0])
        label = tf.cast(label, tf.int32)
        return parsed_features, label

    @staticmethod
    @tf.function
    def to_tuple_dnn(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
        """
        Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

        Parameters:

        * dataset (dict): The input dataset.

        * label (tf.Tensor): The label.

        * depth (int, optional): The depth of one-hot encoding. Default is 1.

        Returns:

        * tuple: A tuple containing the features and one-hot encoded labels.
        """
        return tf.transpose(list(dataset.values())), tf.one_hot(indices=label, depth=depth)

    @staticmethod
    def to_tuple_dnn_ai_platform(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
        """
        Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

        Parameters:

        * dataset (dict): The input dataset.

        * label (tf.Tensor): The label.

        * depth (int, optional): The depth of one-hot encoding. Default is 1.

        Returns:

        * tuple: A tuple containing the features and one-hot encoded labels.
        """
          # (1) -> (1, 1, 1)
        return ({k: [[v]] for k, v in dataset.items()}, tf.expand_dims(tf.one_hot(label, depth), axis=0))


    @staticmethod
    @tf.function
    def parse_tfrecord_multi_label(example_proto: tf.data.Dataset, patch_size: int, features: list = None, labels: list = None) -> tuple:
        """
        Parse a TFRecord example with multiple labels.

        Parameters:

        * example_proto (tf.data.Dataset): The example to parse.

        * patch_size (int): The size of the patch.

        * features (list, optional): The list of feature names to include. Default is None.

        * labels (list, optional): The list of label names to include. Default is None.

        Returns:

        * tuple: A tuple containing the parsed features and labels.
        """
        keys = features + labels
        columns = [
            tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
        ]
        proto_struct = dict(zip(keys, columns))
        parsed_features = tf.io.parse_single_example(example_proto, proto_struct)
        label = parsed_features.pop(labels[0])
        return parsed_features, label

    @staticmethod
    @tf.function
    def to_tuple_multi_label(dataset: dict, label: tf.Tensor, depth: int = 1, x_only: bool = False) -> tuple:
        """
        Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

        Parameters:

        * dataset (tuple): The input dataset.

        * n_labels (int, optional): The number of labels. Default is 1.

        Returns:

        * tuple: A tuple containing the features and multi-hot encoded labels.
        """
        label = tf.cast(label, tf.uint8)
        label = tf.one_hot(indices=label, depth=depth)
        parsed_dataset = {k: tf.expand_dims(v, axis=2) for k, v in dataset.items()}
        if x_only:
            return parsed_dataset
        return parsed_dataset, label


    @staticmethod
    @tf.function
    def to_tuple_multi_label_ai_platform(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
        """
        Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

        Parameters:

        * dataset (tuple): The input dataset.

        * n_labels (int, optional): The number of labels. Default is 1.

        Returns:

        * tuple: A tuple containing the features and multi-hot encoded labels.
        """
        label = tf.cast(label, tf.uint8)
        label = tf.one_hot(indices=label, depth=depth)
        parsed_dataset = {k: tf.expand_dims(v, axis=2) for k, v in dataset.items()}
        return parsed_dataset, label


    @staticmethod
    def _get_dataset(files: list, features: list, labels: list, patch_shape: list, batch_size: int, buffer_size: int = 1000, training: bool = False, **kwargs) -> tf.data.Dataset:
        """
        Get a TFRecord dataset.

        Parameters:
        filenames (list): The list of file names.
        patch_size (int): The size of the patch.
        features (list, optional): The list of feature names to include. Default is None.
        labels (list, optional): The list of label names to include. Default is None.
        batch_size (int, optional): The batch size. Default is 1.
        shuffle (bool, optional): Whether to shuffle the dataset. Default is False.
        n_labels (int, optional): The number of labels. Default is 1.
        num_parallel_calls (int, optional): The number of parallel calls. Default is tf.data.experimental.AUTOTUNE.
        drop_remainder (bool, optional): Whether to drop the remainder of batches. Default is False.
        cache (bool, optional): Whether to cache the dataset. Default is False.

        Returns:

        tf.data.Dataset: The TFRecord dataset.
        """
        dnn = kwargs.get('dnn', False)
        inverse_labels = kwargs.get('inverse_labels', False)
        depth = kwargs.get('depth', len(labels))
        multi_label_unet = kwargs.get('multi_label_unet', False)
        dataset = tf.data.TFRecordDataset(files, compression_type='GZIP')

        if dnn:
            parser = partial(DataProcessor.parse_tfrecord_dnn, features=features, labels=labels)
            split_data = partial(DataProcessor.to_tuple_dnn, depth=depth)
            dataset = dataset.map(parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            dataset = dataset.map(split_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            dataset = dataset.batch(batch_size)
            return dataset

        if multi_label_unet:
            parser = partial(DataProcessor.parse_tfrecord_multi_label, features=features, labels=labels, patch_shape=patch_shape)
            split_data = partial(DataProcessor.to_tuple_multi_label, n_features=len(features), depth=depth)
            dataset = dataset.interleave(parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            if training:
                dataset = dataset.shuffle(buffer_size, reshuffle_each_iteration=True).batch(batch_size) \
                                 .map(DataProcessor.random_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
                                 .map(split_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
            else:
                dataset = dataset.batch(batch_size).map(split_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)

            return dataset

        parser = partial(DataProcessor.parse_tfrecord, features=features, labels=labels)
        split_data = partial(DataProcessor.to_tuple, n_features=len(features), inverse_labels=inverse_labels)
        dataset = dataset.interleave(parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if training:
            dataset = dataset.shuffle(buffer_size, reshuffle_each_iteration=True).batch(batch_size) \
                             .map(DataProcessor.random_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
                             .map(split_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        else:
            dataset = dataset.batch(batch_size).map(split_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)

        return dataset

    @staticmethod
    def get_dataset(pattern: str, features: list, labels: list, patch_size: int, batch_size: int, n_classes: int = 1, **kwargs) -> tf.data.Dataset:
        """
        Get a TFRecord dataset.

        Parameters:
        * filenames (list): The list of file names.

        * patch_size (int): The size of the patch.

        * (list, optional): The list of feature names to include. Default is None.

        * labels (list, optional): The list of label names to include. Default is None.

        * batch_size (int, optional): The batch size. Default is 1.

        * shuffle(bool, optional): Whether to shuffle the dataset. Default is False.

        * n_labels (int, optional): The number of labels. Default is 1.

        * num_parallel_calls (int, optional): The number of parallel calls. Default is tf.data.experimental.AUTOTUNE.

        * drop_remainder (bool, optional): Whether to drop the remainder of batches. Default is False.

        * cache (bool, optional): Whether to cache the dataset. Default is False.

        Returns:
        * tf.data.Dataset: The TFRecord dataset.
        """
        print(f"Loading dataset from {pattern}")

        dataset = tf.data.Dataset.list_files(pattern).interleave(DataProcessor.create_tfrecord_from_file)

        if kwargs.get("IS_DNN", False):
            if kwargs.get("USE_AI_PLATFORM", False):
                parser = partial(DataProcessor.parse_tfrecord_dnn, features=features, labels=labels)
                tupler = partial(DataProcessor.to_tuple_dnn_ai_platform, depth=n_classes)
            else:
                parser = partial(DataProcessor.parse_tfrecord_dnn, features=features, labels=labels)
                tupler = partial(DataProcessor.to_tuple_dnn, depth=n_classes)

            dataset = dataset.map(parser, num_parallel_calls=tf.data.AUTOTUNE)
            dataset = dataset.map(tupler, num_parallel_calls=tf.data.AUTOTUNE)
            dataset = dataset.shuffle(512)
            dataset = dataset.batch(batch_size)
            dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
            return dataset

        if kwargs.get("USE_AI_PLATFORM", False):
            parser = partial(DataProcessor.parse_tfrecord_multi_label, patch_size=patch_size, features=features, labels=labels)
            tupler = partial(DataProcessor.to_tuple_multi_label_ai_platform, depth=n_classes)
            parser_tupler = None
        else:
            parser_tupler = partial(DataProcessor.parse_tfrecord, patch_size=patch_size, features=features, labels=labels, depth=n_classes)

        if parser_tupler is not None:
            dataset = dataset.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
        else:
            dataset = dataset.map(parser, num_parallel_calls=tf.data.AUTOTUNE)
            dataset = dataset.map(tupler, num_parallel_calls=tf.data.AUTOTUNE)

        dataset = dataset.shuffle(512)
        # dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
        if kwargs.get("training", False) and kwargs.get("TRANSFORM_DATA", True):
            print("randomly transforming data")
            if kwargs.get("USE_AI_PLATFORM", False):
                dataset = dataset.map(RandomTransform(), num_parallel_calls=tf.data.AUTOTUNE)
            else:
                dataset = dataset.map(DataProcessor.random_transform, num_parallel_calls=tf.data.AUTOTUNE)

        dataset = dataset.batch(batch_size)
        # dataset = dataset.cache()
        return dataset

calculate_n_samples(**config) staticmethod

Calculate the number of samples in the training, testing, and validation datasets.

**config: The configuration settings.

  • int: The number of training samples.

  • int: The number of testing samples.

  • int: The number of validation samples.

Source code in aces/data_processor.py
@staticmethod
def calculate_n_samples(**config):
    """
    Calculate the number of samples in the training, testing, and validation datasets.

    Parameters:

    **config: The configuration settings.

    Returns:

    * int: The number of training samples.

    * int: The number of testing samples.

    * int: The number of validation samples.

    """
    parser_tupler = partial(DataProcessor.parse_tfrecord,
                     patch_size=config.get("PATCH_SHAPE_SINGLE"),
                     features=config.get("FEATURES"),
                     labels=config.get("LABELS"),
                     depth=config.get("OUT_CLASS_NUM"))

    tf_training_records = tf.data.Dataset.list_files(f"{str(config.get('TRAINING_DIR'))}/*")\
                                         .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
    tf_training_records = tf_training_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)


    if config.get("PRINT_DATASET", False):
        DataProcessor.print_dataset_info(tf_training_records, "Training")

    n_training_records = DataProcessor.get_sum_tensor(tf_training_records)

    tf_testing_records = tf.data.Dataset.list_files(f"{str(config.get('TESTING_DIR'))}/*")\
                                        .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
    tf_testing_records = tf_testing_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
    n_testing_records = DataProcessor.get_sum_tensor(tf_testing_records)

    tf_validation_records = tf.data.Dataset.list_files(f"{str(config.get('VALIDATION_DIR'))}/*")\
                                           .interleave(DataProcessor.create_tfrecord_from_file, num_parallel_calls=tf.data.AUTOTUNE)
    tf_validation_records = tf_validation_records.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
    n_validation_records = DataProcessor.get_sum_tensor(tf_validation_records)

    return n_training_records, n_testing_records, n_validation_records

create_tfrecord_from_file(filename) staticmethod

Create a TensorFlow Dataset from a TFRecord file.

  • filename (str): The filename of the TFRecord file.
  • tf.data.TFRecordDataset: The TensorFlow Dataset created from the TFRecord file.
Source code in aces/data_processor.py
@staticmethod
@tf.autograph.experimental.do_not_convert
def create_tfrecord_from_file(filename: str) -> tf.data.TFRecordDataset:
    """
    Create a TensorFlow Dataset from a TFRecord file.

    Parameters:

    * filename (str): The filename of the TFRecord file.

    Returns:

    * tf.data.TFRecordDataset: The TensorFlow Dataset created from the TFRecord file.
    """
    return tf.data.TFRecordDataset(filename, compression_type="GZIP")

get_dataset(pattern, features, labels, patch_size, batch_size, n_classes=1, **kwargs) staticmethod

Get a TFRecord dataset.

  • filenames (list): The list of file names.

  • patch_size (int): The size of the patch.

  • (list, optional): The list of feature names to include. Default is None.

  • labels (list, optional): The list of label names to include. Default is None.

  • batch_size (int, optional): The batch size. Default is 1.

  • shuffle(bool, optional): Whether to shuffle the dataset. Default is False.

  • n_labels (int, optional): The number of labels. Default is 1.

  • num_parallel_calls (int, optional): The number of parallel calls. Default is tf.data.experimental.AUTOTUNE.

  • drop_remainder (bool, optional): Whether to drop the remainder of batches. Default is False.

  • cache (bool, optional): Whether to cache the dataset. Default is False.

  • tf.data.Dataset: The TFRecord dataset.
Source code in aces/data_processor.py
@staticmethod
def get_dataset(pattern: str, features: list, labels: list, patch_size: int, batch_size: int, n_classes: int = 1, **kwargs) -> tf.data.Dataset:
    """
    Get a TFRecord dataset.

    Parameters:
    * filenames (list): The list of file names.

    * patch_size (int): The size of the patch.

    * (list, optional): The list of feature names to include. Default is None.

    * labels (list, optional): The list of label names to include. Default is None.

    * batch_size (int, optional): The batch size. Default is 1.

    * shuffle(bool, optional): Whether to shuffle the dataset. Default is False.

    * n_labels (int, optional): The number of labels. Default is 1.

    * num_parallel_calls (int, optional): The number of parallel calls. Default is tf.data.experimental.AUTOTUNE.

    * drop_remainder (bool, optional): Whether to drop the remainder of batches. Default is False.

    * cache (bool, optional): Whether to cache the dataset. Default is False.

    Returns:
    * tf.data.Dataset: The TFRecord dataset.
    """
    print(f"Loading dataset from {pattern}")

    dataset = tf.data.Dataset.list_files(pattern).interleave(DataProcessor.create_tfrecord_from_file)

    if kwargs.get("IS_DNN", False):
        if kwargs.get("USE_AI_PLATFORM", False):
            parser = partial(DataProcessor.parse_tfrecord_dnn, features=features, labels=labels)
            tupler = partial(DataProcessor.to_tuple_dnn_ai_platform, depth=n_classes)
        else:
            parser = partial(DataProcessor.parse_tfrecord_dnn, features=features, labels=labels)
            tupler = partial(DataProcessor.to_tuple_dnn, depth=n_classes)

        dataset = dataset.map(parser, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.map(tupler, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.shuffle(512)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
        return dataset

    if kwargs.get("USE_AI_PLATFORM", False):
        parser = partial(DataProcessor.parse_tfrecord_multi_label, patch_size=patch_size, features=features, labels=labels)
        tupler = partial(DataProcessor.to_tuple_multi_label_ai_platform, depth=n_classes)
        parser_tupler = None
    else:
        parser_tupler = partial(DataProcessor.parse_tfrecord, patch_size=patch_size, features=features, labels=labels, depth=n_classes)

    if parser_tupler is not None:
        dataset = dataset.map(parser_tupler, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        dataset = dataset.map(parser, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.map(tupler, num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.shuffle(512)
    # dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    if kwargs.get("training", False) and kwargs.get("TRANSFORM_DATA", True):
        print("randomly transforming data")
        if kwargs.get("USE_AI_PLATFORM", False):
            dataset = dataset.map(RandomTransform(), num_parallel_calls=tf.data.AUTOTUNE)
        else:
            dataset = dataset.map(DataProcessor.random_transform, num_parallel_calls=tf.data.AUTOTUNE)

    dataset = dataset.batch(batch_size)
    # dataset = dataset.cache()
    return dataset

get_sum_tensor(records) staticmethod

Gets the total number of tensor record by mapping through them.

  • records: The input tensor records.
  • tf.Tensor: The total number of tensor records.
Source code in aces/data_processor.py
@staticmethod
@tf.autograph.experimental.do_not_convert
def get_sum_tensor(records):
    """
    Gets the total number of tensor record by mapping through them.

    Parameters:

    * records: The input tensor records.

    Returns:

    * tf.Tensor: The total number of tensor records.
    """
    dataset = records.map(lambda x, y: tf.constant(1, dtype=tf.int64), num_parallel_calls=tf.data.AUTOTUNE)
    # ignores any error encountered while reading the records
    # works with v2.x
    dataset = dataset.apply(tf.data.experimental.ignore_errors())
    n_tensors = dataset.reduce(np.int64(0), lambda x, y: x + y).numpy()
    return n_tensors

parse_tfrecord(example_proto, patch_size, features=None, labels=None, depth=1) staticmethod

Parse a TFRecord example.

  • example_proto (tf.Tensor): The example to parse.

  • patch_size (int): The size of the patch.

  • features (list, optional): The list of feature names to include. Default is None.

  • labels (list, optional): The list of label names to include. Default is None.

  • tf.data.Dataset: The parsed dataset.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def parse_tfrecord(example_proto: tf.Tensor, patch_size: int, features: list = None, labels: list = None, depth: int = 1) -> tf.data.Dataset:
    """
    Parse a TFRecord example.

    Parameters:
    * example_proto (tf.Tensor): The example to parse.

    * patch_size (int): The size of the patch.

    * features (list, optional): The list of feature names to include. Default is None.

    * labels (list, optional): The list of label names to include. Default is None.

    Returns:

    * tf.data.Dataset: The parsed dataset.
    """
    keys = features + labels
    columns = [
        tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
    ]
    proto_struct = dict(zip(keys, columns))
    inputs = tf.io.parse_single_example(example_proto, proto_struct)
    inputs_list = [inputs.get(key) for key in keys]
    stacked = tf.stack(inputs_list, axis=0)
    stacked = tf.transpose(stacked, [1, 2, 0])
    label = stacked[:, :, len(features):]
    y = tf.one_hot(tf.cast(label[:, :, -1], tf.uint8), depth)
    return stacked[:, :, :len(features)], y

parse_tfrecord_dnn(example_proto, features=None, labels=None) staticmethod

Parse a TFRecord example for DNN models.

  • example_proto (tf.Tensor): The example to parse.

  • features (list, optional): The list of feature names to include. Default is None.

  • labels (list, optional): The list of label names to include. Default is None.

  • tuple: A tuple containing the parsed features and labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def parse_tfrecord_dnn(example_proto: tf.Tensor, features: list = None, labels: list = None) -> tuple:
    """
    Parse a TFRecord example for DNN models.

    Parameters:

    * example_proto (tf.Tensor): The example to parse.

    * features (list, optional): The list of feature names to include. Default is None.

    * labels (list, optional): The list of label names to include. Default is None.

    Returns:

    * tuple: A tuple containing the parsed features and labels.
    """
    keys = features + labels
    columns = [
        tf.io.FixedLenFeature(shape=[1], dtype=tf.float32) for _ in keys
    ]
    proto_struct = dict(zip(keys, columns))
    parsed_features = tf.io.parse_single_example(example_proto, proto_struct)
    label = parsed_features.pop(labels[0])
    label = tf.cast(label, tf.int32)
    return parsed_features, label

parse_tfrecord_multi_label(example_proto, patch_size, features=None, labels=None) staticmethod

Parse a TFRecord example with multiple labels.

  • example_proto (tf.data.Dataset): The example to parse.

  • patch_size (int): The size of the patch.

  • features (list, optional): The list of feature names to include. Default is None.

  • labels (list, optional): The list of label names to include. Default is None.

  • tuple: A tuple containing the parsed features and labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def parse_tfrecord_multi_label(example_proto: tf.data.Dataset, patch_size: int, features: list = None, labels: list = None) -> tuple:
    """
    Parse a TFRecord example with multiple labels.

    Parameters:

    * example_proto (tf.data.Dataset): The example to parse.

    * patch_size (int): The size of the patch.

    * features (list, optional): The list of feature names to include. Default is None.

    * labels (list, optional): The list of label names to include. Default is None.

    Returns:

    * tuple: A tuple containing the parsed features and labels.
    """
    keys = features + labels
    columns = [
        tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
    ]
    proto_struct = dict(zip(keys, columns))
    parsed_features = tf.io.parse_single_example(example_proto, proto_struct)
    label = parsed_features.pop(labels[0])
    return parsed_features, label

parse_tfrecord_with_name(example_proto, patch_size, features=None, labels=None) staticmethod

Parse a TFRecord example with named features.

  • example_proto (tf.Tensor): The example to parse.

  • patch_size (int): The size of the patch.

  • features (list, optional): The list of feature names to include. Default is None.

  • labels (list, optional): The list of label names to include. Default is None.

  • tf.data.Dataset: The parsed dataset.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def parse_tfrecord_with_name(example_proto: tf.Tensor, patch_size: int, features: list = None, labels: list = None) -> tf.data.Dataset:
    """
    Parse a TFRecord example with named features.

    Parameters:

    * example_proto (tf.Tensor): The example to parse.

    * patch_size (int): The size of the patch.

    * features (list, optional): The list of feature names to include. Default is None.

    * labels (list, optional): The list of label names to include. Default is None.

    Returns:

    * tf.data.Dataset: The parsed dataset.
    """
    keys = features + labels
    columns = [
        tf.io.FixedLenFeature(shape=[patch_size, patch_size], dtype=tf.float32) for _ in keys
    ]
    proto_struct = dict(zip(keys, columns))
    return tf.io.parse_single_example(example_proto, proto_struct)

print_dataset_info(dataset, dataset_name) staticmethod

Print information about a dataset.

  • dataset (tf.data.Dataset): The dataset to print information about.

  • dataset_name (str): The name of the dataset.

Source code in aces/data_processor.py
@staticmethod
def print_dataset_info(dataset: tf.data.Dataset, dataset_name: str) -> None:
    """
    Print information about a dataset.

    Parameters:

    * dataset (tf.data.Dataset): The dataset to print information about.

    * dataset_name (str): The name of the dataset.
    """
    print(dataset_name)
    for inputs, outputs in dataset.take(1):
        try:
            print(f"inputs: {inputs.dtype.name} {inputs.shape}")
            print(inputs)
            print(f"outputs: {outputs.dtype.name} {outputs.shape}")
            print(outputs)
        except:
            print(f" > inputs:")
            for name, values in inputs.items():
                print(f"    {name}: {values.dtype.name} {values.shape}")
            # print(f"    example \n: {dataset.take(1)}")
            print(f" > outputs: {outputs.dtype.name} {outputs.shape}")

random_transform(dataset, label) staticmethod

Apply random transformations to a dataset.

  • dataset (tf.Tensor): The input dataset.
  • tf.Tensor: The transformed dataset.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def random_transform(dataset: tf.Tensor, label: tf.Tensor) -> tf.Tensor:
    """
    Apply random transformations to a dataset.

    Parameters:

    * dataset (tf.Tensor): The input dataset.

    Returns:

    * tf.Tensor: The transformed dataset.
    """
    x = tf.random.uniform(())
    if x < 0.10:
        dataset = tf.image.flip_left_right(dataset)
        label = tf.image.flip_left_right(label)
    elif tf.math.logical_and(x >= 0.10, x < 0.20):
        dataset = tf.image.flip_up_down(dataset)
        label = tf.image.flip_up_down(label)
    elif tf.math.logical_and(x >= 0.20, x < 0.30):
        dataset = tf.image.flip_left_right(tf.image.flip_up_down(dataset))
        label = tf.image.flip_left_right(tf.image.flip_up_down(label))
    elif tf.math.logical_and(x >= 0.30, x < 0.40):
        dataset = tf.image.rot90(dataset, k=1)
        label = tf.image.rot90(label, k=1)
    elif tf.math.logical_and(x >= 0.40, x < 0.50):
        dataset = tf.image.rot90(dataset, k=2)
        label = tf.image.rot90(label, k=2)
    elif tf.math.logical_and(x >= 0.50, x < 0.60):
        dataset = tf.image.rot90(dataset, k=3)
        label = tf.image.rot90(label, k=3)
    elif tf.math.logical_and(x >= 0.60, x < 0.70):
        dataset = tf.image.flip_left_right(tf.image.rot90(dataset, k=2))
        label = tf.image.flip_left_right(tf.image.rot90(label, k=2))
    else:
        dataset = dataset
        label = label

    return dataset, label

to_tuple(dataset, n_features=None, inverse_labels=False) staticmethod

Convert a dataset to a tuple of features and labels.

  • dataset (tf.Tensor): The input dataset.

  • n_features (int, optional): The number of features. Default is None.

  • inverse_labels (bool, optional): Whether to inverse the labels. Default is False.

  • tuple: A tuple containing the features and labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def to_tuple(dataset: tf.Tensor, n_features: int = None, inverse_labels: bool = False) -> tuple:
    """
    Convert a dataset to a tuple of features and labels.

    Parameters:

    * dataset (tf.Tensor): The input dataset.

    * n_features (int, optional): The number of features. Default is None.

    * inverse_labels (bool, optional): Whether to inverse the labels. Default is False.

    Returns:

    * tuple: A tuple containing the features and labels.
    """
    features = dataset[:, :, :, :n_features]
    labels = dataset[:, :, :, n_features:]
    if inverse_labels:
        labels_inverse = tf.math.abs(labels - 1)
        labels = tf.concat([labels_inverse, labels], axis=-1)
    return features, labels

to_tuple_dnn(dataset, label, depth=1) staticmethod

Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

  • dataset (dict): The input dataset.

  • label (tf.Tensor): The label.

  • depth (int, optional): The depth of one-hot encoding. Default is 1.

  • tuple: A tuple containing the features and one-hot encoded labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def to_tuple_dnn(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
    """
    Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

    Parameters:

    * dataset (dict): The input dataset.

    * label (tf.Tensor): The label.

    * depth (int, optional): The depth of one-hot encoding. Default is 1.

    Returns:

    * tuple: A tuple containing the features and one-hot encoded labels.
    """
    return tf.transpose(list(dataset.values())), tf.one_hot(indices=label, depth=depth)

to_tuple_dnn_ai_platform(dataset, label, depth=1) staticmethod

Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

  • dataset (dict): The input dataset.

  • label (tf.Tensor): The label.

  • depth (int, optional): The depth of one-hot encoding. Default is 1.

  • tuple: A tuple containing the features and one-hot encoded labels.
Source code in aces/data_processor.py
@staticmethod
def to_tuple_dnn_ai_platform(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
    """
    Convert a dataset for DNN models to a tuple of features and one-hot encoded labels.

    Parameters:

    * dataset (dict): The input dataset.

    * label (tf.Tensor): The label.

    * depth (int, optional): The depth of one-hot encoding. Default is 1.

    Returns:

    * tuple: A tuple containing the features and one-hot encoded labels.
    """
      # (1) -> (1, 1, 1)
    return ({k: [[v]] for k, v in dataset.items()}, tf.expand_dims(tf.one_hot(label, depth), axis=0))

to_tuple_multi_label(dataset, label, depth=1, x_only=False) staticmethod

Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

  • dataset (tuple): The input dataset.

  • n_labels (int, optional): The number of labels. Default is 1.

  • tuple: A tuple containing the features and multi-hot encoded labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def to_tuple_multi_label(dataset: dict, label: tf.Tensor, depth: int = 1, x_only: bool = False) -> tuple:
    """
    Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

    Parameters:

    * dataset (tuple): The input dataset.

    * n_labels (int, optional): The number of labels. Default is 1.

    Returns:

    * tuple: A tuple containing the features and multi-hot encoded labels.
    """
    label = tf.cast(label, tf.uint8)
    label = tf.one_hot(indices=label, depth=depth)
    parsed_dataset = {k: tf.expand_dims(v, axis=2) for k, v in dataset.items()}
    if x_only:
        return parsed_dataset
    return parsed_dataset, label

to_tuple_multi_label_ai_platform(dataset, label, depth=1) staticmethod

Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

  • dataset (tuple): The input dataset.

  • n_labels (int, optional): The number of labels. Default is 1.

  • tuple: A tuple containing the features and multi-hot encoded labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def to_tuple_multi_label_ai_platform(dataset: dict, label: tf.Tensor, depth: int = 1) -> tuple:
    """
    Convert a dataset with multiple labels to a tuple of features and multi-hot encoded labels.

    Parameters:

    * dataset (tuple): The input dataset.

    * n_labels (int, optional): The number of labels. Default is 1.

    Returns:

    * tuple: A tuple containing the features and multi-hot encoded labels.
    """
    label = tf.cast(label, tf.uint8)
    label = tf.one_hot(indices=label, depth=depth)
    parsed_dataset = {k: tf.expand_dims(v, axis=2) for k, v in dataset.items()}
    return parsed_dataset, label

to_tuple_with_name(inputs, features=None, labels=None, n_classes=1) staticmethod

Convert inputs with named features to a tuple of features and one-hot encoded labels.

  • inputs (tf.Tensor): The input dataset.

  • features (list, optional): The list of feature names. Default is None.

  • labels (list, optional): The list of label names. Default is None.

  • n_classes (int, optional): The number of classes for one-hot encoding. Default is 1.

  • tuple: A tuple containing the features and one-hot encoded labels.
Source code in aces/data_processor.py
@staticmethod
@tf.function
def to_tuple_with_name(inputs: tf.Tensor, features: list = None, labels: list = None, n_classes: int = 1) -> tuple:
    """
    Convert inputs with named features to a tuple of features and one-hot encoded labels.

    Parameters:

    * inputs (tf.Tensor): The input dataset.

    * features (list, optional): The list of feature names. Default is None.

    * labels (list, optional): The list of label names. Default is None.

    * n_classes (int, optional): The number of classes for one-hot encoding. Default is 1.

    Returns:

    * tuple: A tuple containing the features and one-hot encoded labels.
    """
    return (
        {name: inputs[name] for name in features},
        tf.one_hot(tf.cast(inputs[labels[0]], tf.uint8), n_classes)
    )

RandomTransform (Layer)

Source code in aces/data_processor.py
class RandomTransform(tf.keras.layers.Layer):
    def __init__(self, seed=42, unit_range=True):
        super().__init__()
        self.seed = seed
        self.flip_horizontal = tf.keras.layers.RandomFlip("horizontal", seed=self.seed)
        self.flip_vertical = tf.keras.layers.RandomFlip("vertical", seed=self.seed)
        self.flip_both = tf.keras.layers.RandomFlip("horizontal_and_vertical", seed=self.seed)
        self.random_brightness = tf.keras.layers.RandomBrightness(0.2, value_range=(0, 1) if unit_range else (0, 255), seed=self.seed)
        self.random_contrast = tf.keras.layers.RandomContrast(0.2, seed=self.seed)

    @tf.function
    def call(self, dataset, label):
        """
        Apply random transformations to a dataset.

        Parameters:

        * dataset (tf.Tensor): The input dataset.

        * label (tf.Tensor): The corresponding label.

        Returns:

        * tuple: The transformed dataset and label as a tuple.
        """
        x = tf.random.uniform((), seed=self.seed)
        transformed_features = {}

        # Apply the same random transformation across all bands or features
        for key, feature in dataset.items():
            transformed_feature = feature  # Default to no change
            if x < 0.10:
                transformed_feature = self.flip_horizontal(feature)
            elif tf.math.logical_and(x >= 0.10, x < 0.20):
                transformed_feature = self.flip_vertical(feature)
            elif tf.math.logical_and(x >= 0.20, x < 0.30):
                transformed_feature = self.flip_both(feature)
            elif tf.math.logical_and(x >= 0.30, x < 0.40):
                transformed_feature = tf.image.rot90(feature, k=1)
            elif tf.math.logical_and(x >= 0.40, x < 0.50):
                transformed_feature = tf.image.rot90(feature, k=2)
            elif tf.math.logical_and(x >= 0.50, x < 0.60):
                transformed_feature = tf.image.rot90(feature, k=3)
            elif tf.math.logical_and(x >= 0.60, x < 0.70):
                transformed_feature = self.random_brightness(feature)
            elif tf.math.logical_and(x >= 0.70, x < 0.80):
                transformed_feature = self.random_contrast(feature)

            transformed_features[key] = transformed_feature

        # Apply corresponding transformations to the label
        transformed_label = label  # Default to no change
        if x < 0.10:
            transformed_label = self.flip_horizontal(label)
        elif tf.math.logical_and(x >= 0.10, x < 0.20):
            transformed_label = self.flip_vertical(label)
        elif tf.math.logical_and(x >= 0.20, x < 0.30):
            transformed_label = self.flip_both(label)
        elif tf.math.logical_and(x >= 0.30, x < 0.40):
            transformed_label = tf.image.rot90(label, k=1)
        elif tf.math.logical_and(x >= 0.40, x < 0.50):
            transformed_label = tf.image.rot90(label, k=2)
        elif tf.math.logical_and(x >= 0.50, x < 0.60):
            transformed_label = tf.image.rot90(label, k=3)

        return transformed_features, transformed_label

call(self, dataset, label)

Apply random transformations to a dataset.

  • dataset (tf.Tensor): The input dataset.

  • label (tf.Tensor): The corresponding label.

  • tuple: The transformed dataset and label as a tuple.
Source code in aces/data_processor.py
@tf.function
def call(self, dataset, label):
    """
    Apply random transformations to a dataset.

    Parameters:

    * dataset (tf.Tensor): The input dataset.

    * label (tf.Tensor): The corresponding label.

    Returns:

    * tuple: The transformed dataset and label as a tuple.
    """
    x = tf.random.uniform((), seed=self.seed)
    transformed_features = {}

    # Apply the same random transformation across all bands or features
    for key, feature in dataset.items():
        transformed_feature = feature  # Default to no change
        if x < 0.10:
            transformed_feature = self.flip_horizontal(feature)
        elif tf.math.logical_and(x >= 0.10, x < 0.20):
            transformed_feature = self.flip_vertical(feature)
        elif tf.math.logical_and(x >= 0.20, x < 0.30):
            transformed_feature = self.flip_both(feature)
        elif tf.math.logical_and(x >= 0.30, x < 0.40):
            transformed_feature = tf.image.rot90(feature, k=1)
        elif tf.math.logical_and(x >= 0.40, x < 0.50):
            transformed_feature = tf.image.rot90(feature, k=2)
        elif tf.math.logical_and(x >= 0.50, x < 0.60):
            transformed_feature = tf.image.rot90(feature, k=3)
        elif tf.math.logical_and(x >= 0.60, x < 0.70):
            transformed_feature = self.random_brightness(feature)
        elif tf.math.logical_and(x >= 0.70, x < 0.80):
            transformed_feature = self.random_contrast(feature)

        transformed_features[key] = transformed_feature

    # Apply corresponding transformations to the label
    transformed_label = label  # Default to no change
    if x < 0.10:
        transformed_label = self.flip_horizontal(label)
    elif tf.math.logical_and(x >= 0.10, x < 0.20):
        transformed_label = self.flip_vertical(label)
    elif tf.math.logical_and(x >= 0.20, x < 0.30):
        transformed_label = self.flip_both(label)
    elif tf.math.logical_and(x >= 0.30, x < 0.40):
        transformed_label = tf.image.rot90(label, k=1)
    elif tf.math.logical_and(x >= 0.40, x < 0.50):
        transformed_label = tf.image.rot90(label, k=2)
    elif tf.math.logical_and(x >= 0.50, x < 0.60):
        transformed_label = tf.image.rot90(label, k=3)

    return transformed_features, transformed_label