sub_string_delim_at_index

SubStringDelimAtIndexLayer ¤

SubStringDelimAtIndexLayer(
    name=None,
    input_dtype=None,
    output_dtype=None,
    delimiter="_",
    index=0,
    default_value="",
    **kwargs
)

Bases: BaseLayer

Layer which splits a string tensor by a delimiter and returns the substring at the specified index. If the delimiter is the empty string, the string is split into bytes/characters. If the index is negative, start counting from the end of the string. If the index is out of bounds, the default value is returned.

Initialise the SubStringDelimAtIndexLayer layer.

Parameters:

Name	Type	Description	Default
`name`	`Optional[str]`	The name of the layer. Defaults to `None`.	`None`
`input_dtype`	`Optional[str]`	The dtype to cast the input to. Defaults to `None`.	`None`
`output_dtype`	`Optional[str]`	The dtype to cast the output to. Defaults to `None`.	`None`
`delimiter`	`str`	String to split on. Defaults to `"_"`.	`'_'`
`index`	`int`	Index of the substring to return. Defaults to `0`. If the index is negative, start counting from the end of the string.	`0`
`default_value`	`str`	Value to return if index is out of bounds. Defaults to `""`. Defaults to `""`.	`''`

Source code in src/kamae/tensorflow/layers/sub_string_delim_at_index.py

def __init__(
    self,
    name: Optional[str] = None,
    input_dtype: Optional[str] = None,
    output_dtype: Optional[str] = None,
    delimiter: str = "_",
    index: int = 0,
    default_value: str = "",
    **kwargs: Any,
) -> None:
    """
    Initialise the SubStringDelimAtIndexLayer layer.

    :param name: The name of the layer. Defaults to `None`.
    :param input_dtype: The dtype to cast the input to. Defaults to `None`.
    :param output_dtype: The dtype to cast the output to. Defaults to `None`.
    :param delimiter: String to split on. Defaults to `"_"`.
    :param index: Index of the substring to return. Defaults to `0`.
    If the index is negative, start counting from the end of the string.
    :param default_value: Value to return if index is out of bounds.
    Defaults to `""`.
    Defaults to `""`.
    """
    super().__init__(
        name=name, input_dtype=input_dtype, output_dtype=output_dtype, **kwargs
    )
    self.delimiter = delimiter
    self.index = index
    self.default_value = default_value

compatible_dtypes `property` ¤

compatible_dtypes

Returns the compatible dtypes of the layer.

Returns:

Type	Description
`Optional[List[DType]]`	The compatible dtypes of the layer.

_call ¤

_call(inputs, **kwargs)

Splits the input string tensor by the delimiter and returns the substring at the specified index. If the index is out of bounds, the default value is returned.

Decorated with @enforce_single_tensor_input to ensure that the input is a single tensor. Raises an error if an iterable of tensors is passed in.

Parameters:

Name	Type	Description	Default
`inputs`	`Tensor`	Input tensor.	required

Returns:

Type	Description
`Tensor`	Tensor with the substring at the specified index.

Source code in src/kamae/tensorflow/layers/sub_string_delim_at_index.py

@enforce_single_tensor_input
def _call(self, inputs: Tensor, **kwargs: Any) -> Tensor:
    """
    Splits the input string tensor by the delimiter and returns the substring
    at the specified index. If the index is out of bounds, the default value
    is returned.

    Decorated with `@enforce_single_tensor_input` to ensure that the input
    is a single tensor. Raises an error if an iterable of tensors is passed
    in.

    :param inputs: Input tensor.
    :returns: Tensor with the substring at the specified index.
    """
    input_shape = tf.shape(inputs)
    # If the delimiter is empty, we split on bytes/characters.
    # Otherwise, we use the standard string split.
    ragged_strings_split = (
        tf.strings.split(inputs, sep=self.delimiter)
        if self.delimiter != ""
        else tf.strings.bytes_split(inputs)
    )

    if self.index >= 0:
        # The index is fully qualified, therefore, add the index + 1 to the shape
        # and then pad the ragged tensor to that shape. If the index is
        # out of bounds, it returns the default value
        index_shape = tf.constant([self.index + 1])
        input_shape = tf.concat([input_shape, index_shape], axis=0)
        return ragged_strings_split.to_tensor(
            default_value=self.default_value, shape=input_shape
        )[..., self.index]
    else:
        # The index is negative, so we need to resolve the positive index from it.
        resolved_index_tensor = self.resolve_negative_indices(
            ragged_tensor=ragged_strings_split, index=self.index
        )
        if isinstance(resolved_index_tensor, tf.RaggedTensor):
            # The resolved indices can be ragged or a regular tensor, however
            # are always rectangular since we only have a single ragged dimension,
            # and we have found the required index within this.
            resolved_index_tensor = resolved_index_tensor.to_tensor(
                shape=tf.shape(inputs)
            )

        # Pad the ragged tensor to the maximum row_length of the ragged tensor
        # This could be different for each batch, however we return a single index
        # from it, and thus we will have consistent output shapes per batch.
        max_ragged_dim = tf.cast(
            tf.reduce_max(ragged_strings_split.row_lengths(axis=-1)), dtype=tf.int32
        )
        input_shape = tf.concat(
            [input_shape, tf.expand_dims(max_ragged_dim, axis=0)], axis=0
        )
        padded_tensor = ragged_strings_split.to_tensor(
            default_value=self.default_value, shape=input_shape
        )
        # Expand the indices to match the shape of the input
        expanded_indices = tf.expand_dims(resolved_index_tensor, axis=-1)
        # Replace negative indices with zeros temporarily, we will send these to the
        # default value as they are out of bounds
        non_negative_expanded_indices = tf.where(
            expanded_indices < 0,
            tf.constant(0, dtype=expanded_indices.dtype),
            expanded_indices,
        )
        # Gather the resolved indices from the padded tensor, send any negative
        # indices to the default value
        gathered_tensor = tf.where(
            expanded_indices >= 0,
            tf.gather(padded_tensor, non_negative_expanded_indices, batch_dims=-1),
            tf.constant(self.default_value),
        )
        # Squeeze out the extra dimension
        return tf.squeeze(gathered_tensor, axis=-1)

get_config ¤

get_config()

Returns the config of the SubStringDelimAtIndex layer. Used for saving and loading from a model.

Specifically adds the delimiter, index and default_value to the config.

Returns:

Type	Description
`Dict[str, Any]`	Dictionary of the config of the layer.

Source code in src/kamae/tensorflow/layers/sub_string_delim_at_index.py

def get_config(self) -> Dict[str, Any]:
    """
    Returns the config of the SubStringDelimAtIndex layer.
    Used for saving and loading from a model.

    Specifically adds the `delimiter`, `index` and `default_value` to the config.

    :returns: Dictionary of the config of the layer.
    """
    config = super().get_config()
    config.update(
        {
            "delimiter": self.delimiter,
            "index": self.index,
            "default_value": self.default_value,
        }
    )
    return config

resolve_negative_indices `staticmethod` ¤

resolve_negative_indices(ragged_tensor, index)

Resolves negative indices to positive indices.

Parameters:

Name	Type	Description	Default
`ragged_tensor`	`RaggedTensor`	Ragged tensor	required
`index`	`int`	The index to resolve.	required

Returns:

Type	Description
`Tensor`	The resolved index.

Source code in src/kamae/tensorflow/layers/sub_string_delim_at_index.py

@staticmethod
def resolve_negative_indices(
    ragged_tensor: tf.RaggedTensor, index: int
) -> tf.Tensor:
    """
    Resolves negative indices to positive indices.

    :param ragged_tensor: Ragged tensor
    :param index: The index to resolve.
    :returns: The resolved index.
    """
    if index >= 0:
        raise ValueError("Index should be negative to resolve. Got positive index.")
    ragged_row_lengths = ragged_tensor.row_lengths(axis=-1)
    # Positive index is the length of the row + index. So that index = -1
    # resolves to the last dimension
    return tf.math.add(ragged_row_lengths, index)

sub_string_delim_at_index