array_crop

ArrayCropParams ¤

Bases: PadValueParams

Mixin class containing pad value parameters needed for array crop transformers.

getArrayLength ¤

getArrayLength()

Gets the array length parameter.

Returns:

Type	Description
`int`	array length.

Source code in src/kamae/spark/transformers/array_crop.py

def getArrayLength(self) -> int:
    """
    Gets the array length parameter.
    :returns: array length.
    """
    return self.getOrDefault(self.arrayLength)

setArrayLength ¤

setArrayLength(value)

Sets the parameter array length to the given value.

Parameters:

Name	Type	Description	Default
`value`	`int`	array length.	required

Returns:

Type	Description
`ArrayCropParams`	Instance of class mixed in.

Source code in src/kamae/spark/transformers/array_crop.py

def setArrayLength(self, value: int) -> "ArrayCropParams":
    """
    Sets the parameter array length to the given value.
    :param value: array length.
    :returns: Instance of class mixed in.
    """
    if value < 1:
        raise ValueError("Array length must be greater than 0.")
    return self._set(arrayLength=value)

ArrayCropTransformer ¤

ArrayCropTransformer(
    inputCol=None,
    outputCol=None,
    inputDtype=None,
    outputDtype=None,
    layerName=None,
    arrayLength=128,
    padValue=None,
)

Bases: BaseTransformer, SingleInputSingleOutputParams, ArrayCropParams

Transformer that reshapes arrays into consistent shapes by either cropping or padding.

If the tensor is shorter than the specified length, it is padded with specified pad value.

Initialises the ArrayCropTransformer

Parameters:

Name	Type	Description	Default
`inputCol`	`Optional[str]`	Input column name.	`None`
`outputCol`	`Optional[str]`	Output column name.	`None`
`inputDtype`	`Optional[Union[str, int, float]]`	Input data type to cast input column(s) to before transforming.	`None`
`outputDtype`	`Optional[Union[str, int, float]]`	Output data type to cast the output column to after transforming.	`None`
`layerName`	`Optional[str]`	Name of the layer. Used as the name of the tensorflow layer	`None`
`arrayLength`	`Optional[int]`	The length to crop or pad the arrays to. Defaults to 128.	`128`
`padValue`	`Optional[Union[str, int, float]]`	The value pad the arrays with. Defaults to `None`.	`None`

Returns:

Type	Description
`None`	None

Source code in src/kamae/spark/transformers/array_crop.py

@keyword_only
def __init__(
    self,
    inputCol: Optional[str] = None,
    outputCol: Optional[str] = None,
    inputDtype: Optional[Union[str, int, float]] = None,
    outputDtype: Optional[Union[str, int, float]] = None,
    layerName: Optional[str] = None,
    arrayLength: Optional[int] = 128,
    padValue: Optional[Union[str, int, float]] = None,
) -> None:
    """
    Initialises the ArrayCropTransformer
    :param inputCol: Input column name.
    :param outputCol: Output column name.
    :param inputDtype: Input data type to cast input column(s) to before
    transforming.
    :param outputDtype: Output data type to cast the output column to after
    transforming.
    :param layerName: Name of the layer. Used as the name of the tensorflow layer
    :param arrayLength: The length to crop or pad the arrays to. Defaults to 128.
    :param padValue: The value pad the arrays with. Defaults to `None`.
    :returns: None
    """
    super().__init__()
    kwargs = self._input_kwargs
    self.setParams(**kwargs)
    self._pad_type_to_valid_element_types = {
        "int": ["int", "bigint", "smallint"],
        "float": ["float", "double", "decimal(10,0)"],
        "string": ["string"],
        "boolean": ["boolean"],
    }

compatible_dtypes `property` ¤

compatible_dtypes

List of compatible data types for the layer. If the computation can be performed on any data type, return None.

Returns:

Type	Description
`Optional[List[DataType]]`	List of compatible data types for the layer.

_transform ¤

_transform(dataset)

Performs the cropping and/or padding on the input dataset. Example, crop to length 3, with value '-1':

dataset = spark.Dataframe( [ ['a', 'a', 'a', 'b', 'c'], ['x', 'z', 'y'], ['a', 'b',], ['a', 'x', 'a', 'b',], [] ], 'input_col' ) Output: spark.Dataframe( [ ['a', 'a', 'a', 'b', 'c'], ['x', 'z', 'y'], ['a', 'b',], ['a', 'x', 'a', 'b',], [] ], [ ['a', 'a', 'a'], ['x', 'z', 'y'], ['a', 'b', '-1'], ['a', 'x', 'a'], ['-1', '-1', '-1'] ], 'input_col', 'output_col' )

Parameters:

Name	Type	Description	Default
`dataset`	`DataFrame`	The input dataframe.	required

Returns:

Type	Description
`DataFrame`	Transformed pyspark dataframe.

Source code in src/kamae/spark/transformers/array_crop.py

def _transform(self, dataset: DataFrame) -> DataFrame:
    """
    Performs the cropping and/or padding on the input dataset.
    Example, crop to length 3, with value '-1':

     dataset = spark.Dataframe(
        [
            ['a', 'a', 'a', 'b', 'c'],
            ['x', 'z', 'y'],
            ['a', 'b',],
            ['a', 'x', 'a', 'b',],
            []
        ],
        'input_col'
     )
     Output: spark.Dataframe(
        [
            ['a', 'a', 'a', 'b', 'c'],
            ['x', 'z', 'y'],
            ['a', 'b',],
            ['a', 'x', 'a', 'b',],
            []
        ],
        [
            ['a', 'a', 'a'],
            ['x', 'z', 'y'],
            ['a', 'b', '-1'],
            ['a', 'x', 'a'],
            ['-1', '-1', '-1']
        ],
        'input_col', 'output_col'
    )
    :param dataset: The input dataframe.
    :returns: Transformed pyspark dataframe.
    """
    pad_value_spark_type = self._get_pad_value_type(self.getPadValue())
    input_col_type = self.get_column_datatype(
        dataset=dataset, column_name=self.getInputCol()
    )
    input_col_element_type = get_array_nesting_level_and_element_dtype(
        input_col_type
    )[1]

    if (
        input_col_element_type.simpleString()
        not in self._pad_type_to_valid_element_types[
            pad_value_spark_type.simpleString()
        ]
    ):
        raise ValueError(
            f"""
        The pad value type '{type(pad_value_spark_type)}' does
        not match the element type of the input
        column '{type(input_col_element_type)}'.
        """
        )

    output_col = single_input_single_output_array_transform(
        input_col=F.col(self.getInputCol()),
        input_col_datatype=input_col_type,
        func=lambda x: F.concat(
            F.slice(x, 1, self.getArrayLength()),
            F.array_repeat(
                F.lit(self.getPadValue()),
                self.getArrayLength() - F.size(x),
            ),
        ),
    )
    return dataset.withColumn(self.getOutputCol(), output_col)

get_tf_layer ¤

get_tf_layer()

Gets the tensorflow layer that performs the array cropping and padding.

Returns:

Type	Description
`Layer`	Tensorflow keras layer with name equal to the layerName parameter that performs the array cropping and padding operation.

Source code in src/kamae/spark/transformers/array_crop.py

def get_tf_layer(self) -> tf.keras.layers.Layer:
    """
    Gets the tensorflow layer that performs the array cropping and padding.

    :returns: Tensorflow keras layer with name equal to the layerName parameter
    that performs the array cropping and padding operation.
    """
    return ArrayCropLayer(
        name=self.getLayerName(),
        input_dtype=self.getInputTFDtype(),
        output_dtype=self.getOutputTFDtype(),
        array_length=self.getArrayLength(),
        pad_value=self.getPadValue(),
    )

array_crop

ArrayCropParams ¤

getArrayLength ¤

setArrayLength ¤

ArrayCropTransformer ¤

compatible_dtypes property ¤

_transform ¤

get_tf_layer ¤

compatible_dtypes `property` ¤