Skip to content

array_crop

ArrayCropParams ¤

Bases: PadValueParams

Mixin class containing pad value parameters needed for array crop transformers.

getArrayLength ¤

getArrayLength()

Gets the array length parameter.

Returns:

Type Description
int

array length.

Source code in src/kamae/spark/transformers/array_crop.py
57
58
59
60
61
62
def getArrayLength(self) -> int:
    """
    Gets the array length parameter.
    :returns: array length.
    """
    return self.getOrDefault(self.arrayLength)

setArrayLength ¤

setArrayLength(value)

Sets the parameter array length to the given value.

Parameters:

Name Type Description Default
value int

array length.

required

Returns:

Type Description
ArrayCropParams

Instance of class mixed in.

Source code in src/kamae/spark/transformers/array_crop.py
47
48
49
50
51
52
53
54
55
def setArrayLength(self, value: int) -> "ArrayCropParams":
    """
    Sets the parameter array length to the given value.
    :param value: array length.
    :returns: Instance of class mixed in.
    """
    if value < 1:
        raise ValueError("Array length must be greater than 0.")
    return self._set(arrayLength=value)

ArrayCropTransformer ¤

ArrayCropTransformer(
    inputCol=None,
    outputCol=None,
    inputDtype=None,
    outputDtype=None,
    layerName=None,
    arrayLength=128,
    padValue=None,
)

Bases: BaseTransformer, SingleInputSingleOutputParams, ArrayCropParams

Transformer that reshapes arrays into consistent shapes by either cropping or padding.

If the tensor is shorter than the specified length, it is padded with specified pad value.

Initialises the ArrayCropTransformer

Parameters:

Name Type Description Default
inputCol Optional[str]

Input column name.

None
outputCol Optional[str]

Output column name.

None
inputDtype Optional[Union[str, int, float]]

Input data type to cast input column(s) to before transforming.

None
outputDtype Optional[Union[str, int, float]]

Output data type to cast the output column to after transforming.

None
layerName Optional[str]

Name of the layer. Used as the name of the tensorflow layer

None
arrayLength Optional[int]

The length to crop or pad the arrays to. Defaults to 128.

128
padValue Optional[Union[str, int, float]]

The value pad the arrays with. Defaults to None.

None

Returns:

Type Description
None

None

Source code in src/kamae/spark/transformers/array_crop.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@keyword_only
def __init__(
    self,
    inputCol: Optional[str] = None,
    outputCol: Optional[str] = None,
    inputDtype: Optional[Union[str, int, float]] = None,
    outputDtype: Optional[Union[str, int, float]] = None,
    layerName: Optional[str] = None,
    arrayLength: Optional[int] = 128,
    padValue: Optional[Union[str, int, float]] = None,
) -> None:
    """
    Initialises the ArrayCropTransformer
    :param inputCol: Input column name.
    :param outputCol: Output column name.
    :param inputDtype: Input data type to cast input column(s) to before
    transforming.
    :param outputDtype: Output data type to cast the output column to after
    transforming.
    :param layerName: Name of the layer. Used as the name of the tensorflow layer
    :param arrayLength: The length to crop or pad the arrays to. Defaults to 128.
    :param padValue: The value pad the arrays with. Defaults to `None`.
    :returns: None
    """
    super().__init__()
    kwargs = self._input_kwargs
    self.setParams(**kwargs)
    self._pad_type_to_valid_element_types = {
        "int": ["int", "bigint", "smallint"],
        "float": ["float", "double", "decimal(10,0)"],
        "string": ["string"],
        "boolean": ["boolean"],
    }

compatible_dtypes property ¤

compatible_dtypes

List of compatible data types for the layer. If the computation can be performed on any data type, return None.

Returns:

Type Description
Optional[List[DataType]]

List of compatible data types for the layer.

_transform ¤

_transform(dataset)

Performs the cropping and/or padding on the input dataset. Example, crop to length 3, with value '-1':

dataset = spark.Dataframe( [ ['a', 'a', 'a', 'b', 'c'], ['x', 'z', 'y'], ['a', 'b',], ['a', 'x', 'a', 'b',], [] ], 'input_col' ) Output: spark.Dataframe( [ ['a', 'a', 'a', 'b', 'c'], ['x', 'z', 'y'], ['a', 'b',], ['a', 'x', 'a', 'b',], [] ], [ ['a', 'a', 'a'], ['x', 'z', 'y'], ['a', 'b', '-1'], ['a', 'x', 'a'], ['-1', '-1', '-1'] ], 'input_col', 'output_col' )

Parameters:

Name Type Description Default
dataset DataFrame

The input dataframe.

required

Returns:

Type Description
DataFrame

Transformed pyspark dataframe.

Source code in src/kamae/spark/transformers/array_crop.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def _transform(self, dataset: DataFrame) -> DataFrame:
    """
    Performs the cropping and/or padding on the input dataset.
    Example, crop to length 3, with value '-1':

     dataset = spark.Dataframe(
        [
            ['a', 'a', 'a', 'b', 'c'],
            ['x', 'z', 'y'],
            ['a', 'b',],
            ['a', 'x', 'a', 'b',],
            []
        ],
        'input_col'
     )
     Output: spark.Dataframe(
        [
            ['a', 'a', 'a', 'b', 'c'],
            ['x', 'z', 'y'],
            ['a', 'b',],
            ['a', 'x', 'a', 'b',],
            []
        ],
        [
            ['a', 'a', 'a'],
            ['x', 'z', 'y'],
            ['a', 'b', '-1'],
            ['a', 'x', 'a'],
            ['-1', '-1', '-1']
        ],
        'input_col', 'output_col'
    )
    :param dataset: The input dataframe.
    :returns: Transformed pyspark dataframe.
    """
    pad_value_spark_type = self._get_pad_value_type(self.getPadValue())
    input_col_type = self.get_column_datatype(
        dataset=dataset, column_name=self.getInputCol()
    )
    input_col_element_type = get_array_nesting_level_and_element_dtype(
        input_col_type
    )[1]

    if (
        input_col_element_type.simpleString()
        not in self._pad_type_to_valid_element_types[
            pad_value_spark_type.simpleString()
        ]
    ):
        raise ValueError(
            f"""
        The pad value type '{type(pad_value_spark_type)}' does
        not match the element type of the input
        column '{type(input_col_element_type)}'.
        """
        )

    output_col = single_input_single_output_array_transform(
        input_col=F.col(self.getInputCol()),
        input_col_datatype=input_col_type,
        func=lambda x: F.concat(
            F.slice(x, 1, self.getArrayLength()),
            F.array_repeat(
                F.lit(self.getPadValue()),
                self.getArrayLength() - F.size(x),
            ),
        ),
    )
    return dataset.withColumn(self.getOutputCol(), output_col)

get_tf_layer ¤

get_tf_layer()

Gets the tensorflow layer that performs the array cropping and padding.

Returns:

Type Description
Layer

Tensorflow keras layer with name equal to the layerName parameter that performs the array cropping and padding operation.

Source code in src/kamae/spark/transformers/array_crop.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
def get_tf_layer(self) -> tf.keras.layers.Layer:
    """
    Gets the tensorflow layer that performs the array cropping and padding.

    :returns: Tensorflow keras layer with name equal to the layerName parameter
    that performs the array cropping and padding operation.
    """
    return ArrayCropLayer(
        name=self.getLayerName(),
        input_dtype=self.getInputTFDtype(),
        output_dtype=self.getOutputTFDtype(),
        array_length=self.getArrayLength(),
        pad_value=self.getPadValue(),
    )