Source code for hamilton.plugins.h_pandera

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import typing

import pandera
from pandera import typing as pa_typing

try:
    import pandera.typing.polars as pa_typing_polars
    from pandera.api.polars.model import DataFrameModel as _PolarsDataFrameModel

    _POLARS_TYPING_AVAILABLE = True
except ImportError:
    _POLARS_TYPING_AVAILABLE = False

from hamilton import node
from hamilton.data_quality import base as dq_base
from hamilton.function_modifiers import InvalidDecoratorException
from hamilton.function_modifiers import base as fm_base
from hamilton.function_modifiers import check_output as base_check_output
from hamilton.function_modifiers.validation import BaseDataValidationDecorator
from hamilton.htypes import custom_subclass_check



[docs]
class check_output(BaseDataValidationDecorator):

[docs]
    def __init__(
        self,
        importance: str = dq_base.DataValidationLevel.WARN.value,
        target: fm_base.TargetType = None,
    ):
        """Specific output-checker for pandera schemas. This decorator utilizes the output type of the function, which has
        to be of type pandera.typing.pandas.DataFrame or pandera.typing.pandas.Series, with an annotation argument.

        :param schema: The schema to use for validation. If this is not provided, then the output type of the function is used.
        :param importance: Importance level (either "warn" or "fail") -- see documentation for check_output for more details.
        :param target: The target of the decorator -- see documentation for check_output for more details.

        Let's look at equivalent examples to demonstrate:

        .. code-block:: python
            :name: "@check_output using output type"

            import pandera as pa
            import pandas as pd
            from hamilton.plugins import h_pandera
            from pandera.typing.pandas import DataFrame


            class MySchema(pa.DataFrameModel):
                a: int
                b: float
                c: str = pa.Field(nullable=True)  # For example, allow None values
                d: float  # US dollars


            @h_pandera.check_output()
            def foo() -> DataFrame[MySchema]:
                return pd.DataFrame()  # will fail

        .. code-block:: python
            :name: "@check_output with passed in type"

            from hamilton import function_modifiers

            schema = pa.DataFrameSchema({
                "a": pa.Column(pa.Int),
                "b": pa.Column(pa.Float),
                "c": pa.Column(pa.String, nullable=True),
                "d": pa.Column(pa.Float),
            })


            @function_modifiers.check_output(schema=schema)
            def foo() -> pd.DataFrame:
                return pd.DataFrame()  # will fail

        These two are functionally equivalent. Note that we do not (yet) support modification of the output.
        """
        super(check_output, self).__init__(target)
        self.importance = importance
        self.target = target


    def get_validators(self, node_to_validate: node.Node) -> list[dq_base.DataValidator]:
        """Gets validators for the node. Delegates to the standard check_output(schema=...) decorator.

        :param node_to_validate: Node to validate
        :return: List of validators
        """
        output_type = node_to_validate.type
        schema = None
        origin = typing.get_origin(output_type)
        args = typing.get_args(output_type)
        if custom_subclass_check(origin, pa_typing.DataFrame) and len(args) == 1:
            schema = output_type.__args__[0]  # TODO -- determine if it can ever have multiple...
            if not issubclass(schema, pandera.DataFrameModel):
                schema = None

        if schema is None and _POLARS_TYPING_AVAILABLE:
            if (
                origin is not None
                and len(args) == 1
                and issubclass(origin, (pa_typing_polars.DataFrame, pa_typing_polars.LazyFrame))
            ):
                schema = output_type.__args__[0]
                if not issubclass(schema, _PolarsDataFrameModel):
                    schema = None

        if schema is None:
            raise InvalidDecoratorException(
                f"Output type {output_type} is not a valid pandera schema. "
                f"Note that we currently only support pandera dataframes annotated with "
                f"subclasses of pandera.DataFrameModel (pandas) or "
                f"pandera.api.polars.model.DataFrameModel (polars). "
                f"Please reach out/open an issue if you want more complete integration."
            )

        # We can just delegate to teh standard check_output, which has pandera associated with schema...
        # This is a clever way of reusing as much code as possible
        return base_check_output(
            importance=self.importance, schema=schema, target_=self.target
        ).get_validators(node_to_validate)