Skip to content

openscm_calibration.cost.scmdata#

Cost calculations for models that return [scmdata.run.BaseScmRun][scmdata.run.BaseScmRun] objects

Classes:

Name Description
AlignmentError

Raised when our data's metadata does not align as expected

OptCostCalculatorSSE

Cost calculator based on sum of squared errors

AlignmentError #

Bases: ValueError

Raised when our data's metadata does not align as expected

Methods:

Name Description
__init__

Initialise the error

Source code in src/openscm_calibration/cost/scmdata.py
class AlignmentError(ValueError):
    """
    Raised when our data's metadata does not align as expected
    """

    def __init__(
        self,
        name_left: str,
        val_left: pd.DataFrame | pd.Series[Any],
        name_right: str,
        val_right: pd.DataFrame | pd.Series[Any],
        extra_context: str | None = None,
    ) -> None:
        """
        Initialise the error

        Parameters
        ----------
        name_left
            The name of the first thing being referenced
            (variable, attribute etc.)

        val_left
            The values referred to by ``name_left``

        name_right
            The name of the other thing being referenced (variable, attribute
            etc.)

        name_right
            The values referred to by ``name_left``

        extra_context
            Any extra context to include in the message
        """
        error_msg = (
            f"Please check.\n"
            f"{name_left}:\n{val_left}\n"
            f"{name_right}:\n{val_right}\n"
        )

        if extra_context:
            error_msg = f"{extra_context}. {error_msg}"

        super().__init__(error_msg)

__init__ #

__init__(
    name_left: str,
    val_left: DataFrame | Series[Any],
    name_right: str,
    val_right: DataFrame | Series[Any],
    extra_context: str | None = None,
) -> None

Initialise the error

Parameters:

Name Type Description Default
name_left str

The name of the first thing being referenced (variable, attribute etc.)

required
val_left DataFrame | Series[Any]

The values referred to by name_left

required
name_right str

The name of the other thing being referenced (variable, attribute etc.)

required
name_right str

The values referred to by name_left

required
extra_context str | None

Any extra context to include in the message

None
Source code in src/openscm_calibration/cost/scmdata.py
def __init__(
    self,
    name_left: str,
    val_left: pd.DataFrame | pd.Series[Any],
    name_right: str,
    val_right: pd.DataFrame | pd.Series[Any],
    extra_context: str | None = None,
) -> None:
    """
    Initialise the error

    Parameters
    ----------
    name_left
        The name of the first thing being referenced
        (variable, attribute etc.)

    val_left
        The values referred to by ``name_left``

    name_right
        The name of the other thing being referenced (variable, attribute
        etc.)

    name_right
        The values referred to by ``name_left``

    extra_context
        Any extra context to include in the message
    """
    error_msg = (
        f"Please check.\n"
        f"{name_left}:\n{val_left}\n"
        f"{name_right}:\n{val_right}\n"
    )

    if extra_context:
        error_msg = f"{extra_context}. {error_msg}"

    super().__init__(error_msg)

OptCostCalculatorSSE #

Cost calculator based on sum of squared errors

This is a convenience class. We may want to refactor it in future to provide greater flexibility for other cost calculations.

Methods:

Name Description
calculate_cost

Calculate cost function based on model results

calculate_negative_log_likelihood

Calculate the negative log likelihood of a given set of results

from_series_normalisation

Initialise from a series that defines normalisation for each timeseries.

from_unit_normalisation

Initialise assuming unit normalisation for each timeseries.

Attributes:

Name Type Description
model_col str

Column which contains the name of the model.

normalisation BaseScmRun

Normalisation values

target BaseScmRun

Target timeseries

Source code in src/openscm_calibration/cost/scmdata.py
@define
class OptCostCalculatorSSE:
    """
    Cost calculator based on sum of squared errors

    This is a convenience class.
    We may want to refactor it in future
    to provide greater flexibility for other cost calculations.
    """

    target: scmdata.run.BaseScmRun
    """Target timeseries"""

    model_col: str = field(validator=[_is_meta_in_target])
    """
    Column which contains the name of the model.

    This is used when subtracting the model results from the target
    """

    normalisation: scmdata.run.BaseScmRun = field(validator=[_works_with_self_target])
    """
    Normalisation values

    Should have same timeseries as target. See the class methods for helpers.
    """

    @classmethod
    def from_unit_normalisation(
        cls, target: scmdata.run.BaseScmRun, model_col: str
    ) -> OptCostCalculatorSSE:
        """
        Initialise assuming unit normalisation for each timeseries.

        This is a convenience method, but is not recommended for any serious
        work as unit normalisation is unlikely to be a good choice for most
        problems.

        Parameters
        ----------
        target
            Target timeseries

        model_col
            Column which contains of the model in ``target``

        Returns
        -------
            :obj:`OptCostCalculatorSSE` such that the normalisation is 1 for
            all timepoints (with the units defined by whatever the units of
            each timeseries are in ``target``)
        """
        norm = target.timeseries()
        norm.loc[:, :] = 1
        norm_cast = type(target)(norm)

        return cls(target=target, normalisation=norm_cast, model_col=model_col)

    @classmethod
    def from_series_normalisation(
        cls,
        target: scmdata.run.BaseScmRun,
        model_col: str,
        normalisation_series: pd.Series[float],
    ) -> OptCostCalculatorSSE:
        """
        Initialise from a series that defines normalisation for each timeseries.

        The series is broadcast to match the timeseries in target, using the
        same value for all timepoints in each timeseries.

        Parameters
        ----------
        target
            Target timeseries

        model_col
            Column which contains of the model in ``target``

        normalisation_series
            Series to broadcast to create the desired normalisation

        Returns
        -------
            Initialised :obj:`OptCostCalculatorSSE`
        """
        required_columns = {"variable", "unit"}
        available_cols = set(normalisation_series.index.names)
        missing_cols = required_columns - available_cols
        if missing_cols:
            raise MissingValueError(
                "normalisation_series.index.names",
                vals=sorted(available_cols),
                missing_vals=sorted(missing_cols),
            )

        target_ts_no_unit = target.timeseries().reset_index("unit", drop=True)

        # This is basically what pandas does internally when doing ops:
        # align and then broadcast
        norm_series_aligned, _ = normalisation_series.align(target_ts_no_unit)

        if norm_series_aligned.isna().any():
            raise AlignmentError(
                name_left="target_ts_no_unit",
                val_left=target_ts_no_unit,
                name_right="norm_series_aligned",
                val_right=norm_series_aligned,
                extra_context="Even after aligning, there are still nan values",
            )

        if norm_series_aligned.size != target_ts_no_unit.shape[0]:
            raise AlignmentError(
                name_left="target_ts_no_unit",
                val_left=target_ts_no_unit,
                name_right="norm_series_aligned",
                val_right=norm_series_aligned,
                extra_context=(
                    "After aligning, there are more rows in the normalisation "
                    "than in the target"
                ),
            )

        norm_series_aligned = type(target_ts_no_unit)(
            np.broadcast_to(norm_series_aligned.values, target_ts_no_unit.T.shape).T,  # type: ignore # mypy/np confused
            index=norm_series_aligned.index,
            columns=target_ts_no_unit.columns,
        )

        normalisation = type(target)(norm_series_aligned)

        return cls(target=target, normalisation=normalisation, model_col=model_col)

    def calculate_cost(self, model_results: scmdata.run.BaseScmRun) -> float:
        """
        Calculate cost function based on model results

        Parameters
        ----------
        model_results
            Model results of which to calculate the cost

        Returns
        -------
            Cost
        """
        diff = model_results.subtract(  # type: ignore # error in scmdata types
            self.target, op_cols={self.model_col: "res - target"}
        ).divide(
            self.normalisation,
            op_cols={self.model_col: "(res - target) / normalisation"},
        )

        cost = float((diff.convert_unit("1") ** 2).values.sum().sum())

        return cost

    def calculate_negative_log_likelihood(
        self,
        model_results: scmdata.run.BaseScmRun,
    ) -> float:
        """
        Calculate the negative log likelihood of a given set of results

        Parameters
        ----------
        model_results
            Model results for which to calculate the negative log likelihood

        Returns
        -------
        :
            Negative log likelihood (up to an additive constant)
        """
        sses = self.calculate_cost(model_results)
        # TODO: find the proof of this
        negative_log_likelihood = -sses / 2

        return negative_log_likelihood

model_col class-attribute instance-attribute #

model_col: str = field(validator=[_is_meta_in_target])

Column which contains the name of the model.

This is used when subtracting the model results from the target

normalisation class-attribute instance-attribute #

normalisation: BaseScmRun = field(
    validator=[_works_with_self_target]
)

Normalisation values

Should have same timeseries as target. See the class methods for helpers.

target instance-attribute #

target: BaseScmRun

Target timeseries

calculate_cost #

calculate_cost(model_results: BaseScmRun) -> float

Calculate cost function based on model results

Parameters:

Name Type Description Default
model_results BaseScmRun

Model results of which to calculate the cost

required

Returns:

Type Description
Cost
Source code in src/openscm_calibration/cost/scmdata.py
def calculate_cost(self, model_results: scmdata.run.BaseScmRun) -> float:
    """
    Calculate cost function based on model results

    Parameters
    ----------
    model_results
        Model results of which to calculate the cost

    Returns
    -------
        Cost
    """
    diff = model_results.subtract(  # type: ignore # error in scmdata types
        self.target, op_cols={self.model_col: "res - target"}
    ).divide(
        self.normalisation,
        op_cols={self.model_col: "(res - target) / normalisation"},
    )

    cost = float((diff.convert_unit("1") ** 2).values.sum().sum())

    return cost

calculate_negative_log_likelihood #

calculate_negative_log_likelihood(
    model_results: BaseScmRun,
) -> float

Calculate the negative log likelihood of a given set of results

Parameters:

Name Type Description Default
model_results BaseScmRun

Model results for which to calculate the negative log likelihood

required

Returns:

Type Description
float

Negative log likelihood (up to an additive constant)

Source code in src/openscm_calibration/cost/scmdata.py
def calculate_negative_log_likelihood(
    self,
    model_results: scmdata.run.BaseScmRun,
) -> float:
    """
    Calculate the negative log likelihood of a given set of results

    Parameters
    ----------
    model_results
        Model results for which to calculate the negative log likelihood

    Returns
    -------
    :
        Negative log likelihood (up to an additive constant)
    """
    sses = self.calculate_cost(model_results)
    # TODO: find the proof of this
    negative_log_likelihood = -sses / 2

    return negative_log_likelihood

from_series_normalisation classmethod #

from_series_normalisation(
    target: BaseScmRun,
    model_col: str,
    normalisation_series: Series[float],
) -> OptCostCalculatorSSE

Initialise from a series that defines normalisation for each timeseries.

The series is broadcast to match the timeseries in target, using the same value for all timepoints in each timeseries.

Parameters:

Name Type Description Default
target BaseScmRun

Target timeseries

required
model_col str

Column which contains of the model in target

required
normalisation_series Series[float]

Series to broadcast to create the desired normalisation

required

Returns:

Type Description
Initialised :obj:`OptCostCalculatorSSE`
Source code in src/openscm_calibration/cost/scmdata.py
@classmethod
def from_series_normalisation(
    cls,
    target: scmdata.run.BaseScmRun,
    model_col: str,
    normalisation_series: pd.Series[float],
) -> OptCostCalculatorSSE:
    """
    Initialise from a series that defines normalisation for each timeseries.

    The series is broadcast to match the timeseries in target, using the
    same value for all timepoints in each timeseries.

    Parameters
    ----------
    target
        Target timeseries

    model_col
        Column which contains of the model in ``target``

    normalisation_series
        Series to broadcast to create the desired normalisation

    Returns
    -------
        Initialised :obj:`OptCostCalculatorSSE`
    """
    required_columns = {"variable", "unit"}
    available_cols = set(normalisation_series.index.names)
    missing_cols = required_columns - available_cols
    if missing_cols:
        raise MissingValueError(
            "normalisation_series.index.names",
            vals=sorted(available_cols),
            missing_vals=sorted(missing_cols),
        )

    target_ts_no_unit = target.timeseries().reset_index("unit", drop=True)

    # This is basically what pandas does internally when doing ops:
    # align and then broadcast
    norm_series_aligned, _ = normalisation_series.align(target_ts_no_unit)

    if norm_series_aligned.isna().any():
        raise AlignmentError(
            name_left="target_ts_no_unit",
            val_left=target_ts_no_unit,
            name_right="norm_series_aligned",
            val_right=norm_series_aligned,
            extra_context="Even after aligning, there are still nan values",
        )

    if norm_series_aligned.size != target_ts_no_unit.shape[0]:
        raise AlignmentError(
            name_left="target_ts_no_unit",
            val_left=target_ts_no_unit,
            name_right="norm_series_aligned",
            val_right=norm_series_aligned,
            extra_context=(
                "After aligning, there are more rows in the normalisation "
                "than in the target"
            ),
        )

    norm_series_aligned = type(target_ts_no_unit)(
        np.broadcast_to(norm_series_aligned.values, target_ts_no_unit.T.shape).T,  # type: ignore # mypy/np confused
        index=norm_series_aligned.index,
        columns=target_ts_no_unit.columns,
    )

    normalisation = type(target)(norm_series_aligned)

    return cls(target=target, normalisation=normalisation, model_col=model_col)

from_unit_normalisation classmethod #

from_unit_normalisation(
    target: BaseScmRun, model_col: str
) -> OptCostCalculatorSSE

Initialise assuming unit normalisation for each timeseries.

This is a convenience method, but is not recommended for any serious work as unit normalisation is unlikely to be a good choice for most problems.

Parameters:

Name Type Description Default
target BaseScmRun

Target timeseries

required
model_col str

Column which contains of the model in target

required

Returns:

Type Description
:obj:`OptCostCalculatorSSE` such that the normalisation is 1 for

all timepoints (with the units defined by whatever the units of each timeseries are in target)

Source code in src/openscm_calibration/cost/scmdata.py
@classmethod
def from_unit_normalisation(
    cls, target: scmdata.run.BaseScmRun, model_col: str
) -> OptCostCalculatorSSE:
    """
    Initialise assuming unit normalisation for each timeseries.

    This is a convenience method, but is not recommended for any serious
    work as unit normalisation is unlikely to be a good choice for most
    problems.

    Parameters
    ----------
    target
        Target timeseries

    model_col
        Column which contains of the model in ``target``

    Returns
    -------
        :obj:`OptCostCalculatorSSE` such that the normalisation is 1 for
        all timepoints (with the units defined by whatever the units of
        each timeseries are in ``target``)
    """
    norm = target.timeseries()
    norm.loc[:, :] = 1
    norm_cast = type(target)(norm)

    return cls(target=target, normalisation=norm_cast, model_col=model_col)