Source code for tlo.lm

import ast
import builtins
import numbers
from enum import Enum, auto
from math import prod
from typing import Any, Callable, Dict, Optional, Tuple, Union

import numpy as np
import pandas as pd
from pandas.core.computation.parsing import clean_column_name

from tlo import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)



[docs]
class Predictor(object):

    def __init__(
        self,
        property_name: str = None,
        external: bool = False,
        conditions_are_mutually_exclusive: Optional[bool] = None,
        conditions_are_exhaustive: Optional[bool] = False,
    ):
        """A Predictor variable for the regression model.

        :param property_name: A property of the population dataframe e.g. age, sex, etc.
            or if ``external=True`` the name of the external property that will be
            passed as a keyword argument to the ``LinearModel.predict`` method.
        :param external: Whether the named property is external (``True``) and so will
            be passed as a keyword argument to the ``LinearModel.predict`` method) or is
            a property of the population dataframe (``False``).
        :param conditions_are_mutually_exclusive: Whether the set of conditions that
            are declared for this predictor are all mutually exclusive, that is, for any
            pair of conditions, one condition evaluating to ``True`` implies the other
            must evaluate to ``False``. If this is declared to be the case a more
            efficient method of evaluation will be used in ``LinearModel.predict``. Note
            however that the validity of this declaration will not be checked so if this
            is set to ``True`` for predictors with non-mutually exclusive conditions,
            the model output will be erroneous.
        :param conditions_are_exhaustive: Whether the set of conditions that are
            declared for this predictor are all exhaustive, that is at least one
            condition will always be ``True`` irrespective of the value of the property.
            If this is declared to be the case, a more efficient method of evaluation
            maye be used in ``LinearModel.predict`, though if a catch-all ``otherwise``
            condition is included this flag will provide no benefit. Note that the
            validity of this declaration will not be checked so if this is set to
            ``True`` for predictors with non-exhaustive conditions, the model output
            will be erroneous.
        """
        self.property_name = property_name

        # If this is a property that is not part of the population dataframe
        if external:
            assert property_name is not None, "Can't have an unnamed external predictor"
            self.property_name = f'__{self.property_name}__'

        self.conditions = list()
        self.callback = None
        self.has_otherwise = False
        self.conditions_are_mutually_exclusive = conditions_are_mutually_exclusive
        self.conditions_are_exhaustive = conditions_are_exhaustive


[docs]
    def when(self, condition: Union[str, float, bool], value: float) -> 'Predictor':
        assert self.callback is None, "Can't use `when` on Predictor with function"
        return self._coeff(condition=condition, coefficient=value)



[docs]
    def otherwise(self, value: float) -> 'Predictor':
        assert self.property_name is not None, "Can't use `otherwise` condition on unnamed Predictor"
        assert self.callback is None, "Can't use `otherwise` on Predictor with function"
        return self._coeff(coefficient=value)



[docs]
    def apply(self, callback: Callable[[Any], float]) -> 'Predictor':
        assert self.property_name is not None, "Can't use `apply` on unnamed Predictor"
        assert len(self.conditions) == 0, "Can't specify `apply` on Predictor with when/otherwise conditions"
        assert self.callback is None, "Can't specify more than one callback for a Predictor"
        self.callback = callback
        return self


    def _coeff(self, *, coefficient, condition=None) -> 'Predictor':
        """Adds the coefficient for the Predictor. The arguments can be two:
                `coeff(condition, value)` where the condition evaluates the property value to true/false
                `coeff(value)` where the value is given to all unconditioned values of the property
        The second style (unconditioned value) only makes sense after one or more conditioned values
        """
        # If there isn't a property name
        if self.property_name is None:
            # We use the supplied condition literally
            self.conditions.append((condition, coefficient))
            return self

        # Otherwise, the condition is applied on a specific property
        if isinstance(condition, str):
            # Handle either a complex condition (begins with an operator) or implicit equality
            if condition[0] in ['!', '=', '<', '>', '~', '(', '.']:
                parsed_condition = f'({self.property_name}{condition})'
            else:
                # numeric values don't need to be quoted
                if condition.isnumeric():
                    parsed_condition = f'({self.property_name} == {condition})'
                else:
                    parsed_condition = f'({self.property_name} == "{condition}")'
        elif isinstance(condition, bool):
            if condition:
                parsed_condition = f'({self.property_name} == True)'
            else:
                parsed_condition = f'({self.property_name} == False)'
        elif isinstance(condition, numbers.Number):
            parsed_condition = f'({self.property_name} == {condition})'
        elif condition is None:
            assert not self.has_otherwise, "You can only give one unconditioned value to predictor"
            self.has_otherwise = True
            parsed_condition = None
        else:
            raise RuntimeError(f"Unhandled condition: {condition}")

        self.conditions.append((parsed_condition, coefficient))
        return self

    def __str__(self):
        if self.property_name and self.property_name.startswith('__'):
            name = f'{self.property_name.strip("__")} (external)'
        else:
            name = self.property_name
        if self.callback:
            return f"{name} -> callback({self.callback})"
        out = []
        previous_condition = None
        for condition, value in self.conditions:
            if condition is None:
                out.append(f'{" " * len(previous_condition)} -> {value} (otherwise)')
            else:
                out.append(f"{condition} -> {value}")
                previous_condition = condition
        return "\n  ".join(out)




[docs]
class LinearModelType(Enum):
    """
    The type of model specifies how the results from the predictor are combined:
    'additive' -> adds the effect_sizes from the predictors
    'logisitic' -> multiples the effect_sizes from the predictors and applies the transform x/(1+x)
    [Thus, the intercept can be taken to be an Odds and effect_sizes Odds Ratios,
    and the prediction is a probability.]
    'multiplicative' -> multiplies the effect_sizes from the predictors
    """
    ADDITIVE = auto()
    LOGISTIC = auto()
    MULTIPLICATIVE = auto()
    # the 'custom' is used internally by the custom() method
    CUSTOM = auto()




[docs]
class LinearModel(object):

    def __init__(
        self,
        lm_type: LinearModelType,
        intercept: Union[float, int],
        *predictors: Predictor
    ):
        """A linear model has an intercept and zero or more ``Predictor`` variables.

        :param lm_type: Model type to use.
        :param intercept: Intercept term for the model.
        :param *predictors: Any ``Predictor`` instances to use in computing output.
        """
        assert lm_type in LinearModelType, (
            "Model should be one of the prescribed LinearModelTypes"
        )
        self._lm_type = lm_type

        assert isinstance(intercept, (float, int)), (
            "Intercept is not specified or wrong type."
        )
        assert np.isfinite(intercept), "Intercept must not be NaN or infinite"
        self._intercept = intercept

        # Store predictors as tuple and expose via read-only property to prevent
        # updates after model initialisation
        self._predictors = tuple(predictors)
        non_predictors = [p for p in self._predictors if not isinstance(p, Predictor)]
        assert len(non_predictors) == 0, (
            f"One or more predictors are of invalid type: {non_predictors}"
        )

        self._parse_predictors()

    @property
    def lm_type(self) -> LinearModelType:
        """The model type."""
        return self._lm_type

    @property
    def intercept(self) -> Union[float, int]:
        """The intercept value for the model."""
        return self._intercept

    @property
    def predictors(self) -> Tuple[Predictor]:
        """The predictors used in calculating the model output."""
        return self._predictors


[docs]
    @staticmethod
    def multiplicative(*predictors: Predictor):
        """Returns a multplicative LinearModel with intercept=1.0

        :param predictors: One or more Predictor objects defining the model
        """
        return LinearModel(LinearModelType.MULTIPLICATIVE, 1.0, *predictors)



[docs]
    @staticmethod
    def custom(predict_function, **kwargs):
        """Define a linear model using the supplied function

        The function acts as a drop-in replacement to the predict function and must
        implement the interface:

            (
                self: LinearModel,
                df: Union[pd.DataFrame, pd.Series],
                rng: Optional[np.random.RandomState] = None,
                **kwargs
            ) -> pd.Series

        It is the responsibility of the caller of predict to ensure they pass either
        a dataframe or an individual record as expected by the custom function.

        See test_custom() in test_lm.py for a couple of examples.
        """
        # create an instance of a custom linear model
        custom_model = LinearModel(LinearModelType.CUSTOM, 0)
        # replace this instance's predict method
        # see https://stackoverflow.com/a/28127947
        custom_model.predict = predict_function.__get__(custom_model, LinearModel)
        # save value to any keyword arguments inside of this linear model
        for k, v in kwargs.items():
            # check the name doesn't already exist
            assert not hasattr(custom_model, k), (
                f"Cannot store argument '{k}' as name already exists; change name.")
            setattr(custom_model, k, v)
        return custom_model


    def _parse_predictors(self):
        """Set model string, callback predictors and predictor names from predictors.

        Sets `self._model_string` to an expression string (to be evaluated by
        ``pandas.DataFrame.eval``) corresponding to the evaluation of the model output
        for the subset of the predictors which do not define a custom callback function
        and the model intercept, or an empty string if no non-callback predictors are
        present.

        Additionally sets `self._callback_predictors` to a tuple of the omitted
        predictors with custom callback functions and `self._predictor_names` to a set
        of strings corresponding to names specified in the predictors.
        """
        # For additive models a zero coefficient corresponds to no effect while for
        # multiplicative and logistic models the relevant value is one
        null_coeff_value = 0 if self.lm_type == LinearModelType.ADDITIVE else 1
        predictor_strings = []
        callback_predictors = []
        self._predictor_names = set()
        for predictor in self.predictors:
            if predictor.callback is None:
                if predictor.property_name is not None:
                    self._predictor_names.add(predictor.property_name)
                else:
                    # If no property_name specified, predictor conditions will
                    # contain one or more column names therefore parse condition
                    # strings and filter for all name nodes. This will also
                    # add non-column names such as builtin functions so need to
                    # check if names are actually columns before using
                    for condition, _ in predictor.conditions:
                        self._predictor_names.update(
                            node.id for node in ast.walk(ast.parse(condition))
                            if isinstance(node, ast.Name)
                        )
                has_catch_all_condition = False
                for i, (condition, value) in enumerate(predictor.conditions):
                    if i == 0:
                        if condition is None:
                            # 'otherwise' fallback condition - always True. If used as
                            # first condition any other conditions will be ignored as
                            # this condition matches all
                            predictor_str = f"{value}"
                            any_prev_conds = "True"
                            has_catch_all_condition = True
                            break
                        else:
                            predictor_str = f"({condition}) * {value}"
                            any_prev_conds = f"{condition}"
                    else:
                        if condition is None:
                            # 'otherwise' fallback condition - matches all not
                            # so far matched therefore can ignore any remaining
                            # conditions
                            predictor_str += f" + (~({any_prev_conds})) * {value}"
                            has_catch_all_condition = True
                            break
                        elif predictor.conditions_are_mutually_exclusive:
                            # conditions have been declared to be mutually exclusive
                            # therefore we can just multiply conditions by coefficient
                            # values as condition == ~any_prev_conds & condition
                            predictor_str += f" + ({condition}) * {value}"
                            any_prev_conds += f" | {condition}"

                        else:
                            # conditions are potentially non-mutually exclusive and
                            # are applied sequentially in order specified on subset
                            # not matching any previous conditions
                            predictor_str += (
                                f" + (~({any_prev_conds}) & {condition}) * {value}")
                            any_prev_conds += f" | {condition}"
                # If the predictor neither declares that the conditions are exhaustive
                # (i.e. all cases are covered an any_prev_conds is guaranteed to be
                # True) nor an 'otherwise' catch-all condition has been used (in which
                # case any_prev_conds is also guaranteed to be True) then add term
                # corresponding to no effect when no previous conditions matched
                if not (predictor.conditions_are_exhaustive or has_catch_all_condition):
                    predictor_str += f" + ~({any_prev_conds}) * {null_coeff_value}"
                predictor_strings.append(f"({predictor_str})")
            else:
                self._predictor_names.add(predictor.property_name)
                callback_predictors.append(predictor)

        self._callback_predictors = tuple(callback_predictors)

        if len(predictor_strings) > 0:

            if self.intercept != null_coeff_value:
                # Only need to include intercept if its non-zero in additive models
                # or non-unity in multiplicative/logistic models
                predictor_strings.append(f"{self.intercept}")

        if self.lm_type == LinearModelType.ADDITIVE:
            self._model_string = " + ".join(predictor_strings)
        else:
            self._model_string = " * ".join(predictor_strings)

    def _get_column_resolvers(
        self,
        df: pd.DataFrame,
        **external_variables
    ) -> Dict[str, pd.Series]:
        """Construct mapping from predictor column names to column values.

        For use in ``resolvers`` argument to ``pandas.eval`` call.

        Compared to ``pandas.DataFrame._get_cleaned_column_resolvers()`` here only the
        column names present in the model predictors are included when constructing the
        returned dictionary. For dataframes with a large number of columns this is more
        performant than iterating over all columns, of which typically only a small
        subset are used in each linear model. Any external variables specified in
        predictors are also included with dunder-wrapped keys (e.g '__ext_var__').
        """
        column_resolvers = {}
        for name in self._predictor_names:
            # predictor_names may contain built-in names that are not columns
            # therefore we need to check if name is column in dataframe
            col = df.get(name)
            if col is not None:
                cleaned_name = clean_column_name(name)
                if (
                    isinstance(col.dtype, pd.CategoricalDtype)
                    and np.issubdtype(col.dtype.categories.dtype, np.integer)
                ):
                    # `pandas.eval` raises an error when using boolean operations
                    # on series with a categorical dtype with integer categories
                    # therefore if any such columns are present we convert to
                    # double-precision floats - this should be safe providing only
                    # integer categories which have exact floating point representations
                    # are used (which is likely to be the case)
                    column_resolvers[cleaned_name] = col.astype(np.float64)
                else:
                    column_resolvers[cleaned_name] = col
        for name, value in external_variables.items():
            column_resolvers[f"__{name}__"] = pd.Series(value, index=df.index)
        return column_resolvers


[docs]
    def predict(
        self,
        df: pd.DataFrame,
        rng: Optional[np.random.RandomState] = None,
        squeeze_single_row_output=True,
        **kwargs
    ) -> pd.Series:
        """Evaluate linear model output for a given set of input data.

        :param df: The input ``DataFrame`` containing the input data to evaluate the
          model with.
        :param rng: If set to a NumPy ``RandomState`` instance, returned output will
          be boolean ``Series`` corresponding to Bernoulli random variables sampled
          according to probabilities specified by model output. Otherwise model
          output directly returned.
        :param squeeze_single_row_output: If ``rng`` argument is not ``None`` and this
          argument is set to ``True``, the output for a ``df`` input with a single-row
          will be a scalar boolean value rather than a boolean ``Series``.
        :param **kwargs: Values for any external variables included in model
          predictors.
        """
        # Check that all names specified in predictors are either a column name, an
        # external variable in kwargs (with __ prefix/suffix removed) or a built-in
        for name in self._predictor_names:
            assert (
                name in df
                or (
                    name.startswith("__")
                    and name.endswith("__")
                    and name.strip("__") in kwargs
                )
                or name in builtins.__dict__
            ), (
                f"Predictors include unknown name {name}"
            )

        column_resolvers = self._get_column_resolvers(df, **kwargs)

        if self._model_string != "":
            result = pd.eval(
                self._model_string,
                resolvers=(column_resolvers,),
                engine="python"
            )
        else:
            result = pd.Series(data=self.intercept, index=df.index)

        if len(self._callback_predictors) > 0:
            callback_results = [
                column_resolvers[p.property_name].apply(p.callback)
                for p in self._callback_predictors
            ]
            if self.lm_type == LinearModelType.ADDITIVE:
                result += sum(callback_results)
            else:
                result *= prod(callback_results)

        # Ensure result of floating point type even if all predictor coefficients
        # are integer but intercept is floating point
        if isinstance(self.intercept, float) and result.dtype == int:
            result = result.astype(float)
        # Result series sometimes picks up name from one of predictors - set to
        # None so comparisons with unnamed series in tests pass
        result.name = None

        if self.lm_type == LinearModelType.LOGISTIC:
            # Below is equivalent to result = result / (1 + result) but will give correct
            # output where any elements in result are inf (--> 1.0) or 0.0 (--> 0.0).
            result = (1 / (1 + 1 / result))

        # If the user supplied a random number generator then they want outcomes,
        # not probabilities
        if rng:
            outcome = rng.random_sample(len(result)) < result
            # pop the boolean out of the series if we have a single row,
            # otherwise return the series
            if len(outcome) == 1 and squeeze_single_row_output:
                return outcome.iloc[0]
            else:
                return outcome
        else:
            return result


    def __str__(self):
        out = "LinearModel(\n"\
              f"  {self.lm_type},\n"\
              f"  intercept = {self.intercept},\n"
        for predictor in self.predictors:
            out += f'  {predictor}\n'
        out += ")"
        return out