Source code for emdp.analytic

r"""
Tools to get analytic solutions from MDPs.

we can compute :math:`v_\pi(s)` recursively by solving the system of Bellman equations below [Bellman1957]_:

.. math::

    \begin{align}
        v_\pi(s) &= \sum_{a} \left[
            \pi(a|s) \left( r(s,a) + \gamma \sum_{s'} p(s'|s,a) v_\pi(s') \right)
        \right] \\
        &=\sum_a \pi(a|s)r(s,a) + \gamma \sum_{s'} \left[ \left(\sum_a \pi(a|s)p(s'|s,a)\right) v_\pi(s') \right] \\
        &=r_\pi(s) + \gamma \sum_{s'} p_\pi(s'|s) v_\pi(s')
    \end{align}

These equations can also be written in matrix form with
:math:`\mathbf{v}_\pi, \mathbf{r}_\pi \in \mathbb{R}^{|\mathcal{S}|}` and 
:math:`\mathbf{p}_\pi \in \mathbb{R}^{|S|\times|S|}`:

.. math::

    \begin{align}
        \mathbf{v}_\pi &= \mathbf{r}_\pi + \gamma \mathbf{p}_\pi \mathbf{v}_\pi \\
        &= (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi \\
        &= \Phi \mathbf{r}_\pi    
    \end{align}
    
.. [Bellman1957] Bellman, Richard. 1957. “A Markovian Decision Process.” Journal of mathematics and mechanics: 679--684.
"""
import numpy as np


[docs]def calculate_P_pi(P, pi):
    """
    Calculates the transition matrix :math:`P` under policy :math:`pi`. 
    :math:`p_\pi:=Pr(s'|s,a\sim\pi))`, which is represented as a matrix of shape :math:`|\mathcal{S}|\\times|\mathcal{S}|`.

    .. math::

        p_\pi(s,s') = \sum_a \pi(a|s) p(s'|s, a)

    where :math:`s` and :math:`s'` are the states before and after taking action :math:`a`.

    Args:
        P(np.ndarray): transition matrix of size :math:`|\mathcal{S}|\\times|\mathcal{A}|\\times|\mathcal{S}|`
        pi(np.ndarray): matrix of size :math:`|\mathcal{S}|\\times|\mathcal{A}|` indicating the policy

    Returns:
        np.ndarray: a matrix of size :math:`|\mathcal{S}|\\times|\mathcal{S}|`
    """
    return np.einsum('sat,sa->st', P, pi)

[docs]def calculate_R_pi(R, pi):
    r"""
    Calculates the expected reward :math:`r_\pi` under policy :math:`\pi`, 
    which is represented as a matrix of shape :math:`|\mathcal{S}|`.
    
    .. math::

        r_\pi(s) = \sum_a \pi(a|s) r(s,a)

    Args:
        R(np.ndarray): reward matrix of size :math:`|\mathcal{S}|\times|\mathcal{A}|`
        pi(np.ndarray): matrix of size :math:`|\mathcal{S}|\times|\mathcal{A}|` indicating the policy
    
    Returns:
        np.ndarray: a matrix of size :math:`|\mathcal{S}|`
    """
    return np.einsum('sa,sa->s', R, pi)

[docs]def calculate_successor_representation(P_pi, gamma):
    r"""
    Calculates the successor representation :math:`\Phi`
    
    .. math::
    
        \Phi := (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1}

    see also: :func:`emdp.analytic.calculate_V_pi`

    :param P_pi:
    :param gamma:

    Returns:
        np.ndarray: successor representation
    """
    return np.linalg.inv(np.eye(P_pi.shape[0]) - gamma * P_pi)


[docs]def calculate_V_pi_from_successor_representation(Phi, R_pi):
    r"""
    Calculates the state-value vector :math:`\mathbf{v}_\pi` from the successor representation :math:`\Phi` and the expected reward :math:`\mathbf{r}_\pi`.

    see also: :func:`emdp.analytic.calculate_V_pi`

    Args:
        Phi(np.ndarray): successor representation of size :math:`|\mathcal{S}|\times|\mathcal{S}|`
        R_pi(np.ndarray): expected reward of size :math:`|\mathcal{S}|`
    
    Returns:
        np.ndarray: value function of size :math:`|\mathcal{S}|`
    """

    return np.einsum('st,t->s', Phi, R_pi)

[docs]def calculate_V_pi(P, R, pi, gamma):
    r"""
    Calculates the state-value :math:`v_\pi` from the successor representation using the analytic form:
    
    .. math::
        
        (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi

    where :math:`p_\pi(s,t) = \sum_a \pi(a|s) p(t|s, a)`
    and :math:`r_\pi(s) = \sum_a \pi(a|s) r(s,a)`

    see also :func:`emdp.analytic.calculate_P_pi` and :func:`emdp.analytic.calculate_R_pi`.

    .. note::

        we can compute :math:`v_\pi(s)` recursively by solving the system of Bellman equations below [Bellman1957]_:

        .. math::

            \begin{align}
                v_\pi(s) &= \sum_{a} \left[
                    \pi(a|s) \left( r(s,a) + \gamma \sum_{s'} p(s'|s,a) v_\pi(s') \right)
                \right] \\
                &=\sum_a \pi(a|s)r(s,a) + \gamma \sum_{s'} \left[ \left(\sum_a \pi(a|s)p(s'|s,a)\right) v_\pi(s') \right] \\
                &=r_\pi(s) + \gamma \sum_{s'} p_\pi(s'|s) v_\pi(s')
            \end{align}

        These equations can also be written in matrix form with
        :math:`\mathbf{v}_\pi, \mathbf{r}_\pi \in \mathbb{R}^{|\mathcal{S}|}` and 
        :math:`\mathbf{p}_\pi \in \mathbb{R}^{|S|\times|S|}`:

        .. math::

            \begin{align}
                \mathbf{v}_\pi &= \mathbf{r}_\pi + \gamma \mathbf{p}_\pi \mathbf{v}_\pi \\
                &= (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi \\
                &= \Phi \mathbf{r}_\pi    
            \end{align}

    Args:
        P(np.ndarray): Transition matrix
        R(np.ndarray): Reward matrix
        pi(np.ndarray): policy matrix
        gamma(float): discount factor

    Returns:
        np.ndarray: state-value vector under policy :math:`\pi`.
    """
    P_pi = calculate_P_pi(P, pi)
    R_pi = calculate_R_pi(R, pi)
    Phi = calculate_successor_representation(P_pi, gamma)
    return calculate_V_pi_from_successor_representation(Phi, R_pi)