Source code for emdp.analytic

r"""
Tools to get analytic solutions from MDPs.

we can compute :math:`v_\pi(s)` recursively by solving the system of Bellman equations below [Bellman1957]_:

.. math::

    \begin{align}
        v_\pi(s) &= \sum_{a} \left[
            \pi(a|s) \left( r(s,a) + \gamma \sum_{s'} p(s'|s,a) v_\pi(s') \right)
        \right] \\
        &=\sum_a \pi(a|s)r(s,a) + \gamma \sum_{s'} \left[ \left(\sum_a \pi(a|s)p(s'|s,a)\right) v_\pi(s') \right] \\
        &=r_\pi(s) + \gamma \sum_{s'} p_\pi(s'|s) v_\pi(s')
    \end{align}

These equations can also be written in matrix form with
:math:`\mathbf{v}_\pi, \mathbf{r}_\pi \in \mathbb{R}^{|\mathcal{S}|}` and 
:math:`\mathbf{p}_\pi \in \mathbb{R}^{|S|\times|S|}`:

.. math::

    \begin{align}
        \mathbf{v}_\pi &= \mathbf{r}_\pi + \gamma \mathbf{p}_\pi \mathbf{v}_\pi \\
        &= (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi \\
        &= \Phi \mathbf{r}_\pi    
    \end{align}
    
.. [Bellman1957] Bellman, Richard. 1957. “A Markovian Decision Process.” Journal of mathematics and mechanics: 679--684.
"""
import numpy as np


[docs]def calculate_P_pi(P, pi): """ Calculates the transition matrix :math:`P` under policy :math:`pi`. :math:`p_\pi:=Pr(s'|s,a\sim\pi))`, which is represented as a matrix of shape :math:`|\mathcal{S}|\\times|\mathcal{S}|`. .. math:: p_\pi(s,s') = \sum_a \pi(a|s) p(s'|s, a) where :math:`s` and :math:`s'` are the states before and after taking action :math:`a`. Args: P(np.ndarray): transition matrix of size :math:`|\mathcal{S}|\\times|\mathcal{A}|\\times|\mathcal{S}|` pi(np.ndarray): matrix of size :math:`|\mathcal{S}|\\times|\mathcal{A}|` indicating the policy Returns: np.ndarray: a matrix of size :math:`|\mathcal{S}|\\times|\mathcal{S}|` """ return np.einsum('sat,sa->st', P, pi)
[docs]def calculate_R_pi(R, pi): r""" Calculates the expected reward :math:`r_\pi` under policy :math:`\pi`, which is represented as a matrix of shape :math:`|\mathcal{S}|`. .. math:: r_\pi(s) = \sum_a \pi(a|s) r(s,a) Args: R(np.ndarray): reward matrix of size :math:`|\mathcal{S}|\times|\mathcal{A}|` pi(np.ndarray): matrix of size :math:`|\mathcal{S}|\times|\mathcal{A}|` indicating the policy Returns: np.ndarray: a matrix of size :math:`|\mathcal{S}|` """ return np.einsum('sa,sa->s', R, pi)
[docs]def calculate_successor_representation(P_pi, gamma): r""" Calculates the successor representation :math:`\Phi` .. math:: \Phi := (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} see also: :func:`emdp.analytic.calculate_V_pi` :param P_pi: :param gamma: Returns: np.ndarray: successor representation """ return np.linalg.inv(np.eye(P_pi.shape[0]) - gamma * P_pi)
[docs]def calculate_V_pi_from_successor_representation(Phi, R_pi): r""" Calculates the state-value vector :math:`\mathbf{v}_\pi` from the successor representation :math:`\Phi` and the expected reward :math:`\mathbf{r}_\pi`. see also: :func:`emdp.analytic.calculate_V_pi` Args: Phi(np.ndarray): successor representation of size :math:`|\mathcal{S}|\times|\mathcal{S}|` R_pi(np.ndarray): expected reward of size :math:`|\mathcal{S}|` Returns: np.ndarray: value function of size :math:`|\mathcal{S}|` """ return np.einsum('st,t->s', Phi, R_pi)
[docs]def calculate_V_pi(P, R, pi, gamma): r""" Calculates the state-value :math:`v_\pi` from the successor representation using the analytic form: .. math:: (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi where :math:`p_\pi(s,t) = \sum_a \pi(a|s) p(t|s, a)` and :math:`r_\pi(s) = \sum_a \pi(a|s) r(s,a)` see also :func:`emdp.analytic.calculate_P_pi` and :func:`emdp.analytic.calculate_R_pi`. .. note:: we can compute :math:`v_\pi(s)` recursively by solving the system of Bellman equations below [Bellman1957]_: .. math:: \begin{align} v_\pi(s) &= \sum_{a} \left[ \pi(a|s) \left( r(s,a) + \gamma \sum_{s'} p(s'|s,a) v_\pi(s') \right) \right] \\ &=\sum_a \pi(a|s)r(s,a) + \gamma \sum_{s'} \left[ \left(\sum_a \pi(a|s)p(s'|s,a)\right) v_\pi(s') \right] \\ &=r_\pi(s) + \gamma \sum_{s'} p_\pi(s'|s) v_\pi(s') \end{align} These equations can also be written in matrix form with :math:`\mathbf{v}_\pi, \mathbf{r}_\pi \in \mathbb{R}^{|\mathcal{S}|}` and :math:`\mathbf{p}_\pi \in \mathbb{R}^{|S|\times|S|}`: .. math:: \begin{align} \mathbf{v}_\pi &= \mathbf{r}_\pi + \gamma \mathbf{p}_\pi \mathbf{v}_\pi \\ &= (\mathbf{I} - \gamma \mathbf{p}_\pi)^{-1} \mathbf{r}_\pi \\ &= \Phi \mathbf{r}_\pi \end{align} Args: P(np.ndarray): Transition matrix R(np.ndarray): Reward matrix pi(np.ndarray): policy matrix gamma(float): discount factor Returns: np.ndarray: state-value vector under policy :math:`\pi`. """ P_pi = calculate_P_pi(P, pi) R_pi = calculate_R_pi(R, pi) Phi = calculate_successor_representation(P_pi, gamma) return calculate_V_pi_from_successor_representation(Phi, R_pi)