Source code for emdp.examples.action_gap

import numpy as np
from emdp.common import MDP


[docs]def build_cake_world_mdp(epsilon, discount, cake_reward=1.0): r"""Cake world MDP from Action Gap Paper (Fig 1 of Bellemare et al. 2016). Increasing the Action Gap: New Operators for Reinforcement Learning. https://arxiv.org/pdf/1512.04860.pdf The action gap is modulated by epsilon since the difference between Q values for each action is given by `Q(x1, a2) - Q(x1, a2) = epsilon`. Args: :param epsilon: Float epsilon for the action gap. :param discount: Float discount factor. :param cake_reward: Float reward for eating cake. :returns: An emdp.common.MDP object. """ STATES = 2 ACTIONS = 2 # Short hand to make following paper easy. x1, x2 = 0, 1 a1, a2 = 0, 1 P = np.zeros((STATES, ACTIONS, STATES)) # Taking action a1 in state x1 takes you to x1 or x2 with equal likelihood. P[x1, a1, :] = .5 # Taking the abstain action leads you back to x1. P[x1, a2, x1] = 1. # All actions from x2 should lead to x2 (Terminal state). P[x2, :, x2] = 1. # Found by solving for `r` in `V(x2) = r + discount * V(x2)`. # -2(1+e)/gamma = r + gamma * -2(1+e)/gamma. # Let r = rhat * 1/ gamma. # => -2 (1+e) = rhat + -2 * gamma * (1+e). # => -2 [ (1+e) - gamma * (1+e)] = rhat # => -2 [ (1+e)(1-gamma)] = rhat. # ==> r = -2 (1+e)(1-gamma)/gamma. forever_reward = -2.0 * (1 + epsilon) * (1 - discount) / discount R = np.zeros((STATES, ACTIONS)) R[x2, :] = forever_reward # Small negative forever reward. R[x1, a1] = cake_reward # Cake! R[x1, a2] = 0. # Abstain cake! p0 = np.array([1.0, 0.0]) return MDP(P, R, discount, p0=p0, terminal_states=[x2])