Source code for emdp.examples.off_policy

import numpy as np
from emdp.common import MDP

[docs]def build_two_circle_MDP(discount=0.6, good_reward=10., distractor_reward=5.): """MDP counter example given in Fig 1a of Zhang, et al. See "Generalized Off-Policy Actor-Critic" https://arxiv.org/pdf/1903.11329.pdf :param discount: The discount factor. :param good_reward: The good reward that the agent must find. :param distractor_reward: The disctraction reward. :returns: An emdp.common.MDP object. """ ACTIONS = 2 STATES = 11 # Referrence of MDP states as in the paper. A = 0 C = 1 B = 5 ACTUAL_REWARD_STATE = 3 JOINER_STATE = 4 # State 0 (A) is the starting state # States 1 - 3 are states in the first chain. FIRST_CHAIN = [C, 2, ACTUAL_REWARD_STATE, JOINER_STATE] # States 5 - 7 are states in the second chain. SECOND_CHAIN = [B, 6, 7, JOINER_STATE] # State 4 joins the two chains. # States 8 - 10 are states that lead back to A CONNECTION_CHAIN = [JOINER_STATE, 8, 9, 10, A] # DEFINING TRANSITION MATRIX. P = np.zeros((STATES, ACTIONS, STATES)) # From the first state, the actions lead to different circumstances. P[A, 0, C] = 1. P[A, 1, B] = 1. # Within the chains, any action should lead to the next state in the chain. for chain in [FIRST_CHAIN, SECOND_CHAIN, CONNECTION_CHAIN]: for state_t, state_tp1 in zip(chain[:-1], chain[1:]): P[state_t, :, state_tp1] = 1. # DEFINING DISCOUNT FACTOR. gamma = discount # DEFINING REWARDS. # Both actions lead to the good reward. R = np.zeros((STATES, ACTIONS)) R[ACTUAL_REWARD_STATE, :] = good_reward # Both actions lead to the distractor reward. R[B, :] = distractor_reward # DEFINING START STATES. p0 = np.zeros(STATES) p0[A] = 1. return MDP(P, R, gamma, p0=p0, terminal_states=[])