Source code for emdp.chainworld.env

import numpy as np
from ..common import MDP
from ..actions import LEFT, RIGHT

N_ACTIONS = 2

[docs]def build_chain_MDP(n_states=3, p_success=1, reward_spec=[(1, RIGHT, +5)], starting_distribution=np.array([0, 0, 1]), terminal_states=[0], gamma=0.9, seed=1337, return_MDP=True): """ A simple chain world with states and 2 actions. Actions can fail with probability 1-p_success .. note:: you probably want your terminal state to be separate from the state where the reward is obtained. Example of how to use: # a 7 state MDP where the agent starts in the middle # at the two ends are absorbing states (given by terminal states) # if the agent reaches the state before the terminal state it gets a reward # if the agent is at the left of the world and it takes an action LEFT it gets a -1 # otherwise it gets nothing # if the agent is at the right of the world and it takes an action RIGHT it gets a +1 # otherwise it gets nothing build_chain_MDP(n_states=7, p_success=0.9, reward_spec=[(5, RIGHT, +1), (1, LEFT, -1)] starting_distribution=np.array([0,0,0,1,0,0,0]), terminal_states=[0, 6], gamma=0.9) :param n_states: the number of states in the chain world. :param p_success: the probability of successfully executing an action. :param reward_spec: a list of tuples which represent (location_of_reward, magnitude_of_reward) :param starting_distribution: a distribution over starting states. :param terminal_states: a list of integers representing the terminal states :param return_MDP: returns an MDP object, else will return the components to create one. :return: """ p_fail = 1 - p_success assert p_success <= 1 and p_success >= 0 # building the transition matrix. P = np.zeros((n_states, N_ACTIONS, n_states)) for s in range(n_states): if s in terminal_states: # whatever action we take from this state should end up in this state again P[s, :, s] = 1 else: if s == 0: # we are at the left edge of the grid. # if we take the LEFT action it should be a no-op. P[s, LEFT, s-1] = 0 P[s, LEFT, s] = 1 else: # not at the left edge, fill in LEFT operation as usual P[s, LEFT, s-1] = p_success # successfully transition to the left P[s, LEFT, s] = p_fail if s == n_states-1: # we are at the right edge of the grid. # if we take RIGHT action it should be a no-op P[s, RIGHT, s] = 1 else: # not at the right edge, fill in RIGHT operation as usual P[s, RIGHT, s+1] = p_success # successfully transition to the right P[s, RIGHT, s] = p_fail R = np.zeros((n_states, N_ACTIONS)) for (reward_loc, action ,reward_mag) in reward_spec: R[reward_loc, action] = reward_mag # any action at this position leads to a reward. if return_MDP: return MDP(P, R, gamma, starting_distribution, terminal_states, seed=1337) else: return P, R, gamma, starting_distribution, terminal_states