Source code for emdp.examples.counter

import numpy as np
from emdp.common import MDP

[docs]def build_imani_counterexample(): """ MDP counter example given in Fig 1a of Imani, et al. "An Off-policy Policy Gradient Theorem Using Emphatic Weightings." Neurips 2018 :return: """ # |S| = 4, |A| = 2 STATES = 4 ACTIONS = 2 P = np.zeros((STATES, ACTIONS, STATES)) P[0, 0] = [0, 1, 0, 0] P[0, 1] = [0, 0, 1, 0] P[1, 0] = [0, 0, 0, 1] P[1, 1] = [0, 0, 0, 1] P[2, 0] = [0, 0, 0, 1] P[2, 1] = [0, 0, 0, 1] P[3, 0] = [0, 0, 0, 1] P[3, 1] = [0, 0, 0, 1] gamma = 0.99999 R = np.zeros((STATES, ACTIONS)) R[0, 0] = 0 R[0, 1] = 1 R[1, 0] = 2 R[1, 1] = 0 R[2, 0] = 0 R[2, 1] = 1 return MDP(P, R, gamma, p0=np.array([1, 0, 0, 0]), terminal_states=[3]) pass