# this code is from https://becomesentient.com/mdp-dynamic-programming/import numpy as np import gym.spacesfrom reinforcement_learning.lib.envs.gridworld import GridworldEnvenv = GridworldEnv()def policy_eval(policy, env, discount_factor=1.0, epsilon=0.00001):""" Evaluate a policy given an environment and a full description of the environment's dynamics. Args: policy: [S, A] shaped matrix representing the policy. env: OpenAI env. env.P represents the transition probabilities of the environment. env.P[s][a] is a list of transition tuples (prob, next_state, reward, done). env.nS is a number of states in the environment. env.nA is a number of actions in the environment. theta: We stop evaluation once our value function change is less than theta for all states. discount_factor: Gamma discount factor. Returns: Vector of length env.nS representing the value function. """# Start with a random (all 0) value function V_old = np.zeros(env.nS)whileTrue:#new value function V_new = np.zeros(env.nS)#stopping condition delta =0#loop over state spacefor s inrange(env.nS):#To accumelate bellman expectation eqn v_fn =0#get probability distribution over actions action_probs = policy[s]#loop over possible actionsfor a inrange(env.nA):#get transitions [(prob, next_state, reward, done)] = env.P[s][a]#apply bellman expectatoin eqn v_fn += action_probs[a] * (reward + discount_factor * V_old[next_state])#get the biggest difference over state space delta =max(delta, abs(v_fn - V_old[s]))#update state-value V_new[s] = v_fn#the new value function V_old = V_new#if true value functionif(delta < epsilon):breakreturn np.array(V_old)random_policy = np.ones([env.nS, env.nA]) / env.nAv = policy_eval(random_policy, env)expected_v = np.array([0, -14, -20, -22, -14, -18, -20, -20, -20, -20, -18, -14, -22, -20, -14, 0])np.testing.assert_array_almost_equal(v, expected_v, decimal=2)print(v)print(expected_v)
import numpy as npimport gym.spacesfrom reinforcement_learning.lib.envs.gridworld import GridworldEnvenv = GridworldEnv()def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):""" Policy Improvement Algorithm. Iteratively evaluates and improves a policy until an optimal policy is found. Args: env: The OpenAI envrionment. policy_eval_fn: Policy Evaluation function that takes 3 arguments: policy, env, discount_factor. discount_factor: gamma discount factor. Returns: A tuple (policy, V). policy is the optimal policy, a matrix of shape [S, A] where each state s contains a valid probability distribution over actions. V is the value function for the optimal policy. """def one_step_lookahead(s, value_fn): actions = np.zeros(env.nA)for a inrange(env.nA): [(prob, next_state, reward, done)] = env.P[s][a] actions[a] = prob * (reward + discount_factor * value_fn[next_state])return actions# Start with a random policy policy = np.ones([env.nS, env.nA]) / env.nA actions_values = np.zeros(env.nA)whileTrue:#evaluate the current policy value_fn = policy_eval_fn(policy, env) policy_stable =True#loop over state spacefor s inrange(env.nS):#perform one step lookahead actions_values = one_step_lookahead(s, value_fn)#maximize over possible actions best_action = np.argmax(actions_values)#best action on current policy chosen_action = np.argmax(policy[s])#if Bellman optimality equation not satisifedif(best_action != chosen_action): policy_stable =False#the new policy after acting greedily w.r.t value function policy[s] = np.eye(env.nA)[best_action]#if Bellman optimality eqn is satisfiedif(policy_stable):return policy, value_fnpolicy, v = policy_improvement(env)print("Policy Probability Distribution:")print(policy)print("")print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")print(np.reshape(np.argmax(policy, axis=1), env.shape))print("")print("Value Function:")print(v)print("")print("Reshaped Grid Value Function:")print(v.reshape(env.shape))print("")