# Look at the backup tree above and the vector form Bellman equation to understand this code.
# Have them side by side while you are reading.
# Each of the 11 rows of the "matrix" P[s][a] has 4 tuples - one for each of the allowed actions. Each tuple / action is written in the format (probability, s') and is associated with the 3 possible next states that the agent may end up despite its intention to go to the desired state. The states are numbered sequentially from top left to bottom right.
import numpy as np
from rich import print
P = {
0: {
0: [(0.9, 0), (0.1, 1), (0, 4)],
1: [(0.8, 1), (0.1, 4), (0.1, 0)],
2: [(0.8, 4), (0.1, 1), (0.1, 0)],
3: [(0.9, 0), (0.1, 4)],
},
1: {
0: [(0.8, 1), (0.1, 2), (0.1, 0)],
1: [(0.8, 2), (0.2, 1)],
2: [(0.8, 1), (0.1, 0), (0.1, 2)],
3: [(0.8, 0), (0.2, 1)],
},
2: {
0: [(0.8, 2), (0.1, 3), (0.1, 1)],
1: [(0.8, 3), (0.1, 5), (0.1, 2)],
2: [(0.8, 5), (0.1, 1), (0.1, 3)],
3: [(0.8, 1), (0.1, 2), (0.1, 5)],
},
3: {
0: [(0.9, 3), (0.1, 2)],
1: [(0.9, 3), (0.1, 6)],
2: [(0.8, 6), (0.1, 2), (0.1, 3)],
3: [(0.8, 2), (0.1, 3), (0.1, 6)],
},
4: {
0: [(0.8, 0), (0.2, 4)],
1: [(0.8, 4), (0.1, 7), (0.1, 0)],
2: [(0.8, 7), (0.2, 4)],
3: [(0.8, 4), (0.1, 0), (0.1, 7)],
},
5: {
0: [(0.8, 2), (0.1, 6), (0.1, 5)],
1: [(0.8, 6), (0.1, 9), (0.1, 2)],
2: [(0.8, 9), (0.1, 5), (0.1, 6)],
3: [(0.8, 5), (0.1, 2), (0.1, 9)],
},
6: {
0: [(0.8, 3), (0.1, 6), (0.1, 5)],
1: [(0.8, 6), (0.1, 10), (0.1, 3)],
2: [(0.8, 10), (0.1, 5), (0.1, 6)],
3: [(0.8, 5), (0.1, 3), (0.1, 10)],
},
7: {
0: [(0.8, 4), (0.1, 8), (0.1, 7)],
1: [(0.8, 8), (0.1, 7), (0.1, 4)],
2: [(0.9, 7), (0.1, 8)],
3: [(0.9, 7), (0.1, 4)],
},
8: {
0: [(0.8, 8), (0.1, 9), (0.1, 7)],
1: [(0.8, 9), (0.2, 8)],
2: [(0.8, 8), (0.1, 7), (0.1, 9)],
3: [(0.8, 7), (0.2, 8)],
},
9: {
0: [(0.8, 5), (0.1, 10), (0.1, 8)],
1: [(0.8, 9), (0.1, 9), (0.1, 5)],
2: [(0.8, 9), (0.1, 8), (0.1, 10)],
3: [(0.8, 8), (0.1, 5), (0.1, 9)],
},
10: {
0: [(0.8, 6), (0.1, 10), (0.1, 9)],
1: [(0.9, 10), (0.1, 6)],
2: [(0.9, 10), (0.1, 9)],
3: [(0.8, 9), (0.1, 6), (0.1, 10)],
},
}
R = [0, 0, 0, 1, 0, 0, -1, 0, 0, 0, 0]
gamma = 0.9
States = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Actions = [0, 1, 2, 3] # [east, north, south, west]
Value Iteration Gridworld
Two
v = [0] * 11
# once v computed, we can calculate the optimal policy
optPolicy = [0] * 11
# value iteration
for i in range(100):
for s in States:
# trans[1] = s'
# trans[0] = P_ss'
q_0 = sum(trans[0] * v[trans[1]] for trans in P[s][0])
q_1 = sum(trans[0] * v[trans[1]] for trans in P[s][1])
q_2 = sum(trans[0] * v[trans[1]] for trans in P[s][2])
q_3 = sum(trans[0] * v[trans[1]] for trans in P[s][3])
v[s] = R[s] + gamma * max(q_0, q_1, q_2, q_3)
for s in States:
optPolicy[s] = int(
np.argmax(
a=[sum([trans[0] * v[trans[1]] for trans in P[s][a]]) for a in Actions]
)
)
print(v)
print(optPolicy)
[ 5.882322961884418, 6.788994341526544, 7.731922481317687, 8.925605847317208, 5.164964536161228, 6.773577272302599, 6.633030088993117, 4.595878221066634, 5.149735082549222, 5.864985900325012, 5.828164501942005 ]
[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0]
import numpy as np
import pandas as pd
# Gridworld parameters
rows, cols = 3, 4
step_reward = -0.04
gamma = 0.9
# Terminal states and their rewards (row, col) using 0-indexed rows from top
terminal_states = {
(0, 3): 1.0, # top-right
(1, 3): -1.0,
} # just below top-right
walls = {(1, 1)} # wall at position (row=1, col=1)
# Initialize value function
V = np.zeros((rows, cols))
for (i, j), r in terminal_states.items():
V[i, j] = r
# Define actions
actions = {"up": (-1, 0), "down": (1, 0), "left": (0, -1), "right": (0, 1)}
action_list = list(actions.values())def next_state_reward(state, action):
i, j = state
di, dj = action
ni, nj = i + di, j + dj
# If next state is off-grid or a wall, stay in place
if (ni < 0 or ni >= rows or nj < 0 or nj >= cols) or ((ni, nj) in walls):
return state, step_reward
# If next state is terminal, get its terminal reward
if (ni, nj) in terminal_states:
return (ni, nj), terminal_states[(ni, nj)]
# Otherwise, standard step reward
return (ni, nj), step_reward
def transitions(state, action):
# Stochastic model: intended 0.8, perpendicular slips 0.1 each
if action == actions["up"]:
slips = [actions["left"], actions["right"]]
elif action == actions["down"]:
slips = [actions["right"], actions["left"]]
elif action == actions["left"]:
slips = [actions["down"], actions["up"]]
else: # right
slips = [actions["up"], actions["down"]]
# Gather transitions
results = []
# intended
s1, r1 = next_state_reward(state, action)
results.append((0.8, s1, r1))
# slips / sideways
for slip in slips:
s2, r2 = next_state_reward(state, slip)
results.append((0.1, s2, r2))
return results
# Value Iteration
theta = 1e-4
while True:
delta = 0
newV = V.copy()
for i in range(rows):
for j in range(cols):
if (i, j) in terminal_states or (i, j) in walls:
continue
q_values = []
for action in action_list:
q = 0
for prob, s_prime, reward in transitions((i, j), action):
q += prob * (reward + gamma * V[s_prime])
q_values.append(q)
newV[i, j] = max(q_values)
delta = max(delta, abs(newV[i, j] - V[i, j]))
V = newV
if delta < theta:
break
# Display the resulting value function
df = pd.DataFrame(
V, index=[f"row{r}" for r in range(rows)], columns=[f"col{c}" for c in range(cols)]
)
print(df)col0 col1 col2 col3 row0 1.255424 1.510587 1.775947 1.000000 row1 1.053525 0.000000 1.156793 -1.000000 row2 0.863027 0.742828 0.901585 0.464972