Recycling Robot: Value Iteration to Estimate \(q^*\)

The exerise 3.15 solution provides the closed form expressions for the bellman optimality equations for Q*.

import matplotlib.pyplot as plt
import numpy as np

# Parameters
states = ['high', 'low']
actions = ['search', 'wait', 'recharge']
gamma = 0.9
theta = 1e-4

# Probabilities and rewards
alpha = 0.9   # prob of staying high when searching from high
beta = 0.6    # prob of staying low when searching from low
r_search = 1
r_wait = 0

# Available actions per state
available_actions = {
    'high': ['search', 'wait'],
    'low': ['search', 'wait', 'recharge']
}

# Initialize q(s,a) arbitrarily
Q = {s: {a: 0.0 for a in actions} for s in states}

# Transition model
def transitions(s, a):
    if s == 'high':
        if a == 'search':
            return [(alpha, 'high', r_search), (1 - alpha, 'low', r_search)]
        elif a == 'wait':
            return [(1.0, 'high', r_wait)]
    elif s == 'low':
        if a == 'search':
            return [(beta, 'low', r_search), (1 - beta, None, r_search)]
        elif a == 'wait':
            return [(1.0, 'low', r_wait)]
        elif a == 'recharge':
            return [(1.0, 'high', 0)]
    return []

# Value iteration on Q
q_deltas = []

def value_iteration_q(Q):
    while True:
        delta = 0
        for s in states:
            for a in available_actions[s]:
                q_old = Q[s][a]
                Q[s][a] = sum(
                    p * (r + gamma * max(Q[s1].values()) if s1 is not None else r)
                    for p, s1, r in transitions(s, a)
                )
                delta = max(delta, abs(q_old - Q[s][a]))
        q_deltas.append(delta)
        if delta < theta:
            break
    return Q

# Run value iteration
Q_star = value_iteration_q(Q)

# Prepare optimal policy
pi_star = {s: max(available_actions[s], key=lambda a: Q_star[s][a]) for s in states}

# Plot convergence
plt.plot(q_deltas)
plt.title("Convergence of Value Iteration (Q*)")
plt.xlabel("Iteration")
plt.ylabel("Max Delta")
plt.grid(True)
plt.show()

# Show results
for s in states:
    for a in available_actions[s]:
        print(f"Q*({s}, {a}) = {Q_star[s][a]:.4f}")
print("\nOptimal Policy:")
for s in states:
    print(f"π*({s}) = {pi_star[s]}")

Q*(high, search) = 9.1735
Q*(high, wait) = 8.2562
Q*(low, search) = 5.4583
Q*(low, wait) = 7.4305
Q*(low, recharge) = 8.2562

Optimal Policy:
π*(high) = search
π*(low) = recharge