import numpy as np
import matplotlib.pyplot as plt
= ['A', 'B', 'C', 'D', 'E']
states = {s: i for i, s in enumerate(states)}
state_to_index = len(states)
n_states
def generate_episode(start='C'):
= state_to_index[start]
state = []
episode while True:
= np.random.choice([-1, 1]) # left or right
action = state + action
next_state if next_state < 0:
return episode + [(state, 0)] # Left terminal, reward 0
elif next_state >= n_states:
return episode + [(state, 1)] # Right terminal, reward 1
else:
0))
episode.append((state, = next_state
state
def mc_prediction(episodes=1000, alpha=0.1):
= np.full(n_states, 0.5)
V #V = np.zeros(n_states)
for _ in range(episodes):
= generate_episode()
episode = episode[-1][1] # Only final reward matters
G for (s, _) in episode:
+= alpha * (G - V[s])
V[s] return V
def td0_prediction(episodes=10000, alpha=0.01):
#V = np.zeros(n_states)
= np.full(n_states, 0.5)
V
for _ in range(episodes):
= generate_episode()
episode for i in range(len(episode) - 1):
= episode[i]
s, _ = episode[i + 1]
s_next, r += alpha * (r + V[s_next] - V[s])
V[s] # final transition
= episode[-1]
s, r += alpha * (r - V[s])
V[s] return V
# Run both methods
= mc_prediction()
V_mc = td0_prediction()
V_td
# Ground truth (analytical solution): [1/6, 2/6, 3/6, 4/6, 5/6]
= np.array([1, 2, 3, 4, 5]) / 6
true_V
# Plot results
='True V', linestyle='--', marker='o')
plt.plot(true_V, label='MC Estimate', marker='x')
plt.plot(V_mc, label='TD(0) Estimate', marker='s')
plt.plot(V_td, label=range(n_states), labels=states)
plt.xticks(ticks"Estimated Value")
plt.ylabel("Monte Carlo vs TD(0) on Random Walk")
plt.title(
plt.legend()True)
plt.grid( plt.show()
MC vs. TD(0)
Markov Reward Process Example - Sutton & Barto Example 6.2
It is instructive to see the difference between MC and TD approaches in the following example.
