import matplotlib.pyplot as pltimport numpy as np# Parametersstates = ['high', 'low']actions = ['search', 'wait', 'recharge']gamma =0.9theta =1e-4# Probabilities and rewardsalpha =0.9# prob of staying high when searching from highbeta =0.6# prob of staying low when searching from lowr_search =1r_wait =0# Available actions per stateavailable_actions = {'high': ['search', 'wait'],'low': ['search', 'wait', 'recharge']}# Initialize q(s,a) arbitrarilyQ = {s: {a: 0.0for a in actions} for s in states}# Transition modeldef transitions(s, a):if s =='high':if a =='search':return [(alpha, 'high', r_search), (1- alpha, 'low', r_search)]elif a =='wait':return [(1.0, 'high', r_wait)]elif s =='low':if a =='search':return [(beta, 'low', r_search), (1- beta, None, r_search)]elif a =='wait':return [(1.0, 'low', r_wait)]elif a =='recharge':return [(1.0, 'high', 0)]return []# Value iteration on Qq_deltas = []def value_iteration_q(Q):whileTrue: delta =0for s in states:for a in available_actions[s]: q_old = Q[s][a] Q[s][a] =sum( p * (r + gamma *max(Q[s1].values()) if s1 isnotNoneelse r)for p, s1, r in transitions(s, a) ) delta =max(delta, abs(q_old - Q[s][a])) q_deltas.append(delta)if delta < theta:breakreturn Q# Run value iterationQ_star = value_iteration_q(Q)# Prepare optimal policypi_star = {s: max(available_actions[s], key=lambda a: Q_star[s][a]) for s in states}# Plot convergenceplt.plot(q_deltas)plt.title("Convergence of Value Iteration (Q*)")plt.xlabel("Iteration")plt.ylabel("Max Delta")plt.grid(True)plt.show()# Show resultsfor s in states:for a in available_actions[s]:print(f"Q*({s}, {a}) = {Q_star[s][a]:.4f}")print("\nOptimal Policy:")for s in states:print(f"π*({s}) = {pi_star[s]}")