Cleaning Robot - Stochastic MDP

The following code shows the estimation of the q value function for a policy, the optimal q_star and the optimal policy for the cleaning robot problem in the strochastic case.

import numpy as np

def stochastic_robot_cleaning_v1():
    # Initialization
    state = [0, 1, 2, 3, 4, 5]                # Set of states
    action = [-1, 1]                         # Set of actions

    # Make all (state, action) pairs
    A, B = np.meshgrid(state, action)
    C = np.column_stack((A.flatten(), B.flatten()))
    D1 = C                              # D includes all possible combinations of (state, action) pairs

    # Make all (state, action) pairs indices
    A, B = np.meshgrid(np.arange(len(state)), np.arange(len(action)))
    C = np.column_stack((A.flatten(), B.flatten()))
    D2 = C                              # D includes all possible combinations of (state, action) pairs indices

    D = np.column_stack((D1, D2))
    Q = np.zeros((len(state), len(action)))  # Initial Q can be chosen arbitrarily
    Qold = Q                                 # Save a backup to compare later
    L = 15                                   # Maximum number of iterations
    gamma = 0.5                              # Discounting factor
    epsilon = 0.001                          # Final error to stop the algorithm

    # Deterministic Q-iteration algorithm
    for l in range(1, L + 1):
        print(f'iteration: {l}')
        for ii in range(D.shape[0]):
            XP, P, R = model(D[ii, 0], D[ii, 1])  # Includes the next states and the probability
            I1 = np.where(D[:, 0] == XP[0])[0]  # Find the next state in the pair-matrix D
            I2 = np.where(D[:, 0] == XP[1])[0]
            I3 = np.where(D[:, 0] == XP[2])[0]
            Q[D[ii, 2], D[ii, 3]] = P[0] * (R[0] + gamma * np.max(Q[I1, :])) + \
                                     P[1] * (R[1] + gamma * np.max(Q[I2, :])) + \
                                     P[2] * (R[2] + gamma * np.max(Q[I3, :]))

        if np.abs(np.sum(Q - Qold)) < epsilon:
            print('Epsilon criteria satisfied!')
            break
        else:
            Qold = Q

    # Show the results
    print('Q matrix (optimal):')
    print(Q)

    C = np.argmax(Q, axis=1)                   # Finding the max values
    print('Q(optimal):')
    print(C)
    print('Optimal Policy:')
    print('*')
    print([action[C[1]], action[C[2]], action[C[3]], action[C[4]]])
    print('*')

# Find the possible next states together with their probability value
def model(x, u):
    if x == 0 or x == 5:
        xp = [x, x, x]
    else:
        xp = [x - u, x, x + u]
    p = [prob(x, u, xp[0]), prob(x, u, xp[1]), prob(x, u, xp[2])]
    r = [reward(x, u, xp[0]), reward(x, u, xp[1]), reward(x, u, xp[2])]
    return xp, p, r


# This function is the transition model of the robot
# The inputs are: the current state, the chosen action, and the next desired state
# The output is the probability for going to the next state considering the stochasticity
# In the deterministic case, the model gives the next state, but in the stochastic case,
# the model gives the probability for going to the next state.
def prob(x, u, xp):
    if x == 0 or x == 5:
        if xp == x:
            return 1
        else:
            return 0
    elif 1 <= x <= 4:
        if xp == x:
            return 0.15
        elif xp == x + u:
            return 0.8
        elif xp == x - u:
            return 0.05
        else:
            return 0
    else:
        return 0


# This function is the reward function for the task (stochastic)
# The inputs are: the current state, the chosen action, and the next state
# The output is the expected reward in the next state
# The reward actually doesn't depend on the chosen action, in this case
def reward(x, u, xp):
    if x != 5 and xp == 5:
        return 5
    elif x != 0 and xp == 0:
        return 1
    else:
        return 0

# Call the main function
stochastic_robot_cleaning_v1()