Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

What is the learned Q table for the following code? Please run the code and show the output. import numpy as np import matplotlib.pyplot as

What is the learned Q
table for the following code? Please run the code and show the output.
import numpy as np
import matplotlib.pyplot as plt
# Grid world size
WORLD_SIZE =10
# Percentage of cells occupied by obstacles
OBSTACLE_DENSITY =0.15
# Learning parameters
ALPHA =0.5
GAMMA =0.9
EPSILON =0.1
def initialize_world():
# Create empty grid
world = np.zeros((WORLD_SIZE, WORLD_SIZE))
# Set start in bottom left
world[0,0]=2
# Set goal in top right
world[-1,-1]=3
# Add random obstacles
num_obstacles = int(OBSTACLE_DENSITY * WORLD_SIZE**2)
obstacle_indices = np.random.choice(range(WORLD_SIZE**2), size=num_obstacles, replace=False)
for i in obstacle_indices:
x = i // WORLD_SIZE
y = i % WORLD_SIZE
world[x,y]=1
return world
def initialize_q_values():
# Q(s,a) initialized to 0 for all s,a
q_values ={}
for x in range(WORLD_SIZE):
for y in range(WORLD_SIZE):
for a in range(4): # up, down, left, right
q_values[(x,y,a)]=0.0
return q_values
def epsilon_greedy(state, q_values, epsilon):
# With probability epsilon, take random action
# Otherwise, take greedy action based on current Q values
if np.random.rand()< epsilon:
action = np.random.randint(4)
else:
values =[q_values[(state[0], state[1], a)] for a in range(4)]
action = np.argmax(values)
return action
def update_q_value(state, action, reward, next_state, q_values, alpha, gamma):
# Q-learning update rule
max_q_next = max([q_values[(next_state[0], next_state[1], a)] for a in range(4)])
q_values[(state[0], state[1], action)]+= alpha *(reward + gamma * max_q_next - q_values[(state[0], state[1], action)])
return q_values
def check_goal(state):
return state ==(WORLD_SIZE-1, WORLD_SIZE-1)
if __name__=="__main__":
# Create world
world = initialize_world()
# Initialize Q values
q_values = initialize_q_values()
# Track metrics
steps_per_episode =[]
sse =[]
for episode in range(1000):
# Reset agent to start position
state =(0,0)
step =0
episode_sse =0
while not check_goal(state):
# Choose action using epsilon-greedy
action = epsilon_greedy(state, q_values, EPSILON)
# Take action and get reward/next state
if action ==0: # up
next_state =(state[0]-1, state[1])
elif action ==1: # down
next_state =(state[0]+1, state[1])
elif action ==2: # left
next_state =(state[0], state[1]-1)
else: # right
next_state =(state[0], state[1]+1)
reward =-0.1
if world[next_state]==1: # Hit obstacle
reward =-1
next_state = state # Stay in current state
if check_goal(next_state):
reward =10
# Update Q value
q_values = update_q_value(state, action, reward, next_state, q_values, ALPHA, GAMMA)
# Calculate SSE
episode_sse +=(reward + GAMMA * max([q_values[(next_state[0], next_state[1], a)] for a in range(4)])- q_values[(state[0], state[1], action)])**2
# Update state
state = next_state
step +=1
steps_per_episode.append(step)
sse.append(episode_sse)
# Plot results
plt.plot(steps_per_episode)
plt.xlabel('Episode')
plt.ylabel('Steps per episode')
plt.savefig('steps.png')
plt.plot(sse)
plt.xlabel('Episode')
plt.ylabel('Sum squared error')
plt.savefig('sse.png')
# Print learned policy
policy ={}
for x in range(WORLD_SIZE):
for y in range(WORLD_SIZE):
values =[q_values[(x,y,a)] for a in range(4)]
policy[(x,y)]= np.argmax(values)
print("Learned Optimal Policy:")
print(policy)
# Print the learned Q-table
print("Learned Q-table:")
print(q_table)

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access to Expert-Tailored Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Recommended Textbook for

More Books

Students also viewed these Databases questions