Answered step by step
Verified Expert Solution
Link Copied!

Question

1 Approved Answer

PYTHON PROGRAM UNDERSTAND DEMO POLICY ITERATION CODES AND APPLY ITERATION ON GIVEN GRID (ATTACHED) NOTE: PROVIDE CODE AND SCREENSHOT OF RUNNING PROGAM. DEMO CODE POLICY

PYTHON PROGRAM

UNDERSTAND DEMO POLICY ITERATION CODES AND APPLY ITERATION ON GIVEN GRID (ATTACHED)

NOTE: PROVIDE CODE AND SCREENSHOT OF RUNNING PROGAM.

DEMO CODE POLICY ITERATION 1:(ELEMENTS)

importsys

Importrandom

class MDP(object):

def __init__(self,states,actions,transition,reward,discount=0.5):

self._states = states

self._actions = actions

self._transition = transition

self._reward = reward

self._discount = discount

self._initial_v = [ 0 for _ in states]

self._initial_q = [[0 for _ in actions] for _ in states]

def valueIteration(self,iterations = 0,threshold = 0.000000001):

previousMatrix = self._initial_v

returnQMatrix = [[0 for _ in self._actions] for _ in self._states]

delta = 0.0

for _ in range(iterations):

returnMatrix = [ 0 for _ in self._states]

for s in range(len(self._states)):

maxValue = -sys.maxsize - 1

for a in range(len(self._actions)):

actionValue = 0

possibleOutcomes = self._transition(s,a)

if len(possibleOutcomes) == 0:

maxValue = self._reward(s,a,None)

continue

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,a,sp) + self._discount * previousMatrix[sp])

returnQMatrix[s][a] = actionValue

maxValue = max(maxValue,actionValue)

returnMatrix[s] = maxValue

delta = max(delta, abs(previousMatrix[s] - returnMatrix[s]))

previousMatrix = returnMatrix

if delta

break

return previousMatrix, returnQMatrix

def policyEvaluation(self,policy,start_v,threshold):

while True:

delta = 0.0

for s in range(len(self._states)):

v = start_v[s]

actionValue = 0

possibleOutcomes = self._transition(s,policy[s])

if len(possibleOutcomes) == 0:

actionValue = self._reward(s,policy[s],None)

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,policy[s],sp) + self._discount * start_v[sp])

start_v[s] = actionValue

delta = max(delta, abs(v - start_v[s]))

if delta

break

def policyIteration(self,threshold = 0.000000001):

'''intialize the policy and values'''

start_v = [random.random() * 100 for _ in self._states]

start_policy = [random.randint(0,len(self._actions)-1) for _ in self._states]

while True:

policy_stable = True

self.policyEvaluation(start_policy, start_v, threshold)

for s in range(len(self._states)):

old_action = start_policy[s]

maxValue = -sys.maxsize - 1

for a in range(len(self._actions)):

actionValue = 0

possibleOutcomes = self._transition(s,a)

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,a,sp) + self._discount * start_v[sp])

if maxValue

maxValue = actionValue

start_policy[s] = a

if old_action != start_policy[s]:

policy_stable = False

if policy_stable:

break

return start_policy, start_

DEMO CODE POLICY ITERATION 2(GRID):SAMPLE

from com.ai.mdp.element.mdp import MDP

class GridMDP(object):

'''

classdocs

'''

def __init__(self, file, noise=0.2,livingReward = 0.0):

self._livingReward = livingReward

self._noise = noise

self._states = []

self._actions = [(-1,0),(1,0),(0,-1),(0,1)]

self.readFile(file)

def readFile(self,file):

f = open(file,"+r")

lines = f.readlines()

self._rows = int(lines[0])

self._columns = int(lines[1])

for i in range(2, len(lines)):

self._states += [int(x) for x in lines[i].split(" ")]

# print(self._states)

def transition(self,state,action):

returnStates = []

if self._states[state] == 2:

return returnStates

if self._states[state] == 3 or self._states[state] == 4:

return returnStates

stateRow = int (state / self._columns)

stateColumn = state % self._columns

possibleActions = [ (self._actions[action][0],self._actions[action][1],1-self._noise),

((self._actions[action][0]**2 + 1)%2,(self._actions[action][1]**2 + 1)%2,self._noise/2.0),

( ((self._actions[action][0]**2 + 1)%2)*-1, ((self._actions[action][1]**2 + 1)%2)*-1,self._noise/2.0)]

for pa in possibleActions:

if stateRow + pa[0] >= 0 and\

stateRow + pa[0]

stateColumn + pa[1] >= 0 and\

stateColumn + pa[1]

self._states[int((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]))] != 2:

returnStates.append(((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]),pa[2]))

else:

returnStates.append((state,pa[2]))

return returnStates

def reward(self,s,a,sp):

if self._states[s] == 3:

return 1.0

if self._states[s] == 4:

return -1.0

return self._livingReward

if __name__ == "__main__":

grid = GridMDP("grid.txt",livingReward=-2.0)

mdp = MDP(grid._states, grid._actions, grid.transition, grid.reward, .9)

v = mdp.policyIteration()

print(v[0])

print(v[1])

image text in transcribed

image text in transcribed

Greedy Policy Uk for the Random Policy w.r.t. UK k=0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 random policy k = 1 0.0 -1.0 -1.0 -1.01 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 0.0 1 k = 2 0.0 -1.7 -2.0-2.0 -1.7 -2.0-2.0-2.0 -2.0-2.0-2.00 -1.7 -2.0-2.0-1.7 0.0 Policy Iteration Input: environment (T, R, A, S). 7,8, vmariters, pmariters, Output: policy - state/action mapping 1: V-10....0 Null vector of length 2: +10....0 Initial policy length S 3: piter 0 4: repeat 5: 6: piter -- piter + 1 7: viter 0 8: repeat Evaluate Policy 9 10: viter viter + 1 11: for each sin S do 12: V.+T(8,7, d') (R(5, 7, 8') + V) ES o Improve policy 13: until max(V - V')

Step by Step Solution

There are 3 Steps involved in it

Step: 1

blur-text-image

Get Instant Access with AI-Powered Solutions

See step-by-step solutions with expert insights and AI powered tools for academic success

Step: 2

blur-text-image

Step: 3

blur-text-image

Ace Your Homework with AI

Get the answers you need in no time with our AI-driven, step-by-step assistance

Get Started

Students also viewed these Databases questions