Answered step by step

Verified Expert Solution

Link Copied!

Question

1 Approved Answer

Posted on Sep 07, 2024

Python Programming Note: Please Python Code Only DEMO CODE POLICY ITERATION 1:(ELEMENTS) importsys Importrandom class MDP(object): def init(self,states,actions,transition,reward,discount=0.5): self._states = states self._actions = actions self._transition

Python Programming

Note: Please Python Code Only

DEMO CODE POLICY ITERATION 1:(ELEMENTS)

importsys

Importrandom

class MDP(object):

def __init__(self,states,actions,transition,reward,discount=0.5):

self._states = states

self._actions = actions

self._transition = transition

self._reward = reward

self._discount = discount

self._initial_v = [ 0 for _ in states]

self._initial_q = [[0 for _ in actions] for _ in states]

def valueIteration(self,iterations = 0,threshold = 0.000000001):

previousMatrix = self._initial_v

returnQMatrix = [[0 for _ in self._actions] for _ in self._states]

delta = 0.0

for _ in range(iterations):

returnMatrix = [ 0 for _ in self._states]

for s in range(len(self._states)):

maxValue = -sys.maxsize - 1

for a in range(len(self._actions)):

actionValue = 0

possibleOutcomes = self._transition(s,a)

if len(possibleOutcomes) == 0:

maxValue = self._reward(s,a,None)

continue

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,a,sp) + self._discount * previousMatrix[sp])

returnQMatrix[s][a] = actionValue

maxValue = max(maxValue,actionValue)

returnMatrix[s] = maxValue

delta = max(delta, abs(previousMatrix[s] - returnMatrix[s]))

previousMatrix = returnMatrix

if delta

break

return previousMatrix, returnQMatrix

def policyEvaluation(self,policy,start_v,threshold):

while True:

delta = 0.0

for s in range(len(self._states)):

v = start_v[s]

actionValue = 0

possibleOutcomes = self._transition(s,policy[s])

if len(possibleOutcomes) == 0:

actionValue = self._reward(s,policy[s],None)

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,policy[s],sp) + self._discount * start_v[sp])

start_v[s] = actionValue

delta = max(delta, abs(v - start_v[s]))

if delta

break

def policyIteration(self,threshold = 0.000000001):

'''intialize the policy and values'''

start_v = [random.random() * 100 for _ in self._states]

start_policy = [random.randint(0,len(self._actions)-1) for _ in self._states]

while True:

policy_stable = True

self.policyEvaluation(start_policy, start_v, threshold)

for s in range(len(self._states)):

old_action = start_policy[s]

maxValue = -sys.maxsize - 1

for a in range(len(self._actions)):

actionValue = 0

possibleOutcomes = self._transition(s,a)

for sp, prob in possibleOutcomes:

actionValue += prob * (self._reward(s,a,sp) + self._discount * start_v[sp])

if maxValue

maxValue = actionValue

start_policy[s] = a

if old_action != start_policy[s]:

policy_stable = False

if policy_stable:

break

return start_policy, start_

DEMO CODE POLICY ITERATION 2(GRID):SAMPLE

from com.ai.mdp.element.mdp import MDP

class GridMDP(object):

'''

classdocs

'''

def __init__(self, file, noise=0.2,livingReward = 0.0):

self._livingReward = livingReward

self._noise = noise

self._states = []

self._actions = [(-1,0),(1,0),(0,-1),(0,1)]

self.readFile(file)

def readFile(self,file):

f = open(file,"+r")

lines = f.readlines()

self._rows = int(lines[0])

self._columns = int(lines[1])

for i in range(2, len(lines)):

self._states += [int(x) for x in lines[i].split(" ")]

# print(self._states)

def transition(self,state,action):

returnStates = []

if self._states[state] == 2:

return returnStates

if self._states[state] == 3 or self._states[state] == 4:

return returnStates

stateRow = int (state / self._columns)

stateColumn = state % self._columns

possibleActions = [ (self._actions[action][0],self._actions[action][1],1-self._noise),

((self._actions[action][0]**2 + 1)%2,(self._actions[action][1]**2 + 1)%2,self._noise/2.0),

( ((self._actions[action][0]**2 + 1)%2)*-1, ((self._actions[action][1]**2 + 1)%2)*-1,self._noise/2.0)]

for pa in possibleActions:

if stateRow + pa[0] >= 0 and\

stateRow + pa[0]

stateColumn + pa[1] >= 0 and\

stateColumn + pa[1]

self._states[int((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]))] != 2:

returnStates.append(((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]),pa[2]))

else:

returnStates.append((state,pa[2]))

return returnStates

def reward(self,s,a,sp):

if self._states[s] == 3:

return 1.0

if self._states[s] == 4:

return -1.0

return self._livingReward

if __name__ == "__main__":

grid = GridMDP("grid.txt",livingReward=-2.0)

mdp = MDP(grid._states, grid._actions, grid.transition, grid.reward, .9)

v = mdp.policyIteration()

print(v[0])

print(v[1])

Understand below source code and apply policy iteration (MDP)on given grid Uk for the Random Policy Greedy Policy w.rt. Uk k=0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 random policy k=1 0.0 -1.0 -1.0 -1.0 -1.0-10-10-1.00 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 0.0 k = 2 0.0 -1.71-2.0-2.0 -1.7.2.0 2.0 2.0 |-2.0-2.0-2.0-1.7 -2.0-2.0-1.7 0.0