Answered step by step
Verified Expert Solution
Question
1 Approved Answer
Python Programming Note: Please Python Code Only DEMO CODE POLICY ITERATION 1:(ELEMENTS) importsys Importrandom class MDP(object): def __init__(self,states,actions,transition,reward,discount=0.5): self._states = states self._actions = actions self._transition
Python Programming
Note: Please Python Code Only
DEMO CODE POLICY ITERATION 1:(ELEMENTS)
importsys
Importrandom
class MDP(object):
def __init__(self,states,actions,transition,reward,discount=0.5):
self._states = states
self._actions = actions
self._transition = transition
self._reward = reward
self._discount = discount
self._initial_v = [ 0 for _ in states]
self._initial_q = [[0 for _ in actions] for _ in states]
def valueIteration(self,iterations = 0,threshold = 0.000000001):
previousMatrix = self._initial_v
returnQMatrix = [[0 for _ in self._actions] for _ in self._states]
delta = 0.0
for _ in range(iterations):
returnMatrix = [ 0 for _ in self._states]
for s in range(len(self._states)):
maxValue = -sys.maxsize - 1
for a in range(len(self._actions)):
actionValue = 0
possibleOutcomes = self._transition(s,a)
if len(possibleOutcomes) == 0:
maxValue = self._reward(s,a,None)
continue
for sp, prob in possibleOutcomes:
actionValue += prob * (self._reward(s,a,sp) + self._discount * previousMatrix[sp])
returnQMatrix[s][a] = actionValue
maxValue = max(maxValue,actionValue)
returnMatrix[s] = maxValue
delta = max(delta, abs(previousMatrix[s] - returnMatrix[s]))
previousMatrix = returnMatrix
if delta
break
return previousMatrix, returnQMatrix
def policyEvaluation(self,policy,start_v,threshold):
while True:
delta = 0.0
for s in range(len(self._states)):
v = start_v[s]
actionValue = 0
possibleOutcomes = self._transition(s,policy[s])
if len(possibleOutcomes) == 0:
actionValue = self._reward(s,policy[s],None)
for sp, prob in possibleOutcomes:
actionValue += prob * (self._reward(s,policy[s],sp) + self._discount * start_v[sp])
start_v[s] = actionValue
delta = max(delta, abs(v - start_v[s]))
if delta
break
def policyIteration(self,threshold = 0.000000001):
'''intialize the policy and values'''
start_v = [random.random() * 100 for _ in self._states]
start_policy = [random.randint(0,len(self._actions)-1) for _ in self._states]
while True:
policy_stable = True
self.policyEvaluation(start_policy, start_v, threshold)
for s in range(len(self._states)):
old_action = start_policy[s]
maxValue = -sys.maxsize - 1
for a in range(len(self._actions)):
actionValue = 0
possibleOutcomes = self._transition(s,a)
for sp, prob in possibleOutcomes:
actionValue += prob * (self._reward(s,a,sp) + self._discount * start_v[sp])
if maxValue
maxValue = actionValue
start_policy[s] = a
if old_action != start_policy[s]:
policy_stable = False
if policy_stable:
break
return start_policy, start_
DEMO CODE POLICY ITERATION 2(GRID):SAMPLE
from com.ai.mdp.element.mdp import MDP
class GridMDP(object):
'''
classdocs
'''
def __init__(self, file, noise=0.2,livingReward = 0.0):
self._livingReward = livingReward
self._noise = noise
self._states = []
self._actions = [(-1,0),(1,0),(0,-1),(0,1)]
self.readFile(file)
def readFile(self,file):
f = open(file,"+r")
lines = f.readlines()
self._rows = int(lines[0])
self._columns = int(lines[1])
for i in range(2, len(lines)):
self._states += [int(x) for x in lines[i].split(" ")]
# print(self._states)
def transition(self,state,action):
returnStates = []
if self._states[state] == 2:
return returnStates
if self._states[state] == 3 or self._states[state] == 4:
return returnStates
stateRow = int (state / self._columns)
stateColumn = state % self._columns
possibleActions = [ (self._actions[action][0],self._actions[action][1],1-self._noise),
((self._actions[action][0]**2 + 1)%2,(self._actions[action][1]**2 + 1)%2,self._noise/2.0),
( ((self._actions[action][0]**2 + 1)%2)*-1, ((self._actions[action][1]**2 + 1)%2)*-1,self._noise/2.0)]
for pa in possibleActions:
if stateRow + pa[0] >= 0 and\
stateRow + pa[0]
stateColumn + pa[1] >= 0 and\
stateColumn + pa[1]
Understand below source code and apply policy iteration (MDP)on given grid Uk for the Random Policy Greedy Policy w.rt. Uk k=0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 random policy k=1 0.0 -1.0 -1.0 -1.0 -1.0-10-10-1.00 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 0.0 k = 2 0.0 -1.71-2.0-2.0 -1.7.2.0 2.0 2.0 |-2.0-2.0-2.0-1.7 -2.0-2.0-1.7 0.0 self._states[int((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]))] != 2:
returnStates.append(((stateRow + pa[0]) * self._columns + (stateColumn + pa[1]),pa[2]))
else:
returnStates.append((state,pa[2]))
return returnStates
def reward(self,s,a,sp):
if self._states[s] == 3:
return 1.0
if self._states[s] == 4:
return -1.0
return self._livingReward
if __name__ == "__main__":
grid = GridMDP("grid.txt",livingReward=-2.0)
mdp = MDP(grid._states, grid._actions, grid.transition, grid.reward, .9)
v = mdp.policyIteration()
print(v[0])
print(v[1])
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started