ValueIteration coding project, reinforcement learning

Clear instructions in the pdf file


Code given:


def createRewardTable(transitionTable, normalCost, trapDict, bonusDict):

  rewardTable={s:{action:{sPrime:normalCost for sPrime in transitionTable[s][action].keys()} for action in transitionTable[s].keys()} for s in transitionTable.keys()}

  for s in rewardTable.keys():

    for a in rewardTable[s].keys():

      for sPrime in trapDict.keys():

        if rewardTable.get(s).get(a).get(sPrime) != None:

          rewardTable[s][a][sPrime]=trapDict[sPrime]

      for sPrime in bonusDict.keys():

        if rewardTable.get(s).get(a).get(sPrime) != None:

          rewardTable[s][a][sPrime]=bonusDict[sPrime]

  return rewardTable


import numpy as np

import drawHeatMap as hm

import rewardTable as rt

import transitionTable as tt


##################################################

# Your code here

################################################## 



def main():

   

  minX, maxX, minY, maxY=(0, 3, 0, 2)

  convergenceTolerance = 10e-7

  roundingTolerance= 10e-7

  gamma = 0.8

   

  possibleAction=[(0,1), (0,-1), (1,0), (-1,0)]

  possibleState=[(i,j) for i in range(maxX+1) for j in range(maxY+1)]

  V={s:0 for s in possibleState}

   

  normalCost=-0.04

  trapDict={(3,1):-1}

  bonusDict={(3,0):1}

  blockList=[(1,1)]

   

  p=0.8

  transitionProbability={'forward':p, 'left':(1-p)/2, 'right':(1-p)/2, 'back':0}

  transitionProbability={move: p for move, p in transitionProbability.items() if transitionProbability[move]!=0}

   

  transitionTable=tt.createTransitionTable(minX, minY, maxX, maxY, trapDict, bonusDict, blockList, possibleAction, transitionProbability)

  rewardTable=rt.createRewardTable(transitionTable, normalCost, trapDict, bonusDict)

   

  """

  levelsReward = ["state", "action", "next state", "reward"]

  levelsTransition = ["state", "action", "next state", "probability"]

   

  viewDictionaryStructure(transitionTable, levelsTransition)

  viewDictionaryStructure(rewardTable, levelsReward)

  """



##################################################

# Your code here

################################################## 



  hm.drawFinalMap(V, policy, trapDict, bonusDict, blockList, normalCost)


   

   

   

if __name__=='__main__': 

  main()



def transitionFull(s, move, minX, minY, maxX, maxY, blockList):

  x, y=s

  dx, dy=move

  def boundary(x, minX, maxX):

    return max(minX, min(x, maxX))

  sPrimeConsideringBoundary=(boundary(x+dx, minX, maxX), boundary(y+dy, minY, maxY))

  def blocking(sPrime, blockList):

    if sPrime in blockList:

      return s

    else:

      return sPrime

  sPrime=blocking(sPrimeConsideringBoundary, blockList)

  return sPrime


def createTransitionTable(minX, minY, maxX, maxY, trapDict, bonusDict, blockList, possibleAction, transitionProbability):

     

  possibleState=[(i,j) for i in range(minX, maxX+1) for j in range(minY, maxY+1)]

   

  for trap in trapDict.keys():

    possibleState.remove(trap)

  for bonus in bonusDict.keys():

    possibleState.remove(bonus)

  for block in blockList:

    possibleState.remove(block)

     

  moves={'forward':{(1,0):(1,0),(0,-1):(0,-1),(-1,0):(-1,0),(0,1):(0,1)},\

    'left':{(1,0):(0,-1),(0,-1):(-1,0),(-1,0):(0,1),(0,1):(1,0)},\

    'right':{(1,0):(0,1),(0,-1):(1,0),(-1,0):(0,-1),(0,1):(-1,0)},\

    'back':{(1,0):(-1,0),(0,-1):(0,1),(-1,0):(1,0),(0,1):(0,-1)}}

   

  def transition(s, move):

    return transitionFull(s, move, minX, minY, maxX, maxY, blockList)

   

  def transitionFunction(s, action, sPrime, transitionProbability, moves):

    moveDictionary={moves[move][action]:transitionProbability[move] for move in transitionProbability.keys()}

    sPrimeProbability=sum([p for move, p in moveDictionary.items() if transition(s, move)==sPrime])

    return sPrimeProbability

   

  emptyTransitionTable={s:{action:{transition(s, moves[move][action]):transitionProbability[move] for move in transitionProbability.keys()} for action in possibleAction} for s in possibleState}

   

  transitionTable={s:{action:{sPrime:transitionFunction(s, action, sPrime, transitionProbability, moves) for sPrime in emptyTransitionTable[s][action].keys()} for action in possibleAction} for s in possibleState}

  return transitionTable


import numpy as np

import seaborn as sb 

import matplotlib.pyplot as plt

import matplotlib.animation as animation


def drawHeatMap(V, policy, vmin, vmax, trapDict, bonusDict, blockList):

  VPlot=V.copy()

  for bonus, bonusReward in bonusDict.items():

    VPlot[bonus]=bonusReward

  for trap, trapCost in trapDict.items():

    VPlot[trap]=trapCost

  x ,y, v=([x for (x, y), v in VPlot.items()], [y for (x, y), v in VPlot.items()], [v for (x, y), v in VPlot.items()])

  maxX, maxY=(max(x)+1, max(y)+1)

  label=[str(round(value,3)) for key,value in VPlot.items()]

  label, v=(np.array(label).reshape(maxX,maxY).transpose(), np.array(v).reshape(maxX,maxY).transpose())

  mask=np.array([(vi in blockList) for vi in V.keys()]).reshape(maxX,maxY).transpose()

  heatMap=sb.heatmap(v, annot=label, fmt="", cmap='RdYlGn', linewidths=0.30, vmin=vmin, vmax=vmax, center=0, mask=mask)

  for trap in trapDict.keys():

    xTrap, yTrap=trap

    plt.arrow(xTrap, yTrap, 1, 0, fc="r", ec="r", head_width=0.001, head_length=0.001)

    plt.arrow(xTrap, yTrap, 0, 1, fc="r", ec="r", head_width=0.001, head_length=0.001)

    plt.arrow(xTrap+1, yTrap, 0, 1, fc="r", ec="r", head_width=0.001, head_length=0.001)

    plt.arrow(xTrap, yTrap+1, 1, 0, fc="r", ec="r", head_width=0.001, head_length=0.001)

  for bonus in bonusDict.keys():

    xBonus, yBonus=bonus

    plt.arrow(xBonus, yBonus, 1, 0, fc="y", ec="y", head_width=0.001, head_length=0.001)

    plt.arrow(xBonus, yBonus, 0, 1, fc="y", ec="y", head_width=0.001, head_length=0.001)

    plt.arrow(xBonus+1, yBonus, 0, 1, fc="y", ec="y", head_width=0.001, head_length=0.001)

    plt.arrow(xBonus, yBonus+1, 1, 0, fc="y", ec="y", head_width=0.001, head_length=0.001)

  for s in [s for s in V.keys() if s not in list(trapDict.keys())+list(bonusDict.keys())+blockList]:

    x, y=s

    actions=policy[s].keys()

    for action in actions:

      plt.arrow(x+0.8, y+0.8, action[0]/10, action[1]/10, fc="k", ec="k", head_width=0.06, head_length=0.06)

  return heatMap



def drawFinalMap(V, policy, trapDict, bonusDict, blockList, normalCost):

  vmin=min([min(V.values())]+list(trapDict.values())+list(bonusDict.values()))

  vmax=max([max(V.values())]+list(trapDict.values())+list(bonusDict.values()))

  fig, ax=plt.subplots(figsize=(12,7))

  title=f"Value Map: R={normalCost}"

  plt.title(title, fontsize=18)

  ttl=ax.title

  ttl.set_position([0.5, 1.05])

  drawHeatMap(V, policy, vmin, vmax, trapDict, bonusDict, blockList)

  plt.savefig(f'valueIterationHeatMap_R={normalCost}.jpg')


def createAnimation(VRecord, policyRecord, trapDict, bonusDict, blockList, normalCost):

  vmin=min([min(V.values()) for V in VRecord]+list(trapDict.values())+list(bonusDict.values()))

  vmax=max([max(V.values()) for V in VRecord]+list(trapDict.values())+list(bonusDict.values()))

  def animate(i):

    fig.clear()

    title=f"Value Map: Round {i}, R={normalCost}"

    plt.title(title, fontsize=18)

    ttl=ax.title

    ttl.set_position([0.5, 1.05])

    heatmap=drawHeatMap(VRecord[i], policyRecord[i], vmin, vmax, trapDict, bonusDict, blockList)

    return heatmap

  fig, ax=plt.subplots(figsize=(12,7))

  ani = animation.FuncAnimation(fig, animate, len(VRecord))

  ani.save(f'valueIteration.gif',writer='pillow')



Get Help With a similar task to - ValueIteration coding project, reinforcement learning

Login to view and/or buy answers.. or post an answer
Additional Instructions:

Stats 115 20W Final Project March 6, 2020 In this final project you are going to implement the value iteration algorithm on page 67 from the book Reinforcement Learning: An Introduction (Second edition) . Settings In this final project we are using the settings similar to the settings for Figure 17.1 on page 646 in the Artificial Intelligence: A Modern Approach (Third Edition) (But not the same!). “A simple 4×3 environment that presents the agent with a sequential decision problem.” “The”intended" outcome occurs with probability 0.8, but with probability 0.2 the agent moves at right angles to the intended direction. A collision with a wall results in no movement. The two terminal states have reward +1 and -1, respectively, and all other states have a reward of -0.04." (Artificial Intelligence: A Modern Approach (Third Edition), P646) Input information The information we provide includes: 1) transitionTable. Place the transitionTable.py file in the same folder with your main file and run the transitionTable line in the main function. You can use the viewDictionaryStructure function to take a look at the transitionTable. The transitionTable is in the format {s : {a : {s′ : P (Result(s, a) = s′|a)}}}. Some special states: (1, 1) is a wall. You cannot start from or go to a wall state. (3, 0) and (3, 1) are terminals. You can go to a terminal state, but you cannot start from a terminal state. 2) rewardTable. Place the rewardTable.py file in the same folder with your main file and run the transitionTable line in the main function. You can use the viewDictionaryStructure function to take a look at the rewardTable. This rewardTable is in the format {s : {a : {s′ : R(s, a, s′)}}}. The rewards are determined by the state you end up in. If you move to a normal state (not wall, not terminal), you will get a reward of -0.04. If you move to the positive terminal state (3, 0), you will get a reward of +1. If you move to the negative terminal state (3, 1), you will get a reward of -1. The rewards we are using is different from the rewards in Figure 17.1, so your result will not be the same with the example in the book (e. g. Figure 17.3). 3) V. The V is initialized as 0 for all states. When you are updating V in each iteration step, only change the value of the normal states. Do not change the value of the wall state and the terminal states. 4) convergenceTolerance=1× 10−7. It represents the θ in the algorithm. 5) roundingTolerance=1× 10−7. It is used when you determine the policy from the expected utility. If the difference in expected utility between two actions are smaller than roundingTolerance, the two expected utilities are considered the same. 6) gamma=0.8. It represents the γ in the algorithm. Expected output The output you are providing should include: 1) a dictionary V: the value of each state. It is in the format {s : V (s)}. 1 2) a dictionary policy: a dictionary. The key of the dictionary policy is a state. The corresponding value is a dictionary whose keys are actions that maximized the expected utility, and the values are the corresponding probability. It is in the format {s : {a : π(a|s)}}, π(a|s) = 1/(# of a), a = arg max a Q(a|s), Q(a|s) is the expected utility of each action starting from this state. For example, if the expected utility of state (0,1) is {(0,1): 1, (1,0): 2, (0,-1):-1, (-1,0):2}, a part of the policy dictionary should be {(0,1):{(1,0):0.5, (-1,0):0.5}, . . . } 3) a heatmap (like in homework 4). You should output a .jpg file based on your V and policy. Submission Please submit a completed valueIteration_YourLastName_YourFirstName.py file and a testValueItera- tion_YourLastName_YourFirstName.py file on CCLE before due. Please submit two seperate files. Do not zip them! The due date and time of the final project is Friday, 03/20/2020 11:59pm. Grading rubric 1. Coding style (40 points) You should show good coding style in your final project in the following four aspects. (1) Clear naming habit (10 points). Names of the functions and variables you defined should be clear and meaningful so that other code readers can easily understand. (2) Proper function length (10 points). None of the functions you defined should exceed 15 lines in length. (3) Clear function input and output (10 points). Avoid using global variables in your functions. All the variables you take as input in a function should be included in the input arguments. (4) Avoid using magic numbers (10 points). 2. Unit test (30 points) You should conduct unit test for each functions you defined outside the main function. If you write 0 < n ≤ 6 functions, for each function and unit test pair, you can get 30n points. If you write n > 6 functions, for each function and unit test pair you got wrong, you will get a reduction of 5 points. 3. The final output (30 points) You should write the iteration part of the value iteration and print out (1) the final Value of each state (V ) (10 points). (2) the final policy (10 points) (3) and save the heatmap of the final value and policy (10 points) 2 Settings Input information Expected output Submission Grading rubric 1. Coding style (40 points) 2. Unit test (30 points) 3. The final output (30 points)

Related Questions

Similar orders to ValueIteration coding project, reinforcement learning
26
Views
0
Answers
Biostatistics Project
MUST know how to use SPSS. this is a project I have added the requirements that my professor has asked for...
27
Views
0
Answers
Statistics 1 assignment
Please do the assignment attached...
21
Views
0
Answers
statistics assignment
Please do the assignment in attachment...
41
Views
0
Answers
Statistics
On MyMathlab.com, I can provide login details, only need Final Practice Review test done. Need to score higher than an 80%...