Reinforcement Learning example

No preview image
1 collaborator

Russ Abbott (Author)
Comments and Questions

Please start the discussion about this model! (You'll first need to log in.)
Click to Run Model
;; Adapted from a model by Joe Roop: http://ccl.northwestern.edu/netlogo/models/community/Reinforcement Learning Maze
;; Coppyright Russ Abbott (Russ.Abbott@gmail.com)
;; This work is licensed under the Creative Commons Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License.
;; To view a copy of the license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode.

patches-own [
  Qlist
  reward
  qa-elts
]

breed [walkers walker]

breed [qa-labels qa-label]

globals [
  episode
  goal-color
  goal-patch
  start-patch
  Hlist
  north
  east
  south
  west
]

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to setup
  ca
  set north 0
  set east 90
  set south 180
  set west 270
  set Hlist (list west north east south)

  setup-maze

  create-walkers 1 [ set shape "bug" set color red + 1 set size 0.8 move-to start-patch set heading 45]

  set episode 1
  set-current-plot "Ave Reward Per Episode"
end 

to setup-maze
  ask patches [
    let qa-elts-commands (list (list west 0.25)  (list north 0.35)  (list east 0.3)  (list south 0.40) )
    set qa-elts []
    set Qlist [0 0 0 0]
    sprout 4 [  ;; These are four turtles which display the qa values at the patch edges
      set size 0
      set heading east
      fd 0.1
      set heading south
      fd 0.1
      let command first qa-elts-commands
      set qa-elts-commands but-first qa-elts-commands
      set heading first command
      fd second command
      set label 0
      set qa-elts lput self qa-elts
    ]
  ]
  set-maze-elements
end 

to set-maze-elements
  clear-drawing
  ask patches [
    set pcolor default-color self
  ]

  set start-patch patch -4 -4
  ask start-patch [set pcolor black]

  set goal-patch patch 4 4
  set goal-color orange + 3
  ask goal-patch [set pcolor goal-color ]

  setup-blockades
  make-passage
  set-rewards

  ask patches [
    foreach qa-elts [
      qa-elt -> ask qa-elt [
        set hidden? pcolor = blue or pcolor = goal-color
      ]
    ]
  ]
end 

to setup-blockades
  ask patches with [pxcor = max-pxcor or pxcor = (- max-pxcor)  or pycor = max-pycor or pycor = (- max-pycor) or pycor = -1] [set pcolor blue]
  foreach (list patch -1 2 patch 1 1 patch -2 -3) [
    p -> ask p [
      set pcolor blue
      ask n-of (ifelse-value (p = patch -2 -3) [1] [2]) neighbors [set pcolor blue]]]
end 

to make-passage
  ask one-of patches with [pycor = -1 and pxcor > (- max-pxcor + 3 ) and pxcor < max-pxcor - 3]  [
    set pcolor default-color self
    ask patches with [pxcor = [pxcor] of myself and (pycor = [pycor] of myself + 1 or pycor = [pycor] of myself - 1) ] [
      set pcolor default-color self
    ]
    ask one-of patches with [(pycor = [pycor] of myself + 1 ) and (pxcor = [pxcor] of myself + 1 or pxcor = [pxcor] of myself - 1) ] [
      set pcolor default-color self
    ]
  ]
end 

to set-rewards
  ask patches [set reward ifelse-value (pcolor = blue)  [boundary-reward] [base-reward] ]
  ask goal-patch [ set reward goal-reward ]
  ask patches [sprout 1 [
    set size 0
    set label-color ifelse-value (myself = goal-patch) [black] [yellow + 2]
    set label [reward] of myself
    set heading east
    fd 0.2
    ]
    foreach (list item 0 qa-elts item 2 qa-elts) [qa-elt -> ask qa-elt [set heading south fd 0.1]]
  ]
end 

to-report default-color [a-patch]
  report green - 2 + ifelse-value is-odd? ([pxcor] of a-patch + [pycor] of a-patch) [0.2] [-0.2]
end 

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to go
  if (episode > num-episodes) [stop]
  set trace? false
  one-trip
end 

to one-trip
  clear-drawing
  ;; Don't explore when running with trace-path
  let path episode-path ifelse-value trace? [0] [exploration-% / 100]
  let lng length path
  let lngsum sum path
  let avg-reward lngsum / lng
  plot avg-reward
  output-print (word " "  episode "; path-length: " lng "; avg-reward: "  precision avg-reward 2)
  set episode episode + 1
end 

to-report episode-path [explore-%]
  let r-episode []
  ask walkers [
    pen-up
    move-to start-patch
    if trace? [pen-down set pen-size 3]
    while [ [pcolor] of patch-here = default-color self or patch-here = start-patch] [
        let Qmax max Qlist ;--get max from the Qlist values of the current patch
        let dirp 0
        ifelse (random-float 1 < explore-%) [
          set heading one-of Hlist ;--pick random direction
          set dirp position heading Hlist ;--find dir's position in the Hlist array
          ] [
          set dirp one-of all-positions Qmax Qlist   ;; Qmax may appear multiple times in Qlist. Select one at random.
          set heading item dirp Hlist
        ]
        let Qa item dirp Qlist ;--find the value in Qlist with the same position as in the Hlist

        let r [reward] of patch-ahead 1
        set r-episode lput r r-episode

        ;-- Q-learning update function
        let Qmax' max [Qlist] of patch-ahead 1
        set Qa precision ( (1 - weight) * Qa + weight * (r + gamma * Qmax') ) 3 ;--perform Q-Learning
        set Qlist replace-item dirp Qlist Qa
        ask patch-here [ (foreach qa-elts Qlist [ [t q] -> ask t [ set label precision q 1]] )]
        fd 1
        ]
  ]
  report r-episode
end 

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

to-report all-positions [elt a-list]
  report all-positions' elt a-list 0
end 

to-report all-positions' [elt a-list n]
  if empty? a-list [report []]
  let positions all-positions' elt (but-first a-list) ( n + 1 )
  if (first a-list = elt) [set positions fput n positions]
  report positions
end 

to-report is-odd? [n]
  report n mod 2 = 1
end 

to-report list-to-string [a-list sep]
  report reduce [ [so-far next] -> (word so-far sep next)] a-list
end 

to-report second [a-list]
  report first but-first a-list
end
There is only one version of this model, created over 7 years ago by Russ Abbott.
Attached files

File	Type	Description	Last updated
2018-04-01_21-30-12.png	png	Model image	over 7 years ago, by Russ Abbott	Download
This model does not have any ancestors.
This model does not have any descendants.
NetLogo

Reinforcement Learning example

1 collaborator

Close

Tags

Close

Comments and Questions

Attached files