1
+ # Evolve a control/reward estimation network for the OpenAI Gym
2
+ # LunarLander-v2 environment (https://gym.openai.com/envs/LunarLander-v2).
3
+ # This is a work in progress, and currently takes ~100 generations to
4
+ # find a network that can land with a score >= 200 at least a couple of
5
+ # times. It has yet to solve the environment, which could have something
6
+ # to do to me being totally clueless in regard to reinforcement learning. :)
7
+
8
+ from __future__ import print_function
9
+
10
+ import gym
11
+ import gym .wrappers
12
+
13
+ import matplotlib .pyplot as plt
14
+
15
+ import neat
16
+ import numpy as np
17
+ import os
18
+ import pickle
19
+ import random
20
+
21
+ import visualize
22
+
23
+ env = gym .make ('LunarLander-v2' )
24
+
25
+ print ("action space: {0!r}" .format (env .action_space ))
26
+ print ("observation space: {0!r}" .format (env .observation_space ))
27
+
28
+ # Limit episodes to 400 time steps to cut down on training time.
29
+ # 400 steps is more than enough time to land with a winning score.
30
+ print (env .spec .tags .get ('wrapper_config.TimeLimit.max_episode_steps' ))
31
+ env .spec .tags ['wrapper_config.TimeLimit.max_episode_steps' ] = 400
32
+ print (env .spec .tags .get ('wrapper_config.TimeLimit.max_episode_steps' ))
33
+
34
+ env = gym .wrappers .Monitor (env , 'results' , force = True )
35
+
36
+ discounted_reward = 0.9
37
+ min_reward = - 200
38
+ max_reward = 200
39
+
40
+ score_range = []
41
+
42
+ def eval_fitness_shared (genomes , config ):
43
+ nets = []
44
+ for gid , g in genomes :
45
+ nets .append ((g , neat .nn .FeedForwardNetwork .create (g , config )))
46
+ g .fitness = []
47
+
48
+ episodes = []
49
+ scores = []
50
+ for genome , net in nets :
51
+ observation = env .reset ()
52
+ episode_data = []
53
+ j = 0
54
+ total_score = 0.0
55
+ while 1 :
56
+ if net is not None :
57
+ output = net .activate (observation )
58
+ action = np .argmax (output )
59
+ else :
60
+ action = env .action_space .sample ()
61
+
62
+ observation , reward , done , info = env .step (action )
63
+ total_score += reward
64
+ episode_data .append ((j , observation , action , reward ))
65
+
66
+ if done :
67
+ break
68
+
69
+ j += 1
70
+
71
+ episodes .append (episode_data )
72
+ scores .append (total_score )
73
+ genome .fitness = total_score
74
+
75
+ if scores :
76
+ score_range .append ((min (scores ), np .mean (scores ), max (scores )))
77
+
78
+ # Compute discounted rewards.
79
+ discounted_rewards = []
80
+ for episode in episodes :
81
+ rewards = np .array ([reward for j , observation , action , reward in episode ])
82
+ N = len (episode )
83
+ D = np .sum ((np .eye (N , k = i ) * discounted_reward ** i for i in range (N )))
84
+ discounted_rewards .append (np .dot (D , rewards ))
85
+
86
+ print (min (map (np .min , discounted_rewards )), max (map (np .max , discounted_rewards )))
87
+
88
+ # Normalize rewards
89
+ for i in range (len (discounted_rewards )):
90
+ discounted_rewards [i ] = 2 * (discounted_rewards [i ] - min_reward ) / (max_reward - min_reward ) - 1.0
91
+
92
+ print (min (map (np .min , discounted_rewards )), max (map (np .max , discounted_rewards )))
93
+
94
+ episode_filter = [random .randint (0 , len (episodes )- 1 ) for _ in range (10 )]
95
+ for genome , net in nets :
96
+ reward_error = []
97
+ for i in episode_filter :
98
+ episode = episodes [i ]
99
+ discount_reward = discounted_rewards [i ]
100
+ for (j , observation , action , reward ), dr in zip (episode , discount_reward ):
101
+ #test_set.append((observation, action, reward, dr))
102
+ output = net .activate (observation )
103
+ reward_error .append ((output [action ] - dr )** 2 )
104
+
105
+ print (genome .fitness , np .mean (reward_error ))
106
+ genome .fitness -= 100 * np .mean (reward_error )
107
+
108
+
109
+ def run ():
110
+ # Load the config file, which is assumed to live in
111
+ # the same directory as this script.
112
+ local_dir = os .path .dirname (__file__ )
113
+ config_path = os .path .join (local_dir , 'config' )
114
+ config = neat .Config (neat .DefaultGenome , neat .DefaultReproduction ,
115
+ neat .DefaultSpeciesSet , neat .DefaultStagnation ,
116
+ config_path )
117
+
118
+ pop = neat .Population (config )
119
+ stats = neat .StatisticsReporter ()
120
+ pop .add_reporter (stats )
121
+ pop .add_reporter (neat .StdOutReporter ())
122
+ # Checkpoint every 10 generations or 900 seconds.
123
+ pop .add_reporter (neat .Checkpointer (10 , 900 ))
124
+
125
+ # Run until the winner from a generation is able to solve the environment.
126
+ while 1 :
127
+ winner = pop .run (eval_fitness_shared , 1 )
128
+
129
+ visualize .plot_stats (stats , ylog = False , view = False , filename = "fitness.svg" )
130
+
131
+ if score_range :
132
+ S = np .array (score_range ).T
133
+ plt .plot (S [0 ], 'r-' )
134
+ plt .plot (S [1 ], 'b-' )
135
+ plt .plot (S [2 ], 'g-' )
136
+ plt .grid ()
137
+ plt .savefig ("score-ranges.svg" )
138
+ plt .close ()
139
+
140
+ mfs = sum (stats .get_fitness_mean ()[- 5 :]) / 5.0
141
+ print ("Average mean fitness over last 5 generations: {0}" .format (mfs ))
142
+
143
+ mfs = sum (stats .get_fitness_stat (min )[- 5 :]) / 5.0
144
+ print ("Average min fitness over last 5 generations: {0}" .format (mfs ))
145
+
146
+ winner_net = neat .nn .FeedForwardNetwork .create (winner , config )
147
+
148
+ for k in range (100 ):
149
+ observation = env .reset ()
150
+ score = 0
151
+ while 1 :
152
+ output = winner_net .activate (observation )
153
+ observation , reward , done , info = env .step (np .argmax (output ))
154
+ score += reward
155
+ env .render ()
156
+ if done :
157
+ break
158
+ print (k , score )
159
+ if score < 200 :
160
+ break
161
+ else :
162
+ print ("Solved." )
163
+ break
164
+
165
+ winner = stats .best_genome ()
166
+ print (winner )
167
+
168
+ # Save the winner.
169
+ with open ('winner.pickle' , 'wb' ) as f :
170
+ pickle .dump (winner , f )
171
+
172
+ visualize .plot_stats (stats , ylog = False , view = True , filename = "fitness.svg" )
173
+ visualize .plot_species (stats , view = True , filename = "speciation.svg" )
174
+
175
+ visualize .draw_net (config , winner , True )
176
+
177
+ visualize .draw_net (config , winner , view = True , filename = "winner-net.gv" )
178
+ visualize .draw_net (config , winner , view = True , filename = "winner-net-enabled.gv" ,
179
+ show_disabled = False )
180
+ visualize .draw_net (config , winner , view = True , filename = "winner-net-enabled-pruned.gv" ,
181
+ show_disabled = False , prune_unused = True )
182
+
183
+
184
+ if __name__ == '__main__' :
185
+ run ()
0 commit comments