Skip to content

Commit a53f53f

Browse files
Added an example that uses the OpenAI Gym's LunarLander-v2 environment.
1 parent ee83edb commit a53f53f

File tree

3 files changed

+449
-0
lines changed

3 files changed

+449
-0
lines changed

examples/openai-lander/config

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# neat-python configuration for the LunarLander-v2 environment on OpenAI Gym
2+
3+
[NEAT]
4+
pop_size = 200
5+
max_fitness_threshold = 1000.0
6+
reset_on_extinction = 0
7+
8+
[DefaultGenome]
9+
num_inputs = 8
10+
num_hidden = 0
11+
num_outputs = 4
12+
initial_connection = full
13+
feed_forward = True
14+
compatibility_disjoint_coefficient = 1.0
15+
compatibility_weight_coefficient = 1.0
16+
conn_add_prob = 0.15
17+
conn_delete_prob = 0.1
18+
node_add_prob = 0.15
19+
node_delete_prob = 0.1
20+
activation_default = tanh
21+
activation_options = tanh clamped gauss hat sin
22+
activation_mutate_rate = 0.05
23+
aggregation_default = sum
24+
aggregation_options = sum
25+
aggregation_mutate_rate = 0.0
26+
bias_init_mean = 0.0
27+
bias_init_stdev = 1.0
28+
bias_replace_rate = 0.02
29+
bias_mutate_rate = 0.8
30+
bias_mutate_power = 0.4
31+
bias_max_value = 30.0
32+
bias_min_value = -30.0
33+
response_init_mean = 1.0
34+
response_init_stdev = 0.0
35+
response_replace_rate = 0.0
36+
response_mutate_rate = 0.1
37+
response_mutate_power = 0.01
38+
response_max_value = 30.0
39+
response_min_value = -30.0
40+
41+
weight_max_value = 30
42+
weight_min_value = -30
43+
weight_init_mean = 0.0
44+
weight_init_stdev = 1.0
45+
weight_mutate_rate = 0.8
46+
weight_replace_rate = 0.02
47+
weight_mutate_power = 0.4
48+
enabled_default = True
49+
enabled_mutate_rate = 0.01
50+
51+
[DefaultSpeciesSet]
52+
compatibility_threshold = 3.0
53+
54+
[DefaultStagnation]
55+
species_fitness_func = mean
56+
max_stagnation = 15
57+
species_elitism = 5
58+
59+
[DefaultReproduction]
60+
elitism = 2
61+
survival_threshold = 0.2
62+

examples/openai-lander/evolve.py

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# Evolve a control/reward estimation network for the OpenAI Gym
2+
# LunarLander-v2 environment (https://gym.openai.com/envs/LunarLander-v2).
3+
# This is a work in progress, and currently takes ~100 generations to
4+
# find a network that can land with a score >= 200 at least a couple of
5+
# times. It has yet to solve the environment, which could have something
6+
# to do to me being totally clueless in regard to reinforcement learning. :)
7+
8+
from __future__ import print_function
9+
10+
import gym
11+
import gym.wrappers
12+
13+
import matplotlib.pyplot as plt
14+
15+
import neat
16+
import numpy as np
17+
import os
18+
import pickle
19+
import random
20+
21+
import visualize
22+
23+
env = gym.make('LunarLander-v2')
24+
25+
print("action space: {0!r}".format(env.action_space))
26+
print("observation space: {0!r}".format(env.observation_space))
27+
28+
# Limit episodes to 400 time steps to cut down on training time.
29+
# 400 steps is more than enough time to land with a winning score.
30+
print(env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps'))
31+
env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] = 400
32+
print(env.spec.tags.get('wrapper_config.TimeLimit.max_episode_steps'))
33+
34+
env = gym.wrappers.Monitor(env, 'results', force=True)
35+
36+
discounted_reward = 0.9
37+
min_reward = -200
38+
max_reward = 200
39+
40+
score_range = []
41+
42+
def eval_fitness_shared(genomes, config):
43+
nets = []
44+
for gid, g in genomes:
45+
nets.append((g, neat.nn.FeedForwardNetwork.create(g, config)))
46+
g.fitness = []
47+
48+
episodes = []
49+
scores = []
50+
for genome, net in nets:
51+
observation = env.reset()
52+
episode_data = []
53+
j = 0
54+
total_score = 0.0
55+
while 1:
56+
if net is not None:
57+
output = net.activate(observation)
58+
action = np.argmax(output)
59+
else:
60+
action = env.action_space.sample()
61+
62+
observation, reward, done, info = env.step(action)
63+
total_score += reward
64+
episode_data.append((j, observation, action, reward))
65+
66+
if done:
67+
break
68+
69+
j += 1
70+
71+
episodes.append(episode_data)
72+
scores.append(total_score)
73+
genome.fitness = total_score
74+
75+
if scores:
76+
score_range.append((min(scores), np.mean(scores), max(scores)))
77+
78+
# Compute discounted rewards.
79+
discounted_rewards = []
80+
for episode in episodes:
81+
rewards = np.array([reward for j, observation, action, reward in episode])
82+
N = len(episode)
83+
D = np.sum((np.eye(N, k=i) * discounted_reward ** i for i in range(N)))
84+
discounted_rewards.append(np.dot(D, rewards))
85+
86+
print(min(map(np.min, discounted_rewards)), max(map(np.max, discounted_rewards)))
87+
88+
# Normalize rewards
89+
for i in range(len(discounted_rewards)):
90+
discounted_rewards[i] = 2 * (discounted_rewards[i] - min_reward) / (max_reward - min_reward) - 1.0
91+
92+
print(min(map(np.min, discounted_rewards)), max(map(np.max, discounted_rewards)))
93+
94+
episode_filter = [random.randint(0, len(episodes)-1) for _ in range(10)]
95+
for genome, net in nets:
96+
reward_error = []
97+
for i in episode_filter:
98+
episode = episodes[i]
99+
discount_reward = discounted_rewards[i]
100+
for (j, observation, action, reward), dr in zip(episode, discount_reward):
101+
#test_set.append((observation, action, reward, dr))
102+
output = net.activate(observation)
103+
reward_error.append((output[action] - dr)**2)
104+
105+
print(genome.fitness, np.mean(reward_error))
106+
genome.fitness -= 100 * np.mean(reward_error)
107+
108+
109+
def run():
110+
# Load the config file, which is assumed to live in
111+
# the same directory as this script.
112+
local_dir = os.path.dirname(__file__)
113+
config_path = os.path.join(local_dir, 'config')
114+
config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
115+
neat.DefaultSpeciesSet, neat.DefaultStagnation,
116+
config_path)
117+
118+
pop = neat.Population(config)
119+
stats = neat.StatisticsReporter()
120+
pop.add_reporter(stats)
121+
pop.add_reporter(neat.StdOutReporter())
122+
# Checkpoint every 10 generations or 900 seconds.
123+
pop.add_reporter(neat.Checkpointer(10, 900))
124+
125+
# Run until the winner from a generation is able to solve the environment.
126+
while 1:
127+
winner = pop.run(eval_fitness_shared, 1)
128+
129+
visualize.plot_stats(stats, ylog=False, view=False, filename="fitness.svg")
130+
131+
if score_range:
132+
S = np.array(score_range).T
133+
plt.plot(S[0], 'r-')
134+
plt.plot(S[1], 'b-')
135+
plt.plot(S[2], 'g-')
136+
plt.grid()
137+
plt.savefig("score-ranges.svg")
138+
plt.close()
139+
140+
mfs = sum(stats.get_fitness_mean()[-5:]) / 5.0
141+
print("Average mean fitness over last 5 generations: {0}".format(mfs))
142+
143+
mfs = sum(stats.get_fitness_stat(min)[-5:]) / 5.0
144+
print("Average min fitness over last 5 generations: {0}".format(mfs))
145+
146+
winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
147+
148+
for k in range(100):
149+
observation = env.reset()
150+
score = 0
151+
while 1:
152+
output = winner_net.activate(observation)
153+
observation, reward, done, info = env.step(np.argmax(output))
154+
score += reward
155+
env.render()
156+
if done:
157+
break
158+
print(k, score)
159+
if score < 200:
160+
break
161+
else:
162+
print("Solved.")
163+
break
164+
165+
winner = stats.best_genome()
166+
print(winner)
167+
168+
# Save the winner.
169+
with open('winner.pickle', 'wb') as f:
170+
pickle.dump(winner, f)
171+
172+
visualize.plot_stats(stats, ylog=False, view=True, filename="fitness.svg")
173+
visualize.plot_species(stats, view=True, filename="speciation.svg")
174+
175+
visualize.draw_net(config, winner, True)
176+
177+
visualize.draw_net(config, winner, view=True, filename="winner-net.gv")
178+
visualize.draw_net(config, winner, view=True, filename="winner-net-enabled.gv",
179+
show_disabled=False)
180+
visualize.draw_net(config, winner, view=True, filename="winner-net-enabled-pruned.gv",
181+
show_disabled=False, prune_unused=True)
182+
183+
184+
if __name__ == '__main__':
185+
run()

0 commit comments

Comments
 (0)