Skip to content

Commit ba9dc72

Browse files
Implemented Passive ADP Agent
1 parent c541d31 commit ba9dc72

File tree

1 file changed

+54
-3
lines changed

1 file changed

+54
-3
lines changed

rl.py

Lines changed: 54 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,67 @@
33

44
from collections import defaultdict
55
from utils import argmax
6+
from mdp import MDP, policy_evaluation
67

7-
import agents
88
import random
99

1010

11-
class PassiveADPAgent(agents.Agent):
11+
class PassiveADPAgent:
1212

1313
"""Passive (non-learning) agent that uses adaptive dynamic programming
1414
on a given MDP and policy. [Figure 21.2]"""
15-
NotImplemented
15+
16+
class ModelMDP(MDP):
17+
""" Class for implementing modifed Version of input MDP with
18+
an editable transition model P and a custom function T. """
19+
def __init__(self, init, actlist, terminals, gamma, states):
20+
super().__init__(init, actlist, terminals, gamma)
21+
nested_dict = lambda: defaultdict(nested_dict)
22+
# StackOverflow:whats-the-best-way-to-initialize-a-dict-of-dicts-in-python
23+
self.P = nested_dict()
24+
25+
def T(self, s, a):
26+
"""Returns a list of tuples with probabilities for states
27+
based on the learnt model P. """
28+
return [(prob, res) for (res, prob) in self.P[(s, a)].items()]
29+
30+
def __init__(self, pi, mdp):
31+
self.pi = pi
32+
self.mdp = PassiveADPAgent.ModelMDP(mdp.init, mdp.actlist,
33+
mdp.terminals, mdp.gamma, mdp.states)
34+
self.U = {}
35+
self.Nsa = defaultdict(int)
36+
self.Ns1_sa = defaultdict(int)
37+
self.s = None
38+
self.a = None
39+
40+
def __call__(self, percept):
41+
s1, r1 = percept
42+
self.mdp.states.add(s1) # Model keeps track of visited states.
43+
R, P, mdp, pi = self.mdp.reward, self.mdp.P, self.mdp, self.pi
44+
s, a, Nsa, Ns1_sa, U = self.s, self.a, self.Nsa, self.Ns1_sa, self.U
45+
46+
if s1 not in R: # Reward is only available for visted state.
47+
U[s1] = R[s1] = r1
48+
if s is not None:
49+
Nsa[(s, a)] += 1
50+
Ns1_sa[(s1, s, a)] += 1
51+
# for each t such that Ns′|sa [t, s, a] is nonzero
52+
for t in [res for (res, state, act), freq in Ns1_sa.items()
53+
if (state, act) == (s, a) and freq != 0]:
54+
P[(s, a)][t] = Ns1_sa[(t, s, a)] / Nsa[(s, a)]
55+
56+
U = policy_evaluation(pi, U, mdp)
57+
if s1 in mdp.terminals:
58+
self.s = self.a = None
59+
else:
60+
self.s, self.a = s1, self.pi[s1]
61+
return self.a
62+
63+
def update_state(self, percept):
64+
''' To be overridden in most cases. The default case
65+
assumes th percept to be of type (state, reward)'''
66+
return percept
1667

1768

1869
class PassiveTDAgent:

0 commit comments

Comments
 (0)