diff --git a/.gitignore b/.gitignore index e69de29b..5829c86c 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +data/ \ No newline at end of file diff --git a/hw1/README.md b/hw1/README.md index 345030ac..15612f44 100644 --- a/hw1/README.md +++ b/hw1/README.md @@ -5,7 +5,7 @@ You can run this code on your own machine or on Google Colab. 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions. 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below: -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/pytorch/hw1/cs285/scripts/run_hw1.ipynb) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw1/cs285/scripts/run_hw1.ipynb) ## Complete the code @@ -14,7 +14,7 @@ Fill in sections marked with `TODO`. In particular, see - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py) - [infrastructure/utils.py](cs285/infrastructure/utils.py) - - [infrastructure/pytorch_utils.py](cs285/infrastructure/pytorch_utils.py) + - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py) Look for sections maked with `HW1` to see how the edits you make will be used. Some other files that you may find relevant @@ -25,37 +25,53 @@ See the homework pdf for more details. ## Run the code -Tip: While debugging, you probably want to pass the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. +Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy! -Run the following command for Section 1 (Behavior Cloning): +If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above. + +### Section 1 (Behavior Cloning) +Command for problem 1: ``` -python cs285/scripts/run_hw1_behavior_cloning.py \ +python cs285/scripts/run_hw1.py \ --expert_policy_file cs285/policies/experts/Ant.pkl \ - --env_name Ant-v2 --exp_name test_bc_ant --n_iter 1 \ + --env_name Ant-v2 --exp_name bc_ant --n_iter 1 \ --expert_data cs285/expert_data/expert_data_Ant-v2.pkl + --video_log_freq -1 ``` -Run the following command for Section 2 (DAgger): +Make sure to also try another environment. +See the homework PDF for more details on what else you need to run. +To generate videos of the policy, remove the `--video_log_freq -1` flag. + +### Section 2 (DAgger) +Command for section 1: (Note the `--do_dagger` flag, and the higher value for `n_iter`) ``` -python cs285/scripts/run_hw1_behavior_cloning.py \ - --expert_policy_file cs285/policies/experts/Ant.pkl \ - --env_name Ant-v2 --exp_name test_dagger_ant --n_iter 10 \ - --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl +python cs285/scripts/run_hw1.py \ + --expert_policy_file cs285/policies/experts/Ant.pkl \ + --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \ + --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \ + --video_log_freq -1 ``` -If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above. +Make sure to also try another environment. +See the homework PDF for more details on what else you need to run. ## Visualization the saved tensorboard event file: You can visualize your runs using tensorboard: ``` -tensorboard --logdir cs285/data +tensorboard --logdir data ``` You will see scalar summaries as well as videos of your trained policies (in the 'images' tab). +You can choose to visualize specific runs with a comma-separated list: +``` +tensorboard --logdir data/run1,data/run2,data/run3... +``` + If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details. diff --git a/hw1/cs285/agents/base_agent.py b/hw1/cs285/agents/base_agent.py index 4ccbbc6c..d7712a05 100644 --- a/hw1/cs285/agents/base_agent.py +++ b/hw1/cs285/agents/base_agent.py @@ -3,7 +3,8 @@ class BaseAgent(object): def __init__(self, **kwargs): super(BaseAgent, self).__init__(**kwargs) - def train(self): + def train(self) -> dict: + """Return a dictionary of logging information.""" raise NotImplementedError def add_to_replay_buffer(self, paths): diff --git a/hw1/cs285/agents/bc_agent.py b/hw1/cs285/agents/bc_agent.py index 9b8fc97c..b7ad366e 100644 --- a/hw1/cs285/agents/bc_agent.py +++ b/hw1/cs285/agents/bc_agent.py @@ -1,5 +1,5 @@ from cs285.infrastructure.replay_buffer import ReplayBuffer -from cs285.policies.MLP_policy import * +from cs285.policies.MLP_policy import MLPPolicySL from .base_agent import BaseAgent @@ -27,8 +27,8 @@ def __init__(self, env, agent_params): def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): # training a BC agent refers to updating its actor using # the given observations and corresponding action labels - loss = self.actor.update(ob_no, ac_na) # HW1: you will modify this - return loss + log = self.actor.update(ob_no, ac_na) # HW1: you will modify this + return log def add_to_replay_buffer(self, paths): self.replay_buffer.add_rollouts(paths) diff --git a/hw1/cs285/infrastructure/pytorch_util.py b/hw1/cs285/infrastructure/pytorch_util.py index 2975b8e5..bc7a4081 100644 --- a/hw1/cs285/infrastructure/pytorch_util.py +++ b/hw1/cs285/infrastructure/pytorch_util.py @@ -1,6 +1,6 @@ from typing import Union + import torch -import numpy as np from torch import nn Activation = Union[str, nn.Module] @@ -24,14 +24,11 @@ def build_mlp( size: int, activation: Activation = 'tanh', output_activation: Activation = 'identity', -): +) -> nn.Module: """ Builds a feedforward neural network arguments: - input_placeholder: placeholder variable for the state (batch_size, input_size) - scope: variable scope of the network - n_layers: number of hidden layers size: dimension of each hidden layer activation: activation of each hidden layer @@ -41,16 +38,38 @@ def build_mlp( output_activation: activation of the output layer returns: - output_placeholder: the result of a forward pass through the hidden layers + the output layer + MLP (nn.Module) """ if isinstance(activation, str): activation = _str_to_activation[activation] if isinstance(output_activation, str): output_activation = _str_to_activation[output_activation] + # TODO: return a MLP. This should be an instance of nn.Module # Note: nn.Sequential is an instance of nn.Module. raise NotImplementedError -def from_numpy(array): - return torch.from_numpy(array.astype(np.float32)) +device = None + + +def init_gpu(use_gpu=True, gpu_id=0): + global device + if torch.cuda.is_available() and use_gpu: + device = torch.device("cuda:" + str(gpu_id)) + print("Using GPU id {}".format(gpu_id)) + else: + device = torch.device("cpu") + print("GPU not detected. Defaulting to CPU.") + + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + + +def from_numpy(*args, **kwargs): + return torch.from_numpy(*args, **kwargs).float().to(device) + + +def to_numpy(tensor): + return tensor.to('cpu').detach().numpy() diff --git a/hw1/cs285/infrastructure/replay_buffer.py b/hw1/cs285/infrastructure/replay_buffer.py index 51072c5e..60148e79 100644 --- a/hw1/cs285/infrastructure/replay_buffer.py +++ b/hw1/cs285/infrastructure/replay_buffer.py @@ -72,12 +72,11 @@ def sample_random_data(self, batch_size): == self.terminals.shape[0] ) - # TODO return batch_size number of random entries from each of the 5 - # component arrays above - # HINT 1: use np.random.permutation to sample random indices - # HINT 2: return corresponding data points from each array (i.e., not - # different indices from each array) - # HINT 3: look at the sample_recent_data function below + ## TODO return batch_size number of random entries from each of the 5 component arrays above + ## HINT 1: use np.random.permutation to sample random indices + ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array) + ## HINT 3: look at the sample_recent_data function below + return TODO, TODO, TODO, TODO, TODO def sample_recent_data(self, batch_size=1): diff --git a/hw1/cs285/infrastructure/rl_trainer.py b/hw1/cs285/infrastructure/rl_trainer.py index dabaeabe..bb27972e 100644 --- a/hw1/cs285/infrastructure/rl_trainer.py +++ b/hw1/cs285/infrastructure/rl_trainer.py @@ -5,6 +5,7 @@ import gym import torch +from cs285.infrastructure import pytorch_util as ptu from cs285.infrastructure.logger import Logger from cs285.infrastructure import utils @@ -12,6 +13,7 @@ MAX_NVIDEO = 2 MAX_VIDEO_LEN = 40 # we overwrite this in the code below + class RL_Trainer(object): def __init__(self, params): @@ -28,6 +30,10 @@ def __init__(self, params): seed = self.params['seed'] np.random.seed(seed) torch.manual_seed(seed) + ptu.init_gpu( + use_gpu=not self.params['no_gpu'], + gpu_id=self.params['which_gpu'] + ) ############# ## ENV @@ -114,14 +120,15 @@ def run_training_loop(self, n_iter, collect_policy, eval_policy, self.agent.add_to_replay_buffer(paths) # train agent (using sampled data from replay buffer) - self.train_agent() # HW1: implement this function below + training_logs = self.train_agent() # HW1: implement this function below # log/save if self.log_video or self.log_metrics: # perform logging print('\nBeginning logging procedure...') - self.perform_logging(itr, paths, eval_policy, train_video_paths) + self.perform_logging( + itr, paths, eval_policy, train_video_paths, training_logs) if self.params['save_params']: print('\nSaving agent params') @@ -148,16 +155,14 @@ def collect_training_trajectories( train_video_paths: paths which also contain videos for visualization purposes """ - # TODO decide whether to load training data or use - # HINT: depending on if it's the first iteration or not, - # decide whether to either - # load the data. In this case you can directly return as follows + # TODO decide whether to load training data or use the current policy to collect more data + # HINT: depending on if it's the first iteration or not, decide whether to either + # (1) load the data. In this case you can directly return as follows # ``` return loaded_paths, 0, None ``` - # if it's the first iteration and you aren't loading data, then - # `self.params['batch_size_initial']` is the number of transitions you want to collect + # (2) collect `self.params['batch_size']` transitions - # TODO collect `batch_size` to be used for training + # TODO collect `batch_size` samples to be used for training # HINT1: use sample_trajectories from utils # HINT2: you want each of these collected rollouts to be of length self.params['ep_len'] print("\nCollecting data to be used for training...") @@ -176,6 +181,7 @@ def collect_training_trajectories( def train_agent(self): print('\nTraining agent using sampled data from replay buffer...') + all_logs = [] for train_step in range(self.params['num_agent_train_steps_per_iter']): # TODO sample some data from the data buffer @@ -183,9 +189,12 @@ def train_agent(self): # HINT2: how much data = self.params['train_batch_size'] ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = TODO - # TODO use the sampled data for training + # TODO use the sampled data to train an agent # HINT: use the agent's train function - # HINT: print or plot the loss for debugging! + # HINT: keep the agent's training log for debugging + train_log = TODO + all_logs.append(train_log) + return all_logs def do_relabel_with_expert(self, expert_policy, paths): print("\nRelabelling collected observations with labels from an expert policy...") @@ -199,7 +208,7 @@ def do_relabel_with_expert(self, expert_policy, paths): #################################### #################################### - def perform_logging(self, itr, paths, eval_policy, train_video_paths): + def perform_logging(self, itr, paths, eval_policy, train_video_paths, training_logs): # collect eval trajectories, for logging print("\nCollecting data for eval...") @@ -243,6 +252,8 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths): logs["Train_EnvstepsSoFar"] = self.total_envsteps logs["TimeSinceStart"] = time.time() - self.start_time + last_log = training_logs[-1] # Only use the last log for now + logs.update(last_log) if itr == 0: @@ -255,4 +266,4 @@ def perform_logging(self, itr, paths, eval_policy, train_video_paths): self.logger.log_scalar(value, key, itr) print('Done logging...\n\n') - self.logger.flush() \ No newline at end of file + self.logger.flush() diff --git a/hw1/cs285/policies/MLP_policy.py b/hw1/cs285/policies/MLP_policy.py index 8c6b7129..c8e1fd7d 100644 --- a/hw1/cs285/policies/MLP_policy.py +++ b/hw1/cs285/policies/MLP_policy.py @@ -39,21 +39,29 @@ def __init__(self, self.nn_baseline = nn_baseline if self.discrete: - self.logits_na = ptu.build_mlp(input_size=self.ob_dim, - output_size=self.ac_dim, - n_layers=self.n_layers, - size=self.size) + self.logits_na = ptu.build_mlp( + input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, + size=self.size, + ) + self.logits_na.to(ptu.device) self.mean_net = None self.logstd = None self.optimizer = optim.Adam(self.logits_na.parameters(), self.learning_rate) else: self.logits_na = None - self.mean_net = ptu.build_mlp(input_size=self.ob_dim, - output_size=self.ac_dim, - n_layers=self.n_layers, size=self.size) + self.mean_net = ptu.build_mlp( + input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, size=self.size, + ) + self.mean_net.to(ptu.device) self.logstd = nn.Parameter( - torch.zeros(self.ac_dim, dtype=torch.float32)) + torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) + ) + self.logstd.to(ptu.device) self.optimizer = optim.Adam( itertools.chain([self.logstd], self.mean_net.parameters()), self.learning_rate @@ -102,5 +110,7 @@ def update( ): # TODO: update the policy and return the loss loss = TODO - # if loss is a torch.Tensor, convert to numpy - return loss.detach().numpy() + return { + # You can add extra logging information here, but keep this line + 'Training Loss': ptu.to_numpy(loss), + } diff --git a/hw1/cs285/policies/base_policy.py b/hw1/cs285/policies/base_policy.py index f997572c..e089540a 100644 --- a/hw1/cs285/policies/base_policy.py +++ b/hw1/cs285/policies/base_policy.py @@ -6,7 +6,8 @@ class BasePolicy(object, metaclass=abc.ABCMeta): def get_action(self, obs: np.ndarray) -> np.ndarray: raise NotImplementedError - def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs): + def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: + """Return a dictionary of logging information.""" raise NotImplementedError def save(self, filepath: str): diff --git a/hw1/cs285/policies/loaded_gaussian_policy.py b/hw1/cs285/policies/loaded_gaussian_policy.py index ea82dbf2..720ec472 100644 --- a/hw1/cs285/policies/loaded_gaussian_policy.py +++ b/hw1/cs285/policies/loaded_gaussian_policy.py @@ -8,7 +8,6 @@ def create_linear_layer(W, b) -> nn.Linear: - # in_features, out_features = W.shape out_features, in_features = W.shape linear_layer = nn.Linear( in_features, @@ -98,9 +97,9 @@ def get_action(self, obs): observation = obs else: observation = obs[None, :] - observation = torch.from_numpy(observation.astype(np.float32)) + observation = ptu.from_numpy(observation.astype(np.float32)) action = self(observation) - return action.detach().numpy() + return ptu.to_numpy(action) def save(self, filepath): torch.save(self.state_dict(), filepath) diff --git a/hw1/cs285/scripts/run_hw1.ipynb b/hw1/cs285/scripts/run_hw1.ipynb index 746d100c..476b6326 100644 --- a/hw1/cs285/scripts/run_hw1.ipynb +++ b/hw1/cs285/scripts/run_hw1.ipynb @@ -22,7 +22,6 @@ "colab": { "name": "run_hw1.ipynb", "provenance": [], - "collapsed_sections": [], "toc_visible": true } }, @@ -98,6 +97,7 @@ "\n", "#@markdown Double-click on section headers to show code.\n", "\n", + "!apt update \n", "!apt install -y --no-install-recommends \\\n", " build-essential \\\n", " curl \\\n", @@ -139,11 +139,7 @@ "id": "QeDMsMOXUAkN", "colab_type": "code", "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - }, - "outputId": "a846c29a-eb11-4e1c-94cc-cea21f746a4b" + "colab": {} }, "source": [ "#@title download mujoco\n", @@ -157,15 +153,7 @@ "%rm mujoco200_linux.zip" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "/content/gdrive/My Drive/cs285_f2020/mujoco\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", @@ -235,7 +223,7 @@ "#@title clone homework repo\n", "\n", "%cd $SYM_PATH\n", - "!git clone --single-branch --branch pytorch https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", "%cd homework_fall2020/hw1\n", "%pip install -r requirements_colab.txt\n", "%pip install -e ." @@ -249,11 +237,7 @@ "id": "8y_M1tGxmGhT", "colab_type": "code", "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 36 - }, - "outputId": "98953644-114c-4456-dddc-b03f9484ad31" + "colab": {} }, "source": [ "#@title set up virtual display\n", @@ -264,20 +248,7 @@ "display.start()" ], "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": { - "tags": [] - }, - "execution_count": 9 - } - ] + "outputs": [] }, { "cell_type": "code", @@ -287,9 +258,9 @@ "cellView": "form", "colab": { "base_uri": "https://localhost:8080/", - "height": 439 + "height": 438 }, - "outputId": "0a777fc2-7acb-4591-96e3-857c3faadaeb" + "outputId": "c91293e2-0424-4427-b57e-0e12653c991a" }, "source": [ "#@title test virtual display\n", @@ -330,7 +301,7 @@ "text/html": [ "" ], "text/plain": [ @@ -343,6 +314,18 @@ } ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "eQx7oDGeeKWj", + "colab_type": "text" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, { "cell_type": "markdown", "metadata": { @@ -350,7 +333,7 @@ "colab_type": "text" }, "source": [ - "## Run behavior cloning" + "## Run Behavior Cloning (Problem 1)" ] }, { @@ -359,11 +342,7 @@ "id": "enh5ZMHftEO7", "colab_type": "code", "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "outputId": "609e2dd1-9e01-432a-895a-2922660666fd" + "colab": {} }, "source": [ "#@title imports\n", @@ -380,22 +359,14 @@ "%autoreload 2" ], "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "text": [ - "The autoreload extension is already loaded. To reload it, use:\n", - " %reload_ext autoreload\n" - ], - "name": "stdout" - } - ] + "outputs": [] }, { "cell_type": "code", "metadata": { "id": "imnAkQ6jryL7", "colab_type": "code", + "cellView": "form", "colab": {} }, "source": [ @@ -416,13 +387,14 @@ " exp_name = 'test_bc_ant' #@param\n", " do_dagger = False #@param {type: \"boolean\"}\n", " ep_len = 1000 #@param {type: \"integer\"}\n", + " save_params = False #@param {type: \"boolean\"}\n", "\n", " num_agent_train_steps_per_iter = 1000 #@param {type: \"integer\"})\n", " n_iter = 1 #@param {type: \"integer\"})\n", "\n", " #@markdown batches & buffers\n", " batch_size = 1000 #@param {type: \"integer\"})\n", - " eval_batch_size = 200 #@param {type: \"integer\"}\n", + " eval_batch_size = 1000 #@param {type: \"integer\"}\n", " train_batch_size = 100 #@param {type: \"integer\"}\n", " max_replay_buffer_size = 1000000 #@param {type: \"integer\"}\n", "\n", @@ -435,8 +407,8 @@ " video_log_freq = 5 #@param {type: \"integer\"}\n", " scalar_log_freq = 1 #@param {type: \"integer\"}\n", "\n", - " #@markdown gpu\n", - " use_gpu = False #@param {type: \"boolean\"}\n", + " #@markdown gpu & run-time settings\n", + " no_gpu = False #@param {type: \"boolean\"}\n", " which_gpu = 0 #@param {type: \"integer\"}\n", " seed = 1 #@param {type: \"integer\"}\n", "\n", @@ -450,13 +422,14 @@ "metadata": { "id": "fLnU1evmss4I", "colab_type": "code", + "cellView": "form", "colab": {} }, "source": [ + "#@title define `BC_Trainer`\n", "class BC_Trainer(object):\n", "\n", " def __init__(self, params):\n", - "\n", " #######################\n", " ## AGENT PARAMS\n", " #######################\n", @@ -483,7 +456,7 @@ " #######################\n", "\n", " print('Loading expert policy from...', self.params['expert_policy_file'])\n", - " self.loaded_expert_policy = Loaded_Gaussian_Policy(self.rl_trainer.sess, self.params['expert_policy_file'])\n", + " self.loaded_expert_policy = LoadedGaussianPolicy(self.params['expert_policy_file'])\n", " print('Done restoring expert policy...')\n", "\n", " def run_training_loop(self):\n", @@ -505,16 +478,17 @@ "metadata": { "id": "7UkzHBfxsxH8", "colab_type": "code", + "cellView": "form", "colab": {} }, "source": [ - "## create directory for logging\n", + "#@title create directory for logging\n", "\n", - "logdir_prefix = 'bc_'\n", "if args.do_dagger:\n", - " logdir_prefix = 'dagger_'\n", + " logdir_prefix = 'q2_' # The autograder uses the prefix `q2_`\n", " assert args.n_iter>1, ('DAgger needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).')\n", "else:\n", + " logdir_prefix = 'q1_' # The autograder uses the prefix `q1_`\n", " assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)')\n", "\n", "data_path ='/content/cs285_f2020/data'\n", @@ -561,6 +535,20 @@ ], "execution_count": null, "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ff9onuUPfPEa", + "colab_type": "text" + }, + "source": [ + "## Running DAgger (Problem 2)\n", + "Modify the settings above:\n", + "1. check the `do_dagger` box\n", + "2. set `n_iters` to `10`\n", + "and then rerun the code." + ] } ] -} \ No newline at end of file +} diff --git a/hw1/cs285/scripts/run_hw1.py b/hw1/cs285/scripts/run_hw1.py index ecd6f824..2a4a73de 100644 --- a/hw1/cs285/scripts/run_hw1.py +++ b/hw1/cs285/scripts/run_hw1.py @@ -75,7 +75,7 @@ def main(): parser.add_argument('--video_log_freq', type=int, default=5) parser.add_argument('--scalar_log_freq', type=int, default=1) - parser.add_argument('--use_gpu', action='http://github.com/store_true') + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') parser.add_argument('--which_gpu', type=int, default=0) parser.add_argument('--max_replay_buffer_size', type=int, default=1000000) parser.add_argument('--save_params', action='http://github.com/store_true') @@ -89,15 +89,17 @@ def main(): ### CREATE DIRECTORY FOR LOGGING ################################## - logdir_prefix = 'bc_' if args.do_dagger: - logdir_prefix = 'dagger_' + # Use this prefix when submitting. The auto-grader uses this prefix. + logdir_prefix = 'q2_' assert args.n_iter>1, ('DAGGER needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).') else: + # Use this prefix when submitting. The auto-grader uses this prefix. + logdir_prefix = 'q1_' assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)') ## directory for logging - data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') if not (os.path.exists(data_path)): os.makedirs(data_path) logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") diff --git a/hw1/cs285_hw1.pdf b/hw1/cs285_hw1.pdf index cff45414..d02188d5 100644 Binary files a/hw1/cs285_hw1.pdf and b/hw1/cs285_hw1.pdf differ diff --git a/hw2/README.md b/hw2/README.md new file mode 100644 index 00000000..0e481c04 --- /dev/null +++ b/hw2/README.md @@ -0,0 +1,28 @@ +## Setup + +You can run this code on your own machine or on Google Colab. + +1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. If you completed this installation for homework 1, you do not need to repeat it. +2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw2/cs285/scripts/run_hw2.ipynb) + +## Complete the code + +The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with "TODO: get this from hw1". + +- [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) +- [infrastructure/utils.py](cs285/infrastructure/utils.py) +- [policies/MLP_policy.py](cs285/policies/MLP_policy.py) + +You will then need to complete the following new files for homework 2. The relevant sections are marked with "TODO". +- [agents/pg_agent.py](cs285/agents/pg_agent.py) +- [policies/MLP_policy.py](cs285/policies/MLP_policy.py) + +You will also want to look through [scripts/run_hw2.py](cs285/scripts/run_hw2.py) (if running locally) or [scripts/run_hw2.ipynb](cs285/scripts/run_hw1.2pynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. + +You will be running your policy gradients implementation in four experiments total, investigating the effects of design decisions like reward-to-go estimators, neural network baselines for variance reduction, and advantage normalization. See the [assignment PDF](cs285_hw2.pdf) for more details. + +## Plotting your results + +We have provided a snippet that may be used for reading your Tensorboard eventfiles in [scripts/read_results.py](cs285/scripts/read_results.py). Reading these eventfiles and plotting them with [matplotlib](https://matplotlib.org/) or [seaborn](https://seaborn.pydata.org/) will produce the cleanest results for your submission. For debugging purposes, we recommend visualizing the Tensorboard logs using `tensorboard --logdir data`. diff --git a/hw2/cs285/agents/__init__.py b/hw2/cs285/agents/__init__.py new file mode 100644 index 00000000..9e5b9cf7 --- /dev/null +++ b/hw2/cs285/agents/__init__.py @@ -0,0 +1,3 @@ +from .base_agent import BaseAgent +from .pg_agent import PGAgent + diff --git a/hw2/cs285/agents/base_agent.py b/hw2/cs285/agents/base_agent.py new file mode 100644 index 00000000..a32224b5 --- /dev/null +++ b/hw2/cs285/agents/base_agent.py @@ -0,0 +1,16 @@ +class BaseAgent(object): + def __init__(self, **kwargs): + super(BaseAgent, self).__init__(**kwargs) + + def train(self) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def add_to_replay_buffer(self, paths): + raise NotImplementedError + + def sample(self, batch_size): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError \ No newline at end of file diff --git a/hw2/cs285/agents/pg_agent.py b/hw2/cs285/agents/pg_agent.py new file mode 100644 index 00000000..699d1e67 --- /dev/null +++ b/hw2/cs285/agents/pg_agent.py @@ -0,0 +1,151 @@ +import numpy as np + +from .base_agent import BaseAgent +from cs285.policies.MLP_policy import MLPPolicyPG +from cs285.infrastructure.replay_buffer import ReplayBuffer + + +class PGAgent(BaseAgent): + def __init__(self, env, agent_params): + super(PGAgent, self).__init__() + + # init vars + self.env = env + self.agent_params = agent_params + self.gamma = self.agent_params['gamma'] + self.standardize_advantages = self.agent_params['standardize_advantages'] + self.nn_baseline = self.agent_params['nn_baseline'] + self.reward_to_go = self.agent_params['reward_to_go'] + + # actor/policy + self.actor = MLPPolicyPG( + self.agent_params['ac_dim'], + self.agent_params['ob_dim'], + self.agent_params['n_layers'], + self.agent_params['size'], + discrete=self.agent_params['discrete'], + learning_rate=self.agent_params['learning_rate'], + nn_baseline=self.agent_params['nn_baseline'] + ) + + # replay buffer + self.replay_buffer = ReplayBuffer(1000000) + + def train(self, observations, actions, rewards_list, next_observations, terminals): + + """ + Training a PG agent refers to updating its actor using the given observations/actions + and the calculated qvals/advantages that come from the seen rewards. + """ + + # step 1: calculate q values of each (s_t, a_t) point, using rewards (r_0, ..., r_t, ..., r_T) + q_values = self.calculate_q_vals(rewards_list) + + # step 2: calculate advantages that correspond to each (s_t, a_t) point + advantages = self.estimate_advantage(observations, q_values) + + # TODO: step 3: use all datapoints (s_t, a_t, q_t, adv_t) to update the PG actor/policy + ## HINT: `train_log` should be returned by your actor update method + train_log = TODO + + return train_log + + def calculate_q_vals(self, rewards_list): + + """ + Monte Carlo estimation of the Q function. + """ + + # Case 1: trajectory-based PG + # Estimate Q^{pi}(s_t, a_t) by the total discounted reward summed over entire trajectory + if not self.reward_to_go: + + # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory + # In other words: value of (s_t, a_t) = sum_{t'=0}^T gamma^t' r_{t'} + q_values = np.concatenate([self._discounted_return(r) for r in rewards_list]) + + # Case 2: reward-to-go PG + # Estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting from t + else: + + # For each point (s_t, a_t), associate its value as being the discounted sum of rewards over the full trajectory + # In other words: value of (s_t, a_t) = sum_{t'=t}^T gamma^(t'-t) * r_{t'} + q_values = np.concatenate([self._discounted_cumsum(r) for r in rewards_list]) + + return q_values + + def estimate_advantage(self, obs, q_values): + + """ + Computes advantages by (possibly) subtracting a baseline from the estimated Q values + """ + + # Estimate the advantage when nn_baseline is True, + # by querying the neural network that you're using to learn the baseline + if self.nn_baseline: + baselines_unnormalized = self.actor.run_baseline_prediction(obs) + ## ensure that the baseline and q_values have the same dimensionality + ## to prevent silent broadcasting errors + assert baselines_unnormalized.ndim == q_values.ndim + ## baseline was trained with standardized q_values, so ensure that the predictions + ## have the same mean and standard deviation as the current batch of q_values + baselines = baselines_unnormalized * np.std(q_values) + np.mean(q_values) + ## TODO: compute advantage estimates using q_values and baselines + advantages = TODO + + # Else, just set the advantage to [Q] + else: + advantages = q_values.copy() + + # Normalize the resulting advantages + if self.standardize_advantages: + ## TODO: standardize the advantages to have a mean of zero + ## and a standard deviation of one + ## HINT: there is a `normalize` function in `infrastructure.utils` + advantages = TODO + + return advantages + + ##################################################### + ##################################################### + + def add_to_replay_buffer(self, paths): + self.replay_buffer.add_rollouts(paths) + + def sample(self, batch_size): + return self.replay_buffer.sample_recent_data(batch_size, concat_rew=False) + + ##################################################### + ################## HELPER FUNCTIONS ################# + ##################################################### + + def _discounted_return(self, rewards): + """ + Helper function + + Input: list of rewards {r_0, r_1, ..., r_t', ... r_T} from a single rollout of length T + + Output: list where each index t contains sum_{t'=0}^T gamma^t' r_{t'} + """ + + # TODO: create list_of_discounted_returns + # Hint: note that all entries of this output are equivalent + # because each sum is from 0 to T (and doesnt involve t) + + return list_of_discounted_returns + + def _discounted_cumsum(self, rewards): + """ + Helper function which + -takes a list of rewards {r_0, r_1, ..., r_t', ... r_T}, + -and returns a list where the entry in each index t' is sum_{t'=t}^T gamma^(t'-t) * r_{t'} + """ + + # TODO: create `list_of_discounted_returns` + # HINT1: note that each entry of the output should now be unique, + # because the summation happens over [t, T] instead of [0, T] + # HINT2: it is possible to write a vectorized solution, but a solution + # using a for loop is also fine + + return list_of_discounted_cumsums + diff --git a/hw2/cs285/infrastructure/__init__.py b/hw2/cs285/infrastructure/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw2/cs285/infrastructure/colab_utils.py b/hw2/cs285/infrastructure/colab_utils.py new file mode 100644 index 00000000..31ab6d9e --- /dev/null +++ b/hw2/cs285/infrastructure/colab_utils.py @@ -0,0 +1,26 @@ +from gym.wrappers import Monitor +import glob +import io +import base64 +from IPython.display import HTML +from IPython import display as ipythondisplay + +## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI + +def show_video(): + mp4list = glob.glob('/content/video/*.mp4') + if len(mp4list) > 0: + mp4 = mp4list[0] + video = io.open(mp4, 'r+b').read() + encoded = base64.b64encode(video) + ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) + else: + print("Could not find video") + + +def wrap_env(env): + env = Monitor(env, '/content/video', force=True) + return env \ No newline at end of file diff --git a/hw2/cs285/infrastructure/logger.py b/hw2/cs285/infrastructure/logger.py new file mode 100644 index 00000000..a64931c0 --- /dev/null +++ b/hw2/cs285/infrastructure/logger.py @@ -0,0 +1,74 @@ +import os +from tensorboardX import SummaryWriter +import numpy as np + +class Logger: + def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): + self._log_dir = log_dir + print('########################') + print('logging outputs to ', log_dir) + print('########################') + self._n_logged_samples = n_logged_samples + self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) + + def log_scalar(self, scalar, name, step_): + self._summ_writer.add_scalar('{}'.format(name), scalar, step_) + + def log_scalars(self, scalar_dict, group_name, step, phase): + """Will log all scalars in the same plot.""" + self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) + + def log_image(self, image, name, step): + assert(len(image.shape) == 3) # [C, H, W] + self._summ_writer.add_image('{}'.format(name), image, step) + + def log_video(self, video_frames, name, step, fps=10): + assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" + self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) + + def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): + + # reshape the rollouts + videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] + + # max rollout length + max_videos_to_save = np.min([max_videos_to_save, len(videos)]) + max_length = videos[0].shape[0] + for i in range(max_videos_to_save): + if videos[i].shape[0]>max_length: + max_length = videos[i].shape[0] + + # pad rollouts to all be same length + for i in range(max_videos_to_save): + if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_figure(self, figure, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_graph(self, array, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + im = plot_graph(array) + self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) + + def dump_scalars(self, log_path=None): + log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path + self._summ_writer.export_scalars_to_json(log_path) + + def flush(self): + self._summ_writer.flush() + + + + diff --git a/hw2/cs285/infrastructure/pytorch_util.py b/hw2/cs285/infrastructure/pytorch_util.py new file mode 100644 index 00000000..155e2aeb --- /dev/null +++ b/hw2/cs285/infrastructure/pytorch_util.py @@ -0,0 +1,83 @@ +from typing import Union + +import torch +from torch import nn + +Activation = Union[str, nn.Module] + + +_str_to_activation = { + 'relu': nn.ReLU(), + 'tanh': nn.Tanh(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'selu': nn.SELU(), + 'softplus': nn.Softplus(), + 'identity': nn.Identity(), +} + + +def build_mlp( + input_size: int, + output_size: int, + n_layers: int, + size: int, + activation: Activation = 'tanh', + output_activation: Activation = 'identity', +): + """ + Builds a feedforward neural network + + arguments: + input_placeholder: placeholder variable for the state (batch_size, input_size) + scope: variable scope of the network + + n_layers: number of hidden layers + size: dimension of each hidden layer + activation: activation of each hidden layer + + input_size: size of the input layer + output_size: size of the output layer + output_activation: activation of the output layer + + returns: + output_placeholder: the result of a forward pass through the hidden layers + the output layer + """ + if isinstance(activation, str): + activation = _str_to_activation[activation] + if isinstance(output_activation, str): + output_activation = _str_to_activation[output_activation] + layers = [] + in_size = input_size + for _ in range(n_layers): + layers.append(nn.Linear(in_size, size)) + layers.append(activation) + in_size = size + layers.append(nn.Linear(in_size, output_size)) + layers.append(output_activation) + return nn.Sequential(*layers) + + +device = None + + +def init_gpu(use_gpu=True, gpu_id=0): + global device + if torch.cuda.is_available() and use_gpu: + device = torch.device("cuda:" + str(gpu_id)) + print("Using GPU id {}".format(gpu_id)) + else: + device = torch.device("cpu") + print("GPU not detected. Defaulting to CPU.") + + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + + +def from_numpy(*args, **kwargs): + return torch.from_numpy(*args, **kwargs).float().to(device) + + +def to_numpy(tensor): + return tensor.to('cpu').detach().numpy() diff --git a/hw2/cs285/infrastructure/replay_buffer.py b/hw2/cs285/infrastructure/replay_buffer.py new file mode 100644 index 00000000..ca92794a --- /dev/null +++ b/hw2/cs285/infrastructure/replay_buffer.py @@ -0,0 +1,88 @@ +from cs285.infrastructure.utils import * + + +class ReplayBuffer(object): + + def __init__(self, max_size=1000000): + + self.max_size = max_size + self.paths = [] + self.obs = None + self.acs = None + self.concatenated_rews = None + self.unconcatenated_rews = None + self.next_obs = None + self.terminals = None + + def add_rollouts(self, paths, noised=False): + + # add new rollouts into our list of rollouts + for path in paths: + self.paths.append(path) + + # convert new rollouts into their component arrays, and append them onto our arrays + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) + + if noised: + observations = add_noise(observations) + next_observations = add_noise(next_observations) + + if self.obs is None: + self.obs = observations[-self.max_size:] + self.acs = actions[-self.max_size:] + self.next_obs = next_observations[-self.max_size:] + self.terminals = terminals[-self.max_size:] + self.concatenated_rews = concatenated_rews[-self.max_size:] + self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] + else: + self.obs = np.concatenate([self.obs, observations])[-self.max_size:] + self.acs = np.concatenate([self.acs, actions])[-self.max_size:] + self.next_obs = np.concatenate( + [self.next_obs, next_observations] + )[-self.max_size:] + self.terminals = np.concatenate( + [self.terminals, terminals] + )[-self.max_size:] + self.concatenated_rews = np.concatenate( + [self.concatenated_rews, concatenated_rews] + )[-self.max_size:] + if isinstance(unconcatenated_rews, list): + self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around + else: + self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around + + ######################################## + ######################################## + + def sample_random_rollouts(self, num_rollouts): + rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] + return self.paths[rand_indices] + + def sample_recent_rollouts(self, num_rollouts=1): + return self.paths[-num_rollouts:] + + ######################################## + ######################################## + + def sample_random_data(self, batch_size): + + assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] + rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] + return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] + + def sample_recent_data(self, batch_size=1, concat_rew=True): + + if concat_rew: + return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] + else: + num_recent_rollouts_to_return = 0 + num_datapoints_so_far = 0 + index = -1 + while num_datapoints_so_far < batch_size: + recent_rollout = self.paths[index] + index -=1 + num_recent_rollouts_to_return +=1 + num_datapoints_so_far += get_pathlength(recent_rollout) + rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) + return observations, actions, unconcatenated_rews, next_observations, terminals \ No newline at end of file diff --git a/hw2/cs285/infrastructure/rl_trainer.py b/hw2/cs285/infrastructure/rl_trainer.py new file mode 100644 index 00000000..7d76f7f9 --- /dev/null +++ b/hw2/cs285/infrastructure/rl_trainer.py @@ -0,0 +1,227 @@ +from collections import OrderedDict +import pickle +import os +import sys +import time + +import gym +from gym import wrappers +import numpy as np +import torch +from cs285.infrastructure import pytorch_util as ptu + +from cs285.infrastructure import utils +from cs285.infrastructure.logger import Logger + +# how many rollouts to save as videos to tensorboard +MAX_NVIDEO = 2 +MAX_VIDEO_LEN = 40 # we overwrite this in the code below + + +class RL_Trainer(object): + + def __init__(self, params): + + ############# + ## INIT + ############# + + # Get params, create logger + self.params = params + self.logger = Logger(self.params['logdir']) + + # Set random seeds + seed = self.params['seed'] + np.random.seed(seed) + torch.manual_seed(seed) + ptu.init_gpu( + use_gpu=not self.params['no_gpu'], + gpu_id=self.params['which_gpu'] + ) + + ############# + ## ENV + ############# + + # Make the gym environment + self.env = gym.make(self.params['env_name']) + self.env.seed(seed) + + # import plotting (locally if 'obstacles' env) + if not(self.params['env_name']=='obstacles-cs285-v0'): + import matplotlib + matplotlib.use('Agg') + + # Maximum length for episodes + self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps + global MAX_VIDEO_LEN + MAX_VIDEO_LEN = self.params['ep_len'] + + # Is this env continuous, or self.discrete? + discrete = isinstance(self.env.action_space, gym.spaces.Discrete) + # Are the observations images? + img = len(self.env.observation_space.shape) > 2 + + self.params['agent_params']['discrete'] = discrete + + # Observation and action sizes + + ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] + ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] + self.params['agent_params']['ac_dim'] = ac_dim + self.params['agent_params']['ob_dim'] = ob_dim + + # simulation timestep, will be used for video saving + if 'model' in dir(self.env): + self.fps = 1/self.env.model.opt.timestep + elif 'env_wrappers' in self.params: + self.fps = 30 # This is not actually used when using the Monitor wrapper + elif 'video.frames_per_second' in self.env.env.metadata.keys(): + self.fps = self.env.env.metadata['video.frames_per_second'] + else: + self.fps = 10 + + + ############# + ## AGENT + ############# + + agent_class = self.params['agent_class'] + self.agent = agent_class(self.env, self.params['agent_params']) + + def run_training_loop(self, n_iter, collect_policy, eval_policy, + initial_expertdata=None, relabel_with_expert=False, + start_relabel_with_expert=1, expert_policy=None): + """ + :param n_iter: number of (dagger) iterations + :param collect_policy: + :param eval_policy: + :param initial_expertdata: + :param relabel_with_expert: whether to perform dagger + :param start_relabel_with_expert: iteration at which to start relabel with expert + :param expert_policy: + """ + + # init vars at beginning of training + self.total_envsteps = 0 + self.start_time = time.time() + + for itr in range(n_iter): + print("\n\n********** Iteration %i ************"%itr) + + # decide if videos should be rendered/logged at this iteration + if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: + self.logvideo = True + else: + self.logvideo = False + self.log_video = self.logvideo + + # decide if metrics should be logged + if self.params['scalar_log_freq'] == -1: + self.logmetrics = False + elif itr % self.params['scalar_log_freq'] == 0: + self.logmetrics = True + else: + self.logmetrics = False + + # collect trajectories, to be used for training + training_returns = self.collect_training_trajectories(itr, + initial_expertdata, collect_policy, + self.params['batch_size']) + paths, envsteps_this_batch, train_video_paths = training_returns + self.total_envsteps += envsteps_this_batch + + # add collected data to replay buffer + self.agent.add_to_replay_buffer(paths) + + # train agent (using sampled data from replay buffer) + train_logs = self.train_agent() + + # log/save + if self.logvideo or self.logmetrics: + # perform logging + print('\nBeginning logging procedure...') + self.perform_logging(itr, paths, eval_policy, train_video_paths, train_logs) + + if self.params['save_params']: + self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) + + #################################### + #################################### + + def collect_training_trajectories(self, itr, load_initial_expertdata, collect_policy, batch_size): + # TODO: get this from hw1 + # if your load_initial_expertdata is None, then you need to collect new trajectories at *every* iteration + return paths, envsteps_this_batch, train_video_paths + + def train_agent(self): + # TODO: get this from hw1 + return train_logs + + #################################### + #################################### + + def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): + + last_log = all_logs[-1] + + ####################### + + # collect eval trajectories, for logging + print("\nCollecting data for eval...") + eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) + + # save eval rollouts as videos in tensorboard event file + if self.logvideo and train_video_paths != None: + print('\nCollecting video rollouts eval') + eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + #save train/eval videos + print('\nSaving train rollouts as videos...') + self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, + video_title='train_rollouts') + self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, + video_title='eval_rollouts') + + ####################### + + # save eval metrics + if self.logmetrics: + # returns, for logging + train_returns = [path["reward"].sum() for path in paths] + eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] + + # episode lengths, for logging + train_ep_lens = [len(path["reward"]) for path in paths] + eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] + + # decide what to log + logs = OrderedDict() + logs["Eval_AverageReturn"] = np.mean(eval_returns) + logs["Eval_StdReturn"] = np.std(eval_returns) + logs["Eval_MaxReturn"] = np.max(eval_returns) + logs["Eval_MinReturn"] = np.min(eval_returns) + logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) + + logs["Train_AverageReturn"] = np.mean(train_returns) + logs["Train_StdReturn"] = np.std(train_returns) + logs["Train_MaxReturn"] = np.max(train_returns) + logs["Train_MinReturn"] = np.min(train_returns) + logs["Train_AverageEpLen"] = np.mean(train_ep_lens) + + logs["Train_EnvstepsSoFar"] = self.total_envsteps + logs["TimeSinceStart"] = time.time() - self.start_time + logs.update(last_log) + + if itr == 0: + self.initial_return = np.mean(train_returns) + logs["Initial_DataCollection_AverageReturn"] = self.initial_return + + # perform the logging + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + self.logger.log_scalar(value, key, itr) + print('Done logging...\n\n') + + self.logger.flush() + diff --git a/hw2/cs285/infrastructure/utils.py b/hw2/cs285/infrastructure/utils.py new file mode 100644 index 00000000..b4cea2a0 --- /dev/null +++ b/hw2/cs285/infrastructure/utils.py @@ -0,0 +1,131 @@ +import numpy as np +import time +import copy + +############################################ +############################################ + +def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): + + model = models[0] + + # true + true_states = perform_actions(env, action_sequence)['observation'] + + # predicted + ob = np.expand_dims(true_states[0],0) + pred_states = [] + for ac in action_sequence: + pred_states.append(ob) + action = np.expand_dims(ac,0) + ob = model.get_prediction(ob, action, data_statistics) + pred_states = np.squeeze(pred_states) + + # mpe + mpe = mean_squared_error(pred_states, true_states) + + return mpe, true_states, pred_states + +def perform_actions(env, actions): + ob = env.reset() + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + for ac in actions: + obs.append(ob) + acs.append(ac) + ob, rew, done, _ = env.step(ac) + # add the observation after taking a step to next_obs + next_obs.append(ob) + rewards.append(rew) + steps += 1 + # If the episode ended, the corresponding terminal value is 1 + # otherwise, it is 0 + if done: + terminals.append(1) + break + else: + terminals.append(0) + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) + +def mean_squared_error(a, b): + return np.mean((a-b)**2) + +############################################ +############################################ + +def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): + # TODO: get this from hw1 + return Path(obs, image_obs, acs, rewards, next_obs, terminals) + +def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): + # TODO: get this from hw1 + return paths, timesteps_this_batch + +def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): + # TODO: get this from hw1 + return paths + +############################################ +############################################ + +def Path(obs, image_obs, acs, rewards, next_obs, terminals): + """ + Take info (separate arrays) from a single rollout + and return it in a single dictionary + """ + if image_obs != []: + image_obs = np.stack(image_obs, axis=0) + return {"observation" : np.array(obs, dtype=np.float32), + "image_obs" : np.array(image_obs, dtype=np.uint8), + "reward" : np.array(rewards, dtype=np.float32), + "action" : np.array(acs, dtype=np.float32), + "next_observation": np.array(next_obs, dtype=np.float32), + "terminal": np.array(terminals, dtype=np.float32)} + + +def convert_listofrollouts(paths): + """ + Take a list of rollout dictionaries + and return separate arrays, + where each array is a concatenation of that array from across the rollouts + """ + observations = np.concatenate([path["observation"] for path in paths]) + actions = np.concatenate([path["action"] for path in paths]) + next_observations = np.concatenate([path["next_observation"] for path in paths]) + terminals = np.concatenate([path["terminal"] for path in paths]) + concatenated_rewards = np.concatenate([path["reward"] for path in paths]) + unconcatenated_rewards = [path["reward"] for path in paths] + return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards + +############################################ +############################################ + +def get_pathlength(path): + return len(path["reward"]) + +def normalize(data, mean, std, eps=1e-8): + return (data-mean)/(std+eps) + +def unnormalize(data, mean, std): + return data*std+mean + +def add_noise(data_inp, noiseToSignal=0.01): + + data = copy.deepcopy(data_inp) #(num data points, dim) + + #mean of data + mean_data = np.mean(data, axis=0) + + #if mean is 0, + #make it 0.001 to avoid 0 issues later for dividing by std + mean_data[mean_data == 0] = 0.000001 + + #width of normal distribution to sample noise from + #larger magnitude number = could have larger magnitude noise + std_of_noise = mean_data * noiseToSignal + for j in range(mean_data.shape[0]): + data[:, j] = np.copy(data[:, j] + np.random.normal( + 0, np.absolute(std_of_noise[j]), (data.shape[0],))) + + return data \ No newline at end of file diff --git a/hw2/cs285/policies/MLP_policy.py b/hw2/cs285/policies/MLP_policy.py new file mode 100644 index 00000000..432210b7 --- /dev/null +++ b/hw2/cs285/policies/MLP_policy.py @@ -0,0 +1,174 @@ +import abc +import itertools +from torch import nn +from torch.nn import functional as F +from torch import optim + +import numpy as np +import torch +from torch import distributions + +from cs285.infrastructure import pytorch_util as ptu +from cs285.policies.base_policy import BasePolicy + + +class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): + + def __init__(self, + ac_dim, + ob_dim, + n_layers, + size, + discrete=False, + learning_rate=1e-4, + training=True, + nn_baseline=False, + **kwargs + ): + super().__init__(**kwargs) + + # init vars + self.ac_dim = ac_dim + self.ob_dim = ob_dim + self.n_layers = n_layers + self.discrete = discrete + self.size = size + self.learning_rate = learning_rate + self.training = training + self.nn_baseline = nn_baseline + + if self.discrete: + self.logits_na = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, + size=self.size) + self.logits_na.to(ptu.device) + self.mean_net = None + self.logstd = None + self.optimizer = optim.Adam(self.logits_na.parameters(), + self.learning_rate) + else: + self.logits_na = None + self.mean_net = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, size=self.size) + self.logstd = nn.Parameter( + torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) + ) + self.mean_net.to(ptu.device) + self.logstd.to(ptu.device) + self.optimizer = optim.Adam( + itertools.chain([self.logstd], self.mean_net.parameters()), + self.learning_rate + ) + + if nn_baseline: + self.baseline = ptu.build_mlp( + input_size=self.ob_dim, + output_size=1, + n_layers=self.n_layers, + size=self.size, + ) + self.baseline.to(ptu.device) + self.baseline_optimizer = optim.Adam( + self.baseline.parameters(), + self.learning_rate, + ) + else: + self.baseline = None + + ################################## + + def save(self, filepath): + torch.save(self.state_dict(), filepath) + + ################################## + + # query the policy with observation(s) to get selected action(s) + def get_action(self, obs: np.ndarray) -> np.ndarray: + # TODO: get this from hw1 + return action + + # update/train this policy + def update(self, observations, actions, **kwargs): + raise NotImplementedError + + # This function defines the forward pass of the network. + # You can return anything you want, but you should be able to differentiate + # through it. For example, you can return a torch.FloatTensor. You can also + # return more flexible objects, such as a + # `torch.distributions.Distribution` object. It's up to you! + def forward(self, observation: torch.FloatTensor): + # TODO: get this from hw1 + return action_distribution + + +##################################################### +##################################################### + +class MLPPolicyPG(MLPPolicy): + def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs): + + super().__init__(ac_dim, ob_dim, n_layers, size, **kwargs) + self.baseline_loss = nn.MSELoss() + + def update(self, observations, actions, advantages, q_values=None): + observations = ptu.from_numpy(observations) + actions = ptu.from_numpy(actions) + advantages = ptu.from_numpy(advantages) + + # TODO: compute the loss that should be optimized when training with policy gradient + # HINT1: Recall that the expression that we want to MAXIMIZE + # is the expectation over collected trajectories of: + # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] + # HINT2: you will want to use the `log_prob` method on the distribution returned + # by the `forward` method + # HINT3: don't forget that `optimizer.step()` MINIMIZES a loss + + loss = TODO + + # TODO: optimize `loss` using `self.optimizer` + # HINT: remember to `zero_grad` first + TODO + + if self.nn_baseline: + ## TODO: normalize the q_values to have a mean of zero and a standard deviation of one + ## HINT: there is a `normalize` function in `infrastructure.utils` + targets = TODO + targets = ptu.from_numpy(targets) + + ## TODO: use the `forward` method of `self.baseline` to get baseline predictions + baseline_predictions = TODO + + ## avoid any subtle broadcasting bugs that can arise when dealing with arrays of shape + ## [ N ] versus shape [ N x 1 ] + ## HINT: you can use `squeeze` on torch tensors to remove dimensions of size 1 + assert baseline_predictions.shape == targets.shape + + # TODO: compute the loss that should be optimized for training the baseline MLP (`self.baseline`) + # HINT: use `F.mse_loss` + baseline_loss = TODO + + # TODO: optimize `baseline_loss` using `self.baseline_optimizer` + # HINT: remember to `zero_grad` first + TODO + + train_log = { + 'Training Loss': ptu.to_numpy(loss), + } + return train_log + + def run_baseline_prediction(self, obs): + """ + Helper function that converts `obs` to a tensor, + calls the forward method of the baseline MLP, + and returns a np array + + Input: `obs`: np.ndarray of size [N, 1] + Output: np.ndarray of size [N] + + """ + obs = ptu.from_numpy(obs) + predictions = self.baseline(obs) + return ptu.to_numpy(predictions)[:, 0] + diff --git a/hw2/cs285/policies/__init__.py b/hw2/cs285/policies/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw2/cs285/policies/base_policy.py b/hw2/cs285/policies/base_policy.py new file mode 100644 index 00000000..e089540a --- /dev/null +++ b/hw2/cs285/policies/base_policy.py @@ -0,0 +1,14 @@ +import abc +import numpy as np + + +class BasePolicy(object, metaclass=abc.ABCMeta): + def get_action(self, obs: np.ndarray) -> np.ndarray: + raise NotImplementedError + + def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def save(self, filepath: str): + raise NotImplementedError diff --git a/hw2/cs285/scripts/read_results.py b/hw2/cs285/scripts/read_results.py new file mode 100644 index 00000000..3a5bc50f --- /dev/null +++ b/hw2/cs285/scripts/read_results.py @@ -0,0 +1,26 @@ +import glob +import tensorflow as tf + +def get_section_results(file): + """ + requires tensorflow==1.12.0 + """ + X = [] + Y = [] + for e in tf.train.summary_iterator(file): + for v in e.summary.value: + if v.tag == 'Train_EnvstepsSoFar': + X.append(v.simple_value) + elif v.tag == 'Eval_AverageReturn': + Y.append(v.simple_value) + return X, Y + +if __name__ == '__main__': + import glob + + logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' + eventfile = glob.glob(logdir)[0] + + X, Y = get_section_results(eventfile) + for i, (x, y) in enumerate(zip(X, Y)): + print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) \ No newline at end of file diff --git a/hw2/cs285/scripts/run_hw2.ipynb b/hw2/cs285/scripts/run_hw2.ipynb new file mode 100644 index 00000000..b61f0413 --- /dev/null +++ b/hw2/cs285/scripts/run_hw2.ipynb @@ -0,0 +1,560 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "run_hw2.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Gwo9bpaVgxXF", + "colab_type": "text" + }, + "source": [ + "##Setup\n", + "\n", + "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "6CAdiyTKi4Se", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title mount your Google Drive\n", + "#@markdown Your work will be stored in a folder called `cs285_f2020` by default to prevent Colab instance timeouts from deleting your edits.\n", + "\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BKE5nA1Fgwwy", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title set up mount symlink\n", + "\n", + "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2020'\n", + "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n", + "if not os.path.exists(DRIVE_PYTHON_PATH):\n", + " %mkdir $DRIVE_PATH\n", + "\n", + "## the space in `My Drive` causes some issues,\n", + "## make a symlink to avoid this\n", + "SYM_PATH = '/content/cs285_f2020'\n", + "if not os.path.exists(SYM_PATH):\n", + " !ln -s $DRIVE_PATH $SYM_PATH" + ], + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "9FGK4kbpg3iP", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title apt install requirements\n", + "\n", + "#@markdown Run each section with Shift+Enter\n", + "\n", + "#@markdown Double-click on section headers to show code.\n", + "\n", + "!apt update \n", + "!apt install -y --no-install-recommends \\\n", + " build-essential \\\n", + " curl \\\n", + " git \\\n", + " gnupg2 \\\n", + " make \\\n", + " cmake \\\n", + " ffmpeg \\\n", + " swig \\\n", + " libz-dev \\\n", + " unzip \\\n", + " zlib1g-dev \\\n", + " libglfw3 \\\n", + " libglfw3-dev \\\n", + " libxrandr2 \\\n", + " libxinerama-dev \\\n", + " libxi6 \\\n", + " libxcursor-dev \\\n", + " libgl1-mesa-dev \\\n", + " libgl1-mesa-glx \\\n", + " libglew-dev \\\n", + " libosmesa6-dev \\\n", + " lsb-release \\\n", + " ack-grep \\\n", + " patchelf \\\n", + " wget \\\n", + " xpra \\\n", + " xserver-xorg-dev \\\n", + " xvfb \\\n", + " python-opengl \\\n", + " ffmpeg > /dev/null 2>&1" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YNGuuABeg99q", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title download mujoco\n", + "\n", + "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n", + "if not os.path.exists(MJC_PATH):\n", + " %mkdir $MJC_PATH\n", + "%cd $MJC_PATH\n", + "if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):\n", + " !wget -q https://www.roboti.us/download/mujoco200_linux.zip\n", + " !unzip -q mujoco200_linux.zip\n", + " %mv mujoco200_linux mujoco200\n", + " %rm mujoco200_linux.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "y0MiuTJ4hT5z", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title update mujoco paths\n", + "\n", + "import os\n", + "\n", + "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n", + "\n", + "## installation on colab does not find *.so files\n", + "## in LD_LIBRARY_PATH, copy over manually instead\n", + "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Xd-g5Z7xhWVt", + "colab_type": "text" + }, + "source": [ + "Ensure your `mjkey.txt` is in /content/cs285_f2020/mujoco before this step" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "-p6i5TqAhW4a", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title clone and install mujoco-py\n", + "\n", + "%cd $MJC_PATH\n", + "if not os.path.exists('mujoco-py'):\n", + " !git clone https://github.com/openai/mujoco-py.git\n", + "%cd mujoco-py\n", + "%pip install -e .\n", + "\n", + "## cythonize at the first import\n", + "import mujoco_py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "GQvbeuV1hi5I", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title clone homework repo\n", + "#@markdown Note that this is the same codebase from homework 1,\n", + "#@markdown so you may need to move your old `homework_fall2020`\n", + "#@markdown folder in order to clone the repo again.\n", + "\n", + "#@markdown **Don't delete your old work though!**\n", + "#@markdown You will need it for this assignment.\n", + "\n", + "%cd $SYM_PATH\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "\n", + "%cd homework_fall2020/hw2\n", + "%pip install -r requirements_colab.txt -f https://download.pytorch.org/whl/torch_stable.html\n", + "%pip install -e ." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "noinfUbHiHW2", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title set up virtual display\n", + "\n", + "from pyvirtualdisplay import Display\n", + "\n", + "display = Display(visible=0, size=(1400, 900))\n", + "display.start()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "COqsZLeliU9Y", + "colab_type": "code", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 438 + }, + "outputId": "55f7feb0-e730-4789-c73d-3ec695b48757" + }, + "source": [ + "#@title test virtual display\n", + "\n", + "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n", + "\n", + "import gym\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "from cs285.infrastructure.colab_utils import (\n", + " wrap_env,\n", + " show_video\n", + ")\n", + "\n", + "env = wrap_env(gym.make(\"Ant-v2\"))\n", + "\n", + "observation = env.reset()\n", + "for i in range(100):\n", + " env.render(mode='rgb_array')\n", + " obs, rew, term, _ = env.step(env.action_space.sample() ) \n", + " if term:\n", + " break;\n", + " \n", + "env.close()\n", + "print('Loading video...')\n", + "show_video()" + ], + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Loading video...\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ygs968BbiYHr", + "colab_type": "text" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9qUmV93fif6S", + "colab_type": "text" + }, + "source": [ + "## Run Policy Gradients" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lN-gZkqiijnR", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title imports\n", + "\n", + "import os\n", + "import time\n", + "\n", + "from cs285.infrastructure.rl_trainer import RL_Trainer\n", + "from cs285.agents.pg_agent import PGAgent\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Q6NaOWhOinnU", + "colab_type": "code", + "cellView": "both", + "colab": {} + }, + "source": [ + "#@title runtime arguments\n", + "\n", + "class Args:\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, val):\n", + " setattr(self, key, val)\n", + "\n", + " def __contains__(self, key):\n", + " return hasattr(self, key)\n", + "\n", + " env_name = 'CartPole-v0' #@param\n", + " exp_name = 'q1_sb_rtg_na' #@param\n", + "\n", + " #@markdown main parameters of interest\n", + " n_iter = 100 #@param {type: \"integer\"}\n", + "\n", + " ## PDF will tell you how to set ep_len\n", + " ## and discount for each environment\n", + " ep_len = 200 #@param {type: \"integer\"}\n", + " discount = 0.95 #@param {type: \"number\"}\n", + "\n", + " reward_to_go = True #@param {type: \"boolean\"}\n", + " nn_baseline = False #@param {type: \"boolean\"}\n", + " dont_standardize_advantages = False #@param {type: \"boolean\"}\n", + "\n", + " #@markdown batches and steps\n", + " batch_size = 1000 #@param {type: \"integer\"}\n", + " eval_batch_size = 400 #@param {type: \"integer\"}\n", + "\n", + " num_agent_train_steps_per_iter = 1 #@param {type: \"integer\"}\n", + " learning_rate = 5e-3 #@param {type: \"number\"}\n", + "\n", + " #@markdown MLP parameters\n", + " n_layers = 2 #@param {type: \"integer\"}\n", + " size = 64 #@param {type: \"integer\"}\n", + "\n", + " #@markdown system\n", + " save_params = False #@param {type: \"boolean\"}\n", + " no_gpu = False #@param {type: \"boolean\"}\n", + " which_gpu = 0 #@param {type: \"integer\"}\n", + " seed = 1 #@param {type: \"integer\"}\n", + "\n", + " #@markdown logging\n", + " ## default is to not log video so\n", + " ## that logs are small enough to be\n", + " ## uploaded to gradscope\n", + " video_log_freq = -1#@param {type: \"integer\"}\n", + " scalar_log_freq = 1#@param {type: \"integer\"}\n", + "\n", + "\n", + "args = Args()\n", + "\n", + "## ensure compatibility with hw1 code\n", + "args['train_batch_size'] = args['batch_size']\n", + "\n", + "if args['video_log_freq'] > 0:\n", + " import warnings\n", + " warnings.warn(\n", + " '''\\nLogging videos will make eventfiles too'''\n", + " '''\\nlarge for the autograder. Set video_log_freq = -1'''\n", + " '''\\nfor the runs you intend to submit.''')" + ], + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "eScWwHhnsYkd", + "colab_type": "code", + "cellView": "form", + "colab": {} + }, + "source": [ + "#@title create directory for logging\n", + "\n", + "data_path = '''/content/cs285_f2020/''' \\\n", + " '''homework_fall2020/hw2/data'''\n", + "\n", + "if not (os.path.exists(data_path)):\n", + " os.makedirs(data_path)\n", + "\n", + "logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + "logdir = os.path.join(data_path, logdir)\n", + "args['logdir'] = logdir\n", + "if not(os.path.exists(logdir)):\n", + " os.makedirs(logdir)" + ], + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "aljzrLdAsvNu", + "colab_type": "code", + "colab": {} + }, + "source": [ + "## define policy gradient trainer\n", + "\n", + "class PG_Trainer(object):\n", + "\n", + " def __init__(self, params):\n", + "\n", + " #####################\n", + " ## SET AGENT PARAMS\n", + " #####################\n", + "\n", + " computation_graph_args = {\n", + " 'n_layers': params['n_layers'],\n", + " 'size': params['size'],\n", + " 'learning_rate': params['learning_rate'],\n", + " }\n", + "\n", + " estimate_advantage_args = {\n", + " 'gamma': params['discount'],\n", + " 'standardize_advantages': not(params['dont_standardize_advantages']),\n", + " 'reward_to_go': params['reward_to_go'],\n", + " 'nn_baseline': params['nn_baseline'],\n", + " }\n", + "\n", + " train_args = {\n", + " 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],\n", + " }\n", + "\n", + " agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}\n", + "\n", + " self.params = params\n", + " self.params['agent_class'] = PGAgent\n", + " self.params['agent_params'] = agent_params\n", + " self.params['batch_size_initial'] = self.params['batch_size']\n", + "\n", + " ################\n", + " ## RL TRAINER\n", + " ################\n", + "\n", + " self.rl_trainer = RL_Trainer(self.params)\n", + "\n", + " def run_training_loop(self):\n", + "\n", + " self.rl_trainer.run_training_loop(\n", + " self.params['n_iter'],\n", + " collect_policy = self.rl_trainer.agent.actor,\n", + " eval_policy = self.rl_trainer.agent.actor,\n", + " )" + ], + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "j2rCuQsRsd3N", + "colab_type": "code", + "colab": {} + }, + "source": [ + "## run training\n", + "\n", + "print(args.logdir)\n", + "trainer = PG_Trainer(args)\n", + "trainer.run_training_loop()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "km7LlYvhqKTl", + "colab_type": "code", + "colab": {} + }, + "source": [ + "#@markdown You can visualize your runs with tensorboard from within the notebook\n", + "\n", + "## requires tensorflow==2.3.0\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir /content/cs285_f2020/homework_fall2020/hw2/data" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/hw2/cs285/scripts/run_hw2.py b/hw2/cs285/scripts/run_hw2.py new file mode 100644 index 00000000..527eadd8 --- /dev/null +++ b/hw2/cs285/scripts/run_hw2.py @@ -0,0 +1,115 @@ +import os +import time + +from cs285.infrastructure.rl_trainer import RL_Trainer +from cs285.agents.pg_agent import PGAgent + +class PG_Trainer(object): + + def __init__(self, params): + + ##################### + ## SET AGENT PARAMS + ##################### + + computation_graph_args = { + 'n_layers': params['n_layers'], + 'size': params['size'], + 'learning_rate': params['learning_rate'], + } + + estimate_advantage_args = { + 'gamma': params['discount'], + 'standardize_advantages': not(params['dont_standardize_advantages']), + 'reward_to_go': params['reward_to_go'], + 'nn_baseline': params['nn_baseline'], + } + + train_args = { + 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], + } + + agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} + + self.params = params + self.params['agent_class'] = PGAgent + self.params['agent_params'] = agent_params + self.params['batch_size_initial'] = self.params['batch_size'] + + ################ + ## RL TRAINER + ################ + + self.rl_trainer = RL_Trainer(self.params) + + def run_training_loop(self): + + self.rl_trainer.run_training_loop( + self.params['n_iter'], + collect_policy = self.rl_trainer.agent.actor, + eval_policy = self.rl_trainer.agent.actor, + ) + + +def main(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--env_name', type=str) + parser.add_argument('--exp_name', type=str, default='todo') + parser.add_argument('--n_iter', '-n', type=int, default=200) + + parser.add_argument('--reward_to_go', '-rtg', action='http://github.com/store_true') + parser.add_argument('--nn_baseline', action='http://github.com/store_true') + parser.add_argument('--dont_standardize_advantages', '-dsa', action='http://github.com/store_true') + parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration + parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration + + parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) + parser.add_argument('--discount', type=float, default=1.0) + parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) + parser.add_argument('--n_layers', '-l', type=int, default=2) + parser.add_argument('--size', '-s', type=int, default=64) + + parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default + parser.add_argument('--seed', type=int, default=1) + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') + parser.add_argument('--which_gpu', '-gpu_id', default=0) + parser.add_argument('--video_log_freq', type=int, default=-1) + parser.add_argument('--scalar_log_freq', type=int, default=1) + + parser.add_argument('--save_params', action='http://github.com/store_true') + + args = parser.parse_args() + + # convert to dictionary + params = vars(args) + + ## ensure compatibility with hw1 code + params['train_batch_size'] = params['batch_size'] + + ################################## + ### CREATE DIRECTORY FOR LOGGING + ################################## + + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') + + if not (os.path.exists(data_path)): + os.makedirs(data_path) + + logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + logdir = os.path.join(data_path, logdir) + params['logdir'] = logdir + if not(os.path.exists(logdir)): + os.makedirs(logdir) + + ################### + ### RUN TRAINING + ################### + + trainer = PG_Trainer(params) + trainer.run_training_loop() + + +if __name__ == "__main__": + main() diff --git a/hw2/cs285_hw2.pdf b/hw2/cs285_hw2.pdf new file mode 100644 index 00000000..082d9d9f Binary files /dev/null and b/hw2/cs285_hw2.pdf differ diff --git a/hw2/requirements.txt b/hw2/requirements.txt new file mode 100644 index 00000000..024936df --- /dev/null +++ b/hw2/requirements.txt @@ -0,0 +1,12 @@ +torch==1.5.1 +gym==0.17.2 +mujoco-py==2.0.2.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +ipdb==0.13.3 +box2d-py +tensorflow==1.12.0 \ No newline at end of file diff --git a/hw2/requirements_colab.txt b/hw2/requirements_colab.txt new file mode 100644 index 00000000..0315e55d --- /dev/null +++ b/hw2/requirements_colab.txt @@ -0,0 +1,11 @@ +torch==1.5.1+cu101 +gym==0.17.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +ipdb==0.13.3 +box2d-py +tensorflow==2.3.0 \ No newline at end of file diff --git a/hw2/setup.py b/hw2/setup.py new file mode 100644 index 00000000..3cc1886e --- /dev/null +++ b/hw2/setup.py @@ -0,0 +1,8 @@ +# setup.py +from setuptools import setup + +setup( + name='cs285', + version='0.1.0', + packages=['cs285'], +) \ No newline at end of file diff --git a/hw3/README.md b/hw3/README.md new file mode 100644 index 00000000..b6a7c9ce --- /dev/null +++ b/hw3/README.md @@ -0,0 +1,36 @@ +## Setup + +You can run this code on your own machine or on Google Colab. + +1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. + +2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_dqn.ipynb) **Part I (Q-learning)** + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_actor_critic.ipynb) **Part II (Actor-critic)** + +## Complete the code + +The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from hw1 or hw2`. + +- [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) +- [infrastructure/utils.py](cs285/infrastructure/utils.py) +- [policies/MLP_policy.py](cs285/policies/MLP_policy.py) + +You will then need to implement new routines in the following files for homework 3 part 1 (Q-learning): +- [agents/dqn_agent.py](cs285/agents/dqn_agent.py) +- [critics/dqn_critic.py](cs285/critics/dqn_critic.py) +- [policies/argmax_policy.py](cs285/policies/argmax_policy.py) + +and in the following files for part 2 (actor-critic): +- [agents/ac_agent.py](cs285/agents/ac_agent.py) +- [critics/bootstrapped_continuous_critic.py](cs285/critics/bootstrapped_continuous_critic.py) +- [policies/MLP_policy.py](cs285/policies/MLP_policy.py) + +The relevant sections are marked with `TODO`. + +You may also want to look through [run_hw3_dqn.py](cs285/scripts/run_hw3_dqn.py) and [run_hw3_actor_critic.py](cs285/scripts/run_hw3_actor_critic.py) (if running locally) or [run_hw3_dqn.ipynb](cs285/scripts/run_hw3_dqn.ipynb) and [run_hw3_actor_critic.ipynb](cs285/scripts/run_hw3_actor_critic.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. + +See the [assignment PDF](cs285_hw3.pdf) for more details on what files to edit. + diff --git a/hw3/cs285/agents/ac_agent.py b/hw3/cs285/agents/ac_agent.py new file mode 100644 index 00000000..8aa4ff98 --- /dev/null +++ b/hw3/cs285/agents/ac_agent.py @@ -0,0 +1,66 @@ +from collections import OrderedDict + +from cs285.critics.bootstrapped_continuous_critic import \ + BootstrappedContinuousCritic +from cs285.infrastructure.replay_buffer import ReplayBuffer +from cs285.infrastructure.utils import * +from cs285.policies.MLP_policy import MLPPolicyAC +from .base_agent import BaseAgent + + +class ACAgent(BaseAgent): + def __init__(self, env, agent_params): + super(ACAgent, self).__init__() + + self.env = env + self.agent_params = agent_params + + self.gamma = self.agent_params['gamma'] + self.standardize_advantages = self.agent_params['standardize_advantages'] + + self.actor = MLPPolicyAC( + self.agent_params['ac_dim'], + self.agent_params['ob_dim'], + self.agent_params['n_layers'], + self.agent_params['size'], + self.agent_params['discrete'], + self.agent_params['learning_rate'], + ) + self.critic = BootstrappedContinuousCritic(self.agent_params) + + self.replay_buffer = ReplayBuffer() + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + # TODO Implement the following pseudocode: + # for agent_params['num_critic_updates_per_agent_update'] steps, + # update the critic + + # advantage = estimate_advantage(...) + + # for agent_params['num_actor_updates_per_agent_update'] steps, + # update the actor + + loss = OrderedDict() + loss['Critic_Loss'] = TODO + loss['Actor_Loss'] = TODO + + return loss + + def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): + # TODO Implement the following pseudocode: + # 1) query the critic with ob_no, to get V(s) + # 2) query the critic with next_ob_no, to get V(s') + # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') + # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) + # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) + adv_n = TODO + + if self.standardize_advantages: + adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) + return adv_n + + def add_to_replay_buffer(self, paths): + self.replay_buffer.add_rollouts(paths) + + def sample(self, batch_size): + return self.replay_buffer.sample_recent_data(batch_size) diff --git a/hw3/cs285/agents/base_agent.py b/hw3/cs285/agents/base_agent.py new file mode 100644 index 00000000..a32224b5 --- /dev/null +++ b/hw3/cs285/agents/base_agent.py @@ -0,0 +1,16 @@ +class BaseAgent(object): + def __init__(self, **kwargs): + super(BaseAgent, self).__init__(**kwargs) + + def train(self) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def add_to_replay_buffer(self, paths): + raise NotImplementedError + + def sample(self, batch_size): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError \ No newline at end of file diff --git a/hw3/cs285/agents/dqn_agent.py b/hw3/cs285/agents/dqn_agent.py new file mode 100644 index 00000000..8b071ff8 --- /dev/null +++ b/hw3/cs285/agents/dqn_agent.py @@ -0,0 +1,107 @@ +import numpy as np + +from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule +from cs285.policies.argmax_policy import ArgMaxPolicy +from cs285.critics.dqn_critic import DQNCritic + + +class DQNAgent(object): + def __init__(self, env, agent_params): + + self.env = env + self.agent_params = agent_params + self.batch_size = agent_params['batch_size'] + # import ipdb; ipdb.set_trace() + self.last_obs = self.env.reset() + + self.num_actions = agent_params['ac_dim'] + self.learning_starts = agent_params['learning_starts'] + self.learning_freq = agent_params['learning_freq'] + self.target_update_freq = agent_params['target_update_freq'] + + self.replay_buffer_idx = None + self.exploration = agent_params['exploration_schedule'] + self.optimizer_spec = agent_params['optimizer_spec'] + + self.critic = DQNCritic(agent_params, self.optimizer_spec) + self.actor = ArgMaxPolicy(self.critic) + + lander = agent_params['env_name'].startswith('LunarLander') + self.replay_buffer = MemoryOptimizedReplayBuffer( + agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) + self.t = 0 + self.num_param_updates = 0 + + def add_to_replay_buffer(self, paths): + pass + + def step_env(self): + """ + Step the env and store the transition + At the end of this block of code, the simulator should have been + advanced one step, and the replay buffer should contain one more transition. + Note that self.last_obs must always point to the new latest observation. + """ + + # TODO store the latest observation ("frame") into the replay buffer + # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` + # in dqn_utils.py + self.replay_buffer_idx = TODO + + eps = self.exploration.value(self.t) + + # TODO use epsilon greedy exploration when selecting action + perform_random_action = TODO + if perform_random_action: + # HINT: take random action + # with probability eps (see np.random.random()) + # OR if your current step number (see self.t) is less that self.learning_starts + action = TODO + else: + # HINT: Your actor will take in multiple previous observations ("frames") in order + # to deal with the partial observability of the environment. Get the most recent + # `frame_history_len` observations using functionality from the replay buffer, + # and then use those observations as input to your actor. + action = TODO + + # TODO take a step in the environment using the action from the policy + # HINT1: remember that self.last_obs must always point to the newest/latest observation + # HINT2: remember the following useful function that you've seen before: + #obs, reward, done, info = env.step(action) + TODO + + # TODO store the result of taking this action into the replay buffer + # HINT1: see your replay buffer's `store_effect` function + # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above + TODO + + # TODO if taking this step resulted in done, reset the env (and the latest observation) + TODO + + def sample(self, batch_size): + if self.replay_buffer.can_sample(self.batch_size): + return self.replay_buffer.sample(batch_size) + else: + return [],[],[],[],[] + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + log = {} + if (self.t > self.learning_starts + and self.t % self.learning_freq == 0 + and self.replay_buffer.can_sample(self.batch_size) + ): + + # TODO fill in the call to the update function using the appropriate tensors + log = self.critic.update( + TODO + ) + + # TODO update the target network periodically + # HINT: your critic already has this functionality implemented + if self.num_param_updates % self.target_update_freq == 0: + TODO + + self.num_param_updates += 1 + + self.t += 1 + return log diff --git a/hw3/cs285/critics/__init__.py b/hw3/cs285/critics/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/hw3/cs285/critics/__init__.py @@ -0,0 +1 @@ + diff --git a/hw3/cs285/critics/base_critic.py b/hw3/cs285/critics/base_critic.py new file mode 100644 index 00000000..36308dba --- /dev/null +++ b/hw3/cs285/critics/base_critic.py @@ -0,0 +1,3 @@ +class BaseCritic(object): + def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): + raise NotImplementedError diff --git a/hw3/cs285/critics/bootstrapped_continuous_critic.py b/hw3/cs285/critics/bootstrapped_continuous_critic.py new file mode 100644 index 00000000..b410eff6 --- /dev/null +++ b/hw3/cs285/critics/bootstrapped_continuous_critic.py @@ -0,0 +1,89 @@ +from .base_critic import BaseCritic +from torch import nn +from torch import optim + +from cs285.infrastructure import pytorch_util as ptu + + +class BootstrappedContinuousCritic(nn.Module, BaseCritic): + """ + Notes on notation: + + Prefixes and suffixes: + ob - observation + ac - action + _no - this tensor should have shape (batch self.size /n/, observation dim) + _na - this tensor should have shape (batch self.size /n/, action dim) + _n - this tensor should have shape (batch self.size /n/) + + Note: batch self.size /n/ is defined at runtime. + is None + """ + def __init__(self, hparams): + super().__init__() + self.ob_dim = hparams['ob_dim'] + self.ac_dim = hparams['ac_dim'] + self.discrete = hparams['discrete'] + self.size = hparams['size'] + self.n_layers = hparams['n_layers'] + self.learning_rate = hparams['learning_rate'] + + # critic parameters + self.num_target_updates = hparams['num_target_updates'] + self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] + self.gamma = hparams['gamma'] + self.critic_network = ptu.build_mlp( + self.ob_dim, + 1, + n_layers=self.n_layers, + size=self.size, + ) + self.critic_network.to(ptu.device) + self.loss = nn.MSELoss() + self.optimizer = optim.Adam( + self.critic_network.parameters(), + self.learning_rate, + ) + + def forward(self, obs): + return self.critic_network(obs).squeeze(1) + + def forward_np(self, obs): + obs = ptu.from_numpy(obs) + predictions = self(obs) + return ptu.to_numpy(predictions) + + def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + """ + Update the parameters of the critic. + + let sum_of_path_lengths be the sum of the lengths of the paths sampled from + Agent.sample_trajectories + let num_paths be the number of paths sampled from Agent.sample_trajectories + + arguments: + ob_no: shape: (sum_of_path_lengths, ob_dim) + next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward + reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing + the reward for each timestep + terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended + at that timestep of 0 if the episode did not end + + returns: + training loss + """ + # TODO: Implement the pseudocode below: do the following ( + # self.num_grad_steps_per_target_update * self.num_target_updates) + # times: + # every self.num_grad_steps_per_target_update steps (which includes the + # first step), recompute the target values by + # a) calculating V(s') by querying the critic with next_ob_no + # b) and computing the target values as r(s, a) + gamma * V(s') + # every time, update this critic using the observations and targets + # + # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it + # to 0) when a terminal state is reached + # HINT: make sure to squeeze the output of the critic_network to ensure + # that its dimensions match the reward + + return loss.item() diff --git a/hw3/cs285/critics/dqn_critic.py b/hw3/cs285/critics/dqn_critic.py new file mode 100644 index 00000000..5ff5e9df --- /dev/null +++ b/hw3/cs285/critics/dqn_critic.py @@ -0,0 +1,107 @@ +from .base_critic import BaseCritic +import torch +import torch.optim as optim +from torch.nn import utils +from torch import nn + +from cs285.infrastructure import pytorch_util as ptu + + +class DQNCritic(BaseCritic): + + def __init__(self, hparams, optimizer_spec, **kwargs): + super().__init__(**kwargs) + self.env_name = hparams['env_name'] + self.ob_dim = hparams['ob_dim'] + + if isinstance(self.ob_dim, int): + self.input_shape = (self.ob_dim,) + else: + self.input_shape = hparams['input_shape'] + + self.ac_dim = hparams['ac_dim'] + self.double_q = hparams['double_q'] + self.grad_norm_clipping = hparams['grad_norm_clipping'] + self.gamma = hparams['gamma'] + + self.optimizer_spec = optimizer_spec + network_initializer = hparams['q_func'] + self.q_net = network_initializer(self.ob_dim, self.ac_dim) + self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) + self.optimizer = self.optimizer_spec.constructor( + self.q_net.parameters(), + **self.optimizer_spec.optim_kwargs + ) + self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( + self.optimizer, + self.optimizer_spec.learning_rate_schedule, + ) + self.loss = nn.SmoothL1Loss() # AKA Huber loss + self.q_net.to(ptu.device) + self.q_net_target.to(ptu.device) + + def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + """ + Update the parameters of the critic. + let sum_of_path_lengths be the sum of the lengths of the paths sampled from + Agent.sample_trajectories + let num_paths be the number of paths sampled from Agent.sample_trajectories + arguments: + ob_no: shape: (sum_of_path_lengths, ob_dim) + next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward + reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing + the reward for each timestep + terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended + at that timestep of 0 if the episode did not end + returns: + nothing + """ + ob_no = ptu.from_numpy(ob_no) + ac_na = ptu.from_numpy(ac_na).to(torch.long) + next_ob_no = ptu.from_numpy(next_ob_no) + reward_n = ptu.from_numpy(reward_n) + terminal_n = ptu.from_numpy(terminal_n) + + qa_t_values = self.q_net(ob_no) + q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) + + # TODO compute the Q-values from the target network + qa_tp1_values = TODO + + if self.double_q: + # You must fill this part for Q2 of the Q-learning portion of the homework. + # In double Q-learning, the best action is selected using the Q-network that + # is being updated, but the Q-value for this action is obtained from the + # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. + TODO + else: + q_tp1, _ = qa_tp1_values.max(dim=1) + + # TODO compute targets for minimizing Bellman error + # HINT: as you saw in lecture, this would be: + #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) + target = TODO + target = target.detach() + + assert q_t_values.shape == target.shape + loss = self.loss(q_t_values, target) + + self.optimizer.zero_grad() + loss.backward() + utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) + self.optimizer.step() + + return { + 'Training Loss': ptu.to_numpy(loss), + } + + def update_target_network(self): + for target_param, param in zip( + self.q_net_target.parameters(), self.q_net.parameters() + ): + target_param.data.copy_(param.data) + + def qa_values(self, obs): + obs = ptu.from_numpy(obs) + qa_values = self.q_net(obs) + return ptu.to_numpy(qa_values) diff --git a/hw3/cs285/envs/__init__.py b/hw3/cs285/envs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw3/cs285/envs/box2d/__init__.py b/hw3/cs285/envs/box2d/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw3/cs285/envs/box2d/lunar_lander.py b/hw3/cs285/envs/box2d/lunar_lander.py new file mode 100644 index 00000000..0ef4ab3b --- /dev/null +++ b/hw3/cs285/envs/box2d/lunar_lander.py @@ -0,0 +1,468 @@ +import sys, math +import numpy as np + +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) + +import gym +from gym import spaces +from gym.utils import seeding + +import pyglet + +from copy import copy + +# Rocket trajectory optimization is a classic topic in Optimal Control. +# +# According to Pontryagin's maximum principle it's optimal to fire engine full throttle or +# turn it off. That's the reason this environment is OK to have discreet actions (engine on or off). +# +# Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. +# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. +# If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or +# comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main +# engine is -0.3 points each frame. Solved is 200 points. +# +# Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land +# on its first attempt. Please see source code for details. +# +# Too see heuristic landing, run: +# +# python gym/envs/box2d/lunar_lander.py +# +# To play yourself, run: +# +# python examples/agents/keyboard_agent.py LunarLander-v0 +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +# Modified by Sid Reddy (sgr@berkeley.edu) on 8/14/18 +# +# Changelog: +# - different discretization scheme for actions +# - different terminal rewards +# - different observations +# - randomized landing site +# +# A good agent should be able to achieve >150 reward. + +MAX_NUM_STEPS = 1000 + +N_OBS_DIM = 9 +N_ACT_DIM = 6 # num discrete actions + +FPS = 50 +SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well + +MAIN_ENGINE_POWER = 13.0 +SIDE_ENGINE_POWER = 0.6 + +INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder + +LANDER_POLY =[ + (-14,+17), (-17,0), (-17,-10), + (+17,-10), (+17,0), (+14,+17) + ] +LEG_AWAY = 20 +LEG_DOWN = 18 +LEG_W, LEG_H = 2, 8 +LEG_SPRING_TORQUE = 40 # 40 is too difficult for human players, 400 a bit easier + +SIDE_ENGINE_HEIGHT = 14.0 +SIDE_ENGINE_AWAY = 12.0 + +VIEWPORT_W = 600 +VIEWPORT_H = 400 + +THROTTLE_MAG = 0.75 # discretized 'on' value for thrusters +NOOP = 1 # don't fire main engine, don't steer +def disc_to_cont(action): # discrete action -> continuous action + if type(action) == np.ndarray and action.size > 1: + return action + # main engine + if action < 3: + m = -THROTTLE_MAG + elif action < 6: + m = THROTTLE_MAG + else: + raise ValueError + # steering + if action % 3 == 0: + s = -THROTTLE_MAG + elif action % 3 == 1: + s = 0 + else: + s = THROTTLE_MAG + return np.array([m, s]) + +class ContactDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + def BeginContact(self, contact): + if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: + self.env.game_over = True + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = True + def EndContact(self, contact): + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = False + +class LunarLander(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : FPS + } + + continuous = False + + def __init__(self): + self._seed() + self.viewer = None + + self.world = Box2D.b2World() + self.moon = None + self.lander = None + self.particles = [] + + self.prev_reward = None + + high = np.array([np.inf]*N_OBS_DIM) # useful range is -1 .. +1, but spikes can be higher + self.observation_space = spaces.Box(-high, high) + + self.action_space = spaces.Discrete(N_ACT_DIM) + + self.curr_step = None + + self._reset() + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _destroy(self): + if not self.moon: return + self.world.contactListener = None + self._clean_particles(True) + self.world.DestroyBody(self.moon) + self.moon = None + self.world.DestroyBody(self.lander) + self.lander = None + self.world.DestroyBody(self.legs[0]) + self.world.DestroyBody(self.legs[1]) + + def _reset(self): + self.curr_step = 0 + + self._destroy() + self.world.contactListener_keepref = ContactDetector(self) + self.world.contactListener = self.world.contactListener_keepref + self.game_over = False + self.prev_shaping = None + + W = VIEWPORT_W/SCALE + H = VIEWPORT_H/SCALE + + # terrain + CHUNKS = 11 + height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) + chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] + + # randomize helipad x-coord + helipad_chunk = np.random.choice(range(1, CHUNKS-1)) + + self.helipad_x1 = chunk_x[helipad_chunk-1] + self.helipad_x2 = chunk_x[helipad_chunk+1] + self.helipad_y = H/4 + height[helipad_chunk-2] = self.helipad_y + height[helipad_chunk-1] = self.helipad_y + height[helipad_chunk+0] = self.helipad_y + height[helipad_chunk+1] = self.helipad_y + height[helipad_chunk+2] = self.helipad_y + smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] + + self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) + self.sky_polys = [] + for i in range(CHUNKS-1): + p1 = (chunk_x[i], smooth_y[i]) + p2 = (chunk_x[i+1], smooth_y[i+1]) + self.moon.CreateEdgeFixture( + vertices=[p1,p2], + density=0, + friction=0.1) + self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) + + self.moon.color1 = (0.0,0.0,0.0) + self.moon.color2 = (0.0,0.0,0.0) + + initial_y = VIEWPORT_H/SCALE#*0.75 + self.lander = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2, initial_y), + angle=0.0, + fixtures = fixtureDef( + shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), + density=5.0, + friction=0.1, + categoryBits=0x0010, + maskBits=0x001, # collide only with ground + restitution=0.0) # 0.99 bouncy + ) + self.lander.color1 = (0.5,0.4,0.9) + self.lander.color2 = (0.3,0.3,0.5) + self.lander.ApplyForceToCenter( ( + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) + ), True) + + self.legs = [] + for i in [-1,+1]: + leg = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), + angle = (i*0.05), + fixtures = fixtureDef( + shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), + density=1.0, + restitution=0.0, + categoryBits=0x0020, + maskBits=0x001) + ) + leg.ground_contact = False + leg.color1 = (0.5,0.4,0.9) + leg.color2 = (0.3,0.3,0.5) + rjd = revoluteJointDef( + bodyA=self.lander, + bodyB=leg, + localAnchorA=(0, 0), + localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), + enableMotor=True, + enableLimit=True, + maxMotorTorque=LEG_SPRING_TORQUE, + motorSpeed=+0.3*i # low enough not to jump back into the sky + ) + if i==-1: + rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within + rjd.upperAngle = +0.9 + else: + rjd.lowerAngle = -0.9 + rjd.upperAngle = -0.9 + 0.5 + leg.joint = self.world.CreateJoint(rjd) + self.legs.append(leg) + + self.drawlist = [self.lander] + self.legs + + return self._step(NOOP)[0] + + def _create_particle(self, mass, x, y, ttl): + p = self.world.CreateDynamicBody( + position = (x,y), + angle=0.0, + fixtures = fixtureDef( + shape=circleShape(radius=2/SCALE, pos=(0,0)), + density=mass, + friction=0.1, + categoryBits=0x0100, + maskBits=0x001, # collide only with ground + restitution=0.3) + ) + p.ttl = ttl + self.particles.append(p) + self._clean_particles(False) + return p + + def _clean_particles(self, all): + while self.particles and (all or self.particles[0].ttl<0): + self.world.DestroyBody(self.particles.pop(0)) + + def _step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid " % (action,type(action)) + action = disc_to_cont(action) + + # Engines + tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) + side = (-tip[1], tip[0]); + dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] + + m_power = 0.0 + if action[0] > 0.0: + # Main engine + m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 + assert m_power>=0.5 and m_power <= 1.0 + ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness + oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] + impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) + p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate + p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + + s_power = 0.0 + if np.abs(action[1]) > 0.5: + # Orientation engines + direction = np.sign(action[1]) + s_power = np.clip(np.abs(action[1]), 0.5,1.0) + assert s_power>=0.5 and s_power <= 1.0 + ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) + p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) + p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + + # perform normal update + self.world.Step(1.0/FPS, 6*30, 2*30) + + pos = self.lander.position + vel = self.lander.linearVelocity + helipad_x = (self.helipad_x1 + self.helipad_x2) / 2 + state = [ + (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2), + (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_W/SCALE/2), + vel.x*(VIEWPORT_W/SCALE/2)/FPS, + vel.y*(VIEWPORT_H/SCALE/2)/FPS, + self.lander.angle, + 20.0*self.lander.angularVelocity/FPS, + 1.0 if self.legs[0].ground_contact else 0.0, + 1.0 if self.legs[1].ground_contact else 0.0, + (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2) + ] + assert len(state)==N_OBS_DIM + + self.curr_step += 1 + + reward = 0 + shaping = 0 + dx = (pos.x - helipad_x) / (VIEWPORT_W/SCALE/2) + shaping += -100*np.sqrt(state[2]*state[2] + state[3]*state[3]) - 100*abs(state[4]) + shaping += -100*np.sqrt(dx*dx + state[1]*state[1]) + 10*state[6] + 10*state[7] + if self.prev_shaping is not None: + reward = shaping - self.prev_shaping + self.prev_shaping = shaping + + reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing + reward -= s_power*0.03 + + oob = abs(state[0]) >= 1.0 + timeout = self.curr_step >= MAX_NUM_STEPS + not_awake = not self.lander.awake + + at_site = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2 and state[1] <= 0 + grounded = self.legs[0].ground_contact and self.legs[1].ground_contact + landed = at_site and grounded + + done = self.game_over or oob or not_awake or timeout or landed + if done: + if self.game_over or oob: + reward = -100 + self.lander.color1 = (255,0,0) + elif at_site: + reward = +100 + self.lander.color1 = (0,255,0) + elif timeout: + self.lander.color1 = (255,0,0) + info = {} + + return np.array(state), reward, done, info + + def _render(self, mode='rgb_array', close=False): + if close: + if self.viewer is not None: + self.viewer.close() + self.viewer = None + return + + try: + from gym.envs.classic_control import rendering + except: + print('[ cs285/envs/box2d/lunar_lander ] No display found; rendering is disabled') + return np.zeros((10,10, 3)).astype(np.uint) + + if self.viewer is None: + self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) + self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE) + + for obj in self.particles: + obj.ttl -= 0.15 + obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + + self._clean_particles(False) + + for p in self.sky_polys: + self.viewer.draw_polygon(p, color=(0,0,0)) + + for obj in self.particles + self.drawlist: + for f in obj.fixtures: + trans = f.body.transform + if type(f.shape) is circleShape: + t = rendering.Transform(translation=trans*f.shape.pos) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t) + else: + path = [trans*v for v in f.shape.vertices] + self.viewer.draw_polygon(path, color=obj.color1) + path.append(path[0]) + self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) + + for x in [self.helipad_x1, self.helipad_x2]: + flagy1 = self.helipad_y + flagy2 = flagy1 + 50/SCALE + self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) ) + self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) ) + + clock_prog = self.curr_step / MAX_NUM_STEPS + self.viewer.draw_polyline( [(0, 0.05*VIEWPORT_H/SCALE), (clock_prog*VIEWPORT_W/SCALE, 0.05*VIEWPORT_H/SCALE)], color=(255,0,0), linewidth=5 ) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def reset(self): + return self._reset() + + def step(self, *args, **kwargs): + return self._step(*args, **kwargs) + + +class LunarLanderContinuous(LunarLander): + continuous = True + +def heuristic(env, s): + # Heuristic for: + # 1. Testing. + # 2. Demonstration rollout. + angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) + if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad + if angle_targ < -0.4: angle_targ = -0.4 + hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset + + # PID controller: s[4] angle, s[5] angularSpeed + angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 + #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) + + # PID controller: s[1] vertical coordinate s[3] vertical speed + hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 + #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) + + if s[6] or s[7]: # legs have contact + angle_todo = 0 + hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact + + a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) + a = np.clip(a, -1, +1) + return a + +if __name__=="__main__": + #env = LunarLander() + env = LunarLanderContinuous() + s = env.reset() + total_reward = 0 + steps = 0 + while True: + a = heuristic(env, s) + s, r, done, info = env.step(a) + env.render() + total_reward += r + if steps % 20 == 0 or done: + print(["{:+0.2f}".format(x) for x in s]) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + steps += 1 + if done: break diff --git a/hw3/cs285/infrastructure/atari_wrappers.py b/hw3/cs285/infrastructure/atari_wrappers.py new file mode 100644 index 00000000..d8bb34f6 --- /dev/null +++ b/hw3/cs285/infrastructure/atari_wrappers.py @@ -0,0 +1,176 @@ +import numpy as np +import gym +from gym import spaces + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + gym.Wrapper.__init__(self, env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + gym.Wrapper.__init__(self, env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: self._obs_buffer[0] = obs + if i == self._skip - 1: self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +def _process_frame84(frame): + import cv2 + img = np.reshape(frame, [210, 160, 3]).astype(np.float32) + img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 + resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) + x_t = resized_screen[18:102, :] + x_t = np.reshape(x_t, [84, 84, 1]) + return x_t.astype(np.uint8) + + +class ProcessFrame84(gym.Wrapper): + def __init__(self, env=None): + super(ProcessFrame84, self).__init__(env) + self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) + + def step(self, action): + obs, reward, done, info = self.env.step(action) + return _process_frame84(obs), reward, done, info + + def reset(self): + return _process_frame84(self.env.reset()) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + gym.RewardWrapper.__init__(self, env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +def wrap_deepmind_ram(env): + env = EpisodicLifeEnv(env) + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = ClipRewardEnv(env) + return env + + +def wrap_deepmind(env): + """Configure environment for DeepMind-style Atari. + """ + # assert 'NoFrameskip' in env.spec.id + env = EpisodicLifeEnv(env) + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = ProcessFrame84(env) + env = ClipRewardEnv(env) + return env diff --git a/hw3/cs285/infrastructure/colab_utils.py b/hw3/cs285/infrastructure/colab_utils.py new file mode 100644 index 00000000..a896be97 --- /dev/null +++ b/hw3/cs285/infrastructure/colab_utils.py @@ -0,0 +1,26 @@ +from gym.wrappers import Monitor +import glob +import io +import base64 +from IPython.display import HTML +from IPython import display as ipythondisplay + +## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI + +def show_video(): + mp4list = glob.glob('/content/video/*.mp4') + if len(mp4list) > 0: + mp4 = mp4list[0] + video = io.open(mp4, 'r+b').read() + encoded = base64.b64encode(video) + ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) + else: + print("Could not find video") + + +def wrap_env(env): + env = Monitor(env, '/content/video', force=True) + return env diff --git a/hw3/cs285/infrastructure/dqn_utils.py b/hw3/cs285/infrastructure/dqn_utils.py new file mode 100644 index 00000000..a8ae1bb1 --- /dev/null +++ b/hw3/cs285/infrastructure/dqn_utils.py @@ -0,0 +1,517 @@ +"""This file includes a collection of utility functions that are useful for +implementing DQN.""" +import random +from collections import namedtuple + +import gym +import numpy as np +from torch import nn +import torch.optim as optim + +from cs285.infrastructure.atari_wrappers import wrap_deepmind +from gym.envs.registration import register + +import torch + + +class Flatten(torch.nn.Module): + def forward(self, x): + batch_size = x.shape[0] + return x.view(batch_size, -1) + +OptimizerSpec = namedtuple( + "OptimizerSpec", + ["constructor", "optim_kwargs", "learning_rate_schedule"], +) + + +def register_custom_envs(): + from gym.envs.registration import registry + if 'LunarLander-v3' not in registry.env_specs: + register( + id='LunarLander-v3', + entry_point='cs285.envs.box2d.lunar_lander:LunarLander', + max_episode_steps=1000, + reward_threshold=200, + ) + + +def get_env_kwargs(env_name): + if env_name in ['MsPacman-v0', 'PongNoFrameskip-v4']: + kwargs = { + 'learning_starts': 50000, + 'target_update_freq': 10000, + 'replay_buffer_size': int(1e6), + 'num_timesteps': int(2e8), + 'q_func': create_atari_q_network, + 'learning_freq': 4, + 'grad_norm_clipping': 10, + 'input_shape': (84, 84, 4), + 'env_wrappers': wrap_deepmind, + 'frame_history_len': 4, + 'gamma': 0.99, + } + kwargs['optimizer_spec'] = atari_optimizer(kwargs['num_timesteps']) + kwargs['exploration_schedule'] = atari_exploration_schedule(kwargs['num_timesteps']) + + elif env_name == 'LunarLander-v3': + def lunar_empty_wrapper(env): + return env + kwargs = { + 'optimizer_spec': lander_optimizer(), + 'q_func': create_lander_q_network, + 'replay_buffer_size': 50000, + 'batch_size': 32, + 'gamma': 1.00, + 'learning_starts': 1000, + 'learning_freq': 1, + 'frame_history_len': 1, + 'target_update_freq': 3000, + 'grad_norm_clipping': 10, + 'lander': True, + 'num_timesteps': 500000, + 'env_wrappers': lunar_empty_wrapper + } + kwargs['exploration_schedule'] = lander_exploration_schedule(kwargs['num_timesteps']) + + else: + raise NotImplementedError + + return kwargs + + +def create_lander_q_network(ob_dim, num_actions): + return nn.Sequential( + nn.Linear(ob_dim, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, num_actions), + ) + +class Ipdb(nn.Module): + def __init__(self): + super().__init__() + def forward(self, x): + import ipdb; ipdb.set_trace() + return x + + +class PreprocessAtari(nn.Module): + def forward(self, x): + x = x.permute(0, 3, 1, 2).contiguous() + return x / 255. + + +def create_atari_q_network(ob_dim, num_actions): + return nn.Sequential( + PreprocessAtari(), + nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), + nn.ReLU(), + Flatten(), + nn.Linear(3136, 512), # 3136 hard-coded based on img size + CNN layers + nn.ReLU(), + nn.Linear(512, num_actions), + ) + +def atari_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 1.0), + (1e6, 0.1), + (num_timesteps / 8, 0.01), + ], outside_value=0.01 + ) + + +def atari_ram_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 0.2), + (1e6, 0.1), + (num_timesteps / 8, 0.01), + ], outside_value=0.01 + ) + + +def atari_optimizer(num_timesteps): + lr_schedule = PiecewiseSchedule( + [ + (0, 1e-1), + (num_timesteps / 40, 1e-1), + (num_timesteps / 8, 5e-2), + ], + outside_value=5e-2, + ) + + return OptimizerSpec( + constructor=optim.Adam, + optim_kwargs=dict( + lr=1e-3, + eps=1e-4 + ), + learning_rate_schedule=lambda t: lr_schedule.value(t), + ) + + +def lander_optimizer(): + return OptimizerSpec( + constructor=optim.Adam, + optim_kwargs=dict( + lr=1, + ), + learning_rate_schedule=lambda epoch: 1e-3, # keep init learning rate + ) + + +def lander_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 1), + (num_timesteps * 0.1, 0.02), + ], outside_value=0.02 + ) + + +def sample_n_unique(sampling_f, n): + """Helper function. Given a function `sampling_f` that returns + comparable objects, sample n such unique objects. + """ + res = [] + while len(res) < n: + candidate = sampling_f() + if candidate not in res: + res.append(candidate) + return res + + +class Schedule(object): + def value(self, t): + """Value of the schedule at time t""" + raise NotImplementedError() + + +class ConstantSchedule(object): + def __init__(self, value): + """Value remains constant over time. + Parameters + ---------- + value: float + Constant value of the schedule + """ + self._v = value + + def value(self, t): + """See Schedule.value""" + return self._v + + +def linear_interpolation(l, r, alpha): + return l + alpha * (r - l) + + +class PiecewiseSchedule(object): + def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): + """Piecewise schedule. + endpoints: [(int, int)] + list of pairs `(time, value)` meanining that schedule should output + `value` when `t==time`. All the values for time must be sorted in + an increasing order. When t is between two times, e.g. `(time_a, value_a)` + and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs + `interpolation(value_a, value_b, alpha)` where alpha is a fraction of + time passed between `time_a` and `time_b` for time `t`. + interpolation: lambda float, float, float: float + a function that takes value to the left and to the right of t according + to the `endpoints`. Alpha is the fraction of distance from left endpoint to + right endpoint that t has covered. See linear_interpolation for example. + outside_value: float + if the value is requested outside of all the intervals sepecified in + `endpoints` this value is returned. If None then AssertionError is + raised when outside value is requested. + """ + idxes = [e[0] for e in endpoints] + assert idxes == sorted(idxes) + self._interpolation = interpolation + self._outside_value = outside_value + self._endpoints = endpoints + + def value(self, t): + """See Schedule.value""" + for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): + if l_t <= t and t < r_t: + alpha = float(t - l_t) / (r_t - l_t) + return self._interpolation(l, r, alpha) + + # t does not belong to any of the pieces, so doom. + assert self._outside_value is not None + return self._outside_value + +class LinearSchedule(object): + def __init__(self, schedule_timesteps, final_p, initial_p=1.0): + """Linear interpolation between initial_p and final_p over + schedule_timesteps. After this many timesteps pass final_p is + returned. + Parameters + ---------- + schedule_timesteps: int + Number of timesteps for which to linearly anneal initial_p + to final_p + initial_p: float + initial output value + final_p: float + final output value + """ + self.schedule_timesteps = schedule_timesteps + self.final_p = final_p + self.initial_p = initial_p + + def value(self, t): + """See Schedule.value""" + fraction = min(float(t) / self.schedule_timesteps, 1.0) + return self.initial_p + fraction * (self.final_p - self.initial_p) + +def compute_exponential_averages(variables, decay): + """Given a list of tensorflow scalar variables + create ops corresponding to their exponential + averages + Parameters + ---------- + variables: [tf.Tensor] + List of scalar tensors. + Returns + ------- + averages: [tf.Tensor] + List of scalar tensors corresponding to averages + of al the `variables` (in order) + apply_op: tf.runnable + Op to be run to update the averages with current value + of variables. + """ + averager = tf.train.ExponentialMovingAverage(decay=decay) + apply_op = averager.apply(variables) + return [averager.average(v) for v in variables], apply_op + +def minimize_and_clip(optimizer, objective, var_list, clip_val=10): + """Minimized `objective` using `optimizer` w.r.t. variables in + `var_list` while ensure the norm of the gradients for each + variable is clipped to `clip_val` + """ + gradients = optimizer.compute_gradients(objective, var_list=var_list) + for i, (grad, var) in enumerate(gradients): + if grad is not None: + gradients[i] = (tf.clip_by_norm(grad, clip_val), var) + return optimizer.apply_gradients(gradients) + +def initialize_interdependent_variables(session, vars_list, feed_dict): + """Initialize a list of variables one at a time, which is useful if + initialization of some variables depends on initialization of the others. + """ + vars_left = vars_list + while len(vars_left) > 0: + new_vars_left = [] + for v in vars_left: + try: + session.run(tf.variables_initializer([v]), feed_dict) + except tf.errors.FailedPreconditionError: + new_vars_left.append(v) + if len(new_vars_left) >= len(vars_left): + # This can happen if the variables all depend on each other, or more likely if there's + # another variable outside of the list, that still needs to be initialized. This could be + # detected here, but life's finite. + raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") + else: + vars_left = new_vars_left + +def get_wrapper_by_name(env, classname): + currentenv = env + while True: + if classname in currentenv.__class__.__name__: + return currentenv + elif isinstance(env, gym.Wrapper): + currentenv = currentenv.env + else: + raise ValueError("Couldn't find wrapper named %s"%classname) + +class MemoryOptimizedReplayBuffer(object): + def __init__(self, size, frame_history_len, lander=False): + """This is a memory efficient implementation of the replay buffer. + + The sepecific memory optimizations use here are: + - only store each frame once rather than k times + even if every observation normally consists of k last frames + - store frames as np.uint8 (actually it is most time-performance + to cast them back to float32 on GPU to minimize memory transfer + time) + - store frame_t and frame_(t+1) in the same buffer. + + For the tipical use case in Atari Deep RL buffer with 1M frames the total + memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes + + Warning! Assumes that returning frame of zeros at the beginning + of the episode, when there is less frames than `frame_history_len`, + is acceptable. + + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + frame_history_len: int + Number of memories to be retried for each observation. + """ + self.lander = lander + + self.size = size + self.frame_history_len = frame_history_len + + self.next_idx = 0 + self.num_in_buffer = 0 + + self.obs = None + self.action = None + self.reward = None + self.done = None + + def can_sample(self, batch_size): + """Returns true if `batch_size` different transitions can be sampled from the buffer.""" + return batch_size + 1 <= self.num_in_buffer + + def _encode_sample(self, idxes): + obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) + act_batch = self.action[idxes] + rew_batch = self.reward[idxes] + next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) + done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) + + return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask + + + def sample(self, batch_size): + """Sample `batch_size` different transitions. + + i-th sample transition is the following: + + when observing `obs_batch[i]`, action `act_batch[i]` was taken, + after which reward `rew_batch[i]` was received and subsequent + observation next_obs_batch[i] was observed, unless the epsiode + was done which is represented by `done_mask[i]` which is equal + to 1 if episode has ended as a result of that action. + + Parameters + ---------- + batch_size: int + How many transitions to sample. + + Returns + ------- + obs_batch: np.array + Array of shape + (batch_size, img_h, img_w, img_c * frame_history_len) + and dtype np.uint8 + act_batch: np.array + Array of shape (batch_size,) and dtype np.int32 + rew_batch: np.array + Array of shape (batch_size,) and dtype np.float32 + next_obs_batch: np.array + Array of shape + (batch_size, img_h, img_w, img_c * frame_history_len) + and dtype np.uint8 + done_mask: np.array + Array of shape (batch_size,) and dtype np.float32 + """ + assert self.can_sample(batch_size) + idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) + return self._encode_sample(idxes) + + def encode_recent_observation(self): + """Return the most recent `frame_history_len` frames. + + Returns + ------- + observation: np.array + Array of shape (img_h, img_w, img_c * frame_history_len) + and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] + encodes frame at time `t - frame_history_len + i` + """ + assert self.num_in_buffer > 0 + return self._encode_observation((self.next_idx - 1) % self.size) + + def _encode_observation(self, idx): + end_idx = idx + 1 # make noninclusive + start_idx = end_idx - self.frame_history_len + # this checks if we are using low-dimensional observations, such as RAM + # state, in which case we just directly return the latest RAM. + if len(self.obs.shape) == 2: + return self.obs[end_idx-1] + # if there weren't enough frames ever in the buffer for context + if start_idx < 0 and self.num_in_buffer != self.size: + start_idx = 0 + for idx in range(start_idx, end_idx - 1): + if self.done[idx % self.size]: + start_idx = idx + 1 + missing_context = self.frame_history_len - (end_idx - start_idx) + # if zero padding is needed for missing context + # or we are on the boundry of the buffer + if start_idx < 0 or missing_context > 0: + frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] + for idx in range(start_idx, end_idx): + frames.append(self.obs[idx % self.size]) + return np.concatenate(frames, 2) + else: + # this optimization has potential to saves about 30% compute time \o/ + img_h, img_w = self.obs.shape[1], self.obs.shape[2] + return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) + + def store_frame(self, frame): + """Store a single frame in the buffer at the next available index, overwriting + old frames if necessary. + + Parameters + ---------- + frame: np.array + Array of shape (img_h, img_w, img_c) and dtype np.uint8 + the frame to be stored + + Returns + ------- + idx: int + Index at which the frame is stored. To be used for `store_effect` later. + """ + if self.obs is None: + self.obs = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.lander else np.uint8) + self.action = np.empty([self.size], dtype=np.int32) + self.reward = np.empty([self.size], dtype=np.float32) + self.done = np.empty([self.size], dtype=np.bool) + self.obs[self.next_idx] = frame + + ret = self.next_idx + self.next_idx = (self.next_idx + 1) % self.size + self.num_in_buffer = min(self.size, self.num_in_buffer + 1) + + return ret + + def store_effect(self, idx, action, reward, done): + """Store effects of action taken after obeserving frame stored + at index idx. The reason `store_frame` and `store_effect` is broken + up into two functions is so that once can call `encode_recent_observation` + in between. + + Paramters + --------- + idx: int + Index in buffer of recently observed frame (returned by `store_frame`). + action: int + Action that was performed upon observing this frame. + reward: float + Reward that was received when the actions was performed. + done: bool + True if episode was finished after performing that action. + """ + self.action[idx] = action + self.reward[idx] = reward + self.done[idx] = done + diff --git a/hw3/cs285/infrastructure/logger.py b/hw3/cs285/infrastructure/logger.py new file mode 100644 index 00000000..a64931c0 --- /dev/null +++ b/hw3/cs285/infrastructure/logger.py @@ -0,0 +1,74 @@ +import os +from tensorboardX import SummaryWriter +import numpy as np + +class Logger: + def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): + self._log_dir = log_dir + print('########################') + print('logging outputs to ', log_dir) + print('########################') + self._n_logged_samples = n_logged_samples + self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) + + def log_scalar(self, scalar, name, step_): + self._summ_writer.add_scalar('{}'.format(name), scalar, step_) + + def log_scalars(self, scalar_dict, group_name, step, phase): + """Will log all scalars in the same plot.""" + self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) + + def log_image(self, image, name, step): + assert(len(image.shape) == 3) # [C, H, W] + self._summ_writer.add_image('{}'.format(name), image, step) + + def log_video(self, video_frames, name, step, fps=10): + assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" + self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) + + def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): + + # reshape the rollouts + videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] + + # max rollout length + max_videos_to_save = np.min([max_videos_to_save, len(videos)]) + max_length = videos[0].shape[0] + for i in range(max_videos_to_save): + if videos[i].shape[0]>max_length: + max_length = videos[i].shape[0] + + # pad rollouts to all be same length + for i in range(max_videos_to_save): + if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_figure(self, figure, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_graph(self, array, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + im = plot_graph(array) + self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) + + def dump_scalars(self, log_path=None): + log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path + self._summ_writer.export_scalars_to_json(log_path) + + def flush(self): + self._summ_writer.flush() + + + + diff --git a/hw3/cs285/infrastructure/pytorch_util.py b/hw3/cs285/infrastructure/pytorch_util.py new file mode 100644 index 00000000..530a6208 --- /dev/null +++ b/hw3/cs285/infrastructure/pytorch_util.py @@ -0,0 +1,79 @@ +from typing import Union + +import torch +from torch import nn + +Activation = Union[str, nn.Module] + + +_str_to_activation = { + 'relu': nn.ReLU(), + 'tanh': nn.Tanh(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'selu': nn.SELU(), + 'softplus': nn.Softplus(), + 'identity': nn.Identity(), +} + + +def build_mlp( + input_size: int, + output_size: int, + n_layers: int, + size: int, + activation: Activation = 'tanh', + output_activation: Activation = 'identity', +): + """ + Builds a feedforward neural network + arguments: + input_placeholder: placeholder variable for the state (batch_size, input_size) + scope: variable scope of the network + n_layers: number of hidden layers + size: dimension of each hidden layer + activation: activation of each hidden layer + input_size: size of the input layer + output_size: size of the output layer + output_activation: activation of the output layer + returns: + output_placeholder: the result of a forward pass through the hidden layers + the output layer + """ + if isinstance(activation, str): + activation = _str_to_activation[activation] + if isinstance(output_activation, str): + output_activation = _str_to_activation[output_activation] + layers = [] + in_size = input_size + for _ in range(n_layers): + layers.append(nn.Linear(in_size, size)) + layers.append(activation) + in_size = size + layers.append(nn.Linear(in_size, output_size)) + layers.append(output_activation) + return nn.Sequential(*layers) + + +device = None + + +def init_gpu(use_gpu=True, gpu_id=0): + global device + if torch.cuda.is_available() and use_gpu: + device = torch.device("cuda:" + str(gpu_id)) + print("Using GPU id {}".format(gpu_id)) + else: + device = torch.device("cpu") + print("GPU not detected. Defaulting to CPU.") + + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + + +def from_numpy(*args, **kwargs): + return torch.from_numpy(*args, **kwargs).float().to(device) + + +def to_numpy(tensor): + return tensor.to('cpu').detach().numpy() diff --git a/hw3/cs285/infrastructure/replay_buffer.py b/hw3/cs285/infrastructure/replay_buffer.py new file mode 100644 index 00000000..df7648d4 --- /dev/null +++ b/hw3/cs285/infrastructure/replay_buffer.py @@ -0,0 +1,82 @@ +from cs285.infrastructure.utils import * + + +class ReplayBuffer(object): + + def __init__(self, max_size=1000000): + + self.max_size = max_size + self.paths = [] + self.obs = None + self.acs = None + self.concatenated_rews = None + self.next_obs = None + self.terminals = None + + def add_rollouts(self, paths, noised=False): + + # add new rollouts into our list of rollouts + for path in paths: + self.paths.append(path) + + # convert new rollouts into their component arrays, and append them onto our arrays + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) + + if noised: + observations = add_noise(observations) + next_observations = add_noise(next_observations) + + if self.obs is None: + self.obs = observations[-self.max_size:] + self.acs = actions[-self.max_size:] + self.next_obs = next_observations[-self.max_size:] + self.terminals = terminals[-self.max_size:] + self.concatenated_rews = concatenated_rews[-self.max_size:] + else: + self.obs = np.concatenate([self.obs, observations])[-self.max_size:] + self.acs = np.concatenate([self.acs, actions])[-self.max_size:] + self.next_obs = np.concatenate( + [self.next_obs, next_observations] + )[-self.max_size:] + self.terminals = np.concatenate( + [self.terminals, terminals] + )[-self.max_size:] + self.concatenated_rews = np.concatenate( + [self.concatenated_rews, concatenated_rews] + )[-self.max_size:] + + ######################################## + ######################################## + + def sample_random_rollouts(self, num_rollouts): + rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] + return self.paths[rand_indices] + + def sample_recent_rollouts(self, num_rollouts=1): + return self.paths[-num_rollouts:] + + ######################################## + ######################################## + + def sample_random_data(self, batch_size): + + assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] + rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] + return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] + + def sample_recent_data(self, batch_size=1, concat_rew=True): + + if concat_rew: + return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] + else: + num_recent_rollouts_to_return = 0 + num_datapoints_so_far = 0 + index = -1 + while num_datapoints_so_far < batch_size: + recent_rollout = self.paths[index] + index -=1 + num_recent_rollouts_to_return +=1 + num_datapoints_so_far += get_pathlength(recent_rollout) + rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) + return observations, actions, unconcatenated_rews, next_observations, terminals diff --git a/hw3/cs285/infrastructure/rl_trainer.py b/hw3/cs285/infrastructure/rl_trainer.py new file mode 100644 index 00000000..b0fa472e --- /dev/null +++ b/hw3/cs285/infrastructure/rl_trainer.py @@ -0,0 +1,320 @@ +from collections import OrderedDict +import pickle +import os +import sys +import time + +import gym +from gym import wrappers +import numpy as np +import torch +from cs285.infrastructure import pytorch_util as ptu + +from cs285.infrastructure import utils +from cs285.infrastructure.logger import Logger + +from cs285.agents.dqn_agent import DQNAgent +from cs285.infrastructure.dqn_utils import ( + get_wrapper_by_name, + register_custom_envs, +) + +# how many rollouts to save as videos to tensorboard +MAX_NVIDEO = 2 +MAX_VIDEO_LEN = 40 # we overwrite this in the code below + + +class RL_Trainer(object): + + def __init__(self, params): + + ############# + ## INIT + ############# + + # Get params, create logger + self.params = params + self.logger = Logger(self.params['logdir']) + + # Set random seeds + seed = self.params['seed'] + np.random.seed(seed) + torch.manual_seed(seed) + ptu.init_gpu( + use_gpu=not self.params['no_gpu'], + gpu_id=self.params['which_gpu'] + ) + + ############# + ## ENV + ############# + + # Make the gym environment + register_custom_envs() + self.env = gym.make(self.params['env_name']) + if 'env_wrappers' in self.params: + # These operations are currently only for Atari envs + self.env = wrappers.Monitor( + self.env, + os.path.join(self.params['logdir'], "gym"), + force=True, + video_callable=(None if self.params['video_log_freq'] > 0 else False), + ) + self.env = params['env_wrappers'](self.env) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: + self.env = wrappers.Monitor( + self.env, + os.path.join(self.params['logdir'], "gym"), + force=True, + video_callable=(None if self.params['video_log_freq'] > 0 else False), + ) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + + self.env.seed(seed) + + # import plotting (locally if 'obstacles' env) + if not(self.params['env_name']=='obstacles-cs285-v0'): + import matplotlib + matplotlib.use('Agg') + + # Maximum length for episodes + self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps + global MAX_VIDEO_LEN + MAX_VIDEO_LEN = self.params['ep_len'] + + # Is this env continuous, or self.discrete? + discrete = isinstance(self.env.action_space, gym.spaces.Discrete) + # Are the observations images? + img = len(self.env.observation_space.shape) > 2 + + self.params['agent_params']['discrete'] = discrete + + # Observation and action sizes + + ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] + ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] + self.params['agent_params']['ac_dim'] = ac_dim + self.params['agent_params']['ob_dim'] = ob_dim + + # simulation timestep, will be used for video saving + if 'model' in dir(self.env): + self.fps = 1/self.env.model.opt.timestep + elif 'env_wrappers' in self.params: + self.fps = 30 # This is not actually used when using the Monitor wrapper + elif 'video.frames_per_second' in self.env.env.metadata.keys(): + self.fps = self.env.env.metadata['video.frames_per_second'] + else: + self.fps = 10 + + + ############# + ## AGENT + ############# + + agent_class = self.params['agent_class'] + self.agent = agent_class(self.env, self.params['agent_params']) + + def run_training_loop(self, n_iter, collect_policy, eval_policy, + initial_expertdata=None, relabel_with_expert=False, + start_relabel_with_expert=1, expert_policy=None): + """ + :param n_iter: number of (dagger) iterations + :param collect_policy: + :param eval_policy: + :param initial_expertdata: + :param relabel_with_expert: whether to perform dagger + :param start_relabel_with_expert: iteration at which to start relabel with expert + :param expert_policy: + """ + + # init vars at beginning of training + self.total_envsteps = 0 + self.start_time = time.time() + + print_period = 1000 if isinstance(self.agent, DQNAgent) else 1 + + for itr in range(n_iter): + if itr % print_period == 0: + print("\n\n********** Iteration %i ************"%itr) + + # decide if videos should be rendered/logged at this iteration + if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: + self.logvideo = True + else: + self.logvideo = False + + # decide if metrics should be logged + if self.params['scalar_log_freq'] == -1: + self.logmetrics = False + elif itr % self.params['scalar_log_freq'] == 0: + self.logmetrics = True + else: + self.logmetrics = False + + # collect trajectories, to be used for training + if isinstance(self.agent, DQNAgent): + # only perform an env step and add to replay buffer for DQN + self.agent.step_env() + envsteps_this_batch = 1 + train_video_paths = None + paths = None + else: + use_batchsize = self.params['batch_size'] + if itr==0: + use_batchsize = self.params['batch_size_initial'] + paths, envsteps_this_batch, train_video_paths = ( + self.collect_training_trajectories( + itr, initial_expertdata, collect_policy, use_batchsize) + ) + + self.total_envsteps += envsteps_this_batch + + # relabel the collected obs with actions from a provided expert policy + if relabel_with_expert and itr>=start_relabel_with_expert: + paths = self.do_relabel_with_expert(expert_policy, paths) + + # add collected data to replay buffer + self.agent.add_to_replay_buffer(paths) + + # train agent (using sampled data from replay buffer) + if itr % print_period == 0: + print("\nTraining agent...") + all_logs = self.train_agent() + + # log/save + if self.logvideo or self.logmetrics: + # perform logging + print('\nBeginning logging procedure...') + if isinstance(self.agent, DQNAgent): + self.perform_dqn_logging(all_logs) + else: + self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) + + if self.params['save_params']: + self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) + + #################################### + #################################### + + def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, num_transitions_to_sample, save_expert_data_to_disk=False): + """ + :param itr: + :param load_initial_expertdata: path to expert data pkl file + :param collect_policy: the current policy using which we collect data + :param num_transitions_to_sample: the number of transitions we collect + :return: + paths: a list trajectories + envsteps_this_batch: the sum over the numbers of environment steps in paths + train_video_paths: paths which also contain videos for visualization purposes + """ + # TODO: get this from Piazza + + return paths, envsteps_this_batch, train_video_paths + + def train_agent(self): + # TODO: get this from Piazza + + #################################### + #################################### + def perform_dqn_logging(self, all_logs): + last_log = all_logs[-1] + + episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() + if len(episode_rewards) > 0: + self.mean_episode_reward = np.mean(episode_rewards[-100:]) + if len(episode_rewards) > 100: + self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) + + logs = OrderedDict() + + logs["Train_EnvstepsSoFar"] = self.agent.t + print("Timestep %d" % (self.agent.t,)) + if self.mean_episode_reward > -5000: + logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) + print("mean reward (100 episodes) %f" % self.mean_episode_reward) + if self.best_mean_episode_reward > -5000: + logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) + print("best mean reward %f" % self.best_mean_episode_reward) + + if self.start_time is not None: + time_since_start = (time.time() - self.start_time) + print("running time %f" % time_since_start) + logs["TimeSinceStart"] = time_since_start + + logs.update(last_log) + + sys.stdout.flush() + + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + self.logger.log_scalar(value, key, self.agent.t) + print('Done logging...\n\n') + + self.logger.flush() + + def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): + + last_log = all_logs[-1] + + ####################### + + # collect eval trajectories, for logging + print("\nCollecting data for eval...") + eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) + + # save eval rollouts as videos in tensorboard event file + if self.logvideo and train_video_paths != None: + print('\nCollecting video rollouts eval') + eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + #save train/eval videos + print('\nSaving train rollouts as videos...') + self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, + video_title='train_rollouts') + self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, + video_title='eval_rollouts') + + ####################### + + # save eval metrics + if self.logmetrics: + # returns, for logging + train_returns = [path["reward"].sum() for path in paths] + eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] + + # episode lengths, for logging + train_ep_lens = [len(path["reward"]) for path in paths] + eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] + + # decide what to log + logs = OrderedDict() + logs["Eval_AverageReturn"] = np.mean(eval_returns) + logs["Eval_StdReturn"] = np.std(eval_returns) + logs["Eval_MaxReturn"] = np.max(eval_returns) + logs["Eval_MinReturn"] = np.min(eval_returns) + logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) + + logs["Train_AverageReturn"] = np.mean(train_returns) + logs["Train_StdReturn"] = np.std(train_returns) + logs["Train_MaxReturn"] = np.max(train_returns) + logs["Train_MinReturn"] = np.min(train_returns) + logs["Train_AverageEpLen"] = np.mean(train_ep_lens) + + logs["Train_EnvstepsSoFar"] = self.total_envsteps + logs["TimeSinceStart"] = time.time() - self.start_time + logs.update(last_log) + + if itr == 0: + self.initial_return = np.mean(train_returns) + logs["Initial_DataCollection_AverageReturn"] = self.initial_return + + # perform the logging + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + self.logger.log_scalar(value, key, itr) + print('Done logging...\n\n') + + self.logger.flush() diff --git a/hw3/cs285/infrastructure/utils.py b/hw3/cs285/infrastructure/utils.py new file mode 100644 index 00000000..eabdc393 --- /dev/null +++ b/hw3/cs285/infrastructure/utils.py @@ -0,0 +1,139 @@ +import numpy as np +import time +import copy + +############################################ +############################################ + +def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): + + model = models[0] + + # true + true_states = perform_actions(env, action_sequence)['observation'] + + # predicted + ob = np.expand_dims(true_states[0],0) + pred_states = [] + for ac in action_sequence: + pred_states.append(ob) + action = np.expand_dims(ac,0) + ob = model.get_prediction(ob, action, data_statistics) + pred_states = np.squeeze(pred_states) + + # mpe + mpe = mean_squared_error(pred_states, true_states) + + return mpe, true_states, pred_states + +def perform_actions(env, actions): + ob = env.reset() + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + for ac in actions: + obs.append(ob) + acs.append(ac) + ob, rew, done, _ = env.step(ac) + # add the observation after taking a step to next_obs + next_obs.append(ob) + rewards.append(rew) + steps += 1 + # If the episode ended, the corresponding terminal value is 1 + # otherwise, it is 0 + if done: + terminals.append(1) + break + else: + terminals.append(0) + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) + +def mean_squared_error(a, b): + return np.mean((a-b)**2) + +############################################ +############################################ + +def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): + # TODO: get this from Piazza + +def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect rollouts using policy + until we have collected min_timesteps_per_batch steps + """ + # TODO: get this from Piazza + + return paths, timesteps_this_batch + +def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect ntraj rollouts using policy + """ + # TODO: get this from Piazza + + return paths + +############################################ +############################################ + +def Path(obs, image_obs, acs, rewards, next_obs, terminals): + """ + Take info (separate arrays) from a single rollout + and return it in a single dictionary + """ + if image_obs != []: + image_obs = np.stack(image_obs, axis=0) + return {"observation" : np.array(obs, dtype=np.float32), + "image_obs" : np.array(image_obs, dtype=np.uint8), + "reward" : np.array(rewards, dtype=np.float32), + "action" : np.array(acs, dtype=np.float32), + "next_observation": np.array(next_obs, dtype=np.float32), + "terminal": np.array(terminals, dtype=np.float32)} + + +def convert_listofrollouts(paths): + """ + Take a list of rollout dictionaries + and return separate arrays, + where each array is a concatenation of that array from across the rollouts + """ + observations = np.concatenate([path["observation"] for path in paths]) + actions = np.concatenate([path["action"] for path in paths]) + next_observations = np.concatenate([path["next_observation"] for path in paths]) + terminals = np.concatenate([path["terminal"] for path in paths]) + concatenated_rewards = np.concatenate([path["reward"] for path in paths]) + unconcatenated_rewards = [path["reward"] for path in paths] + return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards + +############################################ +############################################ + +def get_pathlength(path): + return len(path["reward"]) + +def normalize(data, mean, std, eps=1e-8): + return (data-mean)/(std+eps) + +def unnormalize(data, mean, std): + return data*std+mean + +def add_noise(data_inp, noiseToSignal=0.01): + + data = copy.deepcopy(data_inp) #(num data points, dim) + + #mean of data + mean_data = np.mean(data, axis=0) + + #if mean is 0, + #make it 0.001 to avoid 0 issues later for dividing by std + mean_data[mean_data == 0] = 0.000001 + + #width of normal distribution to sample noise from + #larger magnitude number = could have larger magnitude noise + std_of_noise = mean_data * noiseToSignal + for j in range(mean_data.shape[0]): + data[:, j] = np.copy(data[:, j] + np.random.normal( + 0, np.absolute(std_of_noise[j]), (data.shape[0],))) + + return data diff --git a/hw3/cs285/policies/MLP_policy.py b/hw3/cs285/policies/MLP_policy.py new file mode 100644 index 00000000..4c2184a7 --- /dev/null +++ b/hw3/cs285/policies/MLP_policy.py @@ -0,0 +1,114 @@ +import abc +import itertools +from torch import nn +from torch.nn import functional as F +from torch import optim + +import numpy as np +import torch +from torch import distributions + +from cs285.infrastructure import pytorch_util as ptu +from cs285.policies.base_policy import BasePolicy + + +class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): + + def __init__(self, + ac_dim, + ob_dim, + n_layers, + size, + discrete=False, + learning_rate=1e-4, + training=True, + nn_baseline=False, + **kwargs + ): + super().__init__(**kwargs) + + # init vars + self.ac_dim = ac_dim + self.ob_dim = ob_dim + self.n_layers = n_layers + self.discrete = discrete + self.size = size + self.learning_rate = learning_rate + self.training = training + self.nn_baseline = nn_baseline + + if self.discrete: + self.logits_na = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, + size=self.size) + self.logits_na.to(ptu.device) + self.mean_net = None + self.logstd = None + self.optimizer = optim.Adam(self.logits_na.parameters(), + self.learning_rate) + else: + self.logits_na = None + self.mean_net = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, size=self.size) + self.logstd = nn.Parameter( + torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) + ) + self.mean_net.to(ptu.device) + self.logstd.to(ptu.device) + self.optimizer = optim.Adam( + itertools.chain([self.logstd], self.mean_net.parameters()), + self.learning_rate + ) + + if nn_baseline: + self.baseline = ptu.build_mlp( + input_size=self.ob_dim, + output_size=1, + n_layers=self.n_layers, + size=self.size, + ) + self.baseline.to(ptu.device) + self.baseline_optimizer = optim.Adam( + self.baseline.parameters(), + self.learning_rate, + ) + else: + self.baseline = None + + ################################## + + def save(self, filepath): + torch.save(self.state_dict(), filepath) + + ################################## + + # query the policy with observation(s) to get selected action(s) + def get_action(self, obs: np.ndarray) -> np.ndarray: + # TODO: get this from Piazza + return action + + # update/train this policy + def update(self, observations, actions, **kwargs): + raise NotImplementedError + + # This function defines the forward pass of the network. + # You can return anything you want, but you should be able to differentiate + # through it. For example, you can return a torch.FloatTensor. You can also + # return more flexible objects, such as a + # `torch.distributions.Distribution` object. It's up to you! + def forward(self, observation: torch.FloatTensor): + # TODO: get this from Piazza + return action_distribution + + +##################################################### +##################################################### + + +class MLPPolicyAC(MLPPolicy): + def update(self, observations, actions, adv_n=None): + # TODO: update the policy and return the loss + loss = TODO + return loss.item() diff --git a/hw3/cs285/policies/argmax_policy.py b/hw3/cs285/policies/argmax_policy.py new file mode 100644 index 00000000..a7e443c3 --- /dev/null +++ b/hw3/cs285/policies/argmax_policy.py @@ -0,0 +1,19 @@ +import numpy as np + + +class ArgMaxPolicy(object): + + def __init__(self, critic): + self.critic = critic + + def get_action(self, obs): + if len(obs.shape) > 3: + observation = obs + else: + observation = obs[None] + + ## TODO return the action that maxinmizes the Q-value + # at the current observation as the output + actions = TODO + + return action.squeeze() \ No newline at end of file diff --git a/hw3/cs285/policies/base_policy.py b/hw3/cs285/policies/base_policy.py new file mode 100644 index 00000000..e089540a --- /dev/null +++ b/hw3/cs285/policies/base_policy.py @@ -0,0 +1,14 @@ +import abc +import numpy as np + + +class BasePolicy(object, metaclass=abc.ABCMeta): + def get_action(self, obs: np.ndarray) -> np.ndarray: + raise NotImplementedError + + def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def save(self, filepath: str): + raise NotImplementedError diff --git a/hw3/cs285/scripts/read_results.py b/hw3/cs285/scripts/read_results.py new file mode 100644 index 00000000..3a5bc50f --- /dev/null +++ b/hw3/cs285/scripts/read_results.py @@ -0,0 +1,26 @@ +import glob +import tensorflow as tf + +def get_section_results(file): + """ + requires tensorflow==1.12.0 + """ + X = [] + Y = [] + for e in tf.train.summary_iterator(file): + for v in e.summary.value: + if v.tag == 'Train_EnvstepsSoFar': + X.append(v.simple_value) + elif v.tag == 'Eval_AverageReturn': + Y.append(v.simple_value) + return X, Y + +if __name__ == '__main__': + import glob + + logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' + eventfile = glob.glob(logdir)[0] + + X, Y = get_section_results(eventfile) + for i, (x, y) in enumerate(zip(X, Y)): + print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) \ No newline at end of file diff --git a/hw3/cs285/scripts/run_hw3_actor_critic.ipynb b/hw3/cs285/scripts/run_hw3_actor_critic.ipynb new file mode 100644 index 00000000..669f666d --- /dev/null +++ b/hw3/cs285/scripts/run_hw3_actor_critic.ipynb @@ -0,0 +1,501 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "run_hw3_actor_critic.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "gUl_qfOR8JV6" + }, + "source": [ + "##Setup\n", + "\n", + "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iizPcHAp8LnA", + "cellView": "form" + }, + "source": [ + "#@title mount your Google Drive\n", + "#@markdown Your work will be stored in a folder called `cs285_f2020` by default to prevent Colab instance timeouts from deleting your edits.\n", + "\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nAb10wnb8N0m", + "cellView": "form" + }, + "source": [ + "#@title set up mount symlink\n", + "\n", + "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2020'\n", + "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n", + "if not os.path.exists(DRIVE_PYTHON_PATH):\n", + " %mkdir $DRIVE_PATH\n", + "\n", + "## the space in `My Drive` causes some issues,\n", + "## make a symlink to avoid this\n", + "SYM_PATH = '/content/cs285_f2020'\n", + "if not os.path.exists(SYM_PATH):\n", + " !ln -s $DRIVE_PATH $SYM_PATH" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gtS9-WsD8QVr", + "cellView": "form" + }, + "source": [ + "#@title apt install requirements\n", + "\n", + "#@markdown Run each section with Shift+Enter\n", + "\n", + "#@markdown Double-click on section headers to show code.\n", + "\n", + "!apt update \n", + "!apt install -y --no-install-recommends \\\n", + " build-essential \\\n", + " curl \\\n", + " git \\\n", + " gnupg2 \\\n", + " make \\\n", + " cmake \\\n", + " ffmpeg \\\n", + " swig \\\n", + " libz-dev \\\n", + " unzip \\\n", + " zlib1g-dev \\\n", + " libglfw3 \\\n", + " libglfw3-dev \\\n", + " libxrandr2 \\\n", + " libxinerama-dev \\\n", + " libxi6 \\\n", + " libxcursor-dev \\\n", + " libgl1-mesa-dev \\\n", + " libgl1-mesa-glx \\\n", + " libglew-dev \\\n", + " libosmesa6-dev \\\n", + " lsb-release \\\n", + " ack-grep \\\n", + " patchelf \\\n", + " wget \\\n", + " xpra \\\n", + " xserver-xorg-dev \\\n", + " xvfb \\\n", + " python-opengl \\\n", + " ffmpeg > /dev/null 2>&1\n", + "\n", + "!pip install opencv-python==3.4.0.12" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VcKGekJN80NO", + "cellView": "form" + }, + "source": [ + "#@title download mujoco\n", + "\n", + "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n", + "if not os.path.exists(MJC_PATH):\n", + " %mkdir $MJC_PATH\n", + "%cd $MJC_PATH\n", + "if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):\n", + " !wget -q https://www.roboti.us/download/mujoco200_linux.zip\n", + " !unzip -q mujoco200_linux.zip\n", + " %mv mujoco200_linux mujoco200\n", + " %rm mujoco200_linux.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NTiH9f9y82F_", + "cellView": "form" + }, + "source": [ + "#@title update mujoco paths\n", + "\n", + "import os\n", + "\n", + "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n", + "\n", + "## installation on colab does not find *.so files\n", + "## in LD_LIBRARY_PATH, copy over manually instead\n", + "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A0kPh99l87q0" + }, + "source": [ + "Ensure your `mjkey.txt` is in /content/cs285_f2020/mujoco before this step" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X-LoOdZg84pI", + "cellView": "form" + }, + "source": [ + "#@title clone and install mujoco-py\n", + "\n", + "%cd $MJC_PATH\n", + "if not os.path.exists('mujoco-py'):\n", + " !git clone https://github.com/openai/mujoco-py.git\n", + "%cd mujoco-py\n", + "%pip install -e .\n", + "\n", + "## cythonize at the first import\n", + "import mujoco_py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-XcwBiBN8-Fg", + "cellView": "form" + }, + "source": [ + "#@title clone homework repo\n", + "#@markdown Note that this is the same codebase from homework 1,\n", + "#@markdown so you may need to move your old `homework_fall2020`\n", + "#@markdown folder in order to clone the repo again.\n", + "\n", + "#@markdown **Don't delete your old work though!**\n", + "#@markdown You will need it for this assignment.\n", + "\n", + "%cd $SYM_PATH\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "\n", + "%cd homework_fall2020/hw3\n", + "%pip install -r requirements_colab.txt -f https://download.pytorch.org/whl/torch_stable.html\n", + "%pip install -e ." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g5xIOIpW8_jC", + "cellView": "both" + }, + "source": [ + "#@title set up virtual display\n", + "\n", + "from pyvirtualdisplay import Display\n", + "\n", + "display = Display(visible=0, size=(1400, 900))\n", + "display.start()\n", + "\n", + "# For later\n", + "from cs285.infrastructure.colab_utils import (\n", + " wrap_env,\n", + " show_video\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2rsWAWaK9BVp", + "cellView": "both" + }, + "source": [ + "#@title test virtual display\n", + "\n", + "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n", + "\n", + "import gym\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "\n", + "env = wrap_env(gym.make(\"Ant-v2\"))\n", + "\n", + "observation = env.reset()\n", + "for i in range(10):\n", + " env.render(mode='rgb_array')\n", + " obs, rew, term, _ = env.step(env.action_space.sample() ) \n", + " if term:\n", + " break;\n", + " \n", + "env.close()\n", + "print('Loading video...')\n", + "show_video()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QizpiHDh9Fwk" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J_OxQ1AZSyXC" + }, + "source": [ + "## Run Actor Critic" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "IzuN647wT9iJ", + "cellView": "both" + }, + "source": [ + "#@title imports\n", + "import os\n", + "import time\n", + "\n", + "from cs285.agents.ac_agent import ACAgent\n", + "from cs285.infrastructure.rl_trainer import RL_Trainer\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "PQ9qWQu7TNb9", + "cellView": "both" + }, + "source": [ + "#@title runtime arguments\n", + "\n", + "class ACArgs:\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, val):\n", + " setattr(self, key, val)\n", + "\n", + " def __contains__(self, key):\n", + " return hasattr(self, key)\n", + "\n", + " env_name = 'CartPole-v0' #@param ['CartPole-v0', 'InvertedPendulum-v2', 'HalfCheetah-v2']\n", + " exp_name = 'q4_ac' #@param\n", + "\n", + " ## PDF will tell you how to set ep_len\n", + " ## and discount for each environment\n", + " ep_len = 200 #@param {type: \"integer\"}\n", + "\n", + " #@markdown batches and steps\n", + " batch_size = 1000 #@param {type: \"integer\"}\n", + " eval_batch_size = 400#@param {type: \"integer\"}\n", + "\n", + " n_iter = 100 #@param {type: \"integer\"}\n", + " num_agent_train_steps_per_iter = 1 #@param {type: \"integer\"}\n", + " num_actor_updates_per_agent_update = 1 #@param {type: \"integer\"}\n", + " num_critic_updates_per_agent_update = 1 #@param {type: \"integer\"}\n", + " \n", + " #@markdown Actor-Critic parameters\n", + " discount = 0.9#@param {type: \"number\"}\n", + " learning_rate = 5e-3 #@param {type: \"number\"}\n", + " dont_standardize_advantages = False #@param {type: \"boolean\"}\n", + " num_target_updates = 10 #@param {type: \"integer\"}\n", + " num_grad_steps_per_target_update = 10 #@param {type: \"integer\"}\n", + " n_layers = 2 #@param {type: \"integer\"}\n", + " size = 64 #@param {type: \"integer\"}\n", + "\n", + " #@markdown system\n", + " save_params = False #@param {type: \"boolean\"}\n", + " no_gpu = False #@param {type: \"boolean\"}\n", + " which_gpu = 0 #@param {type: \"integer\"}\n", + " seed = 1 #@param {type: \"integer\"}\n", + "\n", + " #@markdown logging\n", + " ## default is to not log video so\n", + " ## that logs are small enough to be\n", + " ## uploaded to gradscope\n", + " video_log_freq = -1#@param {type: \"integer\"}\n", + " scalar_log_freq = 10 #@param {type: \"integer\"}\n", + "\n", + "\n", + "args = ACArgs()\n", + "\n", + "\n", + "if args['video_log_freq'] > 0:\n", + " import warnings\n", + " warnings.warn(\n", + " '''\\nLogging videos will make eventfiles too'''\n", + " '''\\nlarge for the autograder. Set video_log_freq = -1'''\n", + " '''\\nfor the runs you intend to submit.''')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wqUVP5E5S1z8", + "cellView": "both" + }, + "source": [ + "#@title Define AC trainer\n", + "\n", + "class AC_Trainer(object):\n", + "\n", + " def __init__(self, params):\n", + "\n", + " #####################\n", + " ## SET AGENT PARAMS\n", + " #####################\n", + "\n", + " computation_graph_args = {\n", + " 'n_layers': params['n_layers'],\n", + " 'size': params['size'],\n", + " 'learning_rate': params['learning_rate'],\n", + " 'num_target_updates': params['num_target_updates'],\n", + " 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'],\n", + " }\n", + "\n", + " estimate_advantage_args = {\n", + " 'gamma': params['discount'],\n", + " 'standardize_advantages': not(params['dont_standardize_advantages']),\n", + " }\n", + "\n", + " train_args = {\n", + " 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],\n", + " 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],\n", + " 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'],\n", + " }\n", + "\n", + " agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}\n", + "\n", + " self.params = params\n", + " self.params['agent_class'] = ACAgent\n", + " self.params['agent_params'] = agent_params\n", + " self.params['train_batch_size'] = params['batch_size']\n", + " self.params['batch_size_initial'] = self.params['batch_size']\n", + " self.params['non_atari_colab_env'] = True\n", + "\n", + " ################\n", + " ## RL TRAINER\n", + " ################\n", + "\n", + " self.rl_trainer = RL_Trainer(self.params)\n", + "\n", + " def run_training_loop(self):\n", + "\n", + " self.rl_trainer.run_training_loop(\n", + " self.params['n_iter'],\n", + " collect_policy = self.rl_trainer.agent.actor,\n", + " eval_policy = self.rl_trainer.agent.actor,\n", + " )\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "xuNw8N1jTg1p" + }, + "source": [ + "#@title create directories for logging\n", + "\n", + "data_path = '''/content/cs285_f2020/''' \\\n", + " '''homework_fall2020/hw3/data'''\n", + "\n", + "if not (os.path.exists(data_path)):\n", + " os.makedirs(data_path)\n", + "\n", + "logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + "logdir = os.path.join(data_path, logdir)\n", + "args['logdir'] = logdir\n", + "if not(os.path.exists(logdir)):\n", + " os.makedirs(logdir)\n", + "\n", + "print(\"LOGGING TO: \", logdir)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_IGogH9YTt1y" + }, + "source": [ + "#@title run training\n", + "trainer = AC_Trainer(args)\n", + "trainer.run_training_loop()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "LjhrgXnUTzyi" + }, + "source": [ + "#@markdown You can visualize your runs with tensorboard from within the notebook\n", + "\n", + "## requires tensorflow==2.3.0\n", + "# %load_ext tensorboard\n", + "%tensorboard --logdir /content/cs285_f2020/homework_fall2020/hw3/data/" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/hw3/cs285/scripts/run_hw3_actor_critic.py b/hw3/cs285/scripts/run_hw3_actor_critic.py new file mode 100644 index 00000000..0b6f1f08 --- /dev/null +++ b/hw3/cs285/scripts/run_hw3_actor_critic.py @@ -0,0 +1,126 @@ +import os +import time + +from cs285.agents.ac_agent import ACAgent +from cs285.infrastructure.rl_trainer import RL_Trainer + + +class AC_Trainer(object): + + def __init__(self, params): + + ##################### + ## SET AGENT PARAMS + ##################### + + computation_graph_args = { + 'n_layers': params['n_layers'], + 'size': params['size'], + 'learning_rate': params['learning_rate'], + 'num_target_updates': params['num_target_updates'], + 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'], + } + + estimate_advantage_args = { + 'gamma': params['discount'], + 'standardize_advantages': not(params['dont_standardize_advantages']), + } + + train_args = { + 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], + 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], + 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'], + } + + agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} + + self.params = params + self.params['agent_class'] = ACAgent + self.params['agent_params'] = agent_params + self.params['batch_size_initial'] = self.params['batch_size'] + + ################ + ## RL TRAINER + ################ + + self.rl_trainer = RL_Trainer(self.params) + + def run_training_loop(self): + + self.rl_trainer.run_training_loop( + self.params['n_iter'], + collect_policy = self.rl_trainer.agent.actor, + eval_policy = self.rl_trainer.agent.actor, + ) + + +def main(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--env_name', type=str, default='CartPole-v0') + parser.add_argument('--ep_len', type=int, default=200) + parser.add_argument('--exp_name', type=str, default='todo') + parser.add_argument('--n_iter', '-n', type=int, default=200) + + parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) + parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) + parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1) + + parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration + parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration + parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step + + parser.add_argument('--discount', type=float, default=1.0) + parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) + parser.add_argument('--dont_standardize_advantages', '-dsa', action='http://github.com/store_true') + parser.add_argument('--num_target_updates', '-ntu', type=int, default=10) + parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10) + parser.add_argument('--n_layers', '-l', type=int, default=2) + parser.add_argument('--size', '-s', type=int, default=64) + + parser.add_argument('--seed', type=int, default=1) + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') + parser.add_argument('--which_gpu', '-gpu_id', default=0) + parser.add_argument('--video_log_freq', type=int, default=-1) + parser.add_argument('--scalar_log_freq', type=int, default=10) + + parser.add_argument('--save_params', action='http://github.com/store_true') + + args = parser.parse_args() + + # convert to dictionary + params = vars(args) + + # for policy gradient, we made a design decision + # to force batch_size = train_batch_size + # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above) + params['train_batch_size'] = params['batch_size'] + + ################################## + ### CREATE DIRECTORY FOR LOGGING + ################################## + + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') + + if not (os.path.exists(data_path)): + os.makedirs(data_path) + + logdir = 'hw3_ ' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + logdir = os.path.join(data_path, logdir) + params['logdir'] = logdir + if not(os.path.exists(logdir)): + os.makedirs(logdir) + + print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") + + ################### + ### RUN TRAINING + ################### + + trainer = AC_Trainer(params) + trainer.run_training_loop() + + +if __name__ == "__main__": + main() diff --git a/hw3/cs285/scripts/run_hw3_dqn.ipynb b/hw3/cs285/scripts/run_hw3_dqn.ipynb new file mode 100644 index 00000000..4371b847 --- /dev/null +++ b/hw3/cs285/scripts/run_hw3_dqn.ipynb @@ -0,0 +1,477 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "run_hw3_dqn.ipynb", + "provenance": [], + "collapsed_sections": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "gUl_qfOR8JV6" + }, + "source": [ + "##Setup\n", + "\n", + "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iizPcHAp8LnA", + "cellView": "form" + }, + "source": [ + "#@title mount your Google Drive\n", + "#@markdown Your work will be stored in a folder called `cs285_f2020` by default to prevent Colab instance timeouts from deleting your edits.\n", + "\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nAb10wnb8N0m", + "cellView": "form" + }, + "source": [ + "#@title set up mount symlink\n", + "\n", + "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2020'\n", + "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n", + "if not os.path.exists(DRIVE_PYTHON_PATH):\n", + " %mkdir $DRIVE_PATH\n", + "\n", + "## the space in `My Drive` causes some issues,\n", + "## make a symlink to avoid this\n", + "SYM_PATH = '/content/cs285_f2020'\n", + "if not os.path.exists(SYM_PATH):\n", + " !ln -s $DRIVE_PATH $SYM_PATH" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gtS9-WsD8QVr", + "cellView": "form" + }, + "source": [ + "#@title apt install requirements\n", + "\n", + "#@markdown Run each section with Shift+Enter\n", + "\n", + "#@markdown Double-click on section headers to show code.\n", + "\n", + "!apt update \n", + "!apt install -y --no-install-recommends \\\n", + " build-essential \\\n", + " curl \\\n", + " git \\\n", + " gnupg2 \\\n", + " make \\\n", + " cmake \\\n", + " ffmpeg \\\n", + " swig \\\n", + " libz-dev \\\n", + " unzip \\\n", + " zlib1g-dev \\\n", + " libglfw3 \\\n", + " libglfw3-dev \\\n", + " libxrandr2 \\\n", + " libxinerama-dev \\\n", + " libxi6 \\\n", + " libxcursor-dev \\\n", + " libgl1-mesa-dev \\\n", + " libgl1-mesa-glx \\\n", + " libglew-dev \\\n", + " libosmesa6-dev \\\n", + " lsb-release \\\n", + " ack-grep \\\n", + " patchelf \\\n", + " wget \\\n", + " xpra \\\n", + " xserver-xorg-dev \\\n", + " xvfb \\\n", + " python-opengl \\\n", + " ffmpeg > /dev/null 2>&1\n", + "\n", + "!pip install opencv-python==3.4.0.12" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VcKGekJN80NO", + "cellView": "form" + }, + "source": [ + "#@title download mujoco\n", + "\n", + "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n", + "if not os.path.exists(MJC_PATH):\n", + " %mkdir $MJC_PATH\n", + "%cd $MJC_PATH\n", + "if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):\n", + " !wget -q https://www.roboti.us/download/mujoco200_linux.zip\n", + " !unzip -q mujoco200_linux.zip\n", + " %mv mujoco200_linux mujoco200\n", + " %rm mujoco200_linux.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NTiH9f9y82F_", + "cellView": "form" + }, + "source": [ + "#@title update mujoco paths\n", + "\n", + "import os\n", + "\n", + "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n", + "\n", + "## installation on colab does not find *.so files\n", + "## in LD_LIBRARY_PATH, copy over manually instead\n", + "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A0kPh99l87q0" + }, + "source": [ + "Ensure your `mjkey.txt` is in /content/cs285_f2020/mujoco before this step" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X-LoOdZg84pI", + "cellView": "form" + }, + "source": [ + "#@title clone and install mujoco-py\n", + "\n", + "%cd $MJC_PATH\n", + "if not os.path.exists('mujoco-py'):\n", + " !git clone https://github.com/openai/mujoco-py.git\n", + "%cd mujoco-py\n", + "%pip install -e .\n", + "\n", + "## cythonize at the first import\n", + "import mujoco_py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-XcwBiBN8-Fg", + "cellView": "form" + }, + "source": [ + "#@title clone homework repo\n", + "#@markdown Note that this is the same codebase from homework 1,\n", + "#@markdown so you may need to move your old `homework_fall2020`\n", + "#@markdown folder in order to clone the repo again.\n", + "\n", + "#@markdown **Don't delete your old work though!**\n", + "#@markdown You will need it for this assignment.\n", + "\n", + "%cd $SYM_PATH\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "\n", + "%cd homework_fall2020/hw3\n", + "%pip install -r requirements_colab.txt -f https://download.pytorch.org/whl/torch_stable.html\n", + "%pip install -e ." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g5xIOIpW8_jC", + "cellView": "form" + }, + "source": [ + "#@title set up virtual display\n", + "\n", + "from pyvirtualdisplay import Display\n", + "\n", + "display = Display(visible=0, size=(1400, 900))\n", + "display.start()\n", + "\n", + "# For later\n", + "from cs285.infrastructure.colab_utils import (\n", + " wrap_env,\n", + " show_video\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2rsWAWaK9BVp", + "cellView": "form" + }, + "source": [ + "#@title test virtual display\n", + "\n", + "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n", + "\n", + "import gym\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "\n", + "env = wrap_env(gym.make(\"Ant-v2\"))\n", + "\n", + "observation = env.reset()\n", + "for i in range(10):\n", + " env.render(mode='rgb_array')\n", + " obs, rew, term, _ = env.step(env.action_space.sample() ) \n", + " if term:\n", + " break;\n", + " \n", + "env.close()\n", + "print('Loading video...')\n", + "show_video()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QizpiHDh9Fwk" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nii6qk2C9Ipk" + }, + "source": [ + "## Run DQN and Double DQN" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4t7FUeEG9Dkf" + }, + "source": [ + "#@title imports\n", + "import os\n", + "import time\n", + "\n", + "from cs285.infrastructure.rl_trainer import RL_Trainer\n", + "from cs285.agents.dqn_agent import DQNAgent\n", + "from cs285.infrastructure.dqn_utils import get_env_kwargs\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2fXlzARJ9i-t", + "cellView": "both" + }, + "source": [ + "#@title runtime arguments\n", + "\n", + "class Args:\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, val):\n", + " setattr(self, key, val)\n", + "\n", + " def __contains__(self, key):\n", + " return hasattr(self, key)\n", + "\n", + " env_name = 'MsPacman-v0' #@param ['MsPacman-v0', 'LunarLander-v3', 'PongNoFrameSkip-v4']\n", + " exp_name = 'q3_dqn' #@param\n", + "\n", + " ## PDF will tell you how to set ep_len\n", + " ## and discount for each environment\n", + " ep_len = 200 #@param {type: \"integer\"}\n", + "\n", + " #@markdown batches and steps\n", + " batch_size = 32 #@param {type: \"integer\"}\n", + " eval_batch_size = 1000 #@param {type: \"integer\"}\n", + "\n", + " num_agent_train_steps_per_iter = 1 #@param {type: \"integer\"}\n", + "\n", + " num_critic_updates_per_agent_update = 1 #@param {type: \"integer\"}\n", + " \n", + " #@markdown Q-learning parameters\n", + " double_q = False #@param {type: \"boolean\"}\n", + "\n", + " #@markdown system\n", + " save_params = False #@param {type: \"boolean\"}\n", + " no_gpu = False #@param {type: \"boolean\"}\n", + " which_gpu = 0 #@param {type: \"integer\"}\n", + " seed = 1 #@param {type: \"integer\"}\n", + "\n", + " #@markdown logging\n", + " ## default is to not log video so\n", + " ## that logs are small enough to be\n", + " ## uploaded to gradscope\n", + " video_log_freq = -1 #@param {type: \"integer\"}\n", + " scalar_log_freq = 10000#@param {type: \"integer\"}\n", + "\n", + "\n", + "args = Args()\n", + "\n", + "## ensure compatibility with hw1 code\n", + "args['train_batch_size'] = args['batch_size']\n", + "\n", + "if args['video_log_freq'] > 0:\n", + " import warnings\n", + " warnings.warn(\n", + " '''\\nLogging videos will make eventfiles too'''\n", + " '''\\nlarge for the autograder. Set video_log_freq = -1'''\n", + " '''\\nfor the runs you intend to submit.''')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "T0cJlp6s-ogO" + }, + "source": [ + "#@title create directories for logging\n", + "\n", + "data_path = '''/content/cs285_f2020/''' \\\n", + " '''homework_fall2020/hw3/data'''\n", + "\n", + "if not (os.path.exists(data_path)):\n", + " os.makedirs(data_path)\n", + "\n", + "logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + "logdir = os.path.join(data_path, logdir)\n", + "args['logdir'] = logdir\n", + "if not(os.path.exists(logdir)):\n", + " os.makedirs(logdir)\n", + "\n", + "print(\"LOGGING TO: \", logdir)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "I525KFRN-42s" + }, + "source": [ + "#@title Define Q-function trainer\n", + "\n", + "class Q_Trainer(object):\n", + "\n", + " def __init__(self, params):\n", + " self.params = params\n", + "\n", + " train_args = {\n", + " 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],\n", + " 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],\n", + " 'train_batch_size': params['batch_size'],\n", + " 'double_q': params['double_q'],\n", + " }\n", + "\n", + " env_args = get_env_kwargs(params['env_name'])\n", + "\n", + " for k, v in env_args.items():\n", + " params[k] = v\n", + "\n", + " self.params['agent_class'] = DQNAgent\n", + " self.params['agent_params'] = params\n", + " self.params['train_batch_size'] = params['batch_size']\n", + " self.params['env_wrappers'] = env_args['env_wrappers']\n", + "\n", + " self.rl_trainer = RL_Trainer(self.params)\n", + "\n", + " def run_training_loop(self):\n", + " self.rl_trainer.run_training_loop(\n", + " self.params['num_timesteps'],\n", + " collect_policy = self.rl_trainer.agent.actor,\n", + " eval_policy = self.rl_trainer.agent.actor,\n", + " )" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wF4LSRGn-_Cv" + }, + "source": [ + "#@title run training\n", + "\n", + "trainer = Q_Trainer(args)\n", + "trainer.run_training_loop()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_kTH-tXkI-B-" + }, + "source": [ + "#@markdown You can visualize your runs with tensorboard from within the notebook\n", + "\n", + "## requires tensorflow==2.3.0\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir /content/cs285_f2020/homework_fall2020/hw3/data/" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/hw3/cs285/scripts/run_hw3_dqn.py b/hw3/cs285/scripts/run_hw3_dqn.py new file mode 100644 index 00000000..c8ecab0e --- /dev/null +++ b/hw3/cs285/scripts/run_hw3_dqn.py @@ -0,0 +1,94 @@ +import os +import time + +from cs285.infrastructure.rl_trainer import RL_Trainer +from cs285.agents.dqn_agent import DQNAgent +from cs285.infrastructure.dqn_utils import get_env_kwargs + + +class Q_Trainer(object): + + def __init__(self, params): + self.params = params + + train_args = { + 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], + 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], + 'train_batch_size': params['batch_size'], + 'double_q': params['double_q'], + } + + env_args = get_env_kwargs(params['env_name']) + + self.agent_params = {**train_args, **env_args, **params} + + self.params['agent_class'] = DQNAgent + self.params['agent_params'] = self.agent_params + self.params['train_batch_size'] = params['batch_size'] + self.params['env_wrappers'] = self.agent_params['env_wrappers'] + + self.rl_trainer = RL_Trainer(self.params) + + def run_training_loop(self): + self.rl_trainer.run_training_loop( + self.agent_params['num_timesteps'], + collect_policy = self.rl_trainer.agent.actor, + eval_policy = self.rl_trainer.agent.actor, + ) + +def main(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + '--env_name', + default='MsPacman-v0', + choices=('PongNoFrameskip-v4', 'LunarLander-v3', 'MsPacman-v0') + ) + + parser.add_argument('--ep_len', type=int, default=200) + parser.add_argument('--exp_name', type=str, default='todo') + + parser.add_argument('--eval_batch_size', type=int, default=1000) + + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) + parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) + parser.add_argument('--double_q', action='http://github.com/store_true') + + parser.add_argument('--seed', type=int, default=1) + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') + parser.add_argument('--which_gpu', '-gpu_id', default=0) + parser.add_argument('--scalar_log_freq', type=int, default=int(1e4)) + parser.add_argument('--video_log_freq', type=int, default=-1) + + parser.add_argument('--save_params', action='http://github.com/store_true') + + args = parser.parse_args() + + # convert to dictionary + params = vars(args) + params['video_log_freq'] = -1 # This param is not used for DQN + ################################## + ### CREATE DIRECTORY FOR LOGGING + ################################## + + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') + + if not (os.path.exists(data_path)): + os.makedirs(data_path) + + logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + logdir = os.path.join(data_path, logdir) + params['logdir'] = logdir + if not(os.path.exists(logdir)): + os.makedirs(logdir) + + print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") + + trainer = Q_Trainer(params) + trainer.run_training_loop() + + +if __name__ == "__main__": + main() diff --git a/hw3/cs285_hw3.pdf b/hw3/cs285_hw3.pdf new file mode 100644 index 00000000..f0073e77 Binary files /dev/null and b/hw3/cs285_hw3.pdf differ diff --git a/hw3/requirements.txt b/hw3/requirements.txt new file mode 100644 index 00000000..faec9bb9 --- /dev/null +++ b/hw3/requirements.txt @@ -0,0 +1,12 @@ +gym[atari]==0.17.2 +mujoco-py==2.0.2.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +ipdb==0.13.3 +box2d-py diff --git a/hw3/requirements_colab.txt b/hw3/requirements_colab.txt new file mode 100644 index 00000000..4fafb598 --- /dev/null +++ b/hw3/requirements_colab.txt @@ -0,0 +1,11 @@ +gym[atari]==0.17.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +ipdb==0.13.3 +box2d-py diff --git a/hw3/setup.py b/hw3/setup.py new file mode 100644 index 00000000..3cc1886e --- /dev/null +++ b/hw3/setup.py @@ -0,0 +1,8 @@ +# setup.py +from setuptools import setup + +setup( + name='cs285', + version='0.1.0', + packages=['cs285'], +) \ No newline at end of file diff --git a/hw4/README.md b/hw4/README.md new file mode 100644 index 00000000..eee08587 --- /dev/null +++ b/hw4/README.md @@ -0,0 +1,28 @@ +## Setup + +You can run this code on your own machine or on Google Colab. + +1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. + +2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw4/cs285/scripts/run_hw4_mb.ipynb) + +## Complete the code + +The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from Piazza'. + +- [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) +- [infrastructure/utils.py](cs285/infrastructure/utils.py) + +You will then need to implement code in the following files: +- [agents/mb_agent.py](cs285/agents/mb_agent.py) +- [models/ff_model.py](cs285/models/ff_model.py) +- [policies/MPC_policy.py](cs285/policies/MPC_policy.py) + +The relevant sections are marked with `TODO`. + +You may also want to look through [scripts/run_hw4_mb.py](cs285/scripts/run_hw4_mb.py) (if running locally) or [scripts/run_hw4_mb.ipynb](cs285/scripts/run_hw4_mb.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. + +See the [assignment PDF](cs285_hw4.pdf) for more details on what files to edit. + diff --git a/hw4/cs285/agents/base_agent.py b/hw4/cs285/agents/base_agent.py new file mode 100644 index 00000000..a32224b5 --- /dev/null +++ b/hw4/cs285/agents/base_agent.py @@ -0,0 +1,16 @@ +class BaseAgent(object): + def __init__(self, **kwargs): + super(BaseAgent, self).__init__(**kwargs) + + def train(self) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def add_to_replay_buffer(self, paths): + raise NotImplementedError + + def sample(self, batch_size): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError \ No newline at end of file diff --git a/hw4/cs285/agents/mb_agent.py b/hw4/cs285/agents/mb_agent.py new file mode 100644 index 00000000..d88100f9 --- /dev/null +++ b/hw4/cs285/agents/mb_agent.py @@ -0,0 +1,90 @@ +from .base_agent import BaseAgent +from cs285.models.ff_model import FFModel +from cs285.policies.MPC_policy import MPCPolicy +from cs285.infrastructure.replay_buffer import ReplayBuffer +from cs285.infrastructure.utils import * + + +class MBAgent(BaseAgent): + def __init__(self, env, agent_params): + super(MBAgent, self).__init__() + + self.env = env.unwrapped + self.agent_params = agent_params + self.ensemble_size = self.agent_params['ensemble_size'] + + self.dyn_models = [] + for i in range(self.ensemble_size): + model = FFModel( + self.agent_params['ac_dim'], + self.agent_params['ob_dim'], + self.agent_params['n_layers'], + self.agent_params['size'], + self.agent_params['learning_rate'], + ) + self.dyn_models.append(model) + + self.actor = MPCPolicy( + self.env, + ac_dim=self.agent_params['ac_dim'], + dyn_models=self.dyn_models, + horizon=self.agent_params['mpc_horizon'], + N=self.agent_params['mpc_num_action_sequences'], + ) + + self.replay_buffer = ReplayBuffer() + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + + # training a MB agent refers to updating the predictive model using observed state transitions + # NOTE: each model in the ensemble is trained on a different random batch of size batch_size + losses = [] + num_data = ob_no.shape[0] + num_data_per_ens = int(num_data / self.ensemble_size) + + for i in range(self.ensemble_size): + + # select which datapoints to use for this model of the ensemble + # you might find the num_data_per_env variable defined above useful + + observations = # TODO(Q1) + actions = # TODO(Q1) + next_observations = # TODO(Q1) + + # use datapoints to update one of the dyn_models + model = # TODO(Q1) + log = model.update(observations, actions, next_observations, + self.data_statistics) + loss = log['Training Loss'] + losses.append(loss) + + avg_loss = np.mean(losses) + return { + 'Training Loss': avg_loss, + } + + def add_to_replay_buffer(self, paths, add_sl_noise=False): + + # add data to replay buffer + self.replay_buffer.add_rollouts(paths, noised=add_sl_noise) + + # get updated mean/std of the data in our replay buffer + self.data_statistics = { + 'obs_mean': np.mean(self.replay_buffer.obs, axis=0), + 'obs_std': np.std(self.replay_buffer.obs, axis=0), + 'acs_mean': np.mean(self.replay_buffer.acs, axis=0), + 'acs_std': np.std(self.replay_buffer.acs, axis=0), + 'delta_mean': np.mean( + self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), + 'delta_std': np.std( + self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), + } + + # update the actor's data_statistics too, so actor.get_action can be calculated correctly + self.actor.data_statistics = self.data_statistics + + def sample(self, batch_size): + # NOTE: sampling batch_size * ensemble_size, + # so each model in our ensemble can get trained on batch_size data + return self.replay_buffer.sample_random_data( + batch_size * self.ensemble_size) diff --git a/hw4/cs285/envs/__init__.py b/hw4/cs285/envs/__init__.py new file mode 100644 index 00000000..18911594 --- /dev/null +++ b/hw4/cs285/envs/__init__.py @@ -0,0 +1,18 @@ +from gym.envs.registration import register + +def register_envs(): + register( + id='cheetah-cs285-v0', + entry_point='cs285.envs.cheetah:HalfCheetahEnv', + max_episode_steps=1000, + ) + register( + id='obstacles-cs285-v0', + entry_point='cs285.envs.obstacles:Obstacles', + max_episode_steps=500, + ) + register( + id='reacher-cs285-v0', + entry_point='cs285.envs.reacher:Reacher7DOFEnv', + max_episode_steps=500, + ) diff --git a/hw4/cs285/envs/cheetah/__init__.py b/hw4/cs285/envs/cheetah/__init__.py new file mode 100644 index 00000000..b681e92c --- /dev/null +++ b/hw4/cs285/envs/cheetah/__init__.py @@ -0,0 +1 @@ +from cs285.envs.cheetah.cheetah import HalfCheetahEnv diff --git a/hw4/cs285/envs/cheetah/cheetah.py b/hw4/cs285/envs/cheetah/cheetah.py new file mode 100644 index 00000000..4cd8a1e4 --- /dev/null +++ b/hw4/cs285/envs/cheetah/cheetah.py @@ -0,0 +1,133 @@ +import numpy as np +import mujoco_py +from gym import utils +from gym.envs.mujoco import mujoco_env + +class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): + + def __init__(self): + + mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) + utils.EzPickle.__init__(self) + + self.skip = self.frame_skip + + self.action_dim = self.ac_dim = self.action_space.shape[0] + self.observation_dim = self.obs_dim = self.observation_space.shape[0] + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + xvel = observations[:, 9].copy() + body_angle = observations[:, 2].copy() + front_leg = observations[:, 6].copy() + front_shin = observations[:, 7].copy() + front_foot = observations[:, 8].copy() + zeros = np.zeros((observations.shape[0],)).copy() + + # ranges + leg_range = 0.2 + shin_range = 0 + foot_range = 0 + penalty_factor = 10 + + #calc rew + self.reward_dict['run'] = xvel + + front_leg_rew = zeros.copy() + front_leg_rew[front_leg>leg_range] = -penalty_factor + self.reward_dict['leg'] = front_leg_rew + + front_shin_rew = zeros.copy() + front_shin_rew[front_shin>shin_range] = -penalty_factor + self.reward_dict['shin'] = front_shin_rew + + front_foot_rew = zeros.copy() + front_foot_rew[front_foot>foot_range] = -penalty_factor + self.reward_dict['foot'] = front_foot_rew + + # total reward + self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot'] + + #return + dones = zeros.copy() + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + + def get_score(self, obs): + xposafter = obs[0] + return xposafter + + ############################################## + + def step(self, action): + + #step + self.do_simulation(action, self.frame_skip) + + #obs/reward/done/score + ob = self._get_obs() + rew, done = self.get_reward(ob, action) + score = self.get_score(ob) + + #return + env_info = {'obs_dict': self.obs_dict, + 'rewards': self.reward_dict, + 'score': score} + return ob, rew, done, env_info + + def _get_obs(self): + + self.obs_dict = {} + self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy() + self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy() + self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy() + + return np.concatenate([ + self.obs_dict['joints_pos'], #9 + self.obs_dict['joints_vel'], #9 + self.obs_dict['com_torso'], #3 + ]) + + ############################################## + + def reset_model(self, seed=None): + + # set reset pose/vel + self.reset_pose = self.init_qpos + self.np_random.uniform( + low=-.1, high=.1, size=self.model.nq) + self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 + + #reset the env to that pose/vel + return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) + + + def do_reset(self, reset_pose, reset_vel, reset_goal=None): + + #reset + self.set_state(reset_pose, reset_vel) + + #return + return self._get_obs() diff --git a/hw4/cs285/envs/obstacles/__init__.py b/hw4/cs285/envs/obstacles/__init__.py new file mode 100644 index 00000000..30a70022 --- /dev/null +++ b/hw4/cs285/envs/obstacles/__init__.py @@ -0,0 +1 @@ +from cs285.envs.obstacles.obstacles_env import Obstacles diff --git a/hw4/cs285/envs/obstacles/obstacles_env.py b/hw4/cs285/envs/obstacles/obstacles_env.py new file mode 100644 index 00000000..19f7f835 --- /dev/null +++ b/hw4/cs285/envs/obstacles/obstacles_env.py @@ -0,0 +1,227 @@ +import gym +import numpy as np +from gym import spaces + +class Obstacles(gym.Env): + def __init__(self, start=[-0.5, 0.75], end=[0.7, -0.8], random_starts=True): + + import matplotlib.pyplot as plt #inside, so doesnt get imported when not using this env + self.plt = plt + + self.action_dim = self.ac_dim = 2 + self.observation_dim = self.obs_dim = 4 + self.boundary_min = -0.99 + self.boundary_max = 0.99 + + low = self.boundary_min*np.ones((self.action_dim,)) + high = self.boundary_max*np.ones((self.action_dim,)) + self.action_space = spaces.Box(low, high, dtype=np.float32) + + high = np.inf*np.ones(self.obs_dim) + low = -high + self.observation_space = spaces.Box(low, high, dtype=np.float32) + + self.env_name = 'obstacles' + self.is_gym = True + + self.start = np.array(start) + self.end = np.array(end) + self.current = np.array(start) + self.random_starts = random_starts + + #obstacles are rectangles, specified by [x of top left, y of topleft, width x, height y] + self.obstacles = [] + self.obstacles.append([-0.4, 0.8, 0.4, 0.3]) + self.obstacles.append([-0.9, 0.3, 0.2, 0.6]) + self.obstacles.append([0.6, -0.1, 0.12, 0.4]) + self.obstacles.append([-0.1, 0.2, 0.15, 0.4]) + self.obstacles.append([0.1, -0.7, 0.3, 0.15]) + + self.eps = 0.1 + self.fig = self.plt.figure() + + def seed(self, seed): + np.random.seed(seed) + + ######################################### + + def pick_start_pos(self): + if self.random_starts: + temp = np.random.uniform([self.boundary_min, self.boundary_min+1.25], [self.boundary_max-0.4, self.boundary_max], (self.action_dim,)) + if not self.is_valid(temp[None, :]): + temp = self.pick_start_pos() + else: + temp = self.start + return temp + + ######################################### + + def reset(self, seed=None): + if seed: + self.seed(seed) + + self.reset_pose = self.pick_start_pos() + self.reset_vel = self.end + + return self.do_reset(self.reset_pose, self.reset_vel) + + def do_reset(self, reset_pose, reset_vel, reset_goal=None): + + self.current = reset_pose.copy() + self.end = reset_vel.copy() + + #clear + self.counter = 0 + self.plt.clf() + + #return + return self._get_obs() + + ######################################### + + def _get_obs(self): + return np.concatenate([self.current,self.end]) + + def get_score(self, obs): + curr_pos = obs[:2] + end_pos = obs[-2:] + score = -1*np.abs(curr_pos-end_pos) + return score + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + curr_pos = observations[:, :2] + end_pos = observations[:, -2:] + + #calc rew + dist = np.linalg.norm(curr_pos - end_pos, axis=1) + self.reward_dict['dist'] = -dist + self.reward_dict['r_total'] = self.reward_dict['dist'] + + #done + dones = np.zeros((observations.shape[0],)) + dones[distself.boundary_max] = 1 + oob[curr_pos[:,1]>self.boundary_max] = 1 + dones[oob==1] = 1 + + #return + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + def step(self, action): + self.counter += 1 + action = np.clip(action, -1, 1) #clip (-1, 1) + action = action / 10. #scale (-1,1) to (-0.1, 0.1) + + # move, only if its a valid move (else, keep it there because it cant move) + temp = self.current + action + if self.is_valid(temp[None, :]): + self.current = temp + + ob = self._get_obs() + reward, done = self.get_reward(ob, action) + score = self.get_score(ob) + env_info = {'ob': ob, + 'rewards': self.reward_dict, + 'score': score} + + return ob, reward, done, env_info + + ######################################## + # utility functions + ######################################## + + def render(self, mode=None): + # boundaries + self.plt.plot([self.boundary_min, self.boundary_min], + [self.boundary_min, self.boundary_max], 'k') + self.plt.plot([self.boundary_max, self.boundary_max], + [self.boundary_min, self.boundary_max], 'k') + self.plt.plot([self.boundary_min, self.boundary_max], + [self.boundary_min, self.boundary_min], 'k') + self.plt.plot([self.boundary_min, self.boundary_max], + [self.boundary_max, self.boundary_max], 'k') + # obstacles + for obstacle in self.obstacles: + tl_x = obstacle[0] + tl_y = obstacle[1] + tr_x = tl_x + obstacle[2] + tr_y = tl_y + bl_x = tl_x + bl_y = tl_y - obstacle[3] + br_x = tr_x + br_y = bl_y + self.plt.plot([bl_x, br_x], [bl_y, br_y], 'r') + self.plt.plot([tl_x, tr_x], [tl_y, tr_y], 'r') + self.plt.plot([bl_x, bl_x], [bl_y, tl_y], 'r') + self.plt.plot([br_x, br_x], [br_y, tr_y], 'r') + # current and end + self.plt.plot(self.end[0], self.end[1], 'go') + self.plt.plot(self.current[0], self.current[1], 'ko') + self.fig.canvas.draw() + img = np.fromstring(self.fig.canvas.tostring_rgb(), dtype=np.uint8) + img = img.reshape(self.fig.canvas.get_width_height()[::-1] + (3,)) + return img + + def is_valid(self, dat): + + oob_mask = np.any(self.oob(dat), axis=1) + + # old way + self.a = self.boundary_min + (self.boundary_max-self.boundary_min)/3.0 + self.b = self.boundary_min + 2*(self.boundary_max-self.boundary_min)/3.0 + data_mask = (dat[:, 0] < self.a) | (dat[:, 0] > self.b) | \ + (dat[:, 1] < self.a) | (dat[:, 1] > self.b) + + # + in_obstacle = False + for obstacle in self.obstacles: + tl_x = obstacle[0] + tl_y = obstacle[1] + tr_x = tl_x + obstacle[2] + tr_y = tl_y + bl_x = tl_x + bl_y = tl_y - obstacle[3] + br_x = tr_x + br_y = bl_y + + if dat[:, 0]>tl_x and dat[:, 0]bl_y and dat[:, 1]= self.boundary_max) + + diff --git a/hw4/cs285/envs/reacher/__init__.py b/hw4/cs285/envs/reacher/__init__.py new file mode 100644 index 00000000..d298c65d --- /dev/null +++ b/hw4/cs285/envs/reacher/__init__.py @@ -0,0 +1 @@ +from cs285.envs.reacher.reacher_env import Reacher7DOFEnv diff --git a/hw4/cs285/envs/reacher/assets/sawyer.xml b/hw4/cs285/envs/reacher/assets/sawyer.xml new file mode 100644 index 00000000..c27ccf59 --- /dev/null +++ b/hw4/cs285/envs/reacher/assets/sawyer.xml @@ -0,0 +1,110 @@ + + + diff --git a/hw4/cs285/envs/reacher/reacher_env.py b/hw4/cs285/envs/reacher/reacher_env.py new file mode 100644 index 00000000..61da6823 --- /dev/null +++ b/hw4/cs285/envs/reacher/reacher_env.py @@ -0,0 +1,126 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env +from mujoco_py import MjViewer +import os + +class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + + # placeholder + self.hand_sid = -2 + self.target_sid = -1 + + curr_dir = os.path.dirname(os.path.abspath(__file__)) + mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2) + utils.EzPickle.__init__(self) + self.observation_dim = 26 + self.action_dim = 7 + + self.hand_sid = self.model.site_name2id("finger") + self.target_sid = self.model.site_name2id("target") + self.skip = self.frame_skip + + + def _get_obs(self): + return np.concatenate([ + self.data.qpos.flat, #[7] + self.data.qvel.flatten() / 10., #[7] + self.data.site_xpos[self.hand_sid], #[3] + self.model.site_pos[self.target_sid], #[3] + ]) + + def step(self, a): + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + reward, done = self.get_reward(ob, a) + + score = self.get_score(ob) + + # finalize step + env_info = {'ob': ob, + 'rewards': self.reward_dict, + 'score': score} + + return ob, reward, done, env_info + + def get_score(self, obs): + hand_pos = obs[-6:-3] + target_pos = obs[-3:] + score = -1*np.abs(hand_pos-target_pos) + return score + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + hand_pos = observations[:, -6:-3] + target_pos = observations[:, -3:] + + #calc rew + dist = np.linalg.norm(hand_pos - target_pos, axis=1) + self.reward_dict['r_total'] = -10*dist + + #done is always false for this env + dones = np.zeros((observations.shape[0],)) + + #return + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + def reset(self): + _ = self.reset_model() + + self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1] + + observation, _reward, done, _info = self.step(np.zeros(7)) + ob = self._get_obs() + + return ob + + def reset_model(self, seed=None): + if seed is not None: + self.seed(seed) + + self.reset_pose = self.init_qpos.copy() + self.reset_vel = self.init_qvel.copy() + + self.reset_goal = np.zeros(3) + self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3) + self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2) + self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25) + + return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) + + def do_reset(self, reset_pose, reset_vel, reset_goal): + + self.set_state(reset_pose, reset_vel) + + #reset target + self.reset_goal = reset_goal.copy() + self.model.site_pos[self.target_sid] = self.reset_goal + self.sim.forward() + + #return + return self._get_obs() \ No newline at end of file diff --git a/hw4/cs285/infrastructure/colab_utils.py b/hw4/cs285/infrastructure/colab_utils.py new file mode 100644 index 00000000..a896be97 --- /dev/null +++ b/hw4/cs285/infrastructure/colab_utils.py @@ -0,0 +1,26 @@ +from gym.wrappers import Monitor +import glob +import io +import base64 +from IPython.display import HTML +from IPython import display as ipythondisplay + +## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI + +def show_video(): + mp4list = glob.glob('/content/video/*.mp4') + if len(mp4list) > 0: + mp4 = mp4list[0] + video = io.open(mp4, 'r+b').read() + encoded = base64.b64encode(video) + ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) + else: + print("Could not find video") + + +def wrap_env(env): + env = Monitor(env, '/content/video', force=True) + return env diff --git a/hw4/cs285/infrastructure/logger.py b/hw4/cs285/infrastructure/logger.py new file mode 100644 index 00000000..a64931c0 --- /dev/null +++ b/hw4/cs285/infrastructure/logger.py @@ -0,0 +1,74 @@ +import os +from tensorboardX import SummaryWriter +import numpy as np + +class Logger: + def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): + self._log_dir = log_dir + print('########################') + print('logging outputs to ', log_dir) + print('########################') + self._n_logged_samples = n_logged_samples + self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) + + def log_scalar(self, scalar, name, step_): + self._summ_writer.add_scalar('{}'.format(name), scalar, step_) + + def log_scalars(self, scalar_dict, group_name, step, phase): + """Will log all scalars in the same plot.""" + self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) + + def log_image(self, image, name, step): + assert(len(image.shape) == 3) # [C, H, W] + self._summ_writer.add_image('{}'.format(name), image, step) + + def log_video(self, video_frames, name, step, fps=10): + assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" + self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) + + def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): + + # reshape the rollouts + videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] + + # max rollout length + max_videos_to_save = np.min([max_videos_to_save, len(videos)]) + max_length = videos[0].shape[0] + for i in range(max_videos_to_save): + if videos[i].shape[0]>max_length: + max_length = videos[i].shape[0] + + # pad rollouts to all be same length + for i in range(max_videos_to_save): + if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_figure(self, figure, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_graph(self, array, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + im = plot_graph(array) + self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) + + def dump_scalars(self, log_path=None): + log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path + self._summ_writer.export_scalars_to_json(log_path) + + def flush(self): + self._summ_writer.flush() + + + + diff --git a/hw4/cs285/infrastructure/pytorch_util.py b/hw4/cs285/infrastructure/pytorch_util.py new file mode 100644 index 00000000..530a6208 --- /dev/null +++ b/hw4/cs285/infrastructure/pytorch_util.py @@ -0,0 +1,79 @@ +from typing import Union + +import torch +from torch import nn + +Activation = Union[str, nn.Module] + + +_str_to_activation = { + 'relu': nn.ReLU(), + 'tanh': nn.Tanh(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'selu': nn.SELU(), + 'softplus': nn.Softplus(), + 'identity': nn.Identity(), +} + + +def build_mlp( + input_size: int, + output_size: int, + n_layers: int, + size: int, + activation: Activation = 'tanh', + output_activation: Activation = 'identity', +): + """ + Builds a feedforward neural network + arguments: + input_placeholder: placeholder variable for the state (batch_size, input_size) + scope: variable scope of the network + n_layers: number of hidden layers + size: dimension of each hidden layer + activation: activation of each hidden layer + input_size: size of the input layer + output_size: size of the output layer + output_activation: activation of the output layer + returns: + output_placeholder: the result of a forward pass through the hidden layers + the output layer + """ + if isinstance(activation, str): + activation = _str_to_activation[activation] + if isinstance(output_activation, str): + output_activation = _str_to_activation[output_activation] + layers = [] + in_size = input_size + for _ in range(n_layers): + layers.append(nn.Linear(in_size, size)) + layers.append(activation) + in_size = size + layers.append(nn.Linear(in_size, output_size)) + layers.append(output_activation) + return nn.Sequential(*layers) + + +device = None + + +def init_gpu(use_gpu=True, gpu_id=0): + global device + if torch.cuda.is_available() and use_gpu: + device = torch.device("cuda:" + str(gpu_id)) + print("Using GPU id {}".format(gpu_id)) + else: + device = torch.device("cpu") + print("GPU not detected. Defaulting to CPU.") + + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + + +def from_numpy(*args, **kwargs): + return torch.from_numpy(*args, **kwargs).float().to(device) + + +def to_numpy(tensor): + return tensor.to('cpu').detach().numpy() diff --git a/hw4/cs285/infrastructure/replay_buffer.py b/hw4/cs285/infrastructure/replay_buffer.py new file mode 100644 index 00000000..df7648d4 --- /dev/null +++ b/hw4/cs285/infrastructure/replay_buffer.py @@ -0,0 +1,82 @@ +from cs285.infrastructure.utils import * + + +class ReplayBuffer(object): + + def __init__(self, max_size=1000000): + + self.max_size = max_size + self.paths = [] + self.obs = None + self.acs = None + self.concatenated_rews = None + self.next_obs = None + self.terminals = None + + def add_rollouts(self, paths, noised=False): + + # add new rollouts into our list of rollouts + for path in paths: + self.paths.append(path) + + # convert new rollouts into their component arrays, and append them onto our arrays + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) + + if noised: + observations = add_noise(observations) + next_observations = add_noise(next_observations) + + if self.obs is None: + self.obs = observations[-self.max_size:] + self.acs = actions[-self.max_size:] + self.next_obs = next_observations[-self.max_size:] + self.terminals = terminals[-self.max_size:] + self.concatenated_rews = concatenated_rews[-self.max_size:] + else: + self.obs = np.concatenate([self.obs, observations])[-self.max_size:] + self.acs = np.concatenate([self.acs, actions])[-self.max_size:] + self.next_obs = np.concatenate( + [self.next_obs, next_observations] + )[-self.max_size:] + self.terminals = np.concatenate( + [self.terminals, terminals] + )[-self.max_size:] + self.concatenated_rews = np.concatenate( + [self.concatenated_rews, concatenated_rews] + )[-self.max_size:] + + ######################################## + ######################################## + + def sample_random_rollouts(self, num_rollouts): + rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] + return self.paths[rand_indices] + + def sample_recent_rollouts(self, num_rollouts=1): + return self.paths[-num_rollouts:] + + ######################################## + ######################################## + + def sample_random_data(self, batch_size): + + assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] + rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] + return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] + + def sample_recent_data(self, batch_size=1, concat_rew=True): + + if concat_rew: + return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] + else: + num_recent_rollouts_to_return = 0 + num_datapoints_so_far = 0 + index = -1 + while num_datapoints_so_far < batch_size: + recent_rollout = self.paths[index] + index -=1 + num_recent_rollouts_to_return +=1 + num_datapoints_so_far += get_pathlength(recent_rollout) + rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) + return observations, actions, unconcatenated_rews, next_observations, terminals diff --git a/hw4/cs285/infrastructure/rl_trainer.py b/hw4/cs285/infrastructure/rl_trainer.py new file mode 100644 index 00000000..c70d77f9 --- /dev/null +++ b/hw4/cs285/infrastructure/rl_trainer.py @@ -0,0 +1,295 @@ +from collections import OrderedDict +import pickle +import os +import sys +import time + +import gym +from gym import wrappers +import numpy as np +import torch + +from cs285.agents.mb_agent import MBAgent +from cs285.infrastructure import pytorch_util as ptu +from cs285.infrastructure import utils +from cs285.infrastructure.logger import Logger + +# register all of our envs +from cs285.envs import register_envs + +register_envs() + + +# how many rollouts to save as videos to tensorboard +MAX_NVIDEO = 2 +MAX_VIDEO_LEN = 40 # we overwrite this in the code below + + +class RL_Trainer(object): + + def __init__(self, params): + + ############# + ## INIT + ############# + + # Get params, create logger + self.params = params + self.logger = Logger(self.params['logdir']) + + # Set random seeds + seed = self.params['seed'] + np.random.seed(seed) + torch.manual_seed(seed) + ptu.init_gpu( + use_gpu=not self.params['no_gpu'], + gpu_id=self.params['which_gpu'] + ) + + ############# + ## ENV + ############# + + # Make the gym environment + self.env = gym.make(self.params['env_name']) + if 'env_wrappers' in self.params: + # These operations are currently only for Atari envs + self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) + self.env = params['env_wrappers'](self.env) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: + self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + + self.env.seed(seed) + + # import plotting (locally if 'obstacles' env) + if not(self.params['env_name']=='obstacles-cs285-v0'): + import matplotlib + matplotlib.use('Agg') + + # Maximum length for episodes + self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps + global MAX_VIDEO_LEN + MAX_VIDEO_LEN = self.params['ep_len'] + + # Is this env continuous, or self.discrete? + discrete = isinstance(self.env.action_space, gym.spaces.Discrete) + # Are the observations images? + img = len(self.env.observation_space.shape) > 2 + + self.params['agent_params']['discrete'] = discrete + + # Observation and action sizes + + ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] + ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] + self.params['agent_params']['ac_dim'] = ac_dim + self.params['agent_params']['ob_dim'] = ob_dim + + # simulation timestep, will be used for video saving + if 'model' in dir(self.env): + self.fps = 1/self.env.model.opt.timestep + elif 'env_wrappers' in self.params: + self.fps = 30 # This is not actually used when using the Monitor wrapper + elif 'video.frames_per_second' in self.env.env.metadata.keys(): + self.fps = self.env.env.metadata['video.frames_per_second'] + else: + self.fps = 10 + + + ############# + ## AGENT + ############# + + agent_class = self.params['agent_class'] + self.agent = agent_class(self.env, self.params['agent_params']) + + def run_training_loop(self, n_iter, collect_policy, eval_policy, + initial_expertdata=None): + """ + :param n_iter: number of (dagger) iterations + :param collect_policy: + :param eval_policy: + :param initial_expertdata: + """ + + # init vars at beginning of training + self.total_envsteps = 0 + self.start_time = time.time() + + print_period = 1 + + for itr in range(n_iter): + if itr % print_period == 0: + print("\n\n********** Iteration %i ************"%itr) + + # decide if videos should be rendered/logged at this iteration + if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: + self.logvideo = True + else: + self.logvideo = False + + # decide if metrics should be logged + if self.params['scalar_log_freq'] == -1: + self.logmetrics = False + elif itr % self.params['scalar_log_freq'] == 0: + self.logmetrics = True + else: + self.logmetrics = False + + use_batchsize = self.params['batch_size'] + if itr == 0: + use_batchsize = self.params['batch_size_initial'] + paths, envsteps_this_batch, train_video_paths = ( + self.collect_training_trajectories( + itr, initial_expertdata, collect_policy, use_batchsize) + ) + + self.total_envsteps += envsteps_this_batch + + # add collected data to replay buffer + if isinstance(self.agent, MBAgent): + self.agent.add_to_replay_buffer(paths, self.params['add_sl_noise']) + else: + self.agent.add_to_replay_buffer(paths) + + # train agent (using sampled data from replay buffer) + if itr % print_period == 0: + print("\nTraining agent...") + all_logs = self.train_agent() + + # if there is a model, log model predictions + if isinstance(self.agent, MBAgent) and itr == 0: + self.log_model_predictions(itr, all_logs) + + # log/save + if self.logvideo or self.logmetrics: + # perform logging + print('\nBeginning logging procedure...') + self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) + + if self.params['save_params']: + self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) + + #################################### + #################################### + + def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, num_transitions_to_sample, save_expert_data_to_disk=False): + """ + :param itr: + :param load_initial_expertdata: path to expert data pkl file + :param collect_policy: the current policy using which we collect data + :param num_transitions_to_sample: the number of transitions we collect + :return: + paths: a list trajectories + envsteps_this_batch: the sum over the numbers of environment steps in paths + train_video_paths: paths which also contain videos for visualization purposes + """ + # TODO: get this from Piazza + + return paths, envsteps_this_batch, train_video_paths + + def train_agent(self): + # TODO: get this from Piazza + + #################################### + #################################### + def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): + + last_log = all_logs[-1] + + ####################### + + # collect eval trajectories, for logging + print("\nCollecting data for eval...") + eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) + + # save eval rollouts as videos in tensorboard event file + if self.logvideo and train_video_paths != None: + print('\nCollecting video rollouts eval') + eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + #save train/eval videos + print('\nSaving train rollouts as videos...') + self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, + video_title='train_rollouts') + self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, + video_title='eval_rollouts') + + ####################### + + # save eval metrics + if self.logmetrics: + # returns, for logging + train_returns = [path["reward"].sum() for path in paths] + eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] + + # episode lengths, for logging + train_ep_lens = [len(path["reward"]) for path in paths] + eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] + + # decide what to log + logs = OrderedDict() + logs["Eval_AverageReturn"] = np.mean(eval_returns) + logs["Eval_StdReturn"] = np.std(eval_returns) + logs["Eval_MaxReturn"] = np.max(eval_returns) + logs["Eval_MinReturn"] = np.min(eval_returns) + logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) + + logs["Train_AverageReturn"] = np.mean(train_returns) + logs["Train_StdReturn"] = np.std(train_returns) + logs["Train_MaxReturn"] = np.max(train_returns) + logs["Train_MinReturn"] = np.min(train_returns) + logs["Train_AverageEpLen"] = np.mean(train_ep_lens) + + logs["Train_EnvstepsSoFar"] = self.total_envsteps + logs["TimeSinceStart"] = time.time() - self.start_time + logs.update(last_log) + + if itr == 0: + self.initial_return = np.mean(train_returns) + logs["Initial_DataCollection_AverageReturn"] = self.initial_return + + # perform the logging + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + self.logger.log_scalar(value, key, itr) + print('Done logging...\n\n') + + self.logger.flush() + + def log_model_predictions(self, itr, all_logs): + # model predictions + + import matplotlib.pyplot as plt + self.fig = plt.figure() + + # sample actions + action_sequence = self.agent.actor.sample_action_sequences(num_sequences=1, horizon=10) #20 reacher + action_sequence = action_sequence[0] + + # calculate and log model prediction error + mpe, true_states, pred_states = utils.calculate_mean_prediction_error(self.env, action_sequence, self.agent.dyn_models, self.agent.actor.data_statistics) + assert self.params['agent_params']['ob_dim'] == true_states.shape[1] == pred_states.shape[1] + ob_dim = self.params['agent_params']['ob_dim'] + ob_dim = 2*int(ob_dim/2.0) ## skip last state for plotting when state dim is odd + + # plot the predictions + self.fig.clf() + for i in range(ob_dim): + plt.subplot(ob_dim/2, 2, i+1) + plt.plot(true_states[:,i], 'g') + plt.plot(pred_states[:,i], 'r') + self.fig.suptitle('MPE: ' + str(mpe)) + self.fig.savefig(self.params['logdir']+'/itr_'+str(itr)+'_predictions.png', dpi=200, bbox_inches='tight') + + # plot all intermediate losses during this iteration + all_losses = np.array([log['Training Loss'] for log in all_logs]) + np.save(self.params['logdir']+'/itr_'+str(itr)+'_losses.npy', all_losses) + self.fig.clf() + plt.plot(all_losses) + self.fig.savefig(self.params['logdir']+'/itr_'+str(itr)+'_losses.png', dpi=200, bbox_inches='tight') + diff --git a/hw4/cs285/infrastructure/utils.py b/hw4/cs285/infrastructure/utils.py new file mode 100644 index 00000000..6d7cf7d3 --- /dev/null +++ b/hw4/cs285/infrastructure/utils.py @@ -0,0 +1,139 @@ +import numpy as np +import time +import copy + +############################################ +############################################ + +def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): + + model = models[0] + + # true + true_states = perform_actions(env, action_sequence)['observation'] + + # predicted + ob = np.expand_dims(true_states[0],0) + pred_states = [] + for ac in action_sequence: + pred_states.append(ob) + action = np.expand_dims(ac,0) + ob = model.get_prediction(ob, action, data_statistics) + pred_states = np.squeeze(pred_states) + + # mpe + mpe = mean_squared_error(pred_states, true_states) + + return mpe, true_states, pred_states + +def perform_actions(env, actions): + ob = env.reset() + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + for ac in actions: + obs.append(ob) + acs.append(ac) + ob, rew, done, _ = env.step(ac) + # add the observation after taking a step to next_obs + next_obs.append(ob) + rewards.append(rew) + steps += 1 + # If the episode ended, the corresponding terminal value is 1 + # otherwise, it is 0 + if done: + terminals.append(1) + break + else: + terminals.append(0) + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) + +def mean_squared_error(a, b): + return np.mean((a-b)**2) + +############################################ +############################################ + +def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): +# TODO: get this from Piazza + +def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect rollouts using policy + until we have collected min_timesteps_per_batch steps + """ + # TODO: get this from Piazza + + return paths, timesteps_this_batch + +def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect ntraj rollouts using policy + """ + # TODO: get this from Piazza + + return paths + +############################################ +############################################ + +def Path(obs, image_obs, acs, rewards, next_obs, terminals): + """ + Take info (separate arrays) from a single rollout + and return it in a single dictionary + """ + if image_obs != []: + image_obs = np.stack(image_obs, axis=0) + return {"observation" : np.array(obs, dtype=np.float32), + "image_obs" : np.array(image_obs, dtype=np.uint8), + "reward" : np.array(rewards, dtype=np.float32), + "action" : np.array(acs, dtype=np.float32), + "next_observation": np.array(next_obs, dtype=np.float32), + "terminal": np.array(terminals, dtype=np.float32)} + + +def convert_listofrollouts(paths): + """ + Take a list of rollout dictionaries + and return separate arrays, + where each array is a concatenation of that array from across the rollouts + """ + observations = np.concatenate([path["observation"] for path in paths]) + actions = np.concatenate([path["action"] for path in paths]) + next_observations = np.concatenate([path["next_observation"] for path in paths]) + terminals = np.concatenate([path["terminal"] for path in paths]) + concatenated_rewards = np.concatenate([path["reward"] for path in paths]) + unconcatenated_rewards = [path["reward"] for path in paths] + return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards + +############################################ +############################################ + +def get_pathlength(path): + return len(path["reward"]) + +def normalize(data, mean, std, eps=1e-8): + return (data-mean)/(std+eps) + +def unnormalize(data, mean, std): + return data*std+mean + +def add_noise(data_inp, noiseToSignal=0.01): + + data = copy.deepcopy(data_inp) #(num data points, dim) + + #mean of data + mean_data = np.mean(data, axis=0) + + #if mean is 0, + #make it 0.001 to avoid 0 issues later for dividing by std + mean_data[mean_data == 0] = 0.000001 + + #width of normal distribution to sample noise from + #larger magnitude number = could have larger magnitude noise + std_of_noise = mean_data * noiseToSignal + for j in range(mean_data.shape[0]): + data[:, j] = np.copy(data[:, j] + np.random.normal( + 0, np.absolute(std_of_noise[j]), (data.shape[0],))) + + return data diff --git a/hw4/cs285/models/base_model.py b/hw4/cs285/models/base_model.py new file mode 100644 index 00000000..b9a30fec --- /dev/null +++ b/hw4/cs285/models/base_model.py @@ -0,0 +1,17 @@ +import numpy as np +from typing import Any + + +Prediction = Any + + +class BaseModel(object): + def update(self, ob_no, next_ob_no, re_n, terminal_n) -> dict: + raise NotImplementedError + + def get_prediction(self, ob_no, ac_na, data_statistics) -> Prediction: + raise NotImplementedError + + def convert_prediction_to_numpy(self, pred: Prediction) -> np.ndarray: + """Allow caller to be pytorch-agnostic.""" + raise NotImplementedError diff --git a/hw4/cs285/models/ff_model.py b/hw4/cs285/models/ff_model.py new file mode 100644 index 00000000..43d5d6ef --- /dev/null +++ b/hw4/cs285/models/ff_model.py @@ -0,0 +1,143 @@ +from torch import nn +import torch +from torch import optim +from cs285.models.base_model import BaseModel +from cs285.infrastructure.utils import normalize, unnormalize +from cs285.infrastructure import pytorch_util as ptu + + +class FFModel(nn.Module, BaseModel): + + def __init__(self, ac_dim, ob_dim, n_layers, size, learning_rate=0.001): + super(FFModel, self).__init__() + + self.ac_dim = ac_dim + self.ob_dim = ob_dim + self.n_layers = n_layers + self.size = size + self.learning_rate = learning_rate + self.delta_network = ptu.build_mlp( + input_size=self.ob_dim + self.ac_dim, + output_size=self.ob_dim, + n_layers=self.n_layers, + size=self.size, + ) + self.delta_network.to(ptu.device) + self.optimizer = optim.Adam( + self.delta_network.parameters(), + self.learning_rate, + ) + self.loss = nn.MSELoss() + self.obs_mean = None + self.obs_std = None + self.acs_mean = None + self.acs_std = None + self.delta_mean = None + self.delta_std = None + + def update_statistics( + self, + obs_mean, + obs_std, + acs_mean, + acs_std, + delta_mean, + delta_std, + ): + self.obs_mean = ptu.from_numpy(obs_mean) + self.obs_std = ptu.from_numpy(obs_std) + self.acs_mean = ptu.from_numpy(acs_mean) + self.acs_std = ptu.from_numpy(acs_std) + self.delta_mean = ptu.from_numpy(delta_mean) + self.delta_std = ptu.from_numpy(delta_std) + + def forward( + self, + obs_unnormalized, + acs_unnormalized, + obs_mean, + obs_std, + acs_mean, + acs_std, + delta_mean, + delta_std, + ): + """ + :param obs_unnormalized: Unnormalized observations + :param acs_unnormalized: Unnormalized actions + :param obs_mean: Mean of observations + :param obs_std: Standard deviation of observations + :param acs_mean: Mean of actions + :param acs_std: Standard deviation of actions + :param delta_mean: Mean of state difference `s_t+1 - s_t`. + :param delta_std: Standard deviation of state difference `s_t+1 - s_t`. + :return: tuple `(next_obs_pred, delta_pred_normalized)` + This forward function should return a tuple of two items + 1. `next_obs_pred` which is the predicted `s_t+1` + 2. `delta_pred_normalized` which is the normalized (i.e. not + unnormalized) output of the delta network. This is needed + """ + # normalize input data to mean 0, std 1 + obs_normalized = # TODO(Q1) + acs_normalized = # TODO(Q1) + + # predicted change in obs + concatenated_input = torch.cat([obs_normalized, acs_normalized], dim=1) + + # TODO(Q1) compute delta_pred_normalized and next_obs_pred + # Hint: as described in the PDF, the output of the network is the + # *normalized change* in state, i.e. normalized(s_t+1 - s_t). + delta_pred_normalized = # TODO(Q1) + next_obs_pred = # TODO(Q1) + return next_obs_pred, delta_pred_normalized + + def get_prediction(self, obs, acs, data_statistics): + """ + :param obs: numpy array of observations (s_t) + :param acs: numpy array of actions (a_t) + :param data_statistics: A dictionary with the following keys (each with + a numpy array as the value): + - 'obs_mean' + - 'obs_std' + - 'acs_mean' + - 'acs_std' + - 'delta_mean' + - 'delta_std' + :return: a numpy array of the predicted next-states (s_t+1) + """ + prediction = # TODO(Q1) get numpy array of the predicted next-states (s_t+1) + # Hint: `self(...)` returns a tuple, but you only need to use one of the + # outputs. + return prediction + + def update(self, observations, actions, next_observations, data_statistics): + """ + :param observations: numpy array of observations + :param actions: numpy array of actions + :param next_observations: numpy array of next observations + :param data_statistics: A dictionary with the following keys (each with + a numpy array as the value): + - 'obs_mean' + - 'obs_std' + - 'acs_mean' + - 'acs_std' + - 'delta_mean' + - 'delta_std' + :return: + """ + target = # TODO(Q1) compute the normalized target for the model. + # Hint: you should use `data_statistics['delta_mean']` and + # `data_statistics['delta_std']`, which keep track of the mean + # and standard deviation of the model. + + loss = # TODO(Q1) compute the loss + # Hint: `self(...)` returns a tuple, but you only need to use one of the + # outputs. + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + return { + 'Training Loss': ptu.to_numpy(loss), + } diff --git a/hw4/cs285/policies/MPC_policy.py b/hw4/cs285/policies/MPC_policy.py new file mode 100644 index 00000000..86eb2637 --- /dev/null +++ b/hw4/cs285/policies/MPC_policy.py @@ -0,0 +1,90 @@ +import numpy as np + +from .base_policy import BasePolicy + + +class MPCPolicy(BasePolicy): + + def __init__(self, + env, + ac_dim, + dyn_models, + horizon, + N, + **kwargs + ): + super().__init__(**kwargs) + + # init vars + self.env = env + self.dyn_models = dyn_models + self.horizon = horizon + self.N = N + self.data_statistics = None # NOTE must be updated from elsewhere + + self.ob_dim = self.env.observation_space.shape[0] + + # action space + self.ac_space = self.env.action_space + self.ac_dim = ac_dim + self.low = self.ac_space.low + self.high = self.ac_space.high + + def sample_action_sequences(self, num_sequences, horizon): + # TODO(Q1) uniformly sample trajectories and return an array of + # dimensions (num_sequences, horizon, self.ac_dim) in the range + # [self.low, self.high] + return random_action_sequences + + def get_action(self, obs): + + if self.data_statistics is None: + # print("WARNING: performing random actions.") + return self.sample_action_sequences(num_sequences=1, horizon=1)[0] + + # sample random actions (N x horizon) + candidate_action_sequences = self.sample_action_sequences( + num_sequences=self.N, horizon=self.horizon) + + # for each model in ensemble: + predicted_sum_of_rewards_per_model = [] + for model in self.dyn_models: + sum_of_rewards = self.calculate_sum_of_rewards( + obs, candidate_action_sequences, model) + predicted_sum_of_rewards_per_model.append(sum_of_rewards) + + # calculate mean_across_ensembles(predicted rewards) + predicted_rewards = np.mean( + predicted_sum_of_rewards_per_model, axis=0) # [ens, N] --> N + + # pick the action sequence and return the 1st element of that sequence + best_action_sequence = None # TODO (Q2) + action_to_take = None # TODO (Q2) + return action_to_take[None] # Unsqueeze the first index + + def calculate_sum_of_rewards(self, obs, candidate_action_sequences, model): + """ + + :param obs: numpy array with the current observation. Shape [D_obs] + :param candidate_action_sequences: numpy array with the candidate action + sequences. Shape [N, H, D_action] where + - N is the number of action sequences considered + - H is the horizon + - D_action is the action of the dimension + :param model: The current dynamics model. + :return: numpy array with the sum of rewards for each action sequence. + The array should have shape [N]. + """ + sum_of_rewards = None # TODO (Q2) + # For each candidate action sequence, predict a sequence of + # states for each dynamics model in your ensemble. + # Once you have a sequence of predicted states from each model in + # your ensemble, calculate the sum of rewards for each sequence + # using `self.env.get_reward(predicted_obs)` + # You should sum across `self.horizon` time step. + # Hint: you should use model.get_prediction and you shouldn't need + # to import pytorch in this file. + # Hint: Remember that the model can process observations and actions + # in batch, which can be much faster than looping through each + # action sequence. + return sum_of_rewards diff --git a/hw4/cs285/policies/base_policy.py b/hw4/cs285/policies/base_policy.py new file mode 100644 index 00000000..e089540a --- /dev/null +++ b/hw4/cs285/policies/base_policy.py @@ -0,0 +1,14 @@ +import abc +import numpy as np + + +class BasePolicy(object, metaclass=abc.ABCMeta): + def get_action(self, obs: np.ndarray) -> np.ndarray: + raise NotImplementedError + + def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def save(self, filepath: str): + raise NotImplementedError diff --git a/hw4/cs285/scripts/filter_events.py b/hw4/cs285/scripts/filter_events.py new file mode 100644 index 00000000..c621ef2d --- /dev/null +++ b/hw4/cs285/scripts/filter_events.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +""" +Usage: + +Run the command +``` +python filter_events.py --events SOME_DIRECTORY +``` + +and it will generate a directory named `SOME_DIRECTORY_filtered` with the video +events removed. +""" +from __future__ import print_function +import os +import sys +import argparse +import tqdm + +# Adapted from +# https://gist.github.com/serycjon/c9ad58ecc3176d87c49b69b598f4d6c6 + +import tensorflow as tf + + +def parse_arguments(): + parser = argparse.ArgumentParser(description='') + parser.add_argument('--event', help='event file', required=True) + + return parser.parse_args() + + +def main(args): + out_path = os.path.dirname(args.event) + '_filtered' + writer = tf.summary.FileWriter(out_path) + + total = None + for event in tqdm.tqdm(tf.train.summary_iterator(args.event), total=total): + event_type = event.WhichOneof('what') + if event_type != 'summary': + writer.add_event(event) + else: + wall_time = event.wall_time + step = event.step + filtered_values = [value for value in event.summary.value if + 'rollouts' not in value.tag] + summary = tf.Summary(value=filtered_values) + + filtered_event = tf.summary.Event(summary=summary, + wall_time=wall_time, + step=step) + writer.add_event(filtered_event) + writer.close() + return 0 + + +if __name__ == '__main__': + args = parse_arguments() + sys.exit(main(args)) diff --git a/hw4/cs285/scripts/read_results.py b/hw4/cs285/scripts/read_results.py new file mode 100644 index 00000000..3a5bc50f --- /dev/null +++ b/hw4/cs285/scripts/read_results.py @@ -0,0 +1,26 @@ +import glob +import tensorflow as tf + +def get_section_results(file): + """ + requires tensorflow==1.12.0 + """ + X = [] + Y = [] + for e in tf.train.summary_iterator(file): + for v in e.summary.value: + if v.tag == 'Train_EnvstepsSoFar': + X.append(v.simple_value) + elif v.tag == 'Eval_AverageReturn': + Y.append(v.simple_value) + return X, Y + +if __name__ == '__main__': + import glob + + logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' + eventfile = glob.glob(logdir)[0] + + X, Y = get_section_results(eventfile) + for i, (x, y) in enumerate(zip(X, Y)): + print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) \ No newline at end of file diff --git a/hw4/cs285/scripts/run_hw4_mb.ipynb b/hw4/cs285/scripts/run_hw4_mb.ipynb new file mode 100644 index 00000000..1162ee08 --- /dev/null +++ b/hw4/cs285/scripts/run_hw4_mb.ipynb @@ -0,0 +1,497 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "run_hw4_mb.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "gUl_qfOR8JV6" + }, + "source": [ + "##Setup\n", + "\n", + "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iizPcHAp8LnA" + }, + "source": [ + "#@title mount your Google Drive\n", + "#@markdown Your work will be stored in a folder called `cs285_f2020` by default to prevent Colab instance timeouts from deleting your edits.\n", + "\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nAb10wnb8N0m", + "cellView": "form" + }, + "source": [ + "#@title set up mount symlink\n", + "\n", + "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2020'\n", + "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n", + "if not os.path.exists(DRIVE_PYTHON_PATH):\n", + " %mkdir $DRIVE_PATH\n", + "\n", + "## the space in `My Drive` causes some issues,\n", + "## make a symlink to avoid this\n", + "SYM_PATH = '/content/cs285_f2020'\n", + "if not os.path.exists(SYM_PATH):\n", + " !ln -s $DRIVE_PATH $SYM_PATH" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gtS9-WsD8QVr", + "cellView": "form" + }, + "source": [ + "#@title apt install requirements\n", + "\n", + "#@markdown Run each section with Shift+Enter\n", + "\n", + "#@markdown Double-click on section headers to show code.\n", + "\n", + "!apt update \n", + "!apt install -y --no-install-recommends \\\n", + " build-essential \\\n", + " curl \\\n", + " git \\\n", + " gnupg2 \\\n", + " make \\\n", + " cmake \\\n", + " ffmpeg \\\n", + " swig \\\n", + " libz-dev \\\n", + " unzip \\\n", + " zlib1g-dev \\\n", + " libglfw3 \\\n", + " libglfw3-dev \\\n", + " libxrandr2 \\\n", + " libxinerama-dev \\\n", + " libxi6 \\\n", + " libxcursor-dev \\\n", + " libgl1-mesa-dev \\\n", + " libgl1-mesa-glx \\\n", + " libglew-dev \\\n", + " libosmesa6-dev \\\n", + " lsb-release \\\n", + " ack-grep \\\n", + " patchelf \\\n", + " wget \\\n", + " xpra \\\n", + " xserver-xorg-dev \\\n", + " xvfb \\\n", + " python-opengl \\\n", + " ffmpeg > /dev/null 2>&1\n", + "\n", + "!pip install opencv-python==3.4.0.12" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VcKGekJN80NO", + "cellView": "form" + }, + "source": [ + "#@title download mujoco\n", + "\n", + "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n", + "if not os.path.exists(MJC_PATH):\n", + " %mkdir $MJC_PATH\n", + "%cd $MJC_PATH\n", + "if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):\n", + " !wget -q https://www.roboti.us/download/mujoco200_linux.zip\n", + " !unzip -q mujoco200_linux.zip\n", + " %mv mujoco200_linux mujoco200\n", + " %rm mujoco200_linux.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NTiH9f9y82F_", + "cellView": "form" + }, + "source": [ + "#@title update mujoco paths\n", + "\n", + "import os\n", + "\n", + "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n", + "\n", + "## installation on colab does not find *.so files\n", + "## in LD_LIBRARY_PATH, copy over manually instead\n", + "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A0kPh99l87q0" + }, + "source": [ + "Ensure your `mjkey.txt` is in /content/cs285_f2020/mujoco before this step" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X-LoOdZg84pI", + "cellView": "form" + }, + "source": [ + "#@title clone and install mujoco-py\n", + "\n", + "%cd $MJC_PATH\n", + "if not os.path.exists('mujoco-py'):\n", + " !git clone https://github.com/openai/mujoco-py.git\n", + "%cd mujoco-py\n", + "%pip install -e .\n", + "\n", + "## cythonize at the first import\n", + "import mujoco_py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-XcwBiBN8-Fg" + }, + "source": [ + "#@title clone homework repo\n", + "#@markdown Note that this is the same codebase from homework 1,\n", + "#@markdown so you may need to move your old `homework_fall2020`\n", + "#@markdown folder in order to clone the repo again.\n", + "\n", + "#@markdown **Don't delete your old work though!**\n", + "#@markdown You will need it for this assignment.\n", + "\n", + "%cd $SYM_PATH\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "%cd homework_fall2020/hw4\n", + "%pip install -r requirements_colab.txt -f https://download.pytorch.org/whl/torch_stable.html\n", + "%pip install -e ." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g5xIOIpW8_jC" + }, + "source": [ + "#@title set up virtual display\n", + "\n", + "from pyvirtualdisplay import Display\n", + "\n", + "display = Display(visible=0, size=(1400, 900))\n", + "display.start()\n", + "\n", + "# For later\n", + "from cs285.infrastructure.colab_utils import (\n", + " wrap_env,\n", + " show_video\n", + ")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2rsWAWaK9BVp" + }, + "source": [ + "#@title test virtual display\n", + "\n", + "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n", + "\n", + "import gym\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "\n", + "env = wrap_env(gym.make(\"Ant-v2\"))\n", + "\n", + "observation = env.reset()\n", + "for i in range(10):\n", + " env.render(mode='rgb_array')\n", + " obs, rew, term, _ = env.step(env.action_space.sample() ) \n", + " if term:\n", + " break;\n", + " \n", + "env.close()\n", + "print('Loading video...')\n", + "show_video()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QizpiHDh9Fwk" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nii6qk2C9Ipk" + }, + "source": [ + "## Run MBRL" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4t7FUeEG9Dkf" + }, + "source": [ + "#@title imports\n", + "import os\n", + "import time\n", + "\n", + "from cs285.infrastructure.rl_trainer import RL_Trainer\n", + "from cs285.agents.mb_agent import MBAgent\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2fXlzARJ9i-t" + }, + "source": [ + "#@title runtime arguments\n", + "\n", + "class Args:\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, val):\n", + " setattr(self, key, val)\n", + "\n", + " def __contains__(self, key):\n", + " return hasattr(self, key)\n", + "\n", + " env_name = \"cheetah-cs285-v0\" #@param [\"cheetah-cs285-v0\", \"obstacles-cs285-v0\", \"reacher-cs285-v0\"]\n", + " exp_name = TODO#@param\n", + " n_iter = 20 #@param {type:\"integer\"}\n", + "\n", + " if env_name == 'reacher-cs285-v0':\n", + " ep_len = 200\n", + " if env_name == 'cheetah-cs285-v0':\n", + " ep_len = 500\n", + " if env_name == 'obstacles-cs285-v0':\n", + " ep_len = 100\n", + "\n", + " #@markdown batches and steps\n", + " batch_size = 8000 #@param {type: \"integer\"}\n", + " eval_batch_size = 400 #@param {type: \"integer\"}\n", + " train_batch_size = 512 #@param {type: \"integer\"}\n", + " batch_size_initial = 20000 #@param {type: \"integer\"}\n", + "\n", + " num_agent_train_steps_per_iter = 1000 #@param {type: \"integer\"}\n", + "\n", + " #@markdown MBRL parameters\n", + " ensemble_size = 3 #@param {type:\"integer\"}\n", + " mpc_horizon = 10 #@param {type:\"integer\"}\n", + " mpc_num_action_sequences = 1000 #@param {type:\"integer\"}\n", + "\n", + " #@markdown Learning parameters\n", + " learning_rate = 0.001 #@param {type:\"raw\"}\n", + " n_layers = 2 #@param {type:\"integer\"}\n", + " size = 250 #@param {type:\"integer\"}\n", + " add_sl_noise = True #@param {type:\"boolean\"}\n", + "\n", + " #@markdown system\n", + " save_params = False #@param {type: \"boolean\"}\n", + " no_gpu = False #@param {type: \"boolean\"}\n", + " which_gpu = 0 #@param {type: \"integer\"}\n", + " seed = 1 #@param {type: \"integer\"}\n", + "\n", + " #@markdown logging\n", + " ## default is to not log video so\n", + " ## that logs are small enough to be\n", + " ## uploaded to gradscope\n", + " video_log_freq = -1 #@param {type: \"integer\"}\n", + " scalar_log_freq = 1#@param {type: \"integer\"}\n", + "\n", + "\n", + "args = Args()\n", + "\n", + "## ensure compatibility with hw1 code\n", + "args['train_batch_size'] = args['batch_size']\n", + "\n", + "if args['video_log_freq'] > 0:\n", + " import warnings\n", + " warnings.warn(\n", + " '''\\nLogging videos will make eventfiles too'''\n", + " '''\\nlarge for the autograder. Set video_log_freq = -1'''\n", + " '''\\nfor the runs you intend to submit.''')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "T0cJlp6s-ogO" + }, + "source": [ + "#@title create directories for logging\n", + "\n", + "data_path = '''/content/cs285_f2020/''' \\\n", + " '''homework_fall2020/hw4/data'''\n", + "\n", + "if not (os.path.exists(data_path)):\n", + " os.makedirs(data_path)\n", + "\n", + "logdir = 'hw4_' + args.exp_name + '_' + args.env_name + '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + "logdir = os.path.join(data_path, logdir)\n", + "args['logdir'] = logdir\n", + "if not(os.path.exists(logdir)):\n", + " os.makedirs(logdir)\n", + "\n", + "print(\"LOGGING TO: \", logdir)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "I525KFRN-42s" + }, + "source": [ + "#@title Define Model Based trainer\n", + "\n", + "class MB_Trainer(object):\n", + "\n", + " def __init__(self, params):\n", + "\n", + " computation_graph_args = {\n", + " 'ensemble_size': params['ensemble_size'],\n", + " 'n_layers': params['n_layers'],\n", + " 'size': params['size'],\n", + " 'learning_rate': params['learning_rate'],\n", + " }\n", + "\n", + " train_args = {\n", + " 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],\n", + " }\n", + "\n", + " controller_args = {\n", + " 'mpc_horizon': params['mpc_horizon'],\n", + " 'mpc_num_action_sequences': params['mpc_num_action_sequences'],\n", + " }\n", + "\n", + " agent_params = {**computation_graph_args, **train_args, **controller_args}\n", + "\n", + " self.params = params\n", + " self.params['agent_class'] = MBAgent\n", + " self.params['agent_params'] = agent_params\n", + "\n", + " self.rl_trainer = RL_Trainer(self.params)\n", + "\n", + " def run_training_loop(self):\n", + "\n", + " self.rl_trainer.run_training_loop(\n", + " self.params['n_iter'],\n", + " collect_policy = self.rl_trainer.agent.actor,\n", + " eval_policy = self.rl_trainer.agent.actor,\n", + " )\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wF4LSRGn-_Cv" + }, + "source": [ + "#@title run training\n", + "\n", + "trainer = MB_Trainer(args)\n", + "trainer.run_training_loop()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_kTH-tXkI-B-" + }, + "source": [ + "#@markdown You can visualize your runs with tensorboard from within the notebook\n", + "\n", + "## requires tensorflow==2.3.0\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir /content/cs285_f2020/homework_fall2020/hw4/data/" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BwF7tQPQ66hB" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/hw4/cs285/scripts/run_hw4_mb.py b/hw4/cs285/scripts/run_hw4_mb.py new file mode 100644 index 00000000..67eb4fd0 --- /dev/null +++ b/hw4/cs285/scripts/run_hw4_mb.py @@ -0,0 +1,124 @@ +import os +import time + +from cs285.infrastructure.rl_trainer import RL_Trainer +from cs285.agents.mb_agent import MBAgent + + +class MB_Trainer(object): + + def __init__(self, params): + + ##################### + ## SET AGENT PARAMS + ##################### + + computation_graph_args = { + 'ensemble_size': params['ensemble_size'], + 'n_layers': params['n_layers'], + 'size': params['size'], + 'learning_rate': params['learning_rate'], + } + + train_args = { + 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], + } + + controller_args = { + 'mpc_horizon': params['mpc_horizon'], + 'mpc_num_action_sequences': params['mpc_num_action_sequences'], + } + + agent_params = {**computation_graph_args, **train_args, **controller_args} + + self.params = params + self.params['agent_class'] = MBAgent + self.params['agent_params'] = agent_params + + ################ + ## RL TRAINER + ################ + + self.rl_trainer = RL_Trainer(self.params) + + def run_training_loop(self): + + self.rl_trainer.run_training_loop( + self.params['n_iter'], + collect_policy = self.rl_trainer.agent.actor, + eval_policy = self.rl_trainer.agent.actor, + ) + + +def main(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0 + parser.add_argument('--ep_len', type=int, default=200) + parser.add_argument('--exp_name', type=str, default='todo') + parser.add_argument('--n_iter', '-n', type=int, default=20) + + parser.add_argument('--ensemble_size', '-e', type=int, default=3) + parser.add_argument('--mpc_horizon', type=int, default=10) + parser.add_argument('--mpc_num_action_sequences', type=int, default=1000) + + parser.add_argument('--add_sl_noise', '-noise', action='http://github.com/store_true') + parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000) + parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer) + parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer) + parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training) + parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration + + parser.add_argument('--learning_rate', '-lr', type=float, default=0.001) + parser.add_argument('--n_layers', '-l', type=int, default=2) + parser.add_argument('--size', '-s', type=int, default=250) + + parser.add_argument('--seed', type=int, default=1) + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') + parser.add_argument('--which_gpu', '-gpu_id', default=0) + parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable + parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable + parser.add_argument('--save_params', action='http://github.com/store_true') + args = parser.parse_args() + + # convert to dictionary + params = vars(args) + + # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT + if params['env_name']=='reacher-cs285-v0': + params['ep_len']=200 + if params['env_name']=='cheetah-cs285-v0': + params['ep_len']=500 + if params['env_name']=='obstacles-cs285-v0': + params['ep_len']=100 + + ################################## + ### CREATE DIRECTORY FOR LOGGING + ################################## + + logdir_prefix = 'hw4_' # keep for autograder + + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') + + if not (os.path.exists(data_path)): + os.makedirs(data_path) + + logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + logdir = os.path.join(data_path, logdir) + params['logdir'] = logdir + if not(os.path.exists(logdir)): + os.makedirs(logdir) + + print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") + + ################### + ### RUN TRAINING + ################### + + trainer = MB_Trainer(params) + trainer.run_training_loop() + + +if __name__ == "__main__": + main() diff --git a/hw4/cs285_hw4.pdf b/hw4/cs285_hw4.pdf new file mode 100644 index 00000000..17ec4107 Binary files /dev/null and b/hw4/cs285_hw4.pdf differ diff --git a/hw4/requirements.txt b/hw4/requirements.txt new file mode 100644 index 00000000..faec9bb9 --- /dev/null +++ b/hw4/requirements.txt @@ -0,0 +1,12 @@ +gym[atari]==0.17.2 +mujoco-py==2.0.2.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +ipdb==0.13.3 +box2d-py diff --git a/hw4/requirements_colab.txt b/hw4/requirements_colab.txt new file mode 100644 index 00000000..4fafb598 --- /dev/null +++ b/hw4/requirements_colab.txt @@ -0,0 +1,11 @@ +gym[atari]==0.17.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +ipdb==0.13.3 +box2d-py diff --git a/hw4/setup.py b/hw4/setup.py new file mode 100644 index 00000000..3cc1886e --- /dev/null +++ b/hw4/setup.py @@ -0,0 +1,8 @@ +# setup.py +from setuptools import setup + +setup( + name='cs285', + version='0.1.0', + packages=['cs285'], +) \ No newline at end of file diff --git a/hw5/README.md b/hw5/README.md new file mode 100644 index 00000000..b818e7c1 --- /dev/null +++ b/hw5/README.md @@ -0,0 +1,32 @@ +## Setup + +You can run this code on your own machine or on Google Colab. + +1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. + +2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw5/run_hw5_expl.ipynb) + +## Complete the code + +The following files have blanks to be filled with your solutions from homework 1 and 3. The relevant sections are marked with `TODO: get this from Piazza'. + +- [infrastructure/utils.py](cs285/infrastructure/utils.py) +- [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) +- [policies/MLP_policy.py](cs285/policies/MLP_policy.py) +- [policies/argmax_policy.py](cs285/policies/argmax_policy.py) +- [critics/dqn_critic.py](cs285/critics/dqn_critic.py) + +You will then need to implement code in the following files: +- [exploration/rnd_model.py](cs285/exploration/rnd_model.py) +- [agents/explore_or_exploit_agent.py](cs285/agents/explore_or_exploit_agent.py) +- [critics/cql_critic.py](cs285/critics/cql_critic.py) + +The relevant sections are marked with `TODO`. + +You may also want to look through [scripts/run_hw5_expl.py](cs285/scripts/run_hw5_expl.py) (if running locally) or [run_hw5_expl.ipynb](run_hw5_expl.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. + +See the [assignment PDF](hw5.pdf) for more details on what files to edit. + +For this particular assignment, you will need to install networkx==2.5 diff --git a/hw5/cs285/agents/ac_agent.py b/hw5/cs285/agents/ac_agent.py new file mode 100644 index 00000000..cd05d4d6 --- /dev/null +++ b/hw5/cs285/agents/ac_agent.py @@ -0,0 +1,45 @@ +from collections import OrderedDict + +from cs285.critics.bootstrapped_continuous_critic import \ + BootstrappedContinuousCritic +from cs285.infrastructure.replay_buffer import ReplayBuffer +from cs285.infrastructure.utils import * +from cs285.policies.MLP_policy import MLPPolicyAC +from .base_agent import BaseAgent + + +class ACAgent(BaseAgent): + def __init__(self, env, agent_params): + super(ACAgent, self).__init__() + + self.env = env + self.agent_params = agent_params + + self.gamma = self.agent_params['gamma'] + self.standardize_advantages = self.agent_params['standardize_advantages'] + + self.actor = MLPPolicyAC( + self.agent_params['ac_dim'], + self.agent_params['ob_dim'], + self.agent_params['n_layers'], + self.agent_params['size'], + self.agent_params['discrete'], + self.agent_params['learning_rate'], + ) + self.critic = BootstrappedContinuousCritic(self.agent_params) + + self.replay_buffer = ReplayBuffer() + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### + + def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### diff --git a/hw5/cs285/agents/base_agent.py b/hw5/cs285/agents/base_agent.py new file mode 100644 index 00000000..a32224b5 --- /dev/null +++ b/hw5/cs285/agents/base_agent.py @@ -0,0 +1,16 @@ +class BaseAgent(object): + def __init__(self, **kwargs): + super(BaseAgent, self).__init__(**kwargs) + + def train(self) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def add_to_replay_buffer(self, paths): + raise NotImplementedError + + def sample(self, batch_size): + raise NotImplementedError + + def save(self, path): + raise NotImplementedError \ No newline at end of file diff --git a/hw5/cs285/agents/dqn_agent.py b/hw5/cs285/agents/dqn_agent.py new file mode 100644 index 00000000..10fcddde --- /dev/null +++ b/hw5/cs285/agents/dqn_agent.py @@ -0,0 +1,63 @@ +import numpy as np +import pdb + +from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule +from cs285.policies.argmax_policy import ArgMaxPolicy +from cs285.critics.dqn_critic import DQNCritic + + +class DQNAgent(object): + def __init__(self, env, agent_params): + + self.env = env + self.agent_params = agent_params + self.batch_size = agent_params['batch_size'] + # import ipdb; ipdb.set_trace() + self.last_obs = self.env.reset() + + self.num_actions = agent_params['ac_dim'] + self.learning_starts = agent_params['learning_starts'] + self.learning_freq = agent_params['learning_freq'] + self.target_update_freq = agent_params['target_update_freq'] + + self.replay_buffer_idx = None + self.exploration = agent_params['exploration_schedule'] + self.optimizer_spec = agent_params['optimizer_spec'] + + self.critic = DQNCritic(agent_params, self.optimizer_spec) + self.actor = ArgMaxPolicy(self.critic) + + lander = agent_params['env_name'].startswith('LunarLander') + self.replay_buffer = MemoryOptimizedReplayBuffer( + agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) + self.t = 0 + self.num_param_updates = 0 + + def add_to_replay_buffer(self, paths): + pass + + def step_env(self): + """ + Step the env and store the transition + At the end of this block of code, the simulator should have been + advanced one step, and the replay buffer should contain one more transition. + Note that self.last_obs must always point to the new latest observation. + """ + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### + + def sample(self, batch_size): + if self.replay_buffer.can_sample(self.batch_size): + return self.replay_buffer.sample(batch_size) + else: + return [],[],[],[],[] + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### \ No newline at end of file diff --git a/hw5/cs285/agents/explore_or_exploit_agent.py b/hw5/cs285/agents/explore_or_exploit_agent.py new file mode 100644 index 00000000..2f3e6572 --- /dev/null +++ b/hw5/cs285/agents/explore_or_exploit_agent.py @@ -0,0 +1,124 @@ +from collections import OrderedDict + +from cs285.critics.dqn_critic import DQNCritic +from cs285.critics.cql_critic import CQLCritic +from cs285.infrastructure.replay_buffer import ReplayBuffer +from cs285.infrastructure.utils import * +from cs285.policies.argmax_policy import ArgMaxPolicy +from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer +from cs285.exploration.rnd_model import RNDModel +from .dqn_agent import DQNAgent +import numpy as np + + +class ExplorationOrExploitationAgent(DQNAgent): + def __init__(self, env, agent_params): + super(ExplorationOrExploitationAgent, self).__init__(env, agent_params) + + self.replay_buffer = MemoryOptimizedReplayBuffer(100000, 1, float_obs=True) + self.num_exploration_steps = agent_params['num_exploration_steps'] + self.offline_exploitation = agent_params['offline_exploitation'] + + self.exploitation_critic = CQLCritic(agent_params, self.optimizer_spec) + self.exploration_critic = DQNCritic(agent_params, self.optimizer_spec) + + self.exploration_model = RNDModel(agent_params, self.optimizer_spec) + self.explore_weight_schedule = agent_params['explore_weight_schedule'] + self.exploit_weight_schedule = agent_params['exploit_weight_schedule'] + + self.actor = ArgMaxPolicy(self.exploration_critic) + self.eval_policy = ArgMaxPolicy(self.exploitation_critic) + self.exploit_rew_shift = agent_params['exploit_rew_shift'] + self.exploit_rew_scale = agent_params['exploit_rew_scale'] + self.eps = agent_params['eps'] + + def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): + log = {} + + if self.t > self.num_exploration_steps: + # TODO: After exploration is over, set the actor to optimize the extrinsic critic + #HINT: Look at method ArgMaxPolicy.set_critic + + if (self.t > self.learning_starts + and self.t % self.learning_freq == 0 + and self.replay_buffer.can_sample(self.batch_size) + ): + + # Get Reward Weights + # TODO: Get the current explore reward weight and exploit reward weight + # using the schedule's passed in (see __init__) + # COMMENT: Until part 3, explore_weight = 1, and exploit_weight = 0 + explore_weight = None + exploit_weight = None + + # Run Exploration Model # + # TODO: Evaluate the exploration model on s' to get the exploration bonus + # HINT: Normalize the exploration bonus, as RND values vary highly in magnitude + expl_bonus = None + + # Reward Calculations # + # TODO: Calculate mixed rewards, which will be passed into the exploration critic + # HINT: See doc for definition of mixed_reward + mixed_reward = None + + # TODO: Calculate the environment reward + # HINT: For part 1, env_reward is just 're_n' + # After this, env_reward is 're_n' shifted by self.exploit_rew_shift, + # and scaled by self.exploit_rew_scale + env_reward = None + + # Update Critics And Exploration Model # + + # TODO 1): Update the exploration model (based off s') + # TODO 2): Update the exploration critic (based off mixed_reward) + # TODO 3): Update the exploitation critic (based off env_reward) + expl_model_loss = None + exploration_critic_loss = None + exploitation_critic_loss = None + + # Target Networks # + if self.num_param_updates % self.target_update_freq == 0: + # TODO: Update the exploitation and exploration target networks + + # Logging # + log['Exploration Critic Loss'] = exploration_critic_loss['Training Loss'] + log['Exploitation Critic Loss'] = exploitation_critic_loss['Training Loss'] + log['Exploration Model Loss'] = expl_model_loss + + # TODO: Uncomment these lines after completing cql_critic.py + # log['Exploitation Data q-values'] = exploitation_critic_loss['Data q-values'] + # log['Exploitation OOD q-values'] = exploitation_critic_loss['OOD q-values'] + # log['Exploitation CQL Loss'] = exploitation_critic_loss['CQL Loss'] + + self.num_param_updates += 1 + + self.t += 1 + return log + + + def step_env(self): + """ + Step the env and store the transition + At the end of this block of code, the simulator should have been + advanced one step, and the replay buffer should contain one more transition. + Note that self.last_obs must always point to the new latest observation. + """ + if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): + self.replay_buffer_idx = self.replay_buffer.store_frame(self.last_obs) + + perform_random_action = np.random.random() < self.eps or self.t < self.learning_starts + + if perform_random_action: + action = self.env.action_space.sample() + else: + processed = self.replay_buffer.encode_recent_observation() + action = self.actor.get_action(processed) + + next_obs, reward, done, info = self.env.step(action) + self.last_obs = next_obs.copy() + + if (not self.offline_exploitation) or (self.t <= self.num_exploration_steps): + self.replay_buffer.store_effect(self.replay_buffer_idx, action, reward, done) + + if done: + self.last_obs = self.env.reset() diff --git a/hw5/cs285/critics/__init__.py b/hw5/cs285/critics/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/hw5/cs285/critics/__init__.py @@ -0,0 +1 @@ + diff --git a/hw5/cs285/critics/base_critic.py b/hw5/cs285/critics/base_critic.py new file mode 100644 index 00000000..36308dba --- /dev/null +++ b/hw5/cs285/critics/base_critic.py @@ -0,0 +1,3 @@ +class BaseCritic(object): + def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): + raise NotImplementedError diff --git a/hw5/cs285/critics/bootstrapped_continuous_critic.py b/hw5/cs285/critics/bootstrapped_continuous_critic.py new file mode 100644 index 00000000..623c645d --- /dev/null +++ b/hw5/cs285/critics/bootstrapped_continuous_critic.py @@ -0,0 +1,80 @@ +from .base_critic import BaseCritic +from torch import nn +from torch import optim +import pdb + +from cs285.infrastructure import pytorch_util as ptu + + +class BootstrappedContinuousCritic(nn.Module, BaseCritic): + """ + Notes on notation: + + Prefixes and suffixes: + ob - observation + ac - action + _no - this tensor should have shape (batch self.size /n/, observation dim) + _na - this tensor should have shape (batch self.size /n/, action dim) + _n - this tensor should have shape (batch self.size /n/) + + Note: batch self.size /n/ is defined at runtime. + is None + """ + def __init__(self, hparams): + super().__init__() + self.ob_dim = hparams['ob_dim'] + self.ac_dim = hparams['ac_dim'] + self.discrete = hparams['discrete'] + self.size = hparams['size'] + self.n_layers = hparams['n_layers'] + self.learning_rate = hparams['learning_rate'] + + # critic parameters + self.num_target_updates = hparams['num_target_updates'] + self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] + self.gamma = hparams['gamma'] + self.critic_network = ptu.build_mlp( + self.ob_dim, + 1, + n_layers=self.n_layers, + size=self.size, + ) + self.critic_network.to(ptu.device) + self.loss = nn.MSELoss() + self.optimizer = optim.Adam( + self.critic_network.parameters(), + self.learning_rate, + ) + + def forward(self, obs): + return self.critic_network(obs).squeeze(1) + + def forward_np(self, obs): + obs = ptu.from_numpy(obs) + predictions = self(obs) + return ptu.to_numpy(predictions) + + def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + """ + Update the parameters of the critic. + + let sum_of_path_lengths be the sum of the lengths of the paths sampled from + Agent.sample_trajectories + let num_paths be the number of paths sampled from Agent.sample_trajectories + + arguments: + ob_no: shape: (sum_of_path_lengths, ob_dim) + next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward + reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing + the reward for each timestep + terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended + at that timestep of 0 if the episode did not end + + returns: + nothing + """ + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### diff --git a/hw5/cs285/critics/cql_critic.py b/hw5/cs285/critics/cql_critic.py new file mode 100644 index 00000000..6809e4d0 --- /dev/null +++ b/hw5/cs285/critics/cql_critic.py @@ -0,0 +1,114 @@ +from .base_critic import BaseCritic +import torch +import torch.optim as optim +from torch.nn import utils +from torch import nn +import pdb + +from cs285.infrastructure import pytorch_util as ptu + + +class CQLCritic(BaseCritic): + + def __init__(self, hparams, optimizer_spec, **kwargs): + super().__init__(**kwargs) + self.env_name = hparams['env_name'] + self.ob_dim = hparams['ob_dim'] + + if isinstance(self.ob_dim, int): + self.input_shape = (self.ob_dim,) + else: + self.input_shape = hparams['input_shape'] + + self.ac_dim = hparams['ac_dim'] + self.double_q = hparams['double_q'] + self.grad_norm_clipping = hparams['grad_norm_clipping'] + self.gamma = hparams['gamma'] + + self.optimizer_spec = optimizer_spec + network_initializer = hparams['q_func'] + self.q_net = network_initializer(self.ob_dim, self.ac_dim) + self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) + self.optimizer = self.optimizer_spec.constructor( + self.q_net.parameters(), + **self.optimizer_spec.optim_kwargs + ) + self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( + self.optimizer, + self.optimizer_spec.learning_rate_schedule, + ) + self.loss = nn.MSELoss() + self.q_net.to(ptu.device) + self.q_net_target.to(ptu.device) + self.cql_alpha = hparams['cql_alpha'] + + def dqn_loss(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + qa_t_values = self.q_net(ob_no) + q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) + qa_tp1_values = self.q_net_target(next_ob_no) + + next_actions = self.q_net(next_ob_no).argmax(dim=1) + q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1) + + target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) + target = target.detach() + loss = self.loss(q_t_values, target) + + return loss, qa_t_values, q_t_values + + + def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + """ + Update the parameters of the critic. + let sum_of_path_lengths be the sum of the lengths of the paths sampled from + Agent.sample_trajectories + let num_paths be the number of paths sampled from Agent.sample_trajectories + arguments: + ob_no: shape: (sum_of_path_lengths, ob_dim) + next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward + reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing + the reward for each timestep + terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended + at that timestep of 0 if the episode did not end + returns: + nothing + """ + ob_no = ptu.from_numpy(ob_no) + ac_na = ptu.from_numpy(ac_na).to(torch.long) + next_ob_no = ptu.from_numpy(next_ob_no) + reward_n = ptu.from_numpy(reward_n) + terminal_n = ptu.from_numpy(terminal_n) + + loss, qa_t_values, q_t_values = self.dqn_loss( + ob_no, ac_na, next_ob_no, reward_n, terminal_n + ) + + # CQL Implementation + # TODO: Implement CQL as described in the pdf and paper + # Hint: After calculating cql_loss, augment the loss appropriately + cql_loss = None + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + info = {'Training Loss': ptu.to_numpy(loss)} + + # TODO: Uncomment these lines after implementing CQL + # info['CQL Loss'] = ptu.to_numpy(cql_loss) + # info['Data q-values'] = ptu.to_numpy(q_t_values).mean() + # info['OOD q-values'] = ptu.to_numpy(q_t_logsumexp).mean() + + return info + + + def update_target_network(self): + for target_param, param in zip( + self.q_net_target.parameters(), self.q_net.parameters() + ): + target_param.data.copy_(param.data) + + def qa_values(self, obs): + obs = ptu.from_numpy(obs) + qa_values = self.q_net(obs) + return ptu.to_numpy(qa_values) diff --git a/hw5/cs285/critics/dqn_critic.py b/hw5/cs285/critics/dqn_critic.py new file mode 100644 index 00000000..fecee5f9 --- /dev/null +++ b/hw5/cs285/critics/dqn_critic.py @@ -0,0 +1,75 @@ +from .base_critic import BaseCritic +import torch +import torch.optim as optim +from torch.nn import utils +from torch import nn +import pdb + +from cs285.infrastructure import pytorch_util as ptu + + +class DQNCritic(BaseCritic): + + def __init__(self, hparams, optimizer_spec, **kwargs): + super().__init__(**kwargs) + self.env_name = hparams['env_name'] + self.ob_dim = hparams['ob_dim'] + + if isinstance(self.ob_dim, int): + self.input_shape = (self.ob_dim,) + else: + self.input_shape = hparams['input_shape'] + + self.ac_dim = hparams['ac_dim'] + self.double_q = hparams['double_q'] + self.grad_norm_clipping = hparams['grad_norm_clipping'] + self.gamma = hparams['gamma'] + + self.optimizer_spec = optimizer_spec + network_initializer = hparams['q_func'] + self.q_net = network_initializer(self.ob_dim, self.ac_dim) + self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) + self.optimizer = self.optimizer_spec.constructor( + self.q_net.parameters(), + **self.optimizer_spec.optim_kwargs + ) + self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( + self.optimizer, + self.optimizer_spec.learning_rate_schedule, + ) + self.loss = nn.SmoothL1Loss() # AKA Huber loss + self.q_net.to(ptu.device) + self.q_net_target.to(ptu.device) + + def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): + """ + Update the parameters of the critic. + let sum_of_path_lengths be the sum of the lengths of the paths sampled from + Agent.sample_trajectories + let num_paths be the number of paths sampled from Agent.sample_trajectories + arguments: + ob_no: shape: (sum_of_path_lengths, ob_dim) + next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward + reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing + the reward for each timestep + terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended + at that timestep of 0 if the episode did not end + returns: + nothing + """ + raise NotImplementedError + # TODO: Get this from homework 3 + + #################################### + #################################### + + def update_target_network(self): + for target_param, param in zip( + self.q_net_target.parameters(), self.q_net.parameters() + ): + target_param.data.copy_(param.data) + + def qa_values(self, obs): + obs = ptu.from_numpy(obs) + qa_values = self.q_net(obs) + return ptu.to_numpy(qa_values) diff --git a/hw5/cs285/envs/__init__.py b/hw5/cs285/envs/__init__.py new file mode 100644 index 00000000..f57b83a5 --- /dev/null +++ b/hw5/cs285/envs/__init__.py @@ -0,0 +1,4 @@ +from cs285.envs import ant +from cs285.envs import cheetah +from cs285.envs import obstacles +from cs285.envs import reacher \ No newline at end of file diff --git a/hw5/cs285/envs/ant/__init__.py b/hw5/cs285/envs/ant/__init__.py new file mode 100644 index 00000000..91829b06 --- /dev/null +++ b/hw5/cs285/envs/ant/__init__.py @@ -0,0 +1,8 @@ +from gym.envs.registration import register + +register( + id='ant-cs285-v0', + entry_point='cs285.envs.ant:AntEnv', + max_episode_steps=1000, +) +from cs285.envs.ant.ant import AntEnv diff --git a/hw5/cs285/envs/ant/ant.py b/hw5/cs285/envs/ant/ant.py new file mode 100644 index 00000000..9e08cc29 --- /dev/null +++ b/hw5/cs285/envs/ant/ant.py @@ -0,0 +1,270 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import mujoco_py +from gym import utils +from gym.envs.mujoco import mujoco_env + +_FLOAT_EPS = np.finfo(np.float64).eps +_EPS4 = _FLOAT_EPS * 4.0 + +def quat_to_euler(quat): + return mat2euler(quat2mat(quat)) + +def quat2mat(quat): + w, x, y, z = quat[..., 0], quat[..., 1], quat[..., 2], quat[..., 3] + Nq = np.sum(quat * quat, axis=-1) + s = 2.0 / Nq + X, Y, Z = x * s, y * s, z * s + wX, wY, wZ = w * X, w * Y, w * Z + xX, xY, xZ = x * X, x * Y, x * Z + yY, yZ, zZ = y * Y, y * Z, z * Z + + mat = np.empty(quat.shape[:-1] + (3, 3), dtype=np.float64) + mat[..., 0, 0] = 1.0 - (yY + zZ) + mat[..., 0, 1] = xY - wZ + mat[..., 0, 2] = xZ + wY + mat[..., 1, 0] = xY + wZ + mat[..., 1, 1] = 1.0 - (xX + zZ) + mat[..., 1, 2] = yZ - wX + mat[..., 2, 0] = xZ - wY + mat[..., 2, 1] = yZ + wX + mat[..., 2, 2] = 1.0 - (xX + yY) + return np.where((Nq > _FLOAT_EPS)[..., np.newaxis, np.newaxis], mat, np.eye(3)) + +def mat2euler(mat): + cy = np.sqrt(mat[..., 2, 2] * mat[..., 2, 2] + mat[..., 1, 2] * mat[..., 1, 2]) + condition = cy > _EPS4 + euler = np.empty(mat.shape[:-1], dtype=np.float64) + euler[..., 2] = np.where(condition, + -np.arctan2(mat[..., 0, 1], mat[..., 0, 0]), + -np.arctan2(-mat[..., 1, 0], mat[..., 1, 1])) + euler[..., 1] = np.where(condition, + -np.arctan2(-mat[..., 0, 2], cy), + -np.arctan2(-mat[..., 0, 2], cy)) + euler[..., 0] = np.where(condition, + -np.arctan2(mat[..., 1, 2], mat[..., 2, 2]), + 0.0) + return euler + +class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): + + + def __init__(self, + xml_file='ant.xml', + ctrl_cost_weight=0.5, + healthy_reward=1.0, + terminate_when_unhealthy=True, + healthy_z_range=(0.2, 1.0), + reset_noise_scale=0.1, + contact_force_range=0, + exclude_current_positions_from_observation=True): + + utils.EzPickle.__init__(**locals()) + self.startup = True + self.time = 0 + + self._ctrl_cost_weight = ctrl_cost_weight + + self._healthy_reward = healthy_reward + self._terminate_when_unhealthy = terminate_when_unhealthy + self._healthy_z_range = healthy_z_range + self.min_z, self.max_z = self._healthy_z_range + + self._reset_noise_scale = reset_noise_scale + + self._exclude_current_positions_from_observation = ( + exclude_current_positions_from_observation) + + mujoco_env.MujocoEnv.__init__(self, xml_file, 5) + + self.startup = False + self.skip = self.frame_skip + + self.action_dim = self.ac_dim = self.action_space.shape[0] + self.observation_dim = self.obs_dim = self.observation_space.shape[0] + self.do_explicit_render = True + + #self.action_space.high 1, self.action_space.low -1 + #actions are control limited right now (not force limited) + #self.model.actuator_gear starts at [150,0,0,0,0,0] for each of the 8 actuators + for i in range(len(self.model.actuator_gear)): + self.model.actuator_gear[i][0]/=5 + + def get_reward(self, observations, actions): + + """get rewards of a given (observations, actions) pair + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: + done: True if env reaches terminal state (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + xvel = observations[:, -1] + height = observations[:, -2] + roll_angle = observations[:, 0] + pitch_angle = observations[:, 1] + + #is flipped + is_flipping = np.zeros((observations.shape[0],)) + is_flipping[np.abs(roll_angle) > 0.7] = 1 + is_flipping[np.abs(pitch_angle) > 0.6] = 1 + + #check health + all_finite = np.isfinite(observations).all(axis=1) + is_healthy = np.ones((observations.shape[0],)) + is_healthy[all_finite==False] = 0 + is_healthy[height < self.min_z] = 0 + is_healthy[height > self.max_z] = 0 + is_healthy[is_flipping==True] = 0 + + #calc rew + self.reward_dict['actions'] = -self._ctrl_cost_weight * np.sum(np.square(actions), axis=1) + self.reward_dict['run'] = 10*xvel + self.reward_dict['health'] = is_healthy*self._healthy_reward + self.reward_dict['flipping'] = -500*is_flipping + self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['health'] + self.reward_dict['flipping'] ### + self.reward_dict['actions'] + + #check if done + dones = np.zeros((observations.shape[0],)) + if(self._terminate_when_unhealthy): + dones[is_healthy==False] = 1 + + #return + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + def get_score(self, obs): + xvel = obs[-1] + return xvel + + def step(self, action): + + self.prev_com_pos = self.get_body_com("torso").copy() + + #step + self.do_simulation(action, self.frame_skip) + + #obs/reward/done/score + ob = self._get_obs() + rew, done = self.get_reward(ob, action) + score = self.get_score(ob) + + #return + env_info = {'time': self.time, + 'obs_dict': self.obs_dict, + 'rewards': self.reward_dict, + 'score': score} + return ob, rew, done, env_info + + def _get_obs(self): + + #com vel + if(self.startup): + xvel = [0.0] + else: + curr_com_pos = self.get_body_com("torso").copy() + prev_com_pos = self.prev_com_pos + xvel = [(curr_com_pos-prev_com_pos)[0]/self.dt] + + #data.qpos is 15 + # 3 com pos + # 4 com quat + # 8 : 4 pairs of hip/ankle (start top R, go ccw) + + self.obs_dict = {} + self.obs_dict['com_angular_pose'] = quat_to_euler(self.sim.data.qpos[3:7]) # 3 + self.obs_dict['com_pos'] = self.sim.data.qpos[:3] # 3 + self.obs_dict['joints_pos'] = self.sim.data.qpos[7:].copy() # 15 --> 8 + self.obs_dict['joints_vel'] = self.sim.data.qvel[-8:].copy() # 14 --> 8 + self.obs_dict['com_vel_x'] = xvel.copy() #1 + + if self._exclude_current_positions_from_observation: + return np.concatenate([ + self.obs_dict['com_angular_pose'], + self.obs_dict['joints_pos'], + self.obs_dict['joints_vel'], + [self.obs_dict['com_pos'][2]], #only height + self.obs_dict['com_vel_x'], + ]) + else: + return np.concatenate([ + self.obs_dict['com_angular_pose'], + self.obs_dict['joints_pos'], + self.obs_dict['joints_vel'], + self.obs_dict['com_pos'], # x y and z + self.obs_dict['com_vel_x'], + ]) + + def reset_model(self, seed=None): + + noise_low = -self._reset_noise_scale + noise_high = self._reset_noise_scale + + # set reset pose/vel + self.reset_pose = self.init_qpos + self.np_random.uniform( + low=noise_low, high=noise_high, size=self.model.nq) + self.reset_pose[3:7] = np.array([1,0,0,0]) # this is a quaternion + self.reset_vel = self.init_qvel + self._reset_noise_scale * self.np_random.randn(self.model.nv) + + #reset the env to that pose/vel + return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) + + + def do_reset(self, reset_pose, reset_vel, reset_goal=None): + + #reset + self.set_state(reset_pose, reset_vel) + + #return + return self._get_obs() + + def viewer_setup(self): + # self.viewer.cam.trackbodyid = 1 + self.viewer.cam.distance = 15 + + # -------------------------------- + # get and set states + # -------------------------------- + + def get_env_state(self): + return dict(qp=self.data.qpos.copy(), qv=self.data.qvel.copy()) + + def set_env_state(self, state): + qp = state['qp'].copy() + qv = state['qv'].copy() + self.do_reset(qp, qv) + + # -------------------------------- + # utility functions + # -------------------------------- + + def get_env_infos(self): + return dict(state=self.get_env_state()) \ No newline at end of file diff --git a/hw5/cs285/envs/box2d/__init__.py b/hw5/cs285/envs/box2d/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw5/cs285/envs/box2d/lunar_lander.py b/hw5/cs285/envs/box2d/lunar_lander.py new file mode 100644 index 00000000..31770768 --- /dev/null +++ b/hw5/cs285/envs/box2d/lunar_lander.py @@ -0,0 +1,463 @@ +import sys, math +import numpy as np + +import Box2D +from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener) + +import gym +from gym import spaces +from gym.utils import seeding + +import pyglet + +from copy import copy + +# Rocket trajectory optimization is a classic topic in Optimal Control. +# +# According to Pontryagin's maximum principle it's optimal to fire engine full throttle or +# turn it off. That's the reason this environment is OK to have discreet actions (engine on or off). +# +# Landing pad is always at coordinates (0,0). Coordinates are the first two numbers in state vector. +# Reward for moving from the top of the screen to landing pad and zero speed is about 100..140 points. +# If lander moves away from landing pad it loses reward back. Episode finishes if the lander crashes or +# comes to rest, receiving additional -100 or +100 points. Each leg ground contact is +10. Firing main +# engine is -0.3 points each frame. Solved is 200 points. +# +# Landing outside landing pad is possible. Fuel is infinite, so an agent can learn to fly and then land +# on its first attempt. Please see source code for details. +# +# Too see heuristic landing, run: +# +# python gym/envs/box2d/lunar_lander.py +# +# To play yourself, run: +# +# python examples/agents/keyboard_agent.py LunarLander-v0 +# +# Created by Oleg Klimov. Licensed on the same terms as the rest of OpenAI Gym. + +# Modified by Sid Reddy (sgr@berkeley.edu) on 8/14/18 +# +# Changelog: +# - different discretization scheme for actions +# - different terminal rewards +# - different observations +# - randomized landing site +# +# A good agent should be able to achieve >150 reward. + +MAX_NUM_STEPS = 1000 + +N_OBS_DIM = 9 +N_ACT_DIM = 6 # num discrete actions + +FPS = 50 +SCALE = 30.0 # affects how fast-paced the game is, forces should be adjusted as well + +MAIN_ENGINE_POWER = 13.0 +SIDE_ENGINE_POWER = 0.6 + +INITIAL_RANDOM = 1000.0 # Set 1500 to make game harder + +LANDER_POLY =[ + (-14,+17), (-17,0), (-17,-10), + (+17,-10), (+17,0), (+14,+17) + ] +LEG_AWAY = 20 +LEG_DOWN = 18 +LEG_W, LEG_H = 2, 8 +LEG_SPRING_TORQUE = 40 # 40 is too difficult for human players, 400 a bit easier + +SIDE_ENGINE_HEIGHT = 14.0 +SIDE_ENGINE_AWAY = 12.0 + +VIEWPORT_W = 600 +VIEWPORT_H = 400 + +THROTTLE_MAG = 0.75 # discretized 'on' value for thrusters +NOOP = 1 # don't fire main engine, don't steer +def disc_to_cont(action): # discrete action -> continuous action + if type(action) == np.ndarray: + return action + # main engine + if action < 3: + m = -THROTTLE_MAG + elif action < 6: + m = THROTTLE_MAG + else: + raise ValueError + # steering + if action % 3 == 0: + s = -THROTTLE_MAG + elif action % 3 == 1: + s = 0 + else: + s = THROTTLE_MAG + return np.array([m, s]) + +class ContactDetector(contactListener): + def __init__(self, env): + contactListener.__init__(self) + self.env = env + def BeginContact(self, contact): + if self.env.lander==contact.fixtureA.body or self.env.lander==contact.fixtureB.body: + self.env.game_over = True + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = True + def EndContact(self, contact): + for i in range(2): + if self.env.legs[i] in [contact.fixtureA.body, contact.fixtureB.body]: + self.env.legs[i].ground_contact = False + +class LunarLander(gym.Env): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second' : FPS + } + + continuous = False + + def __init__(self): + self._seed() + self.viewer = None + + self.world = Box2D.b2World() + self.moon = None + self.lander = None + self.particles = [] + + self.prev_reward = None + + high = np.array([np.inf]*N_OBS_DIM) # useful range is -1 .. +1, but spikes can be higher + self.observation_space = spaces.Box(-high, high) + + self.action_space = spaces.Discrete(N_ACT_DIM) + + self.curr_step = None + + self._reset() + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _destroy(self): + if not self.moon: return + self.world.contactListener = None + self._clean_particles(True) + self.world.DestroyBody(self.moon) + self.moon = None + self.world.DestroyBody(self.lander) + self.lander = None + self.world.DestroyBody(self.legs[0]) + self.world.DestroyBody(self.legs[1]) + + def _reset(self): + self.curr_step = 0 + + self._destroy() + self.world.contactListener_keepref = ContactDetector(self) + self.world.contactListener = self.world.contactListener_keepref + self.game_over = False + self.prev_shaping = None + + W = VIEWPORT_W/SCALE + H = VIEWPORT_H/SCALE + + # terrain + CHUNKS = 11 + height = self.np_random.uniform(0, H/2, size=(CHUNKS+1,) ) + chunk_x = [W/(CHUNKS-1)*i for i in range(CHUNKS)] + + # randomize helipad x-coord + helipad_chunk = np.random.choice(range(1, CHUNKS-1)) + + self.helipad_x1 = chunk_x[helipad_chunk-1] + self.helipad_x2 = chunk_x[helipad_chunk+1] + self.helipad_y = H/4 + height[helipad_chunk-2] = self.helipad_y + height[helipad_chunk-1] = self.helipad_y + height[helipad_chunk+0] = self.helipad_y + height[helipad_chunk+1] = self.helipad_y + height[helipad_chunk+2] = self.helipad_y + smooth_y = [0.33*(height[i-1] + height[i+0] + height[i+1]) for i in range(CHUNKS)] + + self.moon = self.world.CreateStaticBody( shapes=edgeShape(vertices=[(0, 0), (W, 0)]) ) + self.sky_polys = [] + for i in range(CHUNKS-1): + p1 = (chunk_x[i], smooth_y[i]) + p2 = (chunk_x[i+1], smooth_y[i+1]) + self.moon.CreateEdgeFixture( + vertices=[p1,p2], + density=0, + friction=0.1) + self.sky_polys.append( [p1, p2, (p2[0],H), (p1[0],H)] ) + + self.moon.color1 = (0.0,0.0,0.0) + self.moon.color2 = (0.0,0.0,0.0) + + initial_y = VIEWPORT_H/SCALE#*0.75 + self.lander = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2, initial_y), + angle=0.0, + fixtures = fixtureDef( + shape=polygonShape(vertices=[ (x/SCALE,y/SCALE) for x,y in LANDER_POLY ]), + density=5.0, + friction=0.1, + categoryBits=0x0010, + maskBits=0x001, # collide only with ground + restitution=0.0) # 0.99 bouncy + ) + self.lander.color1 = (0.5,0.4,0.9) + self.lander.color2 = (0.3,0.3,0.5) + self.lander.ApplyForceToCenter( ( + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM), + self.np_random.uniform(-INITIAL_RANDOM, INITIAL_RANDOM) + ), True) + + self.legs = [] + for i in [-1,+1]: + leg = self.world.CreateDynamicBody( + position = (VIEWPORT_W/SCALE/2 - i*LEG_AWAY/SCALE, initial_y), + angle = (i*0.05), + fixtures = fixtureDef( + shape=polygonShape(box=(LEG_W/SCALE, LEG_H/SCALE)), + density=1.0, + restitution=0.0, + categoryBits=0x0020, + maskBits=0x001) + ) + leg.ground_contact = False + leg.color1 = (0.5,0.4,0.9) + leg.color2 = (0.3,0.3,0.5) + rjd = revoluteJointDef( + bodyA=self.lander, + bodyB=leg, + localAnchorA=(0, 0), + localAnchorB=(i*LEG_AWAY/SCALE, LEG_DOWN/SCALE), + enableMotor=True, + enableLimit=True, + maxMotorTorque=LEG_SPRING_TORQUE, + motorSpeed=+0.3*i # low enough not to jump back into the sky + ) + if i==-1: + rjd.lowerAngle = +0.9 - 0.5 # Yes, the most esoteric numbers here, angles legs have freedom to travel within + rjd.upperAngle = +0.9 + else: + rjd.lowerAngle = -0.9 + rjd.upperAngle = -0.9 + 0.5 + leg.joint = self.world.CreateJoint(rjd) + self.legs.append(leg) + + self.drawlist = [self.lander] + self.legs + + return self._step(NOOP)[0] + + def _create_particle(self, mass, x, y, ttl): + p = self.world.CreateDynamicBody( + position = (x,y), + angle=0.0, + fixtures = fixtureDef( + shape=circleShape(radius=2/SCALE, pos=(0,0)), + density=mass, + friction=0.1, + categoryBits=0x0100, + maskBits=0x001, # collide only with ground + restitution=0.3) + ) + p.ttl = ttl + self.particles.append(p) + self._clean_particles(False) + return p + + def _clean_particles(self, all): + while self.particles and (all or self.particles[0].ttl<0): + self.world.DestroyBody(self.particles.pop(0)) + + def _step(self, action): + assert self.action_space.contains(action), "%r (%s) invalid " % (action,type(action)) + action = disc_to_cont(action) + + # Engines + tip = (math.sin(self.lander.angle), math.cos(self.lander.angle)) + side = (-tip[1], tip[0]); + dispersion = [self.np_random.uniform(-1.0, +1.0) / SCALE for _ in range(2)] + + m_power = 0.0 + if action[0] > 0.0: + # Main engine + m_power = (np.clip(action[0], 0.0,1.0) + 1.0)*0.5 # 0.5..1.0 + assert m_power>=0.5 and m_power <= 1.0 + ox = tip[0]*(4/SCALE + 2*dispersion[0]) + side[0]*dispersion[1] # 4 is move a bit downwards, +-2 for randomness + oy = -tip[1]*(4/SCALE + 2*dispersion[0]) - side[1]*dispersion[1] + impulse_pos = (self.lander.position[0] + ox, self.lander.position[1] + oy) + p = self._create_particle(3.5, impulse_pos[0], impulse_pos[1], m_power) # particles are just a decoration, 3.5 is here to make particle speed adequate + p.ApplyLinearImpulse( ( ox*MAIN_ENGINE_POWER*m_power, oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*MAIN_ENGINE_POWER*m_power, -oy*MAIN_ENGINE_POWER*m_power), impulse_pos, True) + + s_power = 0.0 + if np.abs(action[1]) > 0.5: + # Orientation engines + direction = np.sign(action[1]) + s_power = np.clip(np.abs(action[1]), 0.5,1.0) + assert s_power>=0.5 and s_power <= 1.0 + ox = tip[0]*dispersion[0] + side[0]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + oy = -tip[1]*dispersion[0] - side[1]*(3*dispersion[1]+direction*SIDE_ENGINE_AWAY/SCALE) + impulse_pos = (self.lander.position[0] + ox - tip[0]*17/SCALE, self.lander.position[1] + oy + tip[1]*SIDE_ENGINE_HEIGHT/SCALE) + p = self._create_particle(0.7, impulse_pos[0], impulse_pos[1], s_power) + p.ApplyLinearImpulse( ( ox*SIDE_ENGINE_POWER*s_power, oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + self.lander.ApplyLinearImpulse( (-ox*SIDE_ENGINE_POWER*s_power, -oy*SIDE_ENGINE_POWER*s_power), impulse_pos, True) + + # perform normal update + self.world.Step(1.0/FPS, 6*30, 2*30) + + pos = self.lander.position + vel = self.lander.linearVelocity + helipad_x = (self.helipad_x1 + self.helipad_x2) / 2 + state = [ + (pos.x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2), + (pos.y - (self.helipad_y+LEG_DOWN/SCALE)) / (VIEWPORT_W/SCALE/2), + vel.x*(VIEWPORT_W/SCALE/2)/FPS, + vel.y*(VIEWPORT_H/SCALE/2)/FPS, + self.lander.angle, + 20.0*self.lander.angularVelocity/FPS, + 1.0 if self.legs[0].ground_contact else 0.0, + 1.0 if self.legs[1].ground_contact else 0.0, + (helipad_x - VIEWPORT_W/SCALE/2) / (VIEWPORT_W/SCALE/2) + ] + assert len(state)==N_OBS_DIM + + self.curr_step += 1 + + reward = 0 + shaping = 0 + dx = (pos.x - helipad_x) / (VIEWPORT_W/SCALE/2) + shaping += -100*np.sqrt(state[2]*state[2] + state[3]*state[3]) - 100*abs(state[4]) + shaping += -100*np.sqrt(dx*dx + state[1]*state[1]) + 10*state[6] + 10*state[7] + if self.prev_shaping is not None: + reward = shaping - self.prev_shaping + self.prev_shaping = shaping + + reward -= m_power*0.30 # less fuel spent is better, about -30 for heurisic landing + reward -= s_power*0.03 + + oob = abs(state[0]) >= 1.0 + timeout = self.curr_step >= MAX_NUM_STEPS + not_awake = not self.lander.awake + + at_site = pos.x >= self.helipad_x1 and pos.x <= self.helipad_x2 and state[1] <= 0 + grounded = self.legs[0].ground_contact and self.legs[1].ground_contact + landed = at_site and grounded + + done = self.game_over or oob or not_awake or timeout or landed + if done: + if self.game_over or oob: + reward = -100 + self.lander.color1 = (255,0,0) + elif at_site: + reward = +100 + self.lander.color1 = (0,255,0) + elif timeout: + self.lander.color1 = (255,0,0) + info = {} + + return np.array(state), reward, done, info + + def _render(self, mode='human', close=False): + if close: + if self.viewer is not None: + self.viewer.close() + self.viewer = None + return + + from gym.envs.classic_control import rendering + if self.viewer is None: + self.viewer = rendering.Viewer(VIEWPORT_W, VIEWPORT_H) + self.viewer.set_bounds(0, VIEWPORT_W/SCALE, 0, VIEWPORT_H/SCALE) + + for obj in self.particles: + obj.ttl -= 0.15 + obj.color1 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + obj.color2 = (max(0.2,0.2+obj.ttl), max(0.2,0.5*obj.ttl), max(0.2,0.5*obj.ttl)) + + self._clean_particles(False) + + for p in self.sky_polys: + self.viewer.draw_polygon(p, color=(0,0,0)) + + for obj in self.particles + self.drawlist: + for f in obj.fixtures: + trans = f.body.transform + if type(f.shape) is circleShape: + t = rendering.Transform(translation=trans*f.shape.pos) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color1).add_attr(t) + self.viewer.draw_circle(f.shape.radius, 20, color=obj.color2, filled=False, linewidth=2).add_attr(t) + else: + path = [trans*v for v in f.shape.vertices] + self.viewer.draw_polygon(path, color=obj.color1) + path.append(path[0]) + self.viewer.draw_polyline(path, color=obj.color2, linewidth=2) + + for x in [self.helipad_x1, self.helipad_x2]: + flagy1 = self.helipad_y + flagy2 = flagy1 + 50/SCALE + self.viewer.draw_polyline( [(x, flagy1), (x, flagy2)], color=(1,1,1) ) + self.viewer.draw_polygon( [(x, flagy2), (x, flagy2-10/SCALE), (x+25/SCALE, flagy2-5/SCALE)], color=(0.8,0.8,0) ) + + clock_prog = self.curr_step / MAX_NUM_STEPS + self.viewer.draw_polyline( [(0, 0.05*VIEWPORT_H/SCALE), (clock_prog*VIEWPORT_W/SCALE, 0.05*VIEWPORT_H/SCALE)], color=(255,0,0), linewidth=5 ) + + return self.viewer.render(return_rgb_array = mode=='rgb_array') + + def reset(self): + return self._reset() + + def step(self, *args, **kwargs): + return self._step(*args, **kwargs) + + +class LunarLanderContinuous(LunarLander): + continuous = True + +def heuristic(env, s): + # Heuristic for: + # 1. Testing. + # 2. Demonstration rollout. + angle_targ = s[0]*0.5 + s[2]*1.0 # angle should point towards center (s[0] is horizontal coordinate, s[2] hor speed) + if angle_targ > 0.4: angle_targ = 0.4 # more than 0.4 radians (22 degrees) is bad + if angle_targ < -0.4: angle_targ = -0.4 + hover_targ = 0.55*np.abs(s[0]) # target y should be proporional to horizontal offset + + # PID controller: s[4] angle, s[5] angularSpeed + angle_todo = (angle_targ - s[4])*0.5 - (s[5])*1.0 + #print("angle_targ=%0.2f, angle_todo=%0.2f" % (angle_targ, angle_todo)) + + # PID controller: s[1] vertical coordinate s[3] vertical speed + hover_todo = (hover_targ - s[1])*0.5 - (s[3])*0.5 + #print("hover_targ=%0.2f, hover_todo=%0.2f" % (hover_targ, hover_todo)) + + if s[6] or s[7]: # legs have contact + angle_todo = 0 + hover_todo = -(s[3])*0.5 # override to reduce fall speed, that's all we need after contact + + a = np.array( [hover_todo*20 - 1, -angle_todo*20] ) + a = np.clip(a, -1, +1) + return a + +if __name__=="__main__": + #env = LunarLander() + env = LunarLanderContinuous() + s = env.reset() + total_reward = 0 + steps = 0 + while True: + a = heuristic(env, s) + s, r, done, info = env.step(a) + env.render() + total_reward += r + if steps % 20 == 0 or done: + print(["{:+0.2f}".format(x) for x in s]) + print("step {} total_reward {:+0.2f}".format(steps, total_reward)) + steps += 1 + if done: break diff --git a/hw5/cs285/envs/cheetah/__init__.py b/hw5/cs285/envs/cheetah/__init__.py new file mode 100644 index 00000000..01a108bd --- /dev/null +++ b/hw5/cs285/envs/cheetah/__init__.py @@ -0,0 +1,8 @@ +from gym.envs.registration import register + +register( + id='cheetah-cs285-v0', + entry_point='cs285.envs.cheetah:HalfCheetahEnv', + max_episode_steps=1000, +) +from cs285.envs.cheetah.cheetah import HalfCheetahEnv diff --git a/hw5/cs285/envs/cheetah/cheetah.py b/hw5/cs285/envs/cheetah/cheetah.py new file mode 100644 index 00000000..4cd8a1e4 --- /dev/null +++ b/hw5/cs285/envs/cheetah/cheetah.py @@ -0,0 +1,133 @@ +import numpy as np +import mujoco_py +from gym import utils +from gym.envs.mujoco import mujoco_env + +class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): + + def __init__(self): + + mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) + utils.EzPickle.__init__(self) + + self.skip = self.frame_skip + + self.action_dim = self.ac_dim = self.action_space.shape[0] + self.observation_dim = self.obs_dim = self.observation_space.shape[0] + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + xvel = observations[:, 9].copy() + body_angle = observations[:, 2].copy() + front_leg = observations[:, 6].copy() + front_shin = observations[:, 7].copy() + front_foot = observations[:, 8].copy() + zeros = np.zeros((observations.shape[0],)).copy() + + # ranges + leg_range = 0.2 + shin_range = 0 + foot_range = 0 + penalty_factor = 10 + + #calc rew + self.reward_dict['run'] = xvel + + front_leg_rew = zeros.copy() + front_leg_rew[front_leg>leg_range] = -penalty_factor + self.reward_dict['leg'] = front_leg_rew + + front_shin_rew = zeros.copy() + front_shin_rew[front_shin>shin_range] = -penalty_factor + self.reward_dict['shin'] = front_shin_rew + + front_foot_rew = zeros.copy() + front_foot_rew[front_foot>foot_range] = -penalty_factor + self.reward_dict['foot'] = front_foot_rew + + # total reward + self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot'] + + #return + dones = zeros.copy() + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + + def get_score(self, obs): + xposafter = obs[0] + return xposafter + + ############################################## + + def step(self, action): + + #step + self.do_simulation(action, self.frame_skip) + + #obs/reward/done/score + ob = self._get_obs() + rew, done = self.get_reward(ob, action) + score = self.get_score(ob) + + #return + env_info = {'obs_dict': self.obs_dict, + 'rewards': self.reward_dict, + 'score': score} + return ob, rew, done, env_info + + def _get_obs(self): + + self.obs_dict = {} + self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy() + self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy() + self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy() + + return np.concatenate([ + self.obs_dict['joints_pos'], #9 + self.obs_dict['joints_vel'], #9 + self.obs_dict['com_torso'], #3 + ]) + + ############################################## + + def reset_model(self, seed=None): + + # set reset pose/vel + self.reset_pose = self.init_qpos + self.np_random.uniform( + low=-.1, high=.1, size=self.model.nq) + self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 + + #reset the env to that pose/vel + return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) + + + def do_reset(self, reset_pose, reset_vel, reset_goal=None): + + #reset + self.set_state(reset_pose, reset_vel) + + #return + return self._get_obs() diff --git a/hw5/cs285/envs/obstacles/__init__.py b/hw5/cs285/envs/obstacles/__init__.py new file mode 100644 index 00000000..ad7c8e18 --- /dev/null +++ b/hw5/cs285/envs/obstacles/__init__.py @@ -0,0 +1,8 @@ +from gym.envs.registration import register + +register( + id='obstacles-cs285-v0', + entry_point='cs285.envs.obstacles:Obstacles', + max_episode_steps=500, +) +from cs285.envs.obstacles.obstacles_env import Obstacles diff --git a/hw5/cs285/envs/obstacles/obstacles_env.py b/hw5/cs285/envs/obstacles/obstacles_env.py new file mode 100644 index 00000000..06d906e5 --- /dev/null +++ b/hw5/cs285/envs/obstacles/obstacles_env.py @@ -0,0 +1,227 @@ +import gym +import numpy as np +from gym import spaces + +class Obstacles(gym.Env): + def __init__(self, start=[-0.5, 0.75], end=[0.7, -0.8], random_starts=True): + + import matplotlib.pyplot as plt #inside, so doesnt get imported when not using this env + self.plt = plt + + self.action_dim = self.ac_dim = 2 + self.observation_dim = self.obs_dim = 4 + self.boundary_min = -0.99 + self.boundary_max = 0.99 + + low = self.boundary_min*np.ones((self.action_dim,)) + high = self.boundary_max*np.ones((self.action_dim,)) + self.action_space = spaces.Box(low, high, dtype=np.float32) + + high = np.inf*np.ones(self.obs_dim) + low = -high + self.observation_space = spaces.Box(low, high, dtype=np.float32) + + self.env_name = 'obstacles' + self.is_gym = True + + self.start = np.array(start) + self.end = np.array(end) + self.current = np.array(start) + self.random_starts = random_starts + + #obstacles are rectangles, specified by [x of top left, y of topleft, width x, height y] + self.obstacles = [] + self.obstacles.append([-0.4, 0.8, 0.4, 0.3]) + self.obstacles.append([-0.9, 0.3, 0.2, 0.6]) + self.obstacles.append([0.6, -0.1, 0.12, 0.4]) + self.obstacles.append([-0.1, 0.2, 0.15, 0.4]) + self.obstacles.append([0.1, -0.7, 0.3, 0.15]) + + self.eps = 0.1 + self.fig = self.plt.figure() + + def seed(self, seed): + np.random.seed(seed) + + ######################################### + + def pick_start_pos(self): + if self.random_starts: + temp = np.random.uniform([self.boundary_min, self.boundary_min+1.25], [self.boundary_max-0.4, self.boundary_max], (self.action_dim,)) + if not self.is_valid(temp[None, :]): + temp = self.pick_start_pos() + else: + temp = self.start + return temp + + ######################################### + + def reset(self, seed=None): + if seed: + self.seed(seed) + + self.reset_pose = self.pick_start_pos() + self.reset_vel = self.end + + return self.do_reset(self.reset_pose, self.reset_vel) + + def do_reset(self, reset_pose, reset_vel, reset_goal=None): + + self.current = reset_pose.copy() + self.end = reset_vel.copy() + + #clear + self.counter = 0 + self.plt.clf() + + #return + return self._get_obs() + + ######################################### + + def _get_obs(self): + return np.concatenate([self.current,self.end]) + + def get_score(self, obs): + curr_pos = obs[:2] + end_pos = obs[-2:] + score = -1*np.abs(curr_pos-end_pos) + return score + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + curr_pos = observations[:, :2] + end_pos = observations[:, -2:] + + #calc rew + dist = np.linalg.norm(curr_pos - end_pos, axis=1) + self.reward_dict['dist'] = -dist + self.reward_dict['r_total'] = self.reward_dict['dist'] + + #done + dones = np.zeros((observations.shape[0],)) + dones[distself.boundary_max] = 1 + oob[curr_pos[:,1]>self.boundary_max] = 1 + dones[oob==1] = 1 + + #return + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + def step(self, action): + self.counter += 1 + action = np.clip(action, -1, 1) #clip (-1, 1) + action = action / 10. #scale (-1,1) to (-0.1, 0.1) + + # move, only if its a valid move (else, keep it there because it cant move) + temp = self.current + action + if self.is_valid(temp[None, :]): + self.current = temp + + ob = self._get_obs() + reward, done = self.get_reward(ob, action) + score = self.get_score(ob) + env_info = {'ob': ob, + 'rewards': self.reward_dict, + 'score': score} + + return ob, reward, done, env_info + + ######################################## + # utility functions + ######################################## + + def render(self, mode=None): + + # boundaries + self.plt.plot([self.boundary_min, self.boundary_min], [self.boundary_min, self.boundary_max], 'k') + self.plt.plot([self.boundary_max, self.boundary_max], [self.boundary_min, self.boundary_max], 'k') + self.plt.plot([self.boundary_min, self.boundary_max], [self.boundary_min, self.boundary_min], 'k') + self.plt.plot([self.boundary_min, self.boundary_max], [self.boundary_max, self.boundary_max], 'k') + + # obstacles + for obstacle in self.obstacles: + tl_x = obstacle[0] + tl_y = obstacle[1] + tr_x = tl_x + obstacle[2] + tr_y = tl_y + bl_x = tl_x + bl_y = tl_y - obstacle[3] + br_x = tr_x + br_y = bl_y + self.plt.plot([bl_x, br_x], [bl_y, br_y], 'r') + self.plt.plot([tl_x, tr_x], [tl_y, tr_y], 'r') + self.plt.plot([bl_x, bl_x], [bl_y, tl_y], 'r') + self.plt.plot([br_x, br_x], [br_y, tr_y], 'r') + + # current and end + self.plt.plot(self.end[0], self.end[1], 'go') + self.plt.plot(self.current[0], self.current[1], 'ko') + self.plt.pause(0.1) + + img = np.frombuffer(self.fig.canvas.tostring_rgb(), dtype=np.uint8) + img = img.reshape(self.fig.canvas.get_width_height()[::-1] + (3,)) + return img + + def is_valid(self, dat): + + oob_mask = np.any(self.oob(dat), axis=1) + + # old way + self.a = self.boundary_min + (self.boundary_max-self.boundary_min)/3.0 + self.b = self.boundary_min + 2*(self.boundary_max-self.boundary_min)/3.0 + data_mask = (dat[:, 0] < self.a) | (dat[:, 0] > self.b) | \ + (dat[:, 1] < self.a) | (dat[:, 1] > self.b) + + # + in_obstacle = False + for obstacle in self.obstacles: + tl_x = obstacle[0] + tl_y = obstacle[1] + tr_x = tl_x + obstacle[2] + tr_y = tl_y + bl_x = tl_x + bl_y = tl_y - obstacle[3] + br_x = tr_x + br_y = bl_y + + if dat[:, 0]>tl_x and dat[:, 0]bl_y and dat[:, 1]= self.boundary_max) + + diff --git a/hw5/cs285/envs/pointmass/pointmass.py b/hw5/cs285/envs/pointmass/pointmass.py new file mode 100644 index 00000000..ff7081fb --- /dev/null +++ b/hw5/cs285/envs/pointmass/pointmass.py @@ -0,0 +1,587 @@ +import networkx as nx +import scipy.sparse.csgraph +import numpy as np +import gym +import pickle + +WALLS = { + 'Small': + np.array([[0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0]]), + 'Cross': + np.array([[0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0], + [0, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0]]), + 'FourRooms': + np.array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]]), + 'Spiral5x5': + np.array([[0, 0, 0, 0, 0], + [0, 1, 1, 1, 1], + [0, 1, 0, 0, 1], + [0, 1, 1, 0, 1], + [0, 0, 0, 0, 1]]), + 'Spiral7x7': + np.array([[1, 1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0, 0], + [1, 0, 1, 1, 1, 1, 0], + [1, 0, 1, 0, 0, 1, 0], + [1, 0, 1, 1, 0, 1, 0], + [1, 0, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 0]]), + 'Spiral9x9': + np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 1], + [0, 1, 0, 0, 0, 0, 0, 0, 1], + [0, 1, 0, 1, 1, 1, 1, 0, 1], + [0, 1, 0, 1, 0, 0, 1, 0, 1], + [0, 1, 0, 1, 1, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 0, 1, 0, 1], + [0, 1, 1, 1, 1, 1, 1, 0, 1], + [0, 0, 0, 0, 0, 0, 0, 0, 1]]), + 'Spiral11x11': + np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], + [1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0], + [1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0], + [1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0], + [1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0], + [1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), + 'Maze5x5': + # np.array([[0, 0, 0], + # [1, 1, 0], + # [0, 0, 0]]), + np.array([[0, 0, 0, 0, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 0], + [1, 1, 1, 1, 0]]), + 'Maze6x6': + np.array([[0, 0, 1, 0, 0, 0], + [1, 0, 1, 0, 1, 0], + [0, 0, 1, 0, 1, 1], + [0, 1, 1, 0, 0, 1], + [0, 0, 1, 1, 0, 1], + [1, 0, 0, 0, 0, 1]]), + 'Maze11x11': + np.array([[0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0], + [1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'Tunnel': + np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0], + [0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]]), + 'U': + np.array([[0, 0, 0], + [0, 1, 0], + [0, 1, 0], + [0, 1, 0], + [1, 1, 0], + [1, 1, 0], + [0, 1, 0], + [0, 1, 0], + [0, 1, 0], + [0, 0, 0]]), + 'Tree': + np.array([ + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1], + [1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1], + [1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1], + [1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1], + [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0], + [0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0], + ]), + 'UMulti': + np.array([ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + ]), + 'FlyTrapSmall': + np.array([ + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + ]), + 'FlyTrapBig': + np.array([ + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], + ]), + 'Galton': + np.array([ + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0], + [0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0], + [0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0], + ]), +} + + +ACT_DICT = { + 0: [0.,0.], + 1: [0., -1.], + 2: [0., 1.], + 3: [-1., 0.], + 4: [1., 0.], +} + +def resize_walls(walls, factor): + """Increase the environment by rescaling. + + Args: + walls: 0/1 array indicating obstacle locations. + factor: (int) factor by which to rescale the environment.""" + (height, width) = walls.shape + row_indices = np.array([i for i in range(height) for _ in range(factor)]) + col_indices = np.array([i for i in range(width) for _ in range(factor)]) + walls = walls[row_indices] + walls = walls[:, col_indices] + assert walls.shape == (factor * height, factor * width) + return walls + + + +class Pointmass(gym.Env): + """Abstract class for 2D navigation environments.""" + + def __init__(self, + difficulty=0, + dense_reward=False, + ): + """Initialize the point environment. + + Args: + walls: (str) name of one of the maps defined above. + resize_factor: (int) Scale the map by this factor. + action_noise: (float) Standard deviation of noise to add to actions. Use 0 + to add no noise. + """ + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + self.plt = plt + self.fig = self.plt.figure() + + self.action_dim = self.ac_dim = 2 + self.observation_dim = self.obs_dim = 2 + self.env_name = 'pointmass' + self.is_gym = True + + if difficulty == 0: + walls = 'Maze5x5' + resize_factor = 2 + self.fixed_start = np.array([0.5, 0.5]) * resize_factor + self.fixed_goal = np.array([4.5, 4.5]) * resize_factor + self.max_episode_steps = 50 + elif difficulty == 1: + walls = 'Maze6x6' + resize_factor = 1 + self.fixed_start = np.array([0.5, 0.5]) * resize_factor + self.fixed_goal = np.array([1.5, 5.5]) * resize_factor + self.max_episode_steps = 150 + elif difficulty == 2: + walls = 'FourRooms' + resize_factor = 2 + self.fixed_start = np.array([1.0, 1.0]) * resize_factor + self.fixed_goal = np.array([10.0, 10.0]) * resize_factor + self.max_episode_steps = 100 + elif difficulty == 3: + #NOTE TO STUDENTS: FEEL FREE TO EDIT THESE PARAMS FOR THE EXTRA CREDIT PROBLEM! + walls = 'Maze11x11' + resize_factor = 1 + self.fixed_start = np.array([0.5, 0.5]) * resize_factor + self.fixed_goal = np.array([0.5, 10.5]) * resize_factor + self.max_episode_steps = 200 + else: + print('Invalid difficulty setting') + return 1/0 + + if resize_factor > 1: + self._walls = resize_walls(WALLS[walls], resize_factor) + else: + self._walls = WALLS[walls] + (height, width) = self._walls.shape + self._apsp = self._compute_apsp(self._walls) + + self._height = height + self._width = width + self.action_space = gym.spaces.Discrete(5) + self.observation_space = gym.spaces.Box( + low=np.array([0,0]), + high=np.array([self._height, self._width]), + dtype=np.float32) + + self.dense_reward = dense_reward + self.num_actions = 5 + self.epsilon = resize_factor + self.action_noise = 0.5 + + self.obs_vec = [] + self.last_trajectory = None + self.difficulty = difficulty + + self.num_runs = 0 + self.reset() + + def seed(self, seed): + np.random.seed(seed) + + def reset(self, seed=None): + if seed: self.seed(seed) + + if len(self.obs_vec) > 0: + self.last_trajectory = self.plot_trajectory() + + self.plt.clf() + self.timesteps_left = self.max_episode_steps + + self.obs_vec = [self._normalize_obs(self.fixed_start.copy())] + self.state = self.fixed_start.copy() + self.num_runs += 1 + return self._normalize_obs(self.state.copy()) + + def set_logdir(self, path): + self.traj_filepath = path + 'last_traj.png' + + def _get_distance(self, obs, goal): + """Compute the shortest path distance. + + Note: This distance is *not* used for training.""" + (i1, j1) = self._discretize_state(obs.copy()) + (i2, j2) = self._discretize_state(goal.copy()) + return self._apsp[i1, j1, i2, j2] + + def simulate_step(self, state, action): + num_substeps = 10 + dt = 1.0 / num_substeps + num_axis = len(action) + for _ in np.linspace(0, 1, num_substeps): + for axis in range(num_axis): + new_state = state.copy() + new_state[axis] += dt * action[axis] + + if not self._is_blocked(new_state): + state = new_state + return state + + def get_optimal_action(self, state): + state = self._unnormalize_obs(state) + best_action = 0 + best_dist = np.inf + for i in range(self.num_actions): + action = np.array(ACT_DICT[i]) + s_prime = self.simulate_step(state, action) + dist = self._get_distance(s_prime, self.fixed_goal) + if dist < best_dist: + best_dist = dist + best_action = i + return best_action + + def _discretize_state(self, state, resolution=1.0): + (i, j) = np.floor(resolution * state).astype(np.int) + # Round down to the nearest cell if at the boundary. + if i == self._height: + i -= 1 + if j == self._width: + j -= 1 + return (i, j) + + def _normalize_obs(self, obs): + return np.array([ + obs[0] / float(self._height), + obs[1] / float(self._width) + ]) + + def _unnormalize_obs(self, obs): + return np.array([ + obs[0] * float(self._height), + obs[1] * float(self._width) + ]) + + def _is_blocked(self, state): + if not self.observation_space.contains(state): + return True + (i, j) = self._discretize_state(state) + return (self._walls[i, j] == 1) + + def step(self, action): + self.timesteps_left -= 1 + action = np.array(ACT_DICT[action]) + action = np.random.normal(action, self.action_noise) + self.state = self.simulate_step(self.state, action) + + dist = np.linalg.norm(self.state - self.fixed_goal) + done = (dist < self.epsilon) or (self.timesteps_left == 0) + ns = self._normalize_obs(self.state.copy()) + self.obs_vec.append(ns.copy()) + + if self.dense_reward: + reward = -dist + else: + reward = int(dist < self.epsilon) - 1 + + return ns, reward, done, {} + + @property + def walls(self): + return self._walls + + @property + def goal(self): + return self._normalize_obs(self.fixed_goal.copy()) + + def _compute_apsp(self, walls): + (height, width) = walls.shape + g = nx.Graph() + # Add all the nodes + for i in range(height): + for j in range(width): + if walls[i, j] == 0: + g.add_node((i, j)) + + # Add all the edges + for i in range(height): + for j in range(width): + for di in [-1, 0, 1]: + for dj in [-1, 0, 1]: + if di == dj == 0: continue # Don't add self loops + if i + di < 0 or i + di > height - 1: continue # No cell here + if j + dj < 0 or j + dj > width - 1: continue # No cell here + if walls[i, j] == 1: continue # Don't add edges to walls + if walls[i + di, j + dj] == 1: continue # Don't add edges to walls + g.add_edge((i, j), (i + di, j + dj)) + + # dist[i, j, k, l] is path from (i, j) -> (k, l) + dist = np.full((height, width, height, width), np.float('inf')) + for ((i1, j1), dist_dict) in nx.shortest_path_length(g): + for ((i2, j2), d) in dist_dict.items(): + dist[i1, j1, i2, j2] = d + + return dist + + def render(self, mode=None): + self.plot_walls() + + # current and end + self.plt.plot(self.fixed_goal[0], self.fixed_goal[1], 'go') + self.plt.plot(self.state[0], self.state[1], 'ko') + self.plt.pause(0.1) + + img = np.frombuffer(self.fig.canvas.tostring_rgb(), dtype=np.uint8) + img = img.reshape(self.fig.canvas.get_width_height()[::-1] + (3,)) + return img + + def plot_trajectory(self): + self.plt.clf() + self.plot_walls() + + obs_vec, goal = np.array(self.obs_vec), self.goal + self.plt.plot(obs_vec[:, 0], obs_vec[:, 1], 'b-o', alpha=0.3) + self.plt.scatter([obs_vec[0, 0]], [obs_vec[0, 1]], marker='+', + color='red', s=200, label='start') + self.plt.scatter([obs_vec[-1, 0]], [obs_vec[-1, 1]], marker='+', + color='green', s=200, label='end') + self.plt.scatter([goal[0]], [goal[1]], marker='*', + color='green', s=200, label='goal') + self.plt.legend(loc='upper left') + self.plt.savefig(self.traj_filepath) + + def get_last_trajectory(self): + return self.last_trajectory + + def plot_walls(self, walls=None): + if walls is None: + walls = self._walls.T + (height, width) = walls.shape + for (i, j) in zip(*np.where(walls)): + x = np.array([j, j+1]) / float(width) + y0 = np.array([i, i]) / float(height) + y1 = np.array([i+1, i+1]) / float(height) + self.plt.fill_between(x, y0, y1, color='grey') + self.plt.xlim([0, 1]) + self.plt.ylim([0, 1]) + self.plt.xticks([]) + self.plt.yticks([]) + + def _sample_normalized_empty_state(self): + s = self._sample_empty_state() + return self._normalize_obs(s) + + def _sample_empty_state(self): + candidate_states = np.where(self._walls == 0) + num_candidate_states = len(candidate_states[0]) + state_index = np.random.choice(num_candidate_states) + state = np.array([candidate_states[0][state_index], + candidate_states[1][state_index]], + dtype=np.float) + state += np.random.uniform(size=2) + assert not self._is_blocked(state) + return state + +def refresh_path(): + path = dict() + path['observations'] = [] + path['actions'] = [] + path['next_observations'] = [] + path['terminals'] = [] + path['rewards'] = [] + return path + +if __name__ == '__main__': + env = Pointmass(difficulty=0, dense_reward=False) + num_samples = 50000 + total_samples = 0 + path = refresh_path() + all_paths = [] + num_positive_rewards = 0 + + while total_samples < num_samples: + path = refresh_path() + start_state = env._sample_empty_state() + bern = (np.random.rand() > 0.5) + if bern: + goal_state = env._sample_empty_state() + else: + goal_state = env.fixed_goal + + print ('Start: ', start_state, ' Goal state: ', goal_state, total_samples) + # curr_state = start_state + curr_state = env.reset(start_state) + done = False + for i in range(env.max_episode_steps): + action = env.get_optimal_action(goal_state) + temp_bern = (np.random.rand() < 0.2) + if temp_bern: + action = np.random.randint(5) + + next_state, reward, done, _ = env.step(action) + if reward >= 0: + num_positive_rewards += 1 + path['observations'].append(curr_state) + path['actions'].append(action) + path['next_observations'].append(next_state) + path['terminals'].append(done) + path['rewards'].append(reward) + + if done == True: + total_samples += i + break + + all_paths.append(path) + print ('Num Positive Rewards: ', num_positive_rewards) + + with open('buffer_debug_final' + str(env.difficulty) +'.pkl', 'wb') as f: + pickle.dump(all_paths, f) diff --git a/hw5/cs285/envs/reacher/__init__.py b/hw5/cs285/envs/reacher/__init__.py new file mode 100644 index 00000000..af31eecd --- /dev/null +++ b/hw5/cs285/envs/reacher/__init__.py @@ -0,0 +1,8 @@ +from gym.envs.registration import register + +register( + id='reacher-cs285-v0', + entry_point='cs285.envs.reacher:Reacher7DOFEnv', + max_episode_steps=500, +) +from cs285.envs.reacher.reacher_env import Reacher7DOFEnv diff --git a/hw5/cs285/envs/reacher/assets/sawyer.xml b/hw5/cs285/envs/reacher/assets/sawyer.xml new file mode 100644 index 00000000..c27ccf59 --- /dev/null +++ b/hw5/cs285/envs/reacher/assets/sawyer.xml @@ -0,0 +1,110 @@ + + + diff --git a/hw5/cs285/envs/reacher/reacher_env.py b/hw5/cs285/envs/reacher/reacher_env.py new file mode 100644 index 00000000..61da6823 --- /dev/null +++ b/hw5/cs285/envs/reacher/reacher_env.py @@ -0,0 +1,126 @@ +import numpy as np +from gym import utils +from gym.envs.mujoco import mujoco_env +from mujoco_py import MjViewer +import os + +class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): + def __init__(self): + + # placeholder + self.hand_sid = -2 + self.target_sid = -1 + + curr_dir = os.path.dirname(os.path.abspath(__file__)) + mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2) + utils.EzPickle.__init__(self) + self.observation_dim = 26 + self.action_dim = 7 + + self.hand_sid = self.model.site_name2id("finger") + self.target_sid = self.model.site_name2id("target") + self.skip = self.frame_skip + + + def _get_obs(self): + return np.concatenate([ + self.data.qpos.flat, #[7] + self.data.qvel.flatten() / 10., #[7] + self.data.site_xpos[self.hand_sid], #[3] + self.model.site_pos[self.target_sid], #[3] + ]) + + def step(self, a): + + self.do_simulation(a, self.frame_skip) + ob = self._get_obs() + reward, done = self.get_reward(ob, a) + + score = self.get_score(ob) + + # finalize step + env_info = {'ob': ob, + 'rewards': self.reward_dict, + 'score': score} + + return ob, reward, done, env_info + + def get_score(self, obs): + hand_pos = obs[-6:-3] + target_pos = obs[-3:] + score = -1*np.abs(hand_pos-target_pos) + return score + + def get_reward(self, observations, actions): + + """get reward/s of given (observations, actions) datapoint or datapoints + + Args: + observations: (batchsize, obs_dim) or (obs_dim,) + actions: (batchsize, ac_dim) or (ac_dim,) + + Return: + r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) + done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) + """ + + #initialize and reshape as needed, for batch mode + self.reward_dict = {} + if(len(observations.shape)==1): + observations = np.expand_dims(observations, axis = 0) + actions = np.expand_dims(actions, axis = 0) + batch_mode = False + else: + batch_mode = True + + #get vars + hand_pos = observations[:, -6:-3] + target_pos = observations[:, -3:] + + #calc rew + dist = np.linalg.norm(hand_pos - target_pos, axis=1) + self.reward_dict['r_total'] = -10*dist + + #done is always false for this env + dones = np.zeros((observations.shape[0],)) + + #return + if(not batch_mode): + return self.reward_dict['r_total'][0], dones[0] + return self.reward_dict['r_total'], dones + + def reset(self): + _ = self.reset_model() + + self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1] + + observation, _reward, done, _info = self.step(np.zeros(7)) + ob = self._get_obs() + + return ob + + def reset_model(self, seed=None): + if seed is not None: + self.seed(seed) + + self.reset_pose = self.init_qpos.copy() + self.reset_vel = self.init_qvel.copy() + + self.reset_goal = np.zeros(3) + self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3) + self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2) + self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25) + + return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) + + def do_reset(self, reset_pose, reset_vel, reset_goal): + + self.set_state(reset_pose, reset_vel) + + #reset target + self.reset_goal = reset_goal.copy() + self.model.site_pos[self.target_sid] = self.reset_goal + self.sim.forward() + + #return + return self._get_obs() \ No newline at end of file diff --git a/hw5/cs285/exploration/__init__.py b/hw5/cs285/exploration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hw5/cs285/exploration/base_exploration_model.py b/hw5/cs285/exploration/base_exploration_model.py new file mode 100644 index 00000000..3b260887 --- /dev/null +++ b/hw5/cs285/exploration/base_exploration_model.py @@ -0,0 +1,3 @@ +class BaseExplorationModel(object): + def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): + raise NotImplementedError \ No newline at end of file diff --git a/hw5/cs285/exploration/rnd_model.py b/hw5/cs285/exploration/rnd_model.py new file mode 100644 index 00000000..74f61830 --- /dev/null +++ b/hw5/cs285/exploration/rnd_model.py @@ -0,0 +1,64 @@ +from cs285.infrastructure import pytorch_util as ptu +from .base_exploration_model import BaseExplorationModel +import torch.optim as optim +from torch import nn +import torch + +def init_method_1(model): + model.weight.data.uniform_() + model.bias.data.uniform_() + +def init_method_2(model): + model.weight.data.normal_() + model.bias.data.normal_() + + +class RNDModel(nn.Module, BaseExplorationModel): + def __init__(self, hparams, optimizer_spec, **kwargs): + super().__init__(**kwargs) + self.ob_dim = hparams['ob_dim'] + self.output_size = hparams['rnd_output_size'] + self.n_layers = hparams['rnd_n_layers'] + self.size = hparams['rnd_size'] + self.optimizer_spec = optimizer_spec + + # TODO: Create two neural networks: + # 1) f, the random function we are trying to learn + # 2) f_hat, the function we are using to learn f + # WARNING: Make sure you use different types of weight + # initializations for these two functions + + # HINT 1) Check out the method ptu.build_mlp + # HINT 2) There are two weight init methods defined above + + self.f = None + self.f_hat = None + + self.optimizer = self.optimizer_spec.constructor( + self.f_hat.parameters(), + **self.optimizer_spec.optim_kwargs + ) + self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( + self.optimizer, + self.optimizer_spec.learning_rate_schedule, + ) + + self.f.to(ptu.device) + self.f_hat.to(ptu.device) + + def forward(self, ob_no): + # TODO: Get the prediction error for ob_no + # HINT: Remember to detach the output of self.f! + error = None + return error + + def forward_np(self, ob_no): + ob_no = ptu.from_numpy(ob_no) + error = self(ob_no) + return ptu.to_numpy(error) + + def update(self, ob_no): + # TODO: Update f_hat using ob_no + # Hint: Take the mean prediction error across the batch + loss = None + return loss.item() diff --git a/hw5/cs285/infrastructure/atari_wrappers.py b/hw5/cs285/infrastructure/atari_wrappers.py new file mode 100644 index 00000000..a2885c20 --- /dev/null +++ b/hw5/cs285/infrastructure/atari_wrappers.py @@ -0,0 +1,175 @@ +import numpy as np +import gym +from gym import spaces + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + gym.Wrapper.__init__(self, env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == 'NOOP' + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101 + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset for environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == 'FIRE' + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condition for a few frames + # so it's important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + gym.Wrapper.__init__(self, env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: self._obs_buffer[0] = obs + if i == self._skip - 1: self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +def _process_frame84(frame): + import cv2 + img = np.reshape(frame, [210, 160, 3]).astype(np.float32) + img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 + resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) + x_t = resized_screen[18:102, :] + x_t = np.reshape(x_t, [84, 84, 1]) + return x_t.astype(np.uint8) + + +class ProcessFrame84(gym.Wrapper): + def __init__(self, env=None): + super(ProcessFrame84, self).__init__(env) + self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) + + def step(self, action): + obs, reward, done, info = self.env.step(action) + return _process_frame84(obs), reward, done, info + + def reset(self): + return _process_frame84(self.env.reset()) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + gym.RewardWrapper.__init__(self, env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +def wrap_deepmind_ram(env): + env = EpisodicLifeEnv(env) + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = ClipRewardEnv(env) + return env + + +def wrap_deepmind(env): + """Configure environment for DeepMind-style Atari. + """ + env = EpisodicLifeEnv(env) + env = NoopResetEnv(env, noop_max=30) + env = MaxAndSkipEnv(env, skip=4) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = ProcessFrame84(env) + env = ClipRewardEnv(env) + return env diff --git a/hw5/cs285/infrastructure/colab_utils.py b/hw5/cs285/infrastructure/colab_utils.py new file mode 100644 index 00000000..a896be97 --- /dev/null +++ b/hw5/cs285/infrastructure/colab_utils.py @@ -0,0 +1,26 @@ +from gym.wrappers import Monitor +import glob +import io +import base64 +from IPython.display import HTML +from IPython import display as ipythondisplay + +## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI + +def show_video(): + mp4list = glob.glob('/content/video/*.mp4') + if len(mp4list) > 0: + mp4 = mp4list[0] + video = io.open(mp4, 'r+b').read() + encoded = base64.b64encode(video) + ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) + else: + print("Could not find video") + + +def wrap_env(env): + env = Monitor(env, '/content/video', force=True) + return env diff --git a/hw5/cs285/infrastructure/dqn_utils.py b/hw5/cs285/infrastructure/dqn_utils.py new file mode 100644 index 00000000..966c42f4 --- /dev/null +++ b/hw5/cs285/infrastructure/dqn_utils.py @@ -0,0 +1,570 @@ +"""This file includes a collection of utility functions that are useful for +implementing DQN.""" +import random +from collections import namedtuple +import pdb + +import gym +import numpy as np +from torch import nn +import torch.optim as optim + +from cs285.infrastructure.atari_wrappers import wrap_deepmind +from gym.envs.registration import register + +import torch + + +class Flatten(torch.nn.Module): + def forward(self, x): + batch_size = x.shape[0] + return x.view(batch_size, -1) + +OptimizerSpec = namedtuple( + "OptimizerSpec", + ["constructor", "optim_kwargs", "learning_rate_schedule"], +) + + +def register_custom_envs(): + from gym.envs.registration import registry + if 'LunarLander-v3' not in registry.env_specs: + register( + id='LunarLander-v3', + entry_point='cs285.envs.box2d.lunar_lander:LunarLander', + max_episode_steps=1000, + reward_threshold=200, + ) + if 'PointmassEasy-v0' not in registry.env_specs: + register( + id='PointmassEasy-v0', + entry_point='cs285.envs.pointmass.pointmass:Pointmass', + kwargs={'difficulty': 0} + ) + if 'PointmassMedium-v0' not in registry.env_specs: + register( + id='PointmassMedium-v0', + entry_point='cs285.envs.pointmass.pointmass:Pointmass', + kwargs={'difficulty': 1} + ) + if 'PointmassHard-v0' not in registry.env_specs: + register( + id='PointmassHard-v0', + entry_point='cs285.envs.pointmass.pointmass:Pointmass', + kwargs={'difficulty': 2} + ) + if 'PointmassVeryHard-v0' not in registry.env_specs: + register( + id='PointmassVeryHard-v0', + entry_point='cs285.envs.pointmass.pointmass:Pointmass', + kwargs={'difficulty': 3} + ) + + +def get_env_kwargs(env_name): + if env_name in ['MsPacman-v0', 'PongNoFrameskip-v4']: + kwargs = { + 'learning_starts': 50000, + 'target_update_freq': 10000, + 'replay_buffer_size': int(1e6), + 'num_timesteps': int(2e8), + 'q_func': create_atari_q_network, + 'learning_freq': 4, + 'grad_norm_clipping': 10, + 'input_shape': (84, 84, 4), + 'env_wrappers': wrap_deepmind, + 'frame_history_len': 4, + 'gamma': 0.99, + } + kwargs['optimizer_spec'] = atari_optimizer(kwargs['num_timesteps']) + kwargs['exploration_schedule'] = atari_exploration_schedule(kwargs['num_timesteps']) + + elif env_name == 'LunarLander-v3': + def lunar_empty_wrapper(env): + return env + kwargs = { + 'optimizer_spec': lander_optimizer(), + 'q_func': create_lander_q_network, + 'replay_buffer_size': 50000, + 'batch_size': 32, + 'gamma': 1.00, + 'learning_starts': 1000, + 'learning_freq': 1, + 'frame_history_len': 1, + 'target_update_freq': 3000, + 'grad_norm_clipping': 10, + 'lander': True, + 'num_timesteps': 500000, + 'env_wrappers': lunar_empty_wrapper + } + kwargs['exploration_schedule'] = lander_exploration_schedule(kwargs['num_timesteps']) + + # THIS NEEDS TO BE UPDATED + elif 'Pointmass' in env_name: + def pointmass_empty_wrapper(env): + return env + kwargs = { + 'optimizer_spec': pointmass_optimizer(), + 'q_func': create_lander_q_network, + 'replay_buffer_size': int(1e5), + 'gamma': 0.95, + 'learning_freq': 1, + 'frame_history_len': 1, + 'target_update_freq': 300, + 'grad_norm_clipping': 10, + 'lander': False, + 'num_timesteps': 50000, + 'env_wrappers': pointmass_empty_wrapper + } + kwargs['exploration_schedule'] = lander_exploration_schedule(kwargs['num_timesteps']) + + else: + raise NotImplementedError + + return kwargs + +def create_lander_q_network(ob_dim, num_actions): + return nn.Sequential( + nn.Linear(ob_dim, 64), + nn.ReLU(), + nn.Linear(64, 64), + nn.ReLU(), + nn.Linear(64, num_actions), + ) + +class Ipdb(nn.Module): + def __init__(self): + super().__init__() + def forward(self, x): + import ipdb; ipdb.set_trace() + return x + + +class PreprocessAtari(nn.Module): + def forward(self, x): + # MJ: I needed to add `contiguous` here; + # might want to just add this in for students? + x = x.permute(0, 3, 1, 2).contiguous() + return x / 255. + + +def create_atari_q_network(ob_dim, num_actions): + # TODO: diivde input by 255 + return nn.Sequential( + PreprocessAtari(), + nn.Conv2d(in_channels=4, out_channels=32, kernel_size=8, stride=4), + nn.ReLU(), + nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), + nn.ReLU(), + nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), + nn.ReLU(), + Flatten(), + nn.Linear(3136, 512), # 3136 hard-coded based on img size + CNN layers + nn.ReLU(), + nn.Linear(512, num_actions), + ) + +def atari_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 1.0), + (1e6, 0.1), + (num_timesteps / 8, 0.01), + ], outside_value=0.01 + ) + + +def atari_ram_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 0.2), + (1e6, 0.1), + (num_timesteps / 8, 0.01), + ], outside_value=0.01 + ) + + +def atari_optimizer(num_timesteps): + lr_schedule = PiecewiseSchedule( + [ + (0, 1e-1), + (num_timesteps / 40, 1e-1), + (num_timesteps / 8, 5e-2), + ], + outside_value=5e-2, + ) + + return OptimizerSpec( + constructor=optim.Adam, + optim_kwargs=dict( + lr=1e-3, + eps=1e-4 + ), + learning_rate_schedule=lambda t: lr_schedule.value(t), + ) + +def pointmass_optimizer(): + return OptimizerSpec( + constructor=optim.Adam, + optim_kwargs=dict( + lr=1, + ), + learning_rate_schedule=lambda epoch: 1e-3, # keep init learning rate + ) + +def lander_optimizer(): + return OptimizerSpec( + constructor=optim.Adam, + optim_kwargs=dict( + lr=1, + ), + learning_rate_schedule=lambda epoch: 1e-3, # keep init learning rate + ) + + +def lander_exploration_schedule(num_timesteps): + return PiecewiseSchedule( + [ + (0, 1), + (num_timesteps * 0.1, 0.02), + ], outside_value=0.02 + ) + + +def sample_n_unique(sampling_f, n): + """Helper function. Given a function `sampling_f` that returns + comparable objects, sample n such unique objects. + """ + res = [] + while len(res) < n: + candidate = sampling_f() + if candidate not in res: + res.append(candidate) + return res + + +class Schedule(object): + def value(self, t): + """Value of the schedule at time t""" + raise NotImplementedError() + + +class ConstantSchedule(object): + def __init__(self, value): + """Value remains constant over time. + Parameters + ---------- + value: float + Constant value of the schedule + """ + self._v = value + + def value(self, t): + """See Schedule.value""" + return self._v + + +def linear_interpolation(l, r, alpha): + return l + alpha * (r - l) + + +class PiecewiseSchedule(object): + def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): + """Piecewise schedule. + endpoints: [(int, int)] + list of pairs `(time, value)` meanining that schedule should output + `value` when `t==time`. All the values for time must be sorted in + an increasing order. When t is between two times, e.g. `(time_a, value_a)` + and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs + `interpolation(value_a, value_b, alpha)` where alpha is a fraction of + time passed between `time_a` and `time_b` for time `t`. + interpolation: lambda float, float, float: float + a function that takes value to the left and to the right of t according + to the `endpoints`. Alpha is the fraction of distance from left endpoint to + right endpoint that t has covered. See linear_interpolation for example. + outside_value: float + if the value is requested outside of all the intervals sepecified in + `endpoints` this value is returned. If None then AssertionError is + raised when outside value is requested. + """ + idxes = [e[0] for e in endpoints] + assert idxes == sorted(idxes) + self._interpolation = interpolation + self._outside_value = outside_value + self._endpoints = endpoints + + def value(self, t): + """See Schedule.value""" + for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): + if l_t <= t and t < r_t: + alpha = float(t - l_t) / (r_t - l_t) + return self._interpolation(l, r, alpha) + + # t does not belong to any of the pieces, so doom. + assert self._outside_value is not None + return self._outside_value + +class LinearSchedule(object): + def __init__(self, schedule_timesteps, final_p, initial_p=1.0): + """Linear interpolation between initial_p and final_p over + schedule_timesteps. After this many timesteps pass final_p is + returned. + Parameters + ---------- + schedule_timesteps: int + Number of timesteps for which to linearly anneal initial_p + to final_p + initial_p: float + initial output value + final_p: float + final output value + """ + self.schedule_timesteps = schedule_timesteps + self.final_p = final_p + self.initial_p = initial_p + + def value(self, t): + """See Schedule.value""" + fraction = min(float(t) / self.schedule_timesteps, 1.0) + return self.initial_p + fraction * (self.final_p - self.initial_p) + +def compute_exponential_averages(variables, decay): + """Given a list of tensorflow scalar variables + create ops corresponding to their exponential + averages + Parameters + ---------- + variables: [tf.Tensor] + List of scalar tensors. + Returns + ------- + averages: [tf.Tensor] + List of scalar tensors corresponding to averages + of al the `variables` (in order) + apply_op: tf.runnable + Op to be run to update the averages with current value + of variables. + """ + averager = tf.train.ExponentialMovingAverage(decay=decay) + apply_op = averager.apply(variables) + return [averager.average(v) for v in variables], apply_op + +def minimize_and_clip(optimizer, objective, var_list, clip_val=10): + """Minimized `objective` using `optimizer` w.r.t. variables in + `var_list` while ensure the norm of the gradients for each + variable is clipped to `clip_val` + """ + gradients = optimizer.compute_gradients(objective, var_list=var_list) + for i, (grad, var) in enumerate(gradients): + if grad is not None: + gradients[i] = (tf.clip_by_norm(grad, clip_val), var) + return optimizer.apply_gradients(gradients) + +def initialize_interdependent_variables(session, vars_list, feed_dict): + """Initialize a list of variables one at a time, which is useful if + initialization of some variables depends on initialization of the others. + """ + vars_left = vars_list + while len(vars_left) > 0: + new_vars_left = [] + for v in vars_left: + try: + session.run(tf.variables_initializer([v]), feed_dict) + except tf.errors.FailedPreconditionError: + new_vars_left.append(v) + if len(new_vars_left) >= len(vars_left): + # This can happen if the variables all depend on each other, or more likely if there's + # another variable outside of the list, that still needs to be initialized. This could be + # detected here, but life's finite. + raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") + else: + vars_left = new_vars_left + +def get_wrapper_by_name(env, classname): + currentenv = env + while True: + if classname in currentenv.__class__.__name__: + return currentenv + elif isinstance(env, gym.Wrapper): + currentenv = currentenv.env + else: + raise ValueError("Couldn't find wrapper named %s"%classname) + +class MemoryOptimizedReplayBuffer(object): + def __init__(self, size, frame_history_len, lander=False, float_obs=False): + """This is a memory efficient implementation of the replay buffer. + + The sepecific memory optimizations use here are: + - only store each frame once rather than k times + even if every observation normally consists of k last frames + - store frames as np.uint8 (actually it is most time-performance + to cast them back to float32 on GPU to minimize memory transfer + time) + - store frame_t and frame_(t+1) in the same buffer. + + For the tipical use case in Atari Deep RL buffer with 1M frames the total + memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes + + Warning! Assumes that returning frame of zeros at the beginning + of the episode, when there is less frames than `frame_history_len`, + is acceptable. + + Parameters + ---------- + size: int + Max number of transitions to store in the buffer. When the buffer + overflows the old memories are dropped. + frame_history_len: int + Number of memories to be retried for each observation. + """ + self.float_obs = lander or float_obs + + self.size = size + self.frame_history_len = frame_history_len + + self.next_idx = 0 + self.num_in_buffer = 0 + + self.obs = None + self.action = None + self.reward = None + self.done = None + + def can_sample(self, batch_size): + """Returns true if `batch_size` different transitions can be sampled from the buffer.""" + return batch_size + 1 <= self.num_in_buffer + + def _encode_sample(self, idxes): + obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) + act_batch = self.action[idxes] + rew_batch = self.reward[idxes] + next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) + done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) + + return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask + + + def sample(self, batch_size): + """Sample `batch_size` different transitions. + + i-th sample transition is the following: + + when observing `obs_batch[i]`, action `act_batch[i]` was taken, + after which reward `rew_batch[i]` was received and subsequent + observation next_obs_batch[i] was observed, unless the epsiode + was done which is represented by `done_mask[i]` which is equal + to 1 if episode has ended as a result of that action. + + Parameters + ---------- + batch_size: int + How many transitions to sample. + + Returns + ------- + obs_batch: np.array + Array of shape + (batch_size, img_h, img_w, img_c * frame_history_len) + and dtype np.uint8 + act_batch: np.array + Array of shape (batch_size,) and dtype np.int32 + rew_batch: np.array + Array of shape (batch_size,) and dtype np.float32 + next_obs_batch: np.array + Array of shape + (batch_size, img_h, img_w, img_c * frame_history_len) + and dtype np.uint8 + done_mask: np.array + Array of shape (batch_size,) and dtype np.float32 + """ + assert self.can_sample(batch_size) + idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) + return self._encode_sample(idxes) + + def encode_recent_observation(self): + """Return the most recent `frame_history_len` frames. + + Returns + ------- + observation: np.array + Array of shape (img_h, img_w, img_c * frame_history_len) + and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] + encodes frame at time `t - frame_history_len + i` + """ + assert self.num_in_buffer > 0 + return self._encode_observation((self.next_idx - 1) % self.size) + + def _encode_observation(self, idx): + end_idx = idx + 1 # make noninclusive + start_idx = end_idx - self.frame_history_len + # this checks if we are using low-dimensional observations, such as RAM + # state, in which case we just directly return the latest RAM. + if len(self.obs.shape) == 2: + return self.obs[end_idx-1] + # if there weren't enough frames ever in the buffer for context + if start_idx < 0 and self.num_in_buffer != self.size: + start_idx = 0 + for idx in range(start_idx, end_idx - 1): + if self.done[idx % self.size]: + start_idx = idx + 1 + missing_context = self.frame_history_len - (end_idx - start_idx) + # if zero padding is needed for missing context + # or we are on the boundry of the buffer + if start_idx < 0 or missing_context > 0: + frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] + for idx in range(start_idx, end_idx): + frames.append(self.obs[idx % self.size]) + return np.concatenate(frames, 2) + else: + # this optimization has potential to saves about 30% compute time \o/ + img_h, img_w = self.obs.shape[1], self.obs.shape[2] + return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) + + def store_frame(self, frame): + """Store a single frame in the buffer at the next available index, overwriting + old frames if necessary. + + Parameters + ---------- + frame: np.array + Array of shape (img_h, img_w, img_c) and dtype np.uint8 + the frame to be stored + + Returns + ------- + idx: int + Index at which the frame is stored. To be used for `store_effect` later. + """ + if self.obs is None: + self.obs = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.float_obs else np.uint8) + self.action = np.empty([self.size], dtype=np.int32) + self.reward = np.empty([self.size], dtype=np.float32) + self.done = np.empty([self.size], dtype=np.bool) + self.obs[self.next_idx] = frame + + ret = self.next_idx + self.next_idx = (self.next_idx + 1) % self.size + self.num_in_buffer = min(self.size, self.num_in_buffer + 1) + + return ret + + def store_effect(self, idx, action, reward, done): + """Store effects of action taken after obeserving frame stored + at index idx. The reason `store_frame` and `store_effect` is broken + up into two functions is so that once can call `encode_recent_observation` + in between. + + Paramters + --------- + idx: int + Index in buffer of recently observed frame (returned by `store_frame`). + action: int + Action that was performed upon observing this frame. + reward: float + Reward that was received when the actions was performed. + done: bool + True if episode was finished after performing that action. + """ + self.action[idx] = action + self.reward[idx] = reward + self.done[idx] = done diff --git a/hw5/cs285/infrastructure/logger.py b/hw5/cs285/infrastructure/logger.py new file mode 100644 index 00000000..a64931c0 --- /dev/null +++ b/hw5/cs285/infrastructure/logger.py @@ -0,0 +1,74 @@ +import os +from tensorboardX import SummaryWriter +import numpy as np + +class Logger: + def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): + self._log_dir = log_dir + print('########################') + print('logging outputs to ', log_dir) + print('########################') + self._n_logged_samples = n_logged_samples + self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) + + def log_scalar(self, scalar, name, step_): + self._summ_writer.add_scalar('{}'.format(name), scalar, step_) + + def log_scalars(self, scalar_dict, group_name, step, phase): + """Will log all scalars in the same plot.""" + self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) + + def log_image(self, image, name, step): + assert(len(image.shape) == 3) # [C, H, W] + self._summ_writer.add_image('{}'.format(name), image, step) + + def log_video(self, video_frames, name, step, fps=10): + assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" + self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) + + def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): + + # reshape the rollouts + videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] + + # max rollout length + max_videos_to_save = np.min([max_videos_to_save, len(videos)]) + max_length = videos[0].shape[0] + for i in range(max_videos_to_save): + if videos[i].shape[0]>max_length: + max_length = videos[i].shape[0] + + # pad rollouts to all be same length + for i in range(max_videos_to_save): + if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_figure(self, figure, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) + + def log_graph(self, array, name, step, phase): + """figure: matplotlib.pyplot figure handle""" + im = plot_graph(array) + self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) + + def dump_scalars(self, log_path=None): + log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path + self._summ_writer.export_scalars_to_json(log_path) + + def flush(self): + self._summ_writer.flush() + + + + diff --git a/hw5/cs285/infrastructure/pytorch_util.py b/hw5/cs285/infrastructure/pytorch_util.py new file mode 100644 index 00000000..d33f32e4 --- /dev/null +++ b/hw5/cs285/infrastructure/pytorch_util.py @@ -0,0 +1,89 @@ +from typing import Union + +import torch +from torch import nn + +Activation = Union[str, nn.Module] + + +_str_to_activation = { + 'relu': nn.ReLU(), + 'tanh': nn.Tanh(), + 'leaky_relu': nn.LeakyReLU(), + 'sigmoid': nn.Sigmoid(), + 'selu': nn.SELU(), + 'softplus': nn.Softplus(), + 'identity': nn.Identity(), +} + + +def build_mlp( + input_size: int, + output_size: int, + n_layers: int, + size: int, + activation: Activation = 'tanh', + output_activation: Activation = 'identity', + init_method=None, +): + """ + Builds a feedforward neural network + arguments: + input_placeholder: placeholder variable for the state (batch_size, input_size) + scope: variable scope of the network + n_layers: number of hidden layers + size: dimension of each hidden layer + activation: activation of each hidden layer + input_size: size of the input layer + output_size: size of the output layer + output_activation: activation of the output layer + returns: + output_placeholder: the result of a forward pass through the hidden layers + the output layer + """ + if isinstance(activation, str): + activation = _str_to_activation[activation] + if isinstance(output_activation, str): + output_activation = _str_to_activation[output_activation] + layers = [] + in_size = input_size + for _ in range(n_layers): + curr_layer = nn.Linear(in_size, size) + if init_method is not None: + curr_layer.apply(init_method) + layers.append(curr_layer) + layers.append(activation) + in_size = size + + last_layer = nn.Linear(in_size, output_size) + if init_method is not None: + last_layer.apply(init_method) + + layers.append(last_layer) + layers.append(output_activation) + + return nn.Sequential(*layers) + + +device = None + + +def init_gpu(use_gpu=True, gpu_id=0): + global device + if torch.cuda.is_available() and use_gpu: + device = torch.device("cuda:" + str(gpu_id)) + print("Using GPU id {}".format(gpu_id)) + else: + device = torch.device("cpu") + print("GPU not detected. Defaulting to CPU.") + + +def set_device(gpu_id): + torch.cuda.set_device(gpu_id) + + +def from_numpy(*args, **kwargs): + return torch.from_numpy(*args, **kwargs).float().to(device) + + +def to_numpy(tensor): + return tensor.to('cpu').detach().numpy() diff --git a/hw5/cs285/infrastructure/replay_buffer.py b/hw5/cs285/infrastructure/replay_buffer.py new file mode 100644 index 00000000..d3cf2afe --- /dev/null +++ b/hw5/cs285/infrastructure/replay_buffer.py @@ -0,0 +1,106 @@ +from cs285.infrastructure.utils import * + + +class ReplayBuffer(object): + + def __init__(self, max_size=1000000): + + self.max_size = max_size + self.paths = [] + self.obs = None + self.acs = None + self.concatenated_rews = None + self.unconcatenated_rews = None + self.next_obs = None + self.terminals = None + + def add_rollouts(self, paths, noised=False): + + # add new rollouts into our list of rollouts + for path in paths: + tpath = dict() + # print (path.keys()) + tpath['observation'] = path['observations'] + tpath['next_observation'] = path['next_observations'] + tpath['reward'] = path['rewards'] + tpath['action'] = path['actions'] + tpath['terminal'] = path['terminals'] + self.paths.append(tpath) + + # convert new rollouts into their component arrays, and append them onto our arrays + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(self.paths) + + if noised: + observations = add_noise(observations) + next_observations = add_noise(next_observations) + + if self.obs is None: + self.obs = observations[-self.max_size:] + self.acs = actions[-self.max_size:] + self.next_obs = next_observations[-self.max_size:] + self.terminals = terminals[-self.max_size:] + self.concatenated_rews = concatenated_rews[-self.max_size:] + self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] + else: + self.obs = np.concatenate([self.obs, observations])[-self.max_size:] + self.acs = np.concatenate([self.acs, actions])[-self.max_size:] + self.next_obs = np.concatenate( + [self.next_obs, next_observations] + )[-self.max_size:] + self.terminals = np.concatenate( + [self.terminals, terminals] + )[-self.max_size:] + self.concatenated_rews = np.concatenate( + [self.concatenated_rews, concatenated_rews] + )[-self.max_size:] + if isinstance(unconcatenated_rews, list): + self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around + else: + self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around + + print (self.terminals.sum()) + ######################################## + ######################################## + + def sample_random_rollouts(self, num_rollouts): + rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] + return self.paths[rand_indices] + + def sample_recent_rollouts(self, num_rollouts=1): + return self.paths[-num_rollouts:] + + def can_sample(self, batch_size): + # print (self.obs.shape[0]) + if self.obs.shape[0] > batch_size: + return True + else: + return False + + ######################################## + ######################################## + + def sample_random_data(self, batch_size): + + assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] + rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] + return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] + + def sample(self, batch_size): + return self.sample_random_data(batch_size) + + def sample_recent_data(self, batch_size=1, concat_rew=True): + + if concat_rew: + return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] + else: + num_recent_rollouts_to_return = 0 + num_datapoints_so_far = 0 + index = -1 + while num_datapoints_so_far < batch_size: + recent_rollout = self.paths[index] + index -=1 + num_recent_rollouts_to_return +=1 + num_datapoints_so_far += get_pathlength(recent_rollout) + rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] + observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) + return observations, actions, unconcatenated_rews, next_observations, terminals diff --git a/hw5/cs285/infrastructure/rl_trainer.py b/hw5/cs285/infrastructure/rl_trainer.py new file mode 100644 index 00000000..64c5442e --- /dev/null +++ b/hw5/cs285/infrastructure/rl_trainer.py @@ -0,0 +1,401 @@ +from collections import OrderedDict +import pickle +import os +import sys +import time +import pdb + +import gym +from gym import wrappers +import numpy as np +import torch +from cs285.infrastructure import pytorch_util as ptu + +from cs285.infrastructure import utils +from cs285.infrastructure.logger import Logger + +from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent +from cs285.infrastructure.dqn_utils import ( + get_wrapper_by_name, + register_custom_envs, +) + +#register all of our envs +import cs285.envs + +# how many rollouts to save as videos to tensorboard +MAX_NVIDEO = 2 +MAX_VIDEO_LEN = 40 # we overwrite this in the code below + + +class RL_Trainer(object): + + def __init__(self, params): + + ############# + ## INIT + ############# + + # Get params, create logger + self.params = params + self.logger = Logger(self.params['logdir']) + + # Set random seeds + seed = self.params['seed'] + np.random.seed(seed) + torch.manual_seed(seed) + ptu.init_gpu( + use_gpu=not self.params['no_gpu'], + gpu_id=self.params['which_gpu'] + ) + + ############# + ## ENV + ############# + + # Make the gym environment + register_custom_envs() + self.env = gym.make(self.params['env_name']) + self.eval_env = gym.make(self.params['env_name']) + if not ('pointmass' in self.params['env_name']): + import matplotlib + matplotlib.use('Agg') + self.env.set_logdir(self.params['logdir'] + '/expl_') + self.eval_env.set_logdir(self.params['logdir'] + '/eval_') + + if 'env_wrappers' in self.params: + # These operations are currently only for Atari envs + self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), force=True) + self.eval_env = wrappers.Monitor(self.eval_env, os.path.join(self.params['logdir'], "gym"), force=True) + self.env = params['env_wrappers'](self.env) + self.eval_env = params['env_wrappers'](self.eval_env) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + if 'non_atari_colab_env' in self.params and self.params['video_log_freq'] > 0: + self.env = wrappers.Monitor(self.env, os.path.join(self.params['logdir'], "gym"), write_upon_reset=True)#, force=True) + self.eval_env = wrappers.Monitor(self.eval_env, os.path.join(self.params['logdir'], "gym"), write_upon_reset=True) + self.mean_episode_reward = -float('nan') + self.best_mean_episode_reward = -float('inf') + self.env.seed(seed) + self.eval_env.seed(seed) + + # Maximum length for episodes + self.params['ep_len'] = self.params['ep_len'] or self.env.spec.max_episode_steps + global MAX_VIDEO_LEN + MAX_VIDEO_LEN = self.params['ep_len'] + + # Is this env continuous, or self.discrete? + discrete = isinstance(self.env.action_space, gym.spaces.Discrete) + # Are the observations images? + img = len(self.env.observation_space.shape) > 2 + + self.params['agent_params']['discrete'] = discrete + + # Observation and action sizes + + ob_dim = self.env.observation_space.shape if img else self.env.observation_space.shape[0] + ac_dim = self.env.action_space.n if discrete else self.env.action_space.shape[0] + self.params['agent_params']['ac_dim'] = ac_dim + self.params['agent_params']['ob_dim'] = ob_dim + + # simulation timestep, will be used for video saving + if 'model' in dir(self.env): + self.fps = 1/self.env.model.opt.timestep + elif 'env_wrappers' in self.params: + self.fps = 30 # This is not actually used when using the Monitor wrapper + elif 'video.frames_per_second' in self.env.env.metadata.keys(): + self.fps = self.env.env.metadata['video.frames_per_second'] + else: + self.fps = 10 + + + ############# + ## AGENT + ############# + + agent_class = self.params['agent_class'] + self.agent = agent_class(self.env, self.params['agent_params']) + + def run_training_loop(self, n_iter, collect_policy, eval_policy, + buffer_name=None, + initial_expertdata=None, relabel_with_expert=False, + start_relabel_with_expert=1, expert_policy=None): + """ + :param n_iter: number of (dagger) iterations + :param collect_policy: + :param eval_policy: + :param initial_expertdata: + :param relabel_with_expert: whether to perform dagger + :param start_relabel_with_expert: iteration at which to start relabel with expert + :param expert_policy: + """ + + # init vars at beginning of training + self.total_envsteps = 0 + self.start_time = time.time() + + print_period = 1000 if isinstance(self.agent, ExplorationOrExploitationAgent) else 1 + + for itr in range(n_iter): + if itr % print_period == 0: + print("\n\n********** Iteration %i ************"%itr) + + # decide if videos should be rendered/logged at this iteration + if itr % self.params['video_log_freq'] == 0 and self.params['video_log_freq'] != -1: + self.logvideo = True + else: + self.logvideo = False + + # decide if metrics should be logged + if self.params['scalar_log_freq'] == -1: + self.logmetrics = False + elif itr % self.params['scalar_log_freq'] == 0: + self.logmetrics = True + else: + self.logmetrics = False + + # collect trajectories, to be used for training + if isinstance(self.agent, ExplorationOrExploitationAgent): + self.agent.step_env() + envsteps_this_batch = 1 + train_video_paths = None + paths = None + else: + use_batchsize = self.params['batch_size'] + if itr==0: + use_batchsize = self.params['batch_size_initial'] + paths, envsteps_this_batch, train_video_paths = ( + self.collect_training_trajectories( + itr, initial_expertdata, collect_policy, use_batchsize) + ) + + + if (not self.agent.offline_exploitation) or (self.agent.t <= self.agent.num_exploration_steps): + self.total_envsteps += envsteps_this_batch + + # relabel the collected obs with actions from a provided expert policy + if relabel_with_expert and itr>=start_relabel_with_expert: + paths = self.do_relabel_with_expert(expert_policy, paths) + + # add collected data to replay buffer + if isinstance(self.agent, ExplorationOrExploitationAgent): + if (not self.agent.offline_exploitation) or (self.agent.t <= self.agent.num_exploration_steps): + self.agent.add_to_replay_buffer(paths) + + # train agent (using sampled data from replay buffer) + if itr % print_period == 0: + print("\nTraining agent...") + all_logs = self.train_agent() + + # Log densities and output trajectories + if isinstance(self.agent, ExplorationOrExploitationAgent) and (itr % print_period == 0): + self.dump_density_graphs(itr) + + # log/save + if self.logvideo or self.logmetrics: + # perform logging + print('\nBeginning logging procedure...') + if isinstance(self.agent, ExplorationOrExploitationAgent): + self.perform_dqn_logging(all_logs) + else: + self.perform_logging(itr, paths, eval_policy, train_video_paths, all_logs) + + if self.params['save_params']: + self.agent.save('{}/agent_itr_{}.pt'.format(self.params['logdir'], itr)) + + #################################### + #################################### + + def collect_training_trajectories(self, itr, initial_expertdata, collect_policy, num_transitions_to_sample, save_expert_data_to_disk=False): + """ + :param itr: + :param load_initial_expertdata: path to expert data pkl file + :param collect_policy: the current policy using which we collect data + :param num_transitions_to_sample: the number of transitions we collect + :return: + paths: a list trajectories + envsteps_this_batch: the sum over the numbers of environment steps in paths + train_video_paths: paths which also contain videos for visualization purposes + """ + raise NotImplementedError + # TODO: get this from hw1 or hw2 + + #################################### + #################################### + + def train_agent(self): + # TODO: get this from Piazza + all_logs = [] + for train_step in range(self.params['num_agent_train_steps_per_iter']): + ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch = self.agent.sample(self.params['train_batch_size']) + # import ipdb; ipdb.set_trace() + train_log = self.agent.train(ob_batch, ac_batch, re_batch, next_ob_batch, terminal_batch) + all_logs.append(train_log) + return all_logs + + #################################### + #################################### + + def do_relabel_with_expert(self, expert_policy, paths): + raise NotImplementedError + # get this from hw1 or hw2 or ignore it b/c it's not used for this hw + + #################################### + #################################### + + def perform_dqn_logging(self, all_logs): + last_log = all_logs[-1] + + episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() + if len(episode_rewards) > 0: + self.mean_episode_reward = np.mean(episode_rewards[-100:]) + if len(episode_rewards) > 100: + self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) + + logs = OrderedDict() + + logs["Train_EnvstepsSoFar"] = self.agent.t + print("Timestep %d" % (self.agent.t,)) + if self.mean_episode_reward > -5000: + logs["Train_AverageReturn"] = np.mean(self.mean_episode_reward) + print("mean reward (100 episodes) %f" % self.mean_episode_reward) + if self.best_mean_episode_reward > -5000: + logs["Train_BestReturn"] = np.mean(self.best_mean_episode_reward) + print("best mean reward %f" % self.best_mean_episode_reward) + + if self.start_time is not None: + time_since_start = (time.time() - self.start_time) + print("running time %f" % time_since_start) + logs["TimeSinceStart"] = time_since_start + + logs.update(last_log) + + eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.eval_env, self.agent.eval_policy, self.params['eval_batch_size'], self.params['ep_len']) + + eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] + eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] + + logs["Eval_AverageReturn"] = np.mean(eval_returns) + logs["Eval_StdReturn"] = np.std(eval_returns) + logs["Eval_MaxReturn"] = np.max(eval_returns) + logs["Eval_MinReturn"] = np.min(eval_returns) + logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) + + logs['Buffer size'] = self.agent.replay_buffer.num_in_buffer + + sys.stdout.flush() + + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + self.logger.log_scalar(value, key, self.agent.t) + print('Done logging...\n\n') + + self.logger.flush() + + def perform_logging(self, itr, paths, eval_policy, train_video_paths, all_logs): + + last_log = all_logs[-1] + + ####################### + + # collect eval trajectories, for logging + print("\nCollecting data for eval...") + eval_paths, eval_envsteps_this_batch = utils.sample_trajectories(self.env, eval_policy, self.params['eval_batch_size'], self.params['ep_len']) + + # save eval rollouts as videos in tensorboard event file + if self.logvideo and train_video_paths != None: + print('\nCollecting video rollouts eval') + eval_video_paths = utils.sample_n_trajectories(self.env, eval_policy, MAX_NVIDEO, MAX_VIDEO_LEN, True) + + #save train/eval videos + print('\nSaving train rollouts as videos...') + self.logger.log_paths_as_videos(train_video_paths, itr, fps=self.fps, max_videos_to_save=MAX_NVIDEO, + video_title='train_rollouts') + self.logger.log_paths_as_videos(eval_video_paths, itr, fps=self.fps,max_videos_to_save=MAX_NVIDEO, + video_title='eval_rollouts') + + ####################### + + # save eval metrics + if self.logmetrics: + # returns, for logging + train_returns = [path["reward"].sum() for path in paths] + eval_returns = [eval_path["reward"].sum() for eval_path in eval_paths] + + # episode lengths, for logging + train_ep_lens = [len(path["reward"]) for path in paths] + eval_ep_lens = [len(eval_path["reward"]) for eval_path in eval_paths] + + # decide what to log + logs = OrderedDict() + logs["Eval_AverageReturn"] = np.mean(eval_returns) + logs["Eval_StdReturn"] = np.std(eval_returns) + logs["Eval_MaxReturn"] = np.max(eval_returns) + logs["Eval_MinReturn"] = np.min(eval_returns) + logs["Eval_AverageEpLen"] = np.mean(eval_ep_lens) + + logs["Train_AverageReturn"] = np.mean(train_returns) + logs["Train_StdReturn"] = np.std(train_returns) + logs["Train_MaxReturn"] = np.max(train_returns) + logs["Train_MinReturn"] = np.min(train_returns) + logs["Train_AverageEpLen"] = np.mean(train_ep_lens) + + logs["Train_EnvstepsSoFar"] = self.total_envsteps + logs["TimeSinceStart"] = time.time() - self.start_time + logs.update(last_log) + + if itr == 0: + self.initial_return = np.mean(train_returns) + logs["Initial_DataCollection_AverageReturn"] = self.initial_return + + # perform the logging + for key, value in logs.items(): + print('{} : {}'.format(key, value)) + try: + self.logger.log_scalar(value, key, itr) + except: + pdb.set_trace() + print('Done logging...\n\n') + + self.logger.flush() + + def dump_density_graphs(self, itr): + import matplotlib.pyplot as plt + self.fig = plt.figure() + filepath = lambda name: self.params['logdir']+'/curr_{}.png'.format(name) + + num_states = self.agent.replay_buffer.num_in_buffer - 2 + states = self.agent.replay_buffer.obs[:num_states] + if num_states <= 0: return + + H, xedges, yedges = np.histogram2d(states[:,0], states[:,1], range=[[0., 1.], [0., 1.]], density=True) + plt.imshow(np.rot90(H), interpolation='bicubic') + plt.colorbar() + plt.title('State Density') + self.fig.savefig(filepath('state_density'), bbox_inches='tight') + + plt.clf() + ii, jj = np.meshgrid(np.linspace(0, 1), np.linspace(0, 1)) + obs = np.stack([ii.flatten(), jj.flatten()], axis=1) + density = self.agent.exploration_model.forward_np(obs) + density = density.reshape(ii.shape) + plt.imshow(density[::-1]) + plt.colorbar() + plt.title('RND Value') + self.fig.savefig(filepath('rnd_value'), bbox_inches='tight') + + plt.clf() + exploitation_values = self.agent.exploitation_critic.qa_values(obs).mean(-1) + exploitation_values = exploitation_values.reshape(ii.shape) + plt.imshow(exploitation_values[::-1]) + plt.colorbar() + plt.title('Predicted Exploitation Value') + self.fig.savefig(filepath('exploitation_value'), bbox_inches='tight') + + plt.clf() + exploration_values = self.agent.exploration_critic.qa_values(obs).mean(-1) + exploration_values = exploration_values.reshape(ii.shape) + plt.imshow(exploration_values[::-1]) + plt.colorbar() + plt.title('Predicted Exploration Value') + self.fig.savefig(filepath('exploration_value'), bbox_inches='tight') diff --git a/hw5/cs285/infrastructure/utils.py b/hw5/cs285/infrastructure/utils.py new file mode 100644 index 00000000..d4a6adad --- /dev/null +++ b/hw5/cs285/infrastructure/utils.py @@ -0,0 +1,145 @@ +import numpy as np +import time +import copy + +############################################ +############################################ + +def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): + + model = models[0] + + # true + true_states = perform_actions(env, action_sequence)['observation'] + + # predicted + ob = np.expand_dims(true_states[0],0) + pred_states = [] + for ac in action_sequence: + pred_states.append(ob) + action = np.expand_dims(ac,0) + ob = model.get_prediction(ob, action, data_statistics) + pred_states = np.squeeze(pred_states) + + # mpe + mpe = mean_squared_error(pred_states, true_states) + + return mpe, true_states, pred_states + +def perform_actions(env, actions): + ob = env.reset() + obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] + steps = 0 + for ac in actions: + obs.append(ob) + acs.append(ac) + ob, rew, done, _ = env.step(ac) + # add the observation after taking a step to next_obs + next_obs.append(ob) + rewards.append(rew) + steps += 1 + # If the episode ended, the corresponding terminal value is 1 + # otherwise, it is 0 + if done: + terminals.append(1) + break + else: + terminals.append(0) + + return Path(obs, image_obs, acs, rewards, next_obs, terminals) + +def mean_squared_error(a, b): + return np.mean((a-b)**2) + +############################################ +############################################ + +def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): + raise NotImplementedError + # TODO: get this from hw1 or hw2 + # IMPORTANT CHANGE: Comment out the line: ac = ac[0], as Argmax Policy already returns a scalar + + #################################### + #################################### + +def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect rollouts using policy + until we have collected min_timesteps_per_batch steps + """ + raise NotImplementedError + # TODO: get this from hw1 or hw2 + + #################################### + #################################### + +def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): + """ + Collect ntraj rollouts using policy + """ + raise NotImplementedError + # TODO: get this from hw1 or hw2 + + #################################### + #################################### + +def Path(obs, image_obs, acs, rewards, next_obs, terminals): + """ + Take info (separate arrays) from a single rollout + and return it in a single dictionary + """ + if image_obs != []: + image_obs = np.stack(image_obs, axis=0) + return {"observation" : np.array(obs, dtype=np.float32), + "image_obs" : np.array(image_obs, dtype=np.uint8), + "reward" : np.array(rewards, dtype=np.float32), + "action" : np.array(acs, dtype=np.float32), + "next_observation": np.array(next_obs, dtype=np.float32), + "terminal": np.array(terminals, dtype=np.float32)} + + +def convert_listofrollouts(paths): + """ + Take a list of rollout dictionaries + and return separate arrays, + where each array is a concatenation of that array from across the rollouts + """ + observations = np.concatenate([path["observation"] for path in paths]) + actions = np.concatenate([path["action"] for path in paths]) + next_observations = np.concatenate([path["next_observation"] for path in paths]) + terminals = np.concatenate([path["terminal"] for path in paths]) + concatenated_rewards = np.concatenate([path["reward"] for path in paths]) + unconcatenated_rewards = [path["reward"] for path in paths] + return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards + +############################################ +############################################ + +def get_pathlength(path): + return len(path["reward"]) + +def normalize(data, mean, std, eps=1e-8): + return (data-mean)/(std+eps) + +def unnormalize(data, mean, std): + return data*std+mean + +def add_noise(data_inp, noiseToSignal=0.01): + + data = copy.deepcopy(data_inp) #(num data points, dim) + + #mean of data + mean_data = np.mean(data, axis=0) + + #if mean is 0, + #make it 0.001 to avoid 0 issues later for dividing by std + mean_data[mean_data == 0] = 0.000001 + + #width of normal distribution to sample noise from + #larger magnitude number = could have larger magnitude noise + std_of_noise = mean_data * noiseToSignal + for j in range(mean_data.shape[0]): + data[:, j] = np.copy(data[:, j] + np.random.normal( + 0, np.absolute(std_of_noise[j]), (data.shape[0],))) + + return data diff --git a/hw5/cs285/policies/MLP_policy.py b/hw5/cs285/policies/MLP_policy.py new file mode 100644 index 00000000..de02b856 --- /dev/null +++ b/hw5/cs285/policies/MLP_policy.py @@ -0,0 +1,126 @@ +import abc +import itertools +from torch import nn +from torch.nn import functional as F +from torch import optim + +import numpy as np +import torch +from torch import distributions + +from cs285.infrastructure import pytorch_util as ptu +from cs285.policies.base_policy import BasePolicy + + +class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): + + def __init__(self, + ac_dim, + ob_dim, + n_layers, + size, + discrete=False, + learning_rate=1e-4, + training=True, + nn_baseline=False, + **kwargs + ): + super().__init__(**kwargs) + + # init vars + self.ac_dim = ac_dim + self.ob_dim = ob_dim + self.n_layers = n_layers + self.discrete = discrete + self.size = size + self.learning_rate = learning_rate + self.training = training + self.nn_baseline = nn_baseline + + if self.discrete: + self.logits_na = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, + size=self.size) + self.logits_na.to(ptu.device) + self.mean_net = None + self.logstd = None + self.optimizer = optim.Adam(self.logits_na.parameters(), + self.learning_rate) + else: + self.logits_na = None + self.mean_net = ptu.build_mlp(input_size=self.ob_dim, + output_size=self.ac_dim, + n_layers=self.n_layers, size=self.size) + self.logstd = nn.Parameter( + torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) + ) + self.mean_net.to(ptu.device) + self.logstd.to(ptu.device) + self.optimizer = optim.Adam( + itertools.chain([self.logstd], self.mean_net.parameters()), + self.learning_rate + ) + + if nn_baseline: + self.baseline = ptu.build_mlp( + input_size=self.ob_dim, + output_size=1, + n_layers=self.n_layers, + size=self.size, + ) + self.baseline.to(ptu.device) + self.baseline_optimizer = optim.Adam( + self.baseline.parameters(), + self.learning_rate, + ) + else: + self.baseline = None + + ################################## + + def save(self, filepath): + torch.save(self.state_dict(), filepath) + + ################################## + + # query the policy with observation(s) to get selected action(s) + def get_action(self, obs: np.ndarray) -> np.ndarray: + raise NotImplementedError + # TODO: get this from hw1 + + #################################### + #################################### + + # update/train this policy + def update(self, observations, actions, **kwargs): + raise NotImplementedError + + # This function defines the forward pass of the network. + # You can return anything you want, but you should be able to differentiate + # through it. For example, you can return a torch.FloatTensor. You can also + # return more flexible objects, such as a + # `torch.distributions.Distribution` object. It's up to you! + def forward(self, observation: torch.FloatTensor): + raise NotImplementedError + # TODO: get this from hw1 + + #################################### + #################################### + + +##################################################### +##################################################### + + +class MLPPolicyAC(MLPPolicy): + # MJ: cut acs_labels_na and qvals from the signature if they are not used + def update( + self, observations, actions, + adv_n=None, acs_labels_na=None, qvals=None + ): + raise NotImplementedError + # Not needed for this homework + + #################################### + #################################### diff --git a/hw5/cs285/policies/argmax_policy.py b/hw5/cs285/policies/argmax_policy.py new file mode 100644 index 00000000..0525d1e6 --- /dev/null +++ b/hw5/cs285/policies/argmax_policy.py @@ -0,0 +1,24 @@ +import numpy as np +import pdb + + +class ArgMaxPolicy(object): + + def __init__(self, critic): + self.critic = critic + + def set_critic(self, critic): + self.critic = critic + + def get_action(self, obs): + # MJ: changed the dimension check to a 3 + if len(obs.shape) > 3: + observation = obs + else: + observation = obs[None] + + raise NotImplementedError + # TODO: get this from hw3 + + #################################### + #################################### \ No newline at end of file diff --git a/hw5/cs285/policies/base_policy.py b/hw5/cs285/policies/base_policy.py new file mode 100644 index 00000000..e089540a --- /dev/null +++ b/hw5/cs285/policies/base_policy.py @@ -0,0 +1,14 @@ +import abc +import numpy as np + + +class BasePolicy(object, metaclass=abc.ABCMeta): + def get_action(self, obs: np.ndarray) -> np.ndarray: + raise NotImplementedError + + def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: + """Return a dictionary of logging information.""" + raise NotImplementedError + + def save(self, filepath: str): + raise NotImplementedError diff --git a/hw5/cs285/scripts/read_results.py b/hw5/cs285/scripts/read_results.py new file mode 100644 index 00000000..3a5bc50f --- /dev/null +++ b/hw5/cs285/scripts/read_results.py @@ -0,0 +1,26 @@ +import glob +import tensorflow as tf + +def get_section_results(file): + """ + requires tensorflow==1.12.0 + """ + X = [] + Y = [] + for e in tf.train.summary_iterator(file): + for v in e.summary.value: + if v.tag == 'Train_EnvstepsSoFar': + X.append(v.simple_value) + elif v.tag == 'Eval_AverageReturn': + Y.append(v.simple_value) + return X, Y + +if __name__ == '__main__': + import glob + + logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' + eventfile = glob.glob(logdir)[0] + + X, Y = get_section_results(eventfile) + for i, (x, y) in enumerate(zip(X, Y)): + print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) \ No newline at end of file diff --git a/hw5/cs285/scripts/run_hw5_expl.py b/hw5/cs285/scripts/run_hw5_expl.py new file mode 100644 index 00000000..2ef448d4 --- /dev/null +++ b/hw5/cs285/scripts/run_hw5_expl.py @@ -0,0 +1,131 @@ +import os +import time + +from cs285.infrastructure.rl_trainer import RL_Trainer +from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent +from cs285.infrastructure.dqn_utils import get_env_kwargs, PiecewiseSchedule, ConstantSchedule + + +class Q_Trainer(object): + + def __init__(self, params): + self.params = params + + train_args = { + 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], + 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], + 'train_batch_size': params['batch_size'], + 'double_q': params['double_q'], + } + + env_args = get_env_kwargs(params['env_name']) + + self.agent_params = {**train_args, **env_args, **params} + + self.params['agent_class'] = ExplorationOrExploitationAgent + self.params['agent_params'] = self.agent_params + self.params['train_batch_size'] = params['batch_size'] + self.params['env_wrappers'] = self.agent_params['env_wrappers'] + + self.rl_trainer = RL_Trainer(self.params) + + def run_training_loop(self): + self.rl_trainer.run_training_loop( + self.agent_params['num_timesteps'], + collect_policy = self.rl_trainer.agent.actor, + eval_policy = self.rl_trainer.agent.actor, + ) + +def main(): + + import argparse + parser = argparse.ArgumentParser() + parser.add_argument( + '--env_name', + default='PointmassHard-v0', + choices=('PointmassEasy-v0', 'PointmassMedium-v0', 'PointmassHard-v0', 'PointmassVeryHard-v0') + ) + + parser.add_argument('--exp_name', type=str, default='todo') + + parser.add_argument('--eval_batch_size', type=int, default=1000) + parser.add_argument('--batch_size', type=int, default=256) + + parser.add_argument('--use_rnd', action='http://github.com/store_true') + parser.add_argument('--num_exploration_steps', type=int, default=10000) + parser.add_argument('--unsupervised_exploration', action='http://github.com/store_true') + + parser.add_argument('--offline_exploitation', action='http://github.com/store_true') + parser.add_argument('--cql_alpha', type=float, default=0.0) + + parser.add_argument('--exploit_rew_shift', type=float, default=0.0) + parser.add_argument('--exploit_rew_scale', type=float, default=1.0) + + parser.add_argument('--rnd_output_size', type=int, default=5) + parser.add_argument('--rnd_n_layers', type=int, default=2) + parser.add_argument('--rnd_size', type=int, default=400) + + parser.add_argument('--seed', type=int, default=2) + parser.add_argument('--no_gpu', '-ngpu', action='http://github.com/store_true') + parser.add_argument('--which_gpu', '-gpu_id', default=0) + parser.add_argument('--scalar_log_freq', type=int, default=int(1e3)) + parser.add_argument('--save_params', action='http://github.com/store_true') + + args = parser.parse_args() + + # convert to dictionary + params = vars(args) + params['double_q'] = True + params['num_agent_train_steps_per_iter'] = 1 + params['num_critic_updates_per_agent_update'] = 1 + params['exploit_weight_schedule'] = ConstantSchedule(1.0) + params['video_log_freq'] = -1 # This param is not used for DQN + params['num_timesteps'] = 50000 + params['learning_starts'] = 2000 + params['eps'] = 0.2 + ################################## + ### CREATE DIRECTORY FOR LOGGING + ################################## + + if params['env_name']=='PointmassEasy-v0': + params['ep_len']=50 + if params['env_name']=='PointmassMedium-v0': + params['ep_len']=150 + if params['env_name']=='PointmassHard-v0': + params['ep_len']=100 + if params['env_name']=='PointmassVeryHard-v0': + params['ep_len']=200 + + if params['use_rnd']: + params['explore_weight_schedule'] = PiecewiseSchedule([(0,1), (params['num_exploration_steps'], 0)], outside_value=0.0) + else: + params['explore_weight_schedule'] = ConstantSchedule(0.0) + + if params['unsupervised_exploration']: + params['explore_weight_schedule'] = ConstantSchedule(1.0) + params['exploit_weight_schedule'] = ConstantSchedule(0.0) + + if not params['use_rnd']: + params['learning_starts'] = params['num_exploration_steps'] + + + logdir_prefix = 'hw5_expl_' # keep for autograder + data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') + + if not (os.path.exists(data_path)): + os.makedirs(data_path) + + logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") + logdir = os.path.join(data_path, logdir) + params['logdir'] = logdir + if not(os.path.exists(logdir)): + os.makedirs(logdir) + + print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") + + trainer = Q_Trainer(params) + trainer.run_training_loop() + + +if __name__ == "__main__": + main() diff --git a/hw5/hw5.pdf b/hw5/hw5.pdf new file mode 100644 index 00000000..81a5abc8 Binary files /dev/null and b/hw5/hw5.pdf differ diff --git a/hw5/requirements.txt b/hw5/requirements.txt new file mode 100644 index 00000000..5982dcc8 --- /dev/null +++ b/hw5/requirements.txt @@ -0,0 +1,13 @@ +gym==0.17.2 +mujoco-py==2.0.2.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +networkx==2.5 +ipdb==0.13.3 +box2d-py diff --git a/hw5/requirements_colab.txt b/hw5/requirements_colab.txt new file mode 100644 index 00000000..ec0873d0 --- /dev/null +++ b/hw5/requirements_colab.txt @@ -0,0 +1,12 @@ +gym==0.17.2 +tensorboard==2.3.0 +tensorboardX==1.8 +matplotlib==2.2.2 +ipython==6.4.0 +moviepy==1.0.0 +pyvirtualdisplay==1.3.2 +torch==1.5.1 +opencv-python==4.4.0.42 +networkx==2.5 +ipdb==0.13.3 +box2d-py diff --git a/hw5/run_hw5_expl.ipynb b/hw5/run_hw5_expl.ipynb new file mode 100644 index 00000000..88956a8c --- /dev/null +++ b/hw5/run_hw5_expl.ipynb @@ -0,0 +1,526 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "run_hw5_expl.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "gUl_qfOR8JV6" + }, + "source": [ + "##Setup\n", + "\n", + "You will need to make a copy of this notebook in your Google Drive before you can edit the homework files. You can do so with **File → Save a copy in Drive**." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "iizPcHAp8LnA", + "cellView": "form" + }, + "source": [ + "#@title mount your Google Drive\n", + "#@markdown Your work will be stored in a folder called `cs285_f2020` by default to prevent Colab instance timeouts from deleting your edits.\n", + "\n", + "import os\n", + "from google.colab import drive\n", + "drive.mount('/content/gdrive')" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "nAb10wnb8N0m", + "cellView": "form" + }, + "source": [ + "#@title set up mount symlink\n", + "\n", + "DRIVE_PATH = '/content/gdrive/My\\ Drive/cs285_f2020'\n", + "DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\\\', '')\n", + "if not os.path.exists(DRIVE_PYTHON_PATH):\n", + " %mkdir $DRIVE_PATH\n", + "\n", + "## the space in `My Drive` causes some issues,\n", + "## make a symlink to avoid this\n", + "SYM_PATH = '/content/cs285_f2020'\n", + "if not os.path.exists(SYM_PATH):\n", + " !ln -s $DRIVE_PATH $SYM_PATH" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "gtS9-WsD8QVr", + "cellView": "form" + }, + "source": [ + "#@title apt install requirements\n", + "\n", + "#@markdown Run each section with Shift+Enter\n", + "\n", + "#@markdown Double-click on section headers to show code.\n", + "\n", + "!apt update \n", + "!apt install -y --no-install-recommends \\\n", + " build-essential \\\n", + " curl \\\n", + " git \\\n", + " gnupg2 \\\n", + " make \\\n", + " cmake \\\n", + " ffmpeg \\\n", + " swig \\\n", + " libz-dev \\\n", + " unzip \\\n", + " zlib1g-dev \\\n", + " libglfw3 \\\n", + " libglfw3-dev \\\n", + " libxrandr2 \\\n", + " libxinerama-dev \\\n", + " libxi6 \\\n", + " libxcursor-dev \\\n", + " libgl1-mesa-dev \\\n", + " libgl1-mesa-glx \\\n", + " libglew-dev \\\n", + " libosmesa6-dev \\\n", + " lsb-release \\\n", + " ack-grep \\\n", + " patchelf \\\n", + " wget \\\n", + " xpra \\\n", + " xserver-xorg-dev \\\n", + " xvfb \\\n", + " python-opengl \\\n", + " ffmpeg > /dev/null 2>&1\n", + "\n", + "!pip install opencv-python==3.4.0.12" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VcKGekJN80NO", + "cellView": "form" + }, + "source": [ + "#@title download mujoco\n", + "\n", + "MJC_PATH = '{}/mujoco'.format(SYM_PATH)\n", + "if not os.path.exists(MJC_PATH):\n", + " %mkdir $MJC_PATH\n", + "%cd $MJC_PATH\n", + "if not os.path.exists(os.path.join(MJC_PATH, 'mujoco200')):\n", + " !wget -q https://www.roboti.us/download/mujoco200_linux.zip\n", + " !unzip -q mujoco200_linux.zip\n", + " %mv mujoco200_linux mujoco200\n", + " %rm mujoco200_linux.zip" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "NTiH9f9y82F_", + "cellView": "form" + }, + "source": [ + "#@title update mujoco paths\n", + "\n", + "import os\n", + "\n", + "os.environ['LD_LIBRARY_PATH'] += ':{}/mujoco200/bin'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MUJOCO_PATH'] = '{}/mujoco200'.format(MJC_PATH)\n", + "os.environ['MUJOCO_PY_MJKEY_PATH'] = '{}/mjkey.txt'.format(MJC_PATH)\n", + "\n", + "## installation on colab does not find *.so files\n", + "## in LD_LIBRARY_PATH, copy over manually instead\n", + "!cp $MJC_PATH/mujoco200/bin/*.so /usr/lib/x86_64-linux-gnu/" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "A0kPh99l87q0" + }, + "source": [ + "Ensure your `mjkey.txt` is in /content/cs285_f2020/mujoco before this step" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "X-LoOdZg84pI", + "cellView": "form" + }, + "source": [ + "#@title clone and install mujoco-py\n", + "\n", + "%cd $MJC_PATH\n", + "if not os.path.exists('mujoco-py'):\n", + " !git clone https://github.com/openai/mujoco-py.git\n", + "%cd mujoco-py\n", + "%pip install -e .\n", + "\n", + "## cythonize at the first import\n", + "import mujoco_py" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "-XcwBiBN8-Fg", + "cellView": "both" + }, + "source": [ + "#@title clone homework repo\n", + "#@markdown Note that this is the same codebase from homework 1,\n", + "#@markdown so you may need to move your old `homework_fall2020`\n", + "#@markdown folder in order to clone the repo again.\n", + "\n", + "#@markdown **Don't delete your old work though!**\n", + "#@markdown You will need it for this assignment.\n", + "\n", + "%cd $SYM_PATH\n", + "!git clone https://github.com/berkeleydeeprlcourse/homework_fall2020.git\n", + "%cd homework_fall2020/hw5\n", + "%pip install -r requirements_colab.txt -f https://download.pytorch.org/whl/torch_stable.html\n", + "%pip install -e ." + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "g5xIOIpW8_jC", + "cellView": "both" + }, + "source": [ + "#@title set up virtual display\n", + "\n", + "from pyvirtualdisplay import Display\n", + "\n", + "display = Display(visible=0, size=(1400, 900))\n", + "display.start()\n", + "\n", + "# For later\n", + "from cs285.infrastructure.colab_utils import (\n", + " wrap_env,\n", + " show_video\n", + ")" + ], + "execution_count": 8, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2rsWAWaK9BVp", + "cellView": "both" + }, + "source": [ + "#@title test virtual display\n", + "\n", + "#@markdown If you see a video of a four-legged ant fumbling about, setup is complete!\n", + "\n", + "import gym\n", + "import matplotlib\n", + "matplotlib.use('Agg')\n", + "\n", + "env = wrap_env(gym.make(\"Ant-v2\"))\n", + "\n", + "observation = env.reset()\n", + "for i in range(10):\n", + " env.render(mode='rgb_array')\n", + " obs, rew, term, _ = env.step(env.action_space.sample() ) \n", + " if term:\n", + " break;\n", + " \n", + "env.close()\n", + "print('Loading video...')\n", + "show_video()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QizpiHDh9Fwk" + }, + "source": [ + "## Editing Code\n", + "\n", + "To edit code, click the folder icon on the left menu. Navigate to the corresponding file (`cs285_f2020/...`). Double click a file to open an editor. There is a timeout of about ~12 hours with Colab while it is active (and less if you close your browser window). We sync your edits to Google Drive so that you won't lose your work in the event of an instance timeout, but you will need to re-mount your Google Drive and re-install packages with every new instance." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Nii6qk2C9Ipk" + }, + "source": [ + "## Run Exploration or Exploitation" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4t7FUeEG9Dkf" + }, + "source": [ + "import os\n", + "import time\n", + "\n", + "from cs285.infrastructure.rl_trainer import RL_Trainer\n", + "from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent\n", + "from cs285.infrastructure.dqn_utils import get_env_kwargs, PiecewiseSchedule, ConstantSchedule\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ], + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "2fXlzARJ9i-t", + "cellView": "both" + }, + "source": [ + "#@title runtime arguments\n", + "\n", + "class Args:\n", + "\n", + " def __getitem__(self, key):\n", + " return getattr(self, key)\n", + "\n", + " def __setitem__(self, key, val):\n", + " setattr(self, key, val)\n", + "\n", + " def __contains__(self, key):\n", + " return hasattr(self, key)\n", + "\n", + " env_name = \"PointmassEasy-v0\" #@param [\"PointmassEasy-v0\", \"PointmassMedium-v0\", \"PointmassHard-v0\", \"PointmassVeryHard-v0\"]\n", + " exp_name = 'temp'#@param {type: \"string\"}\n", + "\n", + " #@markdown batches and steps\n", + " batch_size = 256 #@param {type: \"integer\"}\n", + " eval_batch_size = 1000 #@param {type: \"integer\"}\n", + "\n", + " #@exploration hyperparameters\n", + " use_rnd = False #@param {type: \"boolean\"}\n", + " unsupervised_exploration = False #@param {type: \"boolean\"}\n", + " num_exploration_steps = 10000 #@param {type: \"integer\"}\n", + " num_exploration_steps = 10000 #@param {type: \"integer\"}\n", + "\n", + " #@offline training hyperparameters\n", + " offline_exploitation = False #@param {type: \"boolean\"}\n", + " cql_alpha = 0.0 #@param {type: \"raw\"}\n", + "\n", + " #@reward shifting hyperparameters\n", + " exploit_rew_shift = 0.0 #@param {type: \"raw\"}\n", + " exploit_rew_scale = 1.0 #@param {type: \"raw\"}\n", + "\n", + " #@exploration model hyperparameters\n", + " rnd_output_size = 5 #@param {type: \"integer\"}\n", + " rnd_n_layers = 2 #@param {type: \"integer\"}\n", + " rnd_size = 400 #@param {type: \"integer\"}\n", + "\n", + " #@experiment hyperparameters\n", + " seed = 1 #@param {type: \"integer\"}\n", + " no_gpu = False #@param {type: \"boolean\"}\n", + " which_gpu = 0 #@param {type: \"integer\"}\n", + " scalar_log_freq = 1000 #@param {type: \"integer\"}\n", + " save_params = False #@param {type: \"boolean\"}\n", + "\n", + "\n", + "args = Args()\n", + "\n", + "## ensure compatibility with hw1 code\n", + "args['train_batch_size'] = args['batch_size']" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "8vi1S18pGi7o" + }, + "source": [ + "###### HARDCODED DETAILS, DO NOT CHANGE ######\n", + "\n", + "args['video_log_freq'] = -1 # Not used\n", + "args['exploit_weight_schedule'] = ConstantSchedule(1.0)\n", + "args['num_agent_train_steps_per_iter'] = 1\n", + "args['num_critic_updates_per_agent_update'] = 1\n", + "args['learning_starts'] = 2000\n", + "args['num_timesteps'] = 50000\n", + "args['double_q'] = True\n", + "args['eps'] = 0.2\n", + "\n", + "if args['env_name']=='PointmassEasy-v0':\n", + " args['ep_len']=50\n", + "if args['env_name']=='PointmassMedium-v0':\n", + " args['ep_len']=150\n", + "if args['env_name']=='PointmassHard-v0':\n", + " args['ep_len']=100\n", + "if args['env_name']=='PointmassVeryHard-v0':\n", + " args['ep_len']=200\n", + "\n", + "if args['use_rnd']:\n", + " args['explore_weight_schedule'] = PiecewiseSchedule([(0,1), (args['num_exploration_steps'], 0)], outside_value=0.0)\n", + "else:\n", + " args['explore_weight_schedule'] = ConstantSchedule(0.0)\n", + "\n", + "if args['unsupervised_exploration']:\n", + " args['explore_weight_schedule'] = ConstantSchedule(1.0)\n", + " args['exploit_weight_schedule'] = ConstantSchedule(0.0)\n", + " \n", + " if not args['use_rnd']:\n", + " args['learning_starts'] = args['num_exploration_steps']" + ], + "execution_count": 12, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "T0cJlp6s-ogO" + }, + "source": [ + "#@title create directories for logging\n", + "\n", + "data_path = '''/content/cs285_f2020/''' \\\n", + " '''homework_fall2020/hw5/data'''\n", + "\n", + "if not (os.path.exists(data_path)):\n", + " os.makedirs(data_path)\n", + "\n", + "logdir = 'hw5_' + args.exp_name + '_' + args.env_name + '_' + time.strftime(\"%d-%m-%Y_%H-%M-%S\")\n", + "logdir = os.path.join(data_path, logdir)\n", + "args['logdir'] = logdir\n", + "if not(os.path.exists(logdir)):\n", + " os.makedirs(logdir)\n", + "\n", + "print(\"LOGGING TO: \", logdir)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "I525KFRN-42s" + }, + "source": [ + "#@title Define Exploration Agent\n", + "\n", + "class Q_Trainer(object):\n", + "\n", + " def __init__(self, params):\n", + " self.params = params\n", + "\n", + " train_args = {\n", + " 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],\n", + " 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],\n", + " 'train_batch_size': params['batch_size'],\n", + " 'double_q': params['double_q'],\n", + " }\n", + "\n", + " env_args = get_env_kwargs(params['env_name'])\n", + "\n", + " for k, v in env_args.items():\n", + " params[k] = v\n", + " for k, v in train_args.items():\n", + " params[k] = v\n", + "\n", + " self.agent_params = params\n", + "\n", + " self.params['agent_class'] = ExplorationOrExploitationAgent\n", + " self.params['agent_params'] = self.agent_params\n", + " self.params['train_batch_size'] = params['batch_size']\n", + " self.params['env_wrappers'] = self.agent_params['env_wrappers']\n", + "\n", + " self.rl_trainer = RL_Trainer(self.params)\n", + "\n", + " def run_training_loop(self):\n", + " self.rl_trainer.run_training_loop(\n", + " self.agent_params['num_timesteps'],\n", + " collect_policy = self.rl_trainer.agent.actor,\n", + " eval_policy = self.rl_trainer.agent.actor,\n", + " )\n" + ], + "execution_count": 14, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wF4LSRGn-_Cv", + "cellView": "both" + }, + "source": [ + "#@title run training\n", + "\n", + "trainer = Q_Trainer(args)\n", + "trainer.run_training_loop()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "_kTH-tXkI-B-" + }, + "source": [ + "#@markdown You can visualize your runs with tensorboard from within the notebook\n", + "\n", + "## requires tensorflow==2.3.0\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir /content/cs285_f2020/homework_fall2020/hw5/data/" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BwF7tQPQ66hB" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + } + ] +} diff --git a/hw5/setup.py b/hw5/setup.py new file mode 100644 index 00000000..3cc1886e --- /dev/null +++ b/hw5/setup.py @@ -0,0 +1,8 @@ +# setup.py +from setuptools import setup + +setup( + name='cs285', + version='0.1.0', + packages=['cs285'], +) \ No newline at end of file