Learning#

In this notebook we try to learn a policy to navigate in a corridor with one obstacle. We fix the model (an MLP with two layers with 128 neurons) and applies three learning algorithms, providing during training more or less the same number of simulation steps:

  • Behavioral cloning (IL)

  • DAgger (IL)

  • SAC (RL)

We can skip training and load previosly trained policies, by switching the flag in the next cell.

[1]:
from navground.learning.utils.jupyter import run_and_time_if, run_if, skip_if

training = True
[2]:
import warnings
warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']

We start by recording the current time, which we will add to the logging directories

[3]:
from datetime import datetime as dt

stamp = dt.now().strftime("%Y%m%d_%H%M%S")

and by initializing the same environment as in the previous notebook, but with a flat observation space because it is the only one supported all three learning algorithms we are going to apply.

[4]:
import gymnasium as gym
from navground import sim
from navground.learning import ControlActionConfig, DefaultObservationConfig
from navground.learning.rewards import SocialReward
import navground.learning.scenarios

duration = 40.0
time_step = 0.1

action_config = ControlActionConfig(max_acceleration=1.0, max_angular_acceleration=10.0,
                                    use_acceleration_action=True)

observation_config = DefaultObservationConfig(include_target_direction=True, include_velocity=True,
                                              include_angular_speed=True, flat=True)

reward = SocialReward(safety_margin=0.04)

with open('sensors.yaml') as f:
    sensors = f.read()

with open('scenario.yaml') as f:
    scenario = sim.load_scenario(f.read())

env = gym.make('navground',
    scenario=scenario,
    sensors=sensors,
    action=action_config,
    observation=observation_config,
    time_step=time_step,
    max_duration=duration,
    reward=reward)

together with an evaluation environment:

[5]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

# uses random seed
test_env = make_vec_env('navground', n_envs=5, env_kwargs=env.spec.kwargs)

We also define helper functions to plot and display runs when the agent following different policies

[6]:
from matplotlib import pyplot as plt
from navground.learning import GroupConfig
from navground.learning.evaluation import make_experiment_with_env
from navground.sim.pyplot_helpers import plot_runs
import numpy as np

def plot_test_runs(policy, color='blue', figsize=(12, 8), cols=4,
                   rows=5, step=30, seed=0):
    fig, _ = plt.subplots(rows, cols, figsize=figsize)
    exp = make_experiment_with_env(env=env.unwrapped,
                                   groups=[GroupConfig(policy=policy)])
    exp.number_of_runs = rows * cols
    exp.record_config.pose = True
    exp.run_index = seed
    exp.run()
    plot_runs(runs=exp.runs.values(), columns=cols, with_agent=True,
              color=lambda a: color, step=step,
              world_kwargs={'in_box': True},
              agent_kwargs={'dot_radius': 0, 'with_safety_margin': True},
              fig=fig)

All policies will share the same network architecture with two layers of 128 neurons each

[7]:
policy_kwargs = dict(net_arch=[128, 128])

Imitation Learning with Behavior Cloning#

To load a pre-trained policy, uncomment and execute the next cell

[8]:
%%skip_if $training

from navground.learning.il import BC

bc = BC.load("BC/model.zip")
[9]:
%%run_if $training

from navground.learning.il import BC, setup_tqdm
from imitation.util import logger

setup_tqdm()

bc = BC(env, policy_kwargs=dict(net_arch=[128, 128]),
        bc_kwargs={'l2_weight': 0, 'ent_weight': 1e-2, 'batch_size': 32})
bc.logger = logger.configure(f"logs/BC/{stamp}", ['tensorboard', 'csv'])
[10]:
%%run_if $training

import time

start = time.time()
bc.collect_runs(500)
print(f'Collecting runs took {time.time() - start: .0f} seconds')

start = time.time()
bc.learn(
    log_rollouts_venv=test_env,
    log_rollouts_n_episodes=50,
    log_interval=200,
    n_epochs=4,
    progress_bar=True
)
print(f'Training took {time.time() - start: .0f} seconds')
bc.save("BC/model")
Collecting runs took  15 seconds
Training took  256 seconds

Even with a quite large number of steps (about 200K per epoch), it fails to learn a decent policy

[11]:
import pandas as pd

df = pd.read_csv(f'{bc.logger.get_dir()}/progress.csv')
df.plot(y='rollout/return_mean', x='bc/samples_so_far', figsize=(10, 3), color='orange');
../../_images/tutorials_corridor_with_obstacle_Learning_19_0.svg
[12]:
plot_test_runs(policy=bc.policy, color='orange', seed=10000);
../../_images/tutorials_corridor_with_obstacle_Learning_20_0.svg
[13]:
import pandas as pd

bc_rewards, steps = evaluate_policy(bc.policy, test_env, 1000, return_episode_rewards=True)

print(f"BC Rewards: min={np.min(bc_rewards):.2f}, mean={np.mean(bc_rewards):.2f}, "
      f"median={np.median(bc_rewards):.2f}, max={np.max(bc_rewards):.2f}")

eval_df = pd.DataFrame()
eval_df['BC'] = {'reward min': np.min(bc_rewards),
            'reward mean': np.mean(bc_rewards),
            'reward max': np.max(bc_rewards),
            'reward median': np.median(bc_rewards)}
BC Rewards: min=-674.42, mean=-384.42, median=-565.84, max=-0.54
[14]:
plt.figure(figsize=(5, 2))
plt.hist(bc_rewards, bins=30, density=True, color="orange");
plt.xlabel('reward')
plt.ylabel('probability')
plt.title("Policy trained with BC");
../../_images/tutorials_corridor_with_obstacle_Learning_22_0.svg

Imitation learning with DAgger#

To load a pre-trained policy, uncomment and execute the next cell

[15]:
%%skip_if $training

from navground.learning.il import DAgger

dagger = DAgger.load("DAgger/model.zip")
[16]:
%%run_if $training

from navground.learning.il import DAgger

dagger = DAgger(env, policy_kwargs=dict(net_arch=[128, 128]),
                bc_kwargs={'l2_weight': 0, 'ent_weight': 1e-2, 'batch_size': 128})
dagger.logger = logger.configure(f"logs/DAgger/{stamp}", ['tensorboard', 'csv'])
[17]:
%%run_if $training

import time

start = time.time()
dagger.learn(
    total_timesteps=150_000,
    rollout_round_min_episodes=25,
    bc_train_kwargs={
        'log_rollouts_venv': test_env,
        'log_rollouts_n_episodes': 50,
        'log_interval': 200,
        'n_epochs': 1,
        'progress_bar': False,
    },
    progress_bar=True
)
print(f'Training took {time.time() - start: .0f} seconds')
dagger.save("DAgger/model.zip")
Training took  157 seconds

About 60000 steps are enough to learn a decent policy

[18]:
import pandas as pd

df = pd.read_csv(f'{dagger.logger.get_dir()}/progress.csv')
df = df[df['dagger/total_timesteps'].notna()]
df.plot(y='rollout/return_mean', x='dagger/total_timesteps', figsize=(10, 3), color='green');
../../_images/tutorials_corridor_with_obstacle_Learning_28_0.svg
[19]:
plot_test_runs(policy=dagger.policy, seed=10000, color='green')
../../_images/tutorials_corridor_with_obstacle_Learning_29_0.svg
[20]:
dagger_rewards, steps = evaluate_policy(dagger.policy, test_env, 1000, return_episode_rewards=True)

print(f"DAgger Rewards: min={np.min(dagger_rewards):.2f}, mean={np.mean(dagger_rewards):.2f}, "
      f"median={np.median(dagger_rewards):.2f}, max={np.max(dagger_rewards):.2f}")

eval_df['DAgger'] = {'reward min': np.min(dagger_rewards),
                     'reward mean': np.mean(dagger_rewards),
                     'reward max': np.max(dagger_rewards),
                     'reward median': np.median(dagger_rewards)}
DAgger Rewards: min=-186.94, mean=-21.54, median=-8.10, max=-1.42
[21]:
plt.figure(figsize=(5, 2))
plt.hist(dagger_rewards, bins=30, density=True, color="green");
plt.xlabel('reward')
plt.ylabel('probability');
plt.title("Policy trained with DAgger");
../../_images/tutorials_corridor_with_obstacle_Learning_31_0.svg

Reinforcement learning with SAC#

To load a pre-trained policy, uncomment and execute the next cell

[22]:
%%skip_if $training

from stable_baselines3 import SAC

sac = SAC.load("SAC/model.zip")
[23]:
%%run_if $training

from stable_baselines3 import SAC
from stable_baselines3.common.logger import configure

sac = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs)
sac.set_logger(configure(f"logs/SAC/{stamp}", ["csv", "tensorboard"]))
[24]:
%%run_if $training

import time

start = time.time()
sac.learn(total_timesteps=150_000, progress_bar=True, tb_log_name="SAC");
print(f'Training took {time.time() - start: .0f} seconds')
sac.save("SAC/model.zip")
Training took  642 seconds

It learns at good performing policy in about 50000 steps. It takes about 5x longer compared to Dagger but the policy is also better, especialy the worst case.

[25]:
import pandas as pd

df = pd.read_csv(f'{sac.logger.get_dir()}/progress.csv')
df.plot(y='rollout/ep_rew_mean', x='time/total_timesteps', figsize=(10, 3), color='blue');
../../_images/tutorials_corridor_with_obstacle_Learning_37_0.svg
[26]:
plot_test_runs(policy=sac.policy, seed=10_000, color='blue')
../../_images/tutorials_corridor_with_obstacle_Learning_38_0.svg
[27]:
sac_rewards, steps = evaluate_policy(sac.policy, test_env, 1000, return_episode_rewards=True)

print(f"SAC Rewards: min={np.min(sac_rewards):.2f}, mean={np.mean(sac_rewards):.2f}, "
      f"median={np.median(sac_rewards):.2f}, max={np.max(sac_rewards):.2f}")

eval_df['SAC'] = {'reward min': np.min(sac_rewards),
                  'reward mean': np.mean(sac_rewards),
                  'reward max': np.max(sac_rewards),
                  'reward median': np.median(sac_rewards)}
SAC Rewards: min=-37.77, mean=-18.39, median=-16.44, max=-5.20
[28]:
plt.figure(figsize=(5, 2))
plt.hist(sac_rewards, bins=30, density=True, color='blue');
plt.xlabel('reward')
plt.ylabel('probability');
plt.title("Policy trained with SAC");
../../_images/tutorials_corridor_with_obstacle_Learning_40_0.svg

Comparision with HL#

Let’s compare the policies with the expert (algorithm) that we have learned to imitate (HL).

[29]:
from navground.learning.evaluation import evaluate_with_experiment_and_env

hl_rewards, _ = evaluate_with_experiment_and_env(env, n_eval_episodes=1000, return_episode_rewards=True)

print(f"HL Rewards: min={np.min(hl_rewards):.2f}, mean={np.mean(hl_rewards):.2f}, "
      f"median={np.median(hl_rewards):.2f}, max={np.max(hl_rewards):.2f}")

eval_df['HL'] = {'reward min': np.min(hl_rewards),
                 'reward mean': np.mean(hl_rewards),
                 'reward max': np.max(hl_rewards),
                 'reward median': np.median(hl_rewards)}
HL Rewards: min=-71.02, mean=-5.36, median=-3.95, max=0.00
[30]:
eval_df.to_csv('eval.csv')
pd.set_option("display.precision", 2)

eval_df.T
[30]:
reward min reward mean reward max reward median
BC -674.42 -384.42 -0.54 -565.84
DAgger -186.94 -21.54 -1.42 -8.10
SAC -37.77 -18.39 -5.20 -16.44
HL -71.02 -5.36 0.00 -3.95
[31]:
plt.figure(figsize=(7, 3))

plt.hist(hl_rewards, color='firebrick', density=True, label="HL", alpha=0.8,
         bins=np.linspace(np.quantile(hl_rewards, 0.02), np.quantile(hl_rewards, 0.98), 20))
plt.hist(sac_rewards, color='blue', density=True, label="SAC", alpha=0.8,
         bins=np.linspace(np.quantile(sac_rewards, 0.02), np.quantile(sac_rewards, 0.98), 30))

plt.hist(dagger_rewards, color='green', density=True, label="DAgger", alpha=0.8,
         bins=np.linspace(np.quantile(dagger_rewards, 0.02), np.quantile(dagger_rewards, 0.98), 50))
plt.hist(bc_rewards, color='orange', density=True, label="BC", alpha=0.8,
         bins=np.linspace(np.quantile(bc_rewards, 0.02), np.quantile(bc_rewards, 0.98), 50))

plt.xlabel('reward')
plt.ylabel('probability');
plt.xlim(-20, 0)
plt.legend();
../../_images/tutorials_corridor_with_obstacle_Learning_44_0.svg
[32]:
def plot_comparison_test_runs(policies, figsize=(12, 8), cols=4, rows=5, step=30, seed=0):
    fig, _ = plt.subplots(rows, cols, figsize=figsize)
    for i, (policy, color, label) in enumerate(policies):
        exp = make_experiment_with_env(env=env.unwrapped, groups=[GroupConfig(policy=policy)])
        exp.number_of_runs = rows * cols
        exp.record_config.pose = True
        exp.run_index = seed
        exp.run()
        plot_runs(runs=exp.runs.values(), columns=cols, with_agent=False, color=lambda a: color, step=step,
              world_kwargs={'in_box': True}, fig=fig, with_world=i==0, label=label)
[33]:
plot_comparison_test_runs(policies=(
    (bc.policy, 'orange', 'BC'),
    (dagger.policy, 'green', 'DAgger'),
    (sac.policy, 'blue', 'SAC'),
    (None, 'firebrick', 'Expert')), seed=123, step=10, cols=4, rows=4)
plt.legend(bbox_to_anchor=(-.75, 0), ncols=4);
../../_images/tutorials_corridor_with_obstacle_Learning_46_0.svg
[ ]: