Learning#
In this notebook we try to learn a policy to navigate in a corridor with one obstacle. We fix the model (an MLP with two layers with 128 neurons) and applies three learning algorithms, providing during training more or less the same number of simulation steps:
Behavioral cloning (IL)
DAgger (IL)
SAC (RL)
We can skip training and load previosly trained policies, by switching the flag in the next cell.
[1]:
from navground.learning.utils.jupyter import run_and_time_if, run_if, skip_if
training = True
[2]:
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_formats = ['svg']
We start by recording the current time, which we will add to the logging directories
[3]:
from datetime import datetime as dt
stamp = dt.now().strftime("%Y%m%d_%H%M%S")
and by initializing the same environment as in the previous notebook, but with a flat observation space because it is the only one supported all three learning algorithms we are going to apply.
[4]:
import gymnasium as gym
from navground import sim
from navground.learning import ControlActionConfig, DefaultObservationConfig
from navground.learning.rewards import SocialReward
import navground.learning.scenarios
duration = 40.0
time_step = 0.1
action_config = ControlActionConfig(max_acceleration=1.0, max_angular_acceleration=10.0,
use_acceleration_action=True)
observation_config = DefaultObservationConfig(include_target_direction=True, include_velocity=True,
include_angular_speed=True, flat=True)
reward = SocialReward(safety_margin=0.04)
with open('sensors.yaml') as f:
sensors = f.read()
with open('scenario.yaml') as f:
scenario = sim.load_scenario(f.read())
env = gym.make('navground',
scenario=scenario,
sensors=sensors,
action=action_config,
observation=observation_config,
time_step=time_step,
max_duration=duration,
reward=reward)
together with an evaluation environment:
[5]:
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
# uses random seed
test_env = make_vec_env('navground', n_envs=5, env_kwargs=env.spec.kwargs)
We also define helper functions to plot and display runs when the agent following different policies
[6]:
from matplotlib import pyplot as plt
from navground.learning import GroupConfig
from navground.learning.evaluation import make_experiment_with_env
from navground.sim.pyplot_helpers import plot_runs
import numpy as np
def plot_test_runs(policy, color='blue', figsize=(12, 8), cols=4,
rows=5, step=30, seed=0):
fig, _ = plt.subplots(rows, cols, figsize=figsize)
exp = make_experiment_with_env(env=env.unwrapped,
groups=[GroupConfig(policy=policy)])
exp.number_of_runs = rows * cols
exp.record_config.pose = True
exp.run_index = seed
exp.run()
plot_runs(runs=exp.runs.values(), columns=cols, with_agent=True,
color=lambda a: color, step=step,
world_kwargs={'in_box': True},
agent_kwargs={'dot_radius': 0, 'with_safety_margin': True},
fig=fig)
All policies will share the same network architecture with two layers of 128 neurons each
[7]:
policy_kwargs = dict(net_arch=[128, 128])
Imitation Learning with Behavior Cloning#
To load a pre-trained policy, uncomment and execute the next cell
[8]:
%%skip_if $training
from navground.learning.il import BC
bc = BC.load("BC/model.zip")
[9]:
%%run_if $training
from navground.learning.il import BC, setup_tqdm
from imitation.util import logger
setup_tqdm()
bc = BC(env, policy_kwargs=dict(net_arch=[128, 128]),
bc_kwargs={'l2_weight': 0, 'ent_weight': 1e-2, 'batch_size': 32})
bc.logger = logger.configure(f"logs/BC/{stamp}", ['tensorboard', 'csv'])
[10]:
%%run_if $training
import time
start = time.time()
bc.collect_runs(500)
print(f'Collecting runs took {time.time() - start: .0f} seconds')
start = time.time()
bc.learn(
log_rollouts_venv=test_env,
log_rollouts_n_episodes=50,
log_interval=200,
n_epochs=4,
progress_bar=True
)
print(f'Training took {time.time() - start: .0f} seconds')
bc.save("BC/model")
Collecting runs took 15 seconds
Training took 256 seconds
Even with a quite large number of steps (about 200K per epoch), it fails to learn a decent policy
[11]:
import pandas as pd
df = pd.read_csv(f'{bc.logger.get_dir()}/progress.csv')
df.plot(y='rollout/return_mean', x='bc/samples_so_far', figsize=(10, 3), color='orange');
[12]:
plot_test_runs(policy=bc.policy, color='orange', seed=10000);
[13]:
import pandas as pd
bc_rewards, steps = evaluate_policy(bc.policy, test_env, 1000, return_episode_rewards=True)
print(f"BC Rewards: min={np.min(bc_rewards):.2f}, mean={np.mean(bc_rewards):.2f}, "
f"median={np.median(bc_rewards):.2f}, max={np.max(bc_rewards):.2f}")
eval_df = pd.DataFrame()
eval_df['BC'] = {'reward min': np.min(bc_rewards),
'reward mean': np.mean(bc_rewards),
'reward max': np.max(bc_rewards),
'reward median': np.median(bc_rewards)}
BC Rewards: min=-674.42, mean=-384.42, median=-565.84, max=-0.54
[14]:
plt.figure(figsize=(5, 2))
plt.hist(bc_rewards, bins=30, density=True, color="orange");
plt.xlabel('reward')
plt.ylabel('probability')
plt.title("Policy trained with BC");
Imitation learning with DAgger#
To load a pre-trained policy, uncomment and execute the next cell
[15]:
%%skip_if $training
from navground.learning.il import DAgger
dagger = DAgger.load("DAgger/model.zip")
[16]:
%%run_if $training
from navground.learning.il import DAgger
dagger = DAgger(env, policy_kwargs=dict(net_arch=[128, 128]),
bc_kwargs={'l2_weight': 0, 'ent_weight': 1e-2, 'batch_size': 128})
dagger.logger = logger.configure(f"logs/DAgger/{stamp}", ['tensorboard', 'csv'])
[17]:
%%run_if $training
import time
start = time.time()
dagger.learn(
total_timesteps=150_000,
rollout_round_min_episodes=25,
bc_train_kwargs={
'log_rollouts_venv': test_env,
'log_rollouts_n_episodes': 50,
'log_interval': 200,
'n_epochs': 1,
'progress_bar': False,
},
progress_bar=True
)
print(f'Training took {time.time() - start: .0f} seconds')
dagger.save("DAgger/model.zip")
Training took 157 seconds
About 60000 steps are enough to learn a decent policy
[18]:
import pandas as pd
df = pd.read_csv(f'{dagger.logger.get_dir()}/progress.csv')
df = df[df['dagger/total_timesteps'].notna()]
df.plot(y='rollout/return_mean', x='dagger/total_timesteps', figsize=(10, 3), color='green');
[19]:
plot_test_runs(policy=dagger.policy, seed=10000, color='green')
[20]:
dagger_rewards, steps = evaluate_policy(dagger.policy, test_env, 1000, return_episode_rewards=True)
print(f"DAgger Rewards: min={np.min(dagger_rewards):.2f}, mean={np.mean(dagger_rewards):.2f}, "
f"median={np.median(dagger_rewards):.2f}, max={np.max(dagger_rewards):.2f}")
eval_df['DAgger'] = {'reward min': np.min(dagger_rewards),
'reward mean': np.mean(dagger_rewards),
'reward max': np.max(dagger_rewards),
'reward median': np.median(dagger_rewards)}
DAgger Rewards: min=-186.94, mean=-21.54, median=-8.10, max=-1.42
[21]:
plt.figure(figsize=(5, 2))
plt.hist(dagger_rewards, bins=30, density=True, color="green");
plt.xlabel('reward')
plt.ylabel('probability');
plt.title("Policy trained with DAgger");
Reinforcement learning with SAC#
To load a pre-trained policy, uncomment and execute the next cell
[22]:
%%skip_if $training
from stable_baselines3 import SAC
sac = SAC.load("SAC/model.zip")
[23]:
%%run_if $training
from stable_baselines3 import SAC
from stable_baselines3.common.logger import configure
sac = SAC("MlpPolicy", env, policy_kwargs=policy_kwargs)
sac.set_logger(configure(f"logs/SAC/{stamp}", ["csv", "tensorboard"]))
[24]:
%%run_if $training
import time
start = time.time()
sac.learn(total_timesteps=150_000, progress_bar=True, tb_log_name="SAC");
print(f'Training took {time.time() - start: .0f} seconds')
sac.save("SAC/model.zip")
Training took 642 seconds
It learns at good performing policy in about 50000 steps. It takes about 5x longer compared to Dagger but the policy is also better, especialy the worst case.
[25]:
import pandas as pd
df = pd.read_csv(f'{sac.logger.get_dir()}/progress.csv')
df.plot(y='rollout/ep_rew_mean', x='time/total_timesteps', figsize=(10, 3), color='blue');
[26]:
plot_test_runs(policy=sac.policy, seed=10_000, color='blue')
[27]:
sac_rewards, steps = evaluate_policy(sac.policy, test_env, 1000, return_episode_rewards=True)
print(f"SAC Rewards: min={np.min(sac_rewards):.2f}, mean={np.mean(sac_rewards):.2f}, "
f"median={np.median(sac_rewards):.2f}, max={np.max(sac_rewards):.2f}")
eval_df['SAC'] = {'reward min': np.min(sac_rewards),
'reward mean': np.mean(sac_rewards),
'reward max': np.max(sac_rewards),
'reward median': np.median(sac_rewards)}
SAC Rewards: min=-37.77, mean=-18.39, median=-16.44, max=-5.20
[28]:
plt.figure(figsize=(5, 2))
plt.hist(sac_rewards, bins=30, density=True, color='blue');
plt.xlabel('reward')
plt.ylabel('probability');
plt.title("Policy trained with SAC");
Comparision with HL#
Let’s compare the policies with the expert (algorithm) that we have learned to imitate (HL).
[29]:
from navground.learning.evaluation import evaluate_with_experiment_and_env
hl_rewards, _ = evaluate_with_experiment_and_env(env, n_eval_episodes=1000, return_episode_rewards=True)
print(f"HL Rewards: min={np.min(hl_rewards):.2f}, mean={np.mean(hl_rewards):.2f}, "
f"median={np.median(hl_rewards):.2f}, max={np.max(hl_rewards):.2f}")
eval_df['HL'] = {'reward min': np.min(hl_rewards),
'reward mean': np.mean(hl_rewards),
'reward max': np.max(hl_rewards),
'reward median': np.median(hl_rewards)}
HL Rewards: min=-71.02, mean=-5.36, median=-3.95, max=0.00
[30]:
eval_df.to_csv('eval.csv')
pd.set_option("display.precision", 2)
eval_df.T
[30]:
reward min | reward mean | reward max | reward median | |
---|---|---|---|---|
BC | -674.42 | -384.42 | -0.54 | -565.84 |
DAgger | -186.94 | -21.54 | -1.42 | -8.10 |
SAC | -37.77 | -18.39 | -5.20 | -16.44 |
HL | -71.02 | -5.36 | 0.00 | -3.95 |
[31]:
plt.figure(figsize=(7, 3))
plt.hist(hl_rewards, color='firebrick', density=True, label="HL", alpha=0.8,
bins=np.linspace(np.quantile(hl_rewards, 0.02), np.quantile(hl_rewards, 0.98), 20))
plt.hist(sac_rewards, color='blue', density=True, label="SAC", alpha=0.8,
bins=np.linspace(np.quantile(sac_rewards, 0.02), np.quantile(sac_rewards, 0.98), 30))
plt.hist(dagger_rewards, color='green', density=True, label="DAgger", alpha=0.8,
bins=np.linspace(np.quantile(dagger_rewards, 0.02), np.quantile(dagger_rewards, 0.98), 50))
plt.hist(bc_rewards, color='orange', density=True, label="BC", alpha=0.8,
bins=np.linspace(np.quantile(bc_rewards, 0.02), np.quantile(bc_rewards, 0.98), 50))
plt.xlabel('reward')
plt.ylabel('probability');
plt.xlim(-20, 0)
plt.legend();
[32]:
def plot_comparison_test_runs(policies, figsize=(12, 8), cols=4, rows=5, step=30, seed=0):
fig, _ = plt.subplots(rows, cols, figsize=figsize)
for i, (policy, color, label) in enumerate(policies):
exp = make_experiment_with_env(env=env.unwrapped, groups=[GroupConfig(policy=policy)])
exp.number_of_runs = rows * cols
exp.record_config.pose = True
exp.run_index = seed
exp.run()
plot_runs(runs=exp.runs.values(), columns=cols, with_agent=False, color=lambda a: color, step=step,
world_kwargs={'in_box': True}, fig=fig, with_world=i==0, label=label)
[33]:
plot_comparison_test_runs(policies=(
(bc.policy, 'orange', 'BC'),
(dagger.policy, 'green', 'DAgger'),
(sac.policy, 'blue', 'SAC'),
(None, 'firebrick', 'Expert')), seed=123, step=10, cols=4, rows=4)
plt.legend(bbox_to_anchor=(-.75, 0), ncols=4);
[ ]: