Parallel SAC with MLP#

We will try combinations of:

  • exchanging 1 bit vs 1 float

  • adding or not the neighbor term to the reward .

[14]:
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

You can skip training and instead load the last trained policy by changing the flag below.

[15]:
from navground.learning.utils.jupyter import skip_if, run_if

training = True
[16]:
from navground.learning import DefaultObservationConfig

from navground.learning.parallel_env import make_vec_from_penv
from navground.learning.config.control_action_with_comm import ControlActionWithCommConfig
from stable_baselines3.common.vec_env import VecMonitor
from navground.learning.examples.pad import get_env, comm, marker, PadReward

comm_size = 1
action = ControlActionWithCommConfig(use_acceleration_action=True, max_acceleration=1,
                                     fix_orientation=True, comm_size=comm_size)
observation = DefaultObservationConfig(flat=False, include_velocity=True, include_target_direction=False)

Floating-point message, without reward sharing#

[17]:
name = "DistributedCommFloatWithoutRewardSharing"
sensors = [comm(size=comm_size, binarize=False), marker()]
train_env = get_env(action=action, observation=observation,
                    sensors=sensors, start_in_opposite_sides=False,
                    reward=PadReward(neighbor_weight=0))
train_venv = VecMonitor(make_vec_from_penv(train_env, num_envs=4))
test_env = get_env(action=action, observation=observation,
                   sensors=sensors, start_in_opposite_sides=True)
test_venv = VecMonitor(make_vec_from_penv(test_env, num_envs=4))
[18]:
test_venv.observation_space, train_venv.observation_space
[18]:
(Dict('neighbor/comm': Box(-1.0, 1.0, (1,), float32), 'pad/x': Box(-1.0, 1.0, (1,), float32), 'ego_velocity': Box(-0.14, 0.14, (1,), float32)),
 Dict('neighbor/comm': Box(-1.0, 1.0, (1,), float32), 'pad/x': Box(-1.0, 1.0, (1,), float32), 'ego_velocity': Box(-0.14, 0.14, (1,), float32)))
[19]:
%%skip_if $training

import pathlib, os
from stable_baselines3 import SAC

log_float = max(pathlib.Path(f'logs/{name}/SAC').glob('*'), key=os.path.getmtime)
[ ]:
%%run_if $training

from stable_baselines3 import SAC
from datetime import datetime as dt
from stable_baselines3.common.logger import configure
from navground.learning.utils.sb3 import callbacks
from navground.learning.scenarios.pad import render_kwargs

kwargs = render_kwargs(comm=True, low=-1, high=1, binarize=False,
                       color_low=(1, 0, 0), color_high=(0, 1, 0))
model_float = SAC("MultiInputPolicy", train_venv, verbose=0, use_sde=False)
stamp = dt.now().strftime("%Y%m%d_%H%M%S")
log_float = f"logs/{name}/SAC/{stamp}"
model_float.set_logger(configure(log_float, ["csv", "tensorboard"]))
cbs = callbacks(venv=test_venv, best_model_save_path=log_float,
                eval_freq=500, export_to_onnx=True, **kwargs)
log_float
'logs/DistributedCommFloatWithoutRewardSharing/SAC/20250521_140207'
[21]:
%%run_if $training

model_float.learn(total_timesteps=300_000,reset_num_timesteps=False, log_interval=10, callback=cbs)
model_float.num_timesteps
[21]:
300000
[22]:
from navground.learning.utils.sb3 import plot_eval_logs

plot_eval_logs(log_float, reward_low=-200, reward_high=0, success=True, figsize=(8, 3))
../../../_images/tutorials_pad_Communication_Comm-SAC_11_0.png
[23]:
from stable_baselines3.common.evaluation import evaluate_policy

best_model_float = SAC.load(f'{log_float}/best_model')
evaluate_policy(best_model_float.policy, test_venv, n_eval_episodes=100)
[23]:
(-26.62414, 40.722)
[27]:
from navground.learning.evaluation.video import display_episode_video

display_episode_video(test_env, policy=best_model_float.policy, factor=4, seed=1, **kwargs)
[27]:
[28]:
from navground.learning.utils.plot import plot_policy

plot_policy(best_model_float.policy,
            variable={'pad/x': (-1, 1), 'neighbor/comm': (-1, 1)},
            fix={'ego_velocity': 0.1},
            actions={0: 'acceleration', 1: 'message'},
            width=5, height=3)
../../../_images/tutorials_pad_Communication_Comm-SAC_14_0.png

Let us record communication during a run and plot it

[31]:
from navground.learning.evaluation.experiment import make_experiment_with_env
from navground.learning.probes.comm import CommProbe
from matplotlib import pyplot as plt
import numpy as np

exp = make_experiment_with_env(test_env, policy=best_model_float.policy)
exp.add_record_probe('tx', CommProbe)
run = exp.run_once(seed=1)
plt.figure(figsize=(8, 3))
for i, color in enumerate(('gold', 'cyan')):
    plt.plot(np.asarray(run.records['tx'])[:, i, 0], color=color, label=f"TX by #{i}")
plt.legend()
plt.ylabel("TX")
plt.title("Broadcasted message during run");
../../../_images/tutorials_pad_Communication_Comm-SAC_16_0.png

Floating-point message, with reward sharing#

[33]:
name = "DistributedCommFloatWithRewardSharing"
train_env = get_env(action=action, observation=observation,
                    sensors=sensors, start_in_opposite_sides=False,
                    reward=PadReward(neighbor_weight=0.5))
train_venv = VecMonitor(make_vec_from_penv(train_env, num_envs=4))
[34]:
%%skip_if $training

log_float_sharing = max(pathlib.Path(f'logs/{name}/SAC').glob('*'), key=os.path.getmtime)
[35]:
%%run_if $training

model_float_sharing = SAC("MultiInputPolicy", train_venv, verbose=0, use_sde=False)
stamp = dt.now().strftime("%Y%m%d_%H%M%S")
log_float_sharing = f"logs/{name}/SAC/{stamp}"
model_float_sharing.set_logger(configure(log_float_sharing, ["csv", "tensorboard"]))
cbs = callbacks(venv=test_venv, best_model_save_path=log_float_sharing,
                eval_freq=500, export_to_onnx=True, **kwargs)
log_float_sharing
[35]:
'logs/DistributedCommFloatWithRewardSharing/SAC/20250521_141550'
[36]:
%%run_if $training

model_float_sharing.learn(total_timesteps=300_000,reset_num_timesteps=False,
                          log_interval=10, callback=cbs)
model_float_sharing.num_timesteps
[36]:
300000
[37]:
plot_eval_logs(log_float_sharing, reward_low=-200, reward_high=0,
               success=True, figsize=(8, 3))
../../../_images/tutorials_pad_Communication_Comm-SAC_23_0.png
[38]:
best_model_float_sharing = SAC.load(f'{log_float_sharing}/best_model')
evaluate_policy(best_model_float_sharing.policy, test_venv, n_eval_episodes=100)
[38]:
(-36.095158, 25.466118)
[39]:
display_episode_video(test_env, policy=best_model_float_sharing.policy, factor=4, seed=1, **kwargs)
[39]:
[40]:
plot_policy(best_model_float_sharing.policy,
            variable={'pad/x': (-1, 1), 'neighbor/comm': (-1, 1)},
            fix={'ego_velocity': 0.1},
            actions={0: 'acceleration', 1: 'message'},
            width=5, height=3)
../../../_images/tutorials_pad_Communication_Comm-SAC_26_0.png

Binary message, with reward sharing#

[68]:
name = "DistributedCommBinaryWithRewardSharing"
sensors_bin = [comm(size=comm_size, binarize=True), marker()]
train_env_bin = get_env(action=action, observation=observation,
                        sensors=sensors_bin, start_in_opposite_sides=False,
                        reward=PadReward(neighbor_weight=0.5))
train_venv_bin = VecMonitor(make_vec_from_penv(train_env_bin, num_envs=4))
test_env_bin = get_env(action=action, observation=observation,
                       sensors=sensors_bin, start_in_opposite_sides=True)
test_venv_bin = VecMonitor(make_vec_from_penv(test_env_bin, num_envs=4))
[69]:
test_venv_bin.observation_space
[69]:
Dict('neighbor/comm': Box(0, 1, (1,), int8), 'pad/x': Box(-1.0, 1.0, (1,), float32), 'ego_velocity': Box(-0.14, 0.14, (1,), float32))
[70]:
%%skip_if $training

log_bin_sharing = max(pathlib.Path(f'logs/{name}/SAC').glob('*'), key=os.path.getmtime)
[71]:
%%run_if $training

model_bin_sharing = SAC("MultiInputPolicy", train_venv_bin, verbose=0, use_sde=False)
stamp = dt.now().strftime("%Y%m%d_%H%M%S")
log_bin_sharing = f"logs/{name}/SAC/{stamp}"
model_bin_sharing.set_logger(configure(log_bin_sharing, ["csv", "tensorboard"]))
kwargs_bin = render_kwargs(comm=True, low=0, high=1,
                           color_low=(1, 0, 0), color_high=(0, 1, 0))
cbs = callbacks(venv=test_venv_bin, best_model_save_path=log_bin_sharing,
                eval_freq=500, export_to_onnx=True, **kwargs_bin)
log_bin_sharing
[71]:
'logs/DistributedCommBinaryWithRewardSharing/SAC/20250521_144105'
[72]:
%%run_if $training

model_bin_sharing.learn(total_timesteps=100_000, reset_num_timesteps=False,
                        log_interval=10, callback=cbs)
model_bin_sharing.num_timesteps
[72]:
100000
[73]:
plot_eval_logs(log_bin_sharing, reward_low=-200, reward_high=0,
               success=True, figsize=(8, 3))
../../../_images/tutorials_pad_Communication_Comm-SAC_33_0.png
[77]:
best_model_bin_sharing = SAC.load(f'{log_bin_sharing}/best_model')
evaluate_policy(best_model_bin_sharing.policy, test_venv_bin, n_eval_episodes=100)
[77]:
(-209.73132, 122.59714)
[81]:
display_episode_video(test_env_bin, policy=best_model_bin_sharing.policy, factor=4,
                      seed=1, **kwargs_bin)
[81]:
[82]:
plot_policy(best_model_bin_sharing.policy,
            variable={'pad/x': (-1, 1)},
            fix={'ego_velocity': 0.07,  'neighbor/comm': [0, 1]},
            colors=['red', 'green'],
            actions={0: 'acceleration', 1: 'message'},
            width=5, height=3)
../../../_images/tutorials_pad_Communication_Comm-SAC_36_0.png
[83]:
exp = make_experiment_with_env(test_env, policy=best_model_bin_sharing.policy)
exp.add_record_probe('tx', CommProbe)
run = exp.run_once(seed=1)
plt.figure(figsize=(8, 3))
for i, color in enumerate(('gold', 'cyan')):
    tx = np.asarray(run.records['tx'])[:, i, 0]
    rx = (tx > 0).astype(int)
    plt.plot(rx, color=color, label=f"TX by #{i}")
plt.legend()
plt.ylabel("TX")
plt.title("Binarized broadcasted message during run");
../../../_images/tutorials_pad_Communication_Comm-SAC_37_0.png

Summary#

From this (very) limited experiment, we may like to infer some general observations:

  • without reward sharing, it does not learn to coordinate.

  • with reward sharing, it learns but it is very unstable (which is not surprising)

  • the best models learnt with reward sharing makes sense: the agent signals its precense near the pad

  • there no significant advantage in sharing a float or a bit (at least with this configuration, where binarization happens during observation, but actions are continuous)

[ ]: