Elnagdi, Murad: Predictive Safety in Reinforcement Learning : From MPC-Guidance to Learned Regulators. - Bonn, 2026. - Dissertation, Rheinische Friedrich-Wilhelms-Universität Bonn.
Online-Ausgabe in bonndoc: https://nbn-resolving.org/urn:nbn:de:hbz:5-90538
Online-Ausgabe in bonndoc: https://nbn-resolving.org/urn:nbn:de:hbz:5-90538
@phdthesis{handle:20.500.11811/14201,
urn: https://nbn-resolving.org/urn:nbn:de:hbz:5-90538,
doi: https://doi.org/10.48565/bonndoc-880,
author = {{Murad Elnagdi}},
title = {Predictive Safety in Reinforcement Learning : From MPC-Guidance to Learned Regulators},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
year = 2026,
month = jun,
note = {Autonomous robots operating in unstructured environments require control policies that achieve their tasks reliably while respecting safety constraints. While Reinforcement Learning (RL) has emerged as a powerful data-driven paradigm for optimizing these policies through interaction, its application in robotics is hindered by three fundamental barriers during exploration: it is unstable under sparse rewards, since feedback signals are rare or delayed until the task is solved; it is unsafe, risking damage to the robot and its surroundings; and it is sample-inefficient, as it requires extensive interaction with the environment. This thesis addresses these challenges by establishing prediction as a framework for exploration. We combine model predictive control (MPC) and predicted safety signals to guide data collection, constrain risk, and preserve task performance during training and execution of RL agents. Our methodological progression begins by addressing the inefficiency of random exploration in sparse reward settings. We introduce a hybrid training framework that utilizes MPC as a synthetic expert to guide the agent through complex navigation tasks. This approach accelerates convergence by alternating planned trajectories with trial-and-error experience, yielding a lightweight policy for independent deployment. Transitioning to the safety-critical domain of multi-robot systems, we subsequently employ predictive control as a distributed safety filter. We develop a scalable behavior-based formation controller secured by distributed nonlinear MPC shields, which ensures collision-free training, faster convergence, and enables zero-shot transfer to larger teams and to physical hardware. However, relying on static safety shields often induces conservative behavior that hinders learning. To overcome this limitation, we propose a dynamic safety shield that utilizes a supervisor agent to adapt constraint parameters online. By tuning the shield's sensitivity to the environment, we reduce solver failures and prevent the conservative behaviors typical of static shields. Ultimately, to eliminate the computational bottleneck of running optimization solvers at runtime, we transfer these predictive principles into a modular action regulator. This learned mechanism uses cost critics to preemptively adjust actions based on predicted risk, decoupling safety enforcement from reward maximization. Collectively, these studies show that combining short-horizon planning with learned risk estimation makes RL safer and more sample-efficient without sacrificing task performance. The proposed methods are evaluated in extensive simulation, with the MPC-based frameworks further validated on physical robots to demonstrate their robustness under real-world uncertainties.},
url = {https://hdl.handle.net/20.500.11811/14201}
}
urn: https://nbn-resolving.org/urn:nbn:de:hbz:5-90538,
doi: https://doi.org/10.48565/bonndoc-880,
author = {{Murad Elnagdi}},
title = {Predictive Safety in Reinforcement Learning : From MPC-Guidance to Learned Regulators},
school = {Rheinische Friedrich-Wilhelms-Universität Bonn},
year = 2026,
month = jun,
note = {Autonomous robots operating in unstructured environments require control policies that achieve their tasks reliably while respecting safety constraints. While Reinforcement Learning (RL) has emerged as a powerful data-driven paradigm for optimizing these policies through interaction, its application in robotics is hindered by three fundamental barriers during exploration: it is unstable under sparse rewards, since feedback signals are rare or delayed until the task is solved; it is unsafe, risking damage to the robot and its surroundings; and it is sample-inefficient, as it requires extensive interaction with the environment. This thesis addresses these challenges by establishing prediction as a framework for exploration. We combine model predictive control (MPC) and predicted safety signals to guide data collection, constrain risk, and preserve task performance during training and execution of RL agents. Our methodological progression begins by addressing the inefficiency of random exploration in sparse reward settings. We introduce a hybrid training framework that utilizes MPC as a synthetic expert to guide the agent through complex navigation tasks. This approach accelerates convergence by alternating planned trajectories with trial-and-error experience, yielding a lightweight policy for independent deployment. Transitioning to the safety-critical domain of multi-robot systems, we subsequently employ predictive control as a distributed safety filter. We develop a scalable behavior-based formation controller secured by distributed nonlinear MPC shields, which ensures collision-free training, faster convergence, and enables zero-shot transfer to larger teams and to physical hardware. However, relying on static safety shields often induces conservative behavior that hinders learning. To overcome this limitation, we propose a dynamic safety shield that utilizes a supervisor agent to adapt constraint parameters online. By tuning the shield's sensitivity to the environment, we reduce solver failures and prevent the conservative behaviors typical of static shields. Ultimately, to eliminate the computational bottleneck of running optimization solvers at runtime, we transfer these predictive principles into a modular action regulator. This learned mechanism uses cost critics to preemptively adjust actions based on predicted risk, decoupling safety enforcement from reward maximization. Collectively, these studies show that combining short-horizon planning with learned risk estimation makes RL safer and more sample-efficient without sacrificing task performance. The proposed methods are evaluated in extensive simulation, with the MPC-based frameworks further validated on physical robots to demonstrate their robustness under real-world uncertainties.},
url = {https://hdl.handle.net/20.500.11811/14201}
}





