Our paper on constrained reinforcement learning via dissipative saddle flow dynamics [1] has been accepted to the 56th Asilomar Conference on Signals, Systems, and Computers. Congrats Tianqi!
[Bibtex] [Abstract] [Download PDF]
In constrained reinforcement learning (C-RL), an agent seeks to learn from the environment a policy that maximizes the expected cumulative reward while satisfying minimum requirements in secondary cumulative reward con- straints. Several algorithms rooted in sampled-based primal- dual methods have been recently proposed to solve this problem in policy space. However, such methods are based on stochastic gradient descent-ascent algorithms whose trajectories are con- nected to the optimal policy only after a mixing output stage that depends on the algorithm’s history. As a result, there is a mismatch between the behavioral policy and the optimal one. In this work, we propose a novel algorithm for constrained RL that does not suffer from these limitations. Leveraging recent results on regularized saddle-flow dynamics, we develop a novel stochastic gradient descent-ascent algorithm whose trajectories converge to the optimal policy almost surely.
@inproceedings{zym2022asilomar,
abstract = {In constrained reinforcement learning (C-RL), an agent seeks to learn from the environment a policy that maximizes the expected cumulative reward while satisfying minimum requirements in secondary cumulative reward con- straints. Several algorithms rooted in sampled-based primal- dual methods have been recently proposed to solve this problem in policy space. However, such methods are based on stochastic gradient descent-ascent algorithms whose trajectories are con- nected to the optimal policy only after a mixing output stage that depends on the algorithm's history. As a result, there is a mismatch between the behavioral policy and the optimal one. In this work, we propose a novel algorithm for constrained RL that does not suffer from these limitations. Leveraging recent results on regularized saddle-flow dynamics, we develop a novel stochastic gradient descent-ascent algorithm whose trajectories converge to the optimal policy almost surely.},
author = {Zheng, Tianqi and You, Pengcheng and Mallada, Enrique},
bdsk-url-3 = {https://doi.org/10.1109/IEEECONF56349.2022.10052060},
booktitle = {56th Asilomar Conference on Signals, Systems, and Computers},
doi = {10.1109/IEEECONF56349.2022.10052060},
grants = {CAREER-1752362, TRIPODS-1934979, CPS-2136324},
month = {12},
pages = {1362-1366},
record = {presented Dec. 2022, accepted Sep. 2022, submitted Apr. 2022},
title = {Constrained Reinforcement Learning via Dissipative Saddle Flow Dynamics},
url = {https://mallada.ece.jhu.edu/pubs/2022-Asilomar-ZYM.pdf},
year = {2022}
}