Train PPO agents with Stable Baselines3 using the RLX Gymnasium environment. Custom exit rules, reward shaping, and automatic dashboard generation.
Complete RL training pipeline with custom exit rules (max hold time, drawdown limits, profit targets). Uses the RlxEnv Gymnasium environment which wraps the Rust backtesting engine.
import sys
import os
import traceback
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from rlxbt import rlx, load_data, RlxEnv
def main():
# Load Data
print("Loading data...")
data_path = "data/BTCUSDT_1h_with_indicators.csv"
data = load_data(data_path)
# Split Data (Train/Test)
train_size = int(len(data) * 0.8)
train_data = data.iloc[:train_size].reset_index(drop=True)
test_data = data.iloc[train_size:].reset_index(drop=True)
print(f"Training on {len(train_data)} bars, Testing on {len(test_data)} bars")
# Create Environment with Exit Rules
# Force exit after 24 hours (24 bars) or if drawdown > 5% or profit > 0.9%
exit_rules = {
"hold_bars": 24,
"max_drawdown_percent": 5.0,
"min_profit_percent": 0.9,
}
# License key automatically read from RLX_LICENSE_KEY env var
env = DummyVecEnv(
[lambda: RlxEnv(train_data, window_size=128, exit_rules=exit_rules)]
)
# Initialize Agent
print("Initializing PPO Agent with Exit Rules...")
# Increase entropy coefficient to encourage exploration
model = PPO("MlpPolicy", env, verbose=1, ent_coef=0.05)
# Train
print("Starting training (300,000 steps)...")
model.learn(total_timesteps=300_000)
print("Training complete.")
# Test on unseen data
print("Running test on unseen data...")
test_env = RlxEnv(test_data, window_size=128, exit_rules=exit_rules)
obs, _ = test_env.reset()
done = False
total_reward = 0
info = {}
while not done:
action, _ = model.predict(obs, deterministic=True)
obs, reward, done, truncated, info = test_env.step(int(action))
total_reward += reward
print(f"Test Total Reward (PnL): ${total_reward:.2f}")
print(f"Final Portfolio Value: ${info.get('portfolio_value', 0.0):.2f}")
# Print Full Backtest Metrics
if "total_return" in info:
print("\nš RL AGENT BACKTEST REPORT")
print("===========================")
print(f"Total Return: {info['total_return']:.2%}")
print(f"Sharpe Ratio: {info.get('sharpe_ratio', 0.0):.2f}")
print(f"Max Drawdown: {info.get('max_drawdown', 0.0):.2%}")
print(f"Trades: {int(info.get('total_trades', 0))}")
print("===========================") # Generate Dashboard
try:
print("\nGenerating Dashboard...")
# Get the underlying Rust environment to access backtest result
backtest_result = test_env.get_backtest_result()
generator = rlx.DashboardGenerator(
initial_capital=100000.0,
commission=0.0,
slippage=0.0,
use_intrabar_resolution=False,
contract_size=1.0,
)
dashboard_data = generator.generate_dashboard(backtest_result, test_data)
# Export to JSON
json_path = "rl_dashboard.json"
generator.export_to_json(dashboard_data, json_path)
# Launch Web Dashboard
print("Launching Dashboard Server...")
generator.plot(dashboard_data, port=8000, auto_open=True)
except Exception as e:
print(f"Failed to generate dashboard: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()| hold_bars | Max bars to hold position |
| max_drawdown_percent | Close if loss exceeds % |
| min_profit_percent | Take profit at % gain |
| trailing_stop_percent | Trailing stop from high |
The observation is a 2D array of shape (window_size, 6) containing normalized OHLCV data plus position info.
from rlxbt import RlxEnv
import numpy as np
class CustomRlxEnv(RlxEnv):
"""Custom RL environment with shaped rewards."""
def __init__(self, data, **kwargs):
super().__init__(data, **kwargs)
self.sharpe_window = []
self.max_sharpe_len = 100
def _compute_reward(self, pnl: float, info: dict) -> float:
"""Override reward computation."""
base_reward = pnl
# Penalize excessive trading
trade_penalty = -0.001 if info.get("trade_made") else 0
# Bonus for positive Sharpe contribution
self.sharpe_window.append(pnl)
if len(self.sharpe_window) > self.max_sharpe_len:
self.sharpe_window.pop(0)
if len(self.sharpe_window) >= 20:
returns = np.array(self.sharpe_window)
sharpe = returns.mean() / (returns.std() + 1e-8)
sharpe_bonus = max(0, sharpe * 0.01)
else:
sharpe_bonus = 0
return base_reward + trade_penalty + sharpe_bonus
# Usage
env = CustomRlxEnv(
train_data,
window_size=64,
exit_rules={
"hold_bars": 48,
"max_drawdown_percent": 3.0,
}
)# Install RL dependencies
pip install stable-baselines3 shimmy gymnasium
# Or with conda
conda install -c conda-forge stable-baselines3