Reinforcement Learning

Train PPO agents with Stable Baselines3 using the RLX Gymnasium environment. Custom exit rules, reward shaping, and automatic dashboard generation.

PPO
Algorithm
300K
Training Steps
128
Observation Window
3
Exit Rules

PPO Agent Training

Advanced~100 linesStable Baselines3Gymnasium

Complete RL training pipeline with custom exit rules (max hold time, drawdown limits, profit targets). Uses the RlxEnv Gymnasium environment which wraps the Rust backtesting engine.

train_rl_agent.pyFull production example
import sys
import os
import traceback

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from rlxbt import rlx, load_data, RlxEnv


def main():
    # Load Data
    print("Loading data...")
    data_path = "data/BTCUSDT_1h_with_indicators.csv"
    data = load_data(data_path)

    # Split Data (Train/Test)
    train_size = int(len(data) * 0.8)
    train_data = data.iloc[:train_size].reset_index(drop=True)
    test_data = data.iloc[train_size:].reset_index(drop=True)

    print(f"Training on {len(train_data)} bars, Testing on {len(test_data)} bars")

    # Create Environment with Exit Rules
    # Force exit after 24 hours (24 bars) or if drawdown > 5% or profit > 0.9%
    exit_rules = {
        "hold_bars": 24,
        "max_drawdown_percent": 5.0,
        "min_profit_percent": 0.9,
    }

    # License key automatically read from RLX_LICENSE_KEY env var
    env = DummyVecEnv(
        [lambda: RlxEnv(train_data, window_size=128, exit_rules=exit_rules)]
    )

    # Initialize Agent
    print("Initializing PPO Agent with Exit Rules...")
    # Increase entropy coefficient to encourage exploration
    model = PPO("MlpPolicy", env, verbose=1, ent_coef=0.05)

    # Train
    print("Starting training (300,000 steps)...")
    model.learn(total_timesteps=300_000)

    print("Training complete.")

    # Test on unseen data
    print("Running test on unseen data...")
    test_env = RlxEnv(test_data, window_size=128, exit_rules=exit_rules)
    obs, _ = test_env.reset()
    done = False
    total_reward = 0
    info = {}

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = test_env.step(int(action))
        total_reward += reward

    print(f"Test Total Reward (PnL): ${total_reward:.2f}")
    print(f"Final Portfolio Value: ${info.get('portfolio_value', 0.0):.2f}")

    # Print Full Backtest Metrics
    if "total_return" in info:
        print("\nšŸ“Š RL AGENT BACKTEST REPORT")
        print("===========================")
        print(f"Total Return:  {info['total_return']:.2%}")
        print(f"Sharpe Ratio:  {info.get('sharpe_ratio', 0.0):.2f}")
        print(f"Max Drawdown:  {info.get('max_drawdown', 0.0):.2%}")
        print(f"Trades:        {int(info.get('total_trades', 0))}")
        print("===========================")

Generate Dashboard from RL Results

Dashboard generation
    # Generate Dashboard
    try:
        print("\nGenerating Dashboard...")
        
        # Get the underlying Rust environment to access backtest result
        backtest_result = test_env.get_backtest_result()

        generator = rlx.DashboardGenerator(
            initial_capital=100000.0,
            commission=0.0,
            slippage=0.0,
            use_intrabar_resolution=False,
            contract_size=1.0,
        )

        dashboard_data = generator.generate_dashboard(backtest_result, test_data)

        # Export to JSON
        json_path = "rl_dashboard.json"
        generator.export_to_json(dashboard_data, json_path)

        # Launch Web Dashboard
        print("Launching Dashboard Server...")
        generator.plot(dashboard_data, port=8000, auto_open=True)

    except Exception as e:
        print(f"Failed to generate dashboard: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

RlxEnv Configuration

Exit Rules

hold_barsMax bars to hold position
max_drawdown_percent Close if loss exceeds %
min_profit_percent Take profit at % gain
trailing_stop_percent Trailing stop from high

Action Space

0Hold / Do nothing
1Buy / Go Long
2Sell / Go Short

Observation Space (128 Ɨ 6)

The observation is a 2D array of shape (window_size, 6) containing normalized OHLCV data plus position info.

Channel 0-3
Normalized OHLC prices
Channel 4
Normalized Volume
Channel 5
Current Position (-1, 0, 1)

Advanced: Custom Reward Shaping

Custom RlxEnv with reward shaping
from rlxbt import RlxEnv
import numpy as np


class CustomRlxEnv(RlxEnv):
    """Custom RL environment with shaped rewards."""
    
    def __init__(self, data, **kwargs):
        super().__init__(data, **kwargs)
        self.sharpe_window = []
        self.max_sharpe_len = 100
    
    def _compute_reward(self, pnl: float, info: dict) -> float:
        """Override reward computation."""
        base_reward = pnl
        
        # Penalize excessive trading
        trade_penalty = -0.001 if info.get("trade_made") else 0
        
        # Bonus for positive Sharpe contribution
        self.sharpe_window.append(pnl)
        if len(self.sharpe_window) > self.max_sharpe_len:
            self.sharpe_window.pop(0)
        
        if len(self.sharpe_window) >= 20:
            returns = np.array(self.sharpe_window)
            sharpe = returns.mean() / (returns.std() + 1e-8)
            sharpe_bonus = max(0, sharpe * 0.01)
        else:
            sharpe_bonus = 0
        
        return base_reward + trade_penalty + sharpe_bonus


# Usage
env = CustomRlxEnv(
    train_data,
    window_size=64,
    exit_rules={
        "hold_bars": 48,
        "max_drawdown_percent": 3.0,
    }
)

Required Dependencies

Installation
# Install RL dependencies
pip install stable-baselines3 shimmy gymnasium

# Or with conda
conda install -c conda-forge stable-baselines3