How to have same prediction in RL

I have implemented basic trading model in RL and when i am trying to run every time it give different values of profit and Reward While i am setting seed value

How we can make sure it give same value at certain time . following is code

pip install stable-baselines3[extra]
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# for autoformatting
# %load_ext jupyter_black
#!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install "stable-baselines3[extra]>=2.0.0a4" gym-anytrading gym

Imports

Stable-Baselines3 works on environments that follow the gym interface.
You can find a list of available environment here.

Not all algorithms can work with all action spaces, you can find more in this recap table


import gym
import gym_anytrading

# Stable baselines - rl stuff
from stable_baselines3 import A2C,PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Processing libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

print(np.__version__)
!pip install yfinance

The first thing you need to import is the RL model, check the documentation to know what you can use on which problem

import yfinance as yf

ticker_symbol = "AAPL"  # Example: Apple Inc.

# Set the start and end dates for the data
start_date = "2010-01-01"
end_date = "2023-05-31"

# Retrieve the data using yfinance
yf_data = yf.download(ticker_symbol, start=start_date, end=end_date)
yf_data.index
# Convert the data to a Pandas dataframe
df = pd.DataFrame(yf_data)
df.index = pd.to_datetime(df.index)

df
env = gym.make('stocks-v0', df=df, frame_bound=(10,len(df)-int(len(df)/2)), window_size=10)

seed_value = 42
env.seed(seed_value)

#env.signal_features
!pip install finta

from gym_anytrading.envs import StocksEnv
from finta import TA

The next thing you need to import is the policy class that will be used to create the networks (for the policy/value functions).
This step is optional as you can directly use strings in the constructor:

PPO('MlpPolicy', env) instead of PPO(MlpPolicy, env)

Note that some algorithms like SAC have their own MlpPolicy, that’s why using string for the policy is the recommended option.

#df['SMA'] = TA.SMA(df, 12)
#df['RSI'] = TA.RSI(df)
#df['OBV'] = TA.OBV(df)
df['MACD'] = TA.MACD(df).MACD
df['SIGNAL'] = TA.MACD(df).SIGNAL
df.fillna(0, inplace=True)
TA.MACD(df)
df.head(15)
def add_signals(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:, 'Close'].to_numpy()[start:end]
    sc = MinMaxScaler(feature_range=(0,1))
    #sc = StandardScaler()
    #signal_features = env.df.loc[:, ['Close', 'Volume','SMA', 'RSI', 'OBV','MACD','SIGNAL']].to_numpy()[start:end]
    signal_features = sc.fit_transform(env.df.loc[:, ['Volume','MACD','SIGNAL']])[start:end] #env.df.loc[:, ['Volume','MACD','SIGNAL']].to_numpy()[start:end]
    return prices, signal_features
class MyCustomEnv(StocksEnv):
    _process_data = add_signals

env2 = MyCustomEnv(df=df, frame_bound=(10,len(df)-int(len(df)/2)), window_size=10)
env_maker = lambda: env2
env = DummyVecEnv([env_maker])
!pip install tensorrt
%load_ext tensorboard

%tensorboard --logdir ./logs

import IPython

display(IPython.display.HTML('''
<button id='open_tb'>Open TensorBoard</button>
<button id='hide_tb'>Hide TensorBoard</button>
<script>document.querySelector('#open_tb').onclick = () => { window.open(document.querySelector('iframe').src, "__blank") }
        document.querySelector('#hide_tb').onclick = () => { document.querySelector('iframe').style.display = "none" }</script>'''))
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

save_path = os.path.join('Training', 'Saved Models')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

model = A2C('MlpPolicy', env, verbose=1,tensorboard_log="./logs")
model.learn(total_timesteps=100000,callback=eval_callback)
env = MyCustomEnv(df=df, frame_bound=(int(len(df)/2)+10,len(df)), window_size=10)
seed_value = 42
env.seed(seed_value)
obs = env.reset()
while True:
    obs = obs[np.newaxis, ...]
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        print("info", info)
        break
plt.figure(figsize=(15,6))
plt.cla()
env.render_all()
plt.show()

I found that env.seed does not seed the agent’s action space, which may be why you are not getting the same results.

This might help you :

1 Like

Thanks a ton that works. I am getting 99% time same results only once in while get different results .

I am always getting Total Reward as 0 and Total Profit is always coming to 1 . is that because of Seed or some other region

Does changing the seed value vary the results?

I think it doesn’t work again if i make it learn more steps as you can see following code

pip install stable-baselines3[extra]
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# for autoformatting
# %load_ext jupyter_black
#!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install "stable-baselines3[extra]>=2.0.0a4" gym-anytrading gym

Imports

Stable-Baselines3 works on environments that follow the gym interface.
You can find a list of available environment here.

Not all algorithms can work with all action spaces, you can find more in this recap table


import gym
import gym_anytrading

# Stable baselines - rl stuff
from stable_baselines3 import A2C,PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# Processing libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

print(np.__version__)
!pip install yfinance

The first thing you need to import is the RL model, check the documentation to know what you can use on which problem

import yfinance as yf

ticker_symbol = "AAPL"  # Example: Apple Inc.

# Set the start and end dates for the data
start_date = "2010-01-01"
end_date = "2023-05-31"

# Retrieve the data using yfinance
yf_data = yf.download(ticker_symbol, start=start_date, end=end_date)
yf_data.index
# Convert the data to a Pandas dataframe
df = pd.DataFrame(yf_data)
df.index = pd.to_datetime(df.index)

df
env = gym.make('stocks-v0', df=df, frame_bound=(10,len(df)-int(len(df)/2)), window_size=10)

seed_value = 42
env.seed(seed_value)
env.action_space.seed(seed_value)

#env.signal_features
!pip install finta

from gym_anytrading.envs import StocksEnv
from finta import TA

The next thing you need to import is the policy class that will be used to create the networks (for the policy/value functions).
This step is optional as you can directly use strings in the constructor:

PPO('MlpPolicy', env) instead of PPO(MlpPolicy, env)

Note that some algorithms like SAC have their own MlpPolicy, that’s why using string for the policy is the recommended option.

#df['SMA'] = TA.SMA(df, 12)
#df['RSI'] = TA.RSI(df)
#df['OBV'] = TA.OBV(df)
df['MACD'] = TA.MACD(df).MACD
df['SIGNAL'] = TA.MACD(df).SIGNAL
df.fillna(0, inplace=True)
TA.MACD(df)
df.head(15)
def add_signals(env):
    start = env.frame_bound[0] - env.window_size
    end = env.frame_bound[1]
    prices = env.df.loc[:, 'Close'].to_numpy()[start:end]
    sc = MinMaxScaler(feature_range=(0,1))
    #sc = StandardScaler()
    #signal_features = env.df.loc[:, ['Close', 'Volume','SMA', 'RSI', 'OBV','MACD','SIGNAL']].to_numpy()[start:end]
    signal_features = sc.fit_transform(env.df.loc[:, ['Volume','MACD','SIGNAL']])[start:end] #env.df.loc[:, ['Volume','MACD','SIGNAL']].to_numpy()[start:end]
    return prices, signal_features
class MyCustomEnv(StocksEnv):
    _process_data = add_signals

env2 = MyCustomEnv(df=df, frame_bound=(10,len(df)-int(len(df)/2)), window_size=10)
env_maker = lambda: env2
env = DummyVecEnv([env_maker])
!pip install tensorrt
%load_ext tensorboard

%tensorboard --logdir ./logs

import IPython

display(IPython.display.HTML('''
<button id='open_tb'>Open TensorBoard</button>
<button id='hide_tb'>Hide TensorBoard</button>
<script>document.querySelector('#open_tb').onclick = () => { window.open(document.querySelector('iframe').src, "__blank") }
        document.querySelector('#hide_tb').onclick = () => { document.querySelector('iframe').style.display = "none" }</script>'''))
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
import os

save_path = os.path.join('Training', 'Saved Models')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

model = A2C('MlpPolicy', env, verbose=1,tensorboard_log="./logs")
model.learn(total_timesteps=1000000,callback=eval_callback)
env = MyCustomEnv(df=df, frame_bound=(int(len(df)/2)+10,len(df)), window_size=10)
seed_value = 42
env.seed(seed_value)
env.action_space.seed(seed_value)
obs = env.reset()
while True:
    obs = obs[np.newaxis, ...]
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done:
        print("info", info)
        break
plt.figure(figsize=(15,6))
plt.cla()
env.render_all()
plt.show()

Sorry, I have no idea why that isn’t working.