hello ,i have tried to use deep Q network in gym mountain car using same as course ideas but the model is so slow and mostly not converge even after 1000 esposide what is error?
#hyper parameters:
MEMORY_SIZE = 100_000
GAMMA = 0.995
NUM_STEPS_FOR_UPDATE = 16
max_num_timesteps = 200
num_p_av = 20
E_DECAY = 0.995
E_MIN = 0.01
TAU = 1e-3 #---> Soft update parameter.
MINIBATCH_SIZE = 3
lrn_rate=1e-4
regulize=.77
def build_model(state_size, num_actions):
model = Sequential([
Input(shape=state_size),
Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
Dense(units=num_actions, activation='linear')
])
return model
q_network = build_model(state_size, num_actions)
target_q_network = build_model(state_size, num_actions)
optimizer = Adam(learning_rate=lrn_rate)
def compute_loss(experiences, gamma, q_network, target_q_network):
states, actions, rewards, next_states, done_vals = experiences
max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)#select from output (2,) select max #note linear not sigmoid!!
#last predict Q vlaue for next states like bellman
y_targets = rewards + (gamma * max_qsa * (1 - done_vals))#bellman
q_values = q_network(states)
q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
tf.cast(actions, tf.int32)], axis=1))#select from output (4,) select index action
#last predict for taking action
loss = MSE(q_values,y_targets)
return loss
def sample_Unpack(memory_buffer):
experiences = random.sample(memory_buffer, k=MINIBATCH_SIZE)
states = tf.convert_to_tensor(
np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
)
actions = tf.convert_to_tensor(
np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
)
rewards = tf.convert_to_tensor(
np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
)
next_states = tf.convert_to_tensor(
np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
)
done_vals = tf.convert_to_tensor(
np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
dtype=tf.float32,
)
return (states, actions, rewards, next_states, done_vals)
def get_action(q_values, epsilon=0.0):
if random.random() > epsilon:
return np.argmax(q_values.numpy()[0])
else:
return random.choice(np.arange(num_actions))
start = time.time()
def getModel(num_episodes):
epsilon = 1.0
start = time.time()
total_point_history = []
memory_buffer = deque(maxlen=MEMORY_SIZE)
target_q_network.set_weights(q_network.get_weights())
for i in range(num_episodes):
state = env.reset()
total_points = 0
for t in range(400):
state_qn = np.expand_dims(state, axis=0)
q_values = q_network(state_qn)
action = get_action(q_values, epsilon)
next_state, reward, done, _ = env.step(action)
memory_buffer.append(experience(state, action, reward, next_state, done))
if (t % NUM_STEPS_FOR_UPDATE==0 and len(memory_buffer) >= MINIBATCH_SIZE):
experiences = sample_Unpack(memory_buffer)
with tf.GradientTape() as tape:
loss = compute_loss(experiences, GAMMA, q_network, target_q_network)
gradients = tape.gradient(loss, q_network.trainable_variables)
optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))
for target_weights, q_net_weights in zip(target_q_network.weights, q_network.weights):
target_weights.assign(TAU * q_net_weights + (1.0 - TAU) * target_weights)
state = next_state.copy()
total_points += reward
if done: #(state[0]>=.5):
print(f"at t+1={t+1},i+1={i+1} reached to {next_state}")
break
total_point_history.append(total_points)
av_latest_points = np.mean(total_point_history[-num_p_av:])#always -200 as each complete loop is -200 and mean=200N/N ,where N=num_p_av
epsilon = max(E_MIN, E_DECAY * epsilon) #epsilon greedy :new epsilon ,multiplying by fraction make it decrease
if (i+1) % 20 == 0:
print(f"esposide {i+1} finished ,{total_point_history[-num_p_av:]} ,avg {av_latest_points}\n")
return (q_network,total_point_history)