Deep Q nework

hello ,i have tried to use deep Q network in gym mountain car using same as course ideas but the model is so slow and mostly not converge even after 1000 esposide what is error?

#hyper parameters:
MEMORY_SIZE = 100_000
GAMMA = 0.995
max_num_timesteps = 200
num_p_av = 20
E_DECAY = 0.995
E_MIN = 0.01
TAU = 1e-3  #---> Soft update parameter.
def build_model(state_size, num_actions):
    model = Sequential([
        Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
        Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
        Dense(units=64, activation='relu'),#,kernel_regularizer=tf.keras.regularizers.l2(regulize)),
        Dense(units=num_actions, activation='linear')
    return model

q_network = build_model(state_size, num_actions)
target_q_network = build_model(state_size, num_actions)
optimizer = Adam(learning_rate=lrn_rate)
def compute_loss(experiences, gamma, q_network, target_q_network):
    states, actions, rewards, next_states, done_vals = experiences
    max_qsa = tf.reduce_max(target_q_network(next_states), axis=-1)#select from output (2,) select max #note linear not sigmoid!!
   #last predict Q vlaue for next states like bellman
    y_targets = rewards + (gamma * max_qsa * (1 - done_vals))#bellman
    q_values = q_network(states)
    q_values = tf.gather_nd(q_values, tf.stack([tf.range(q_values.shape[0]),
                                                tf.cast(actions, tf.int32)], axis=1))#select from output (4,) select index action
                                                #last predict for taking action
    loss = MSE(q_values,y_targets)
    return loss
def sample_Unpack(memory_buffer):
    experiences = random.sample(memory_buffer, k=MINIBATCH_SIZE)
    states = tf.convert_to_tensor(
        np.array([e.state for e in experiences if e is not None]), dtype=tf.float32
    actions = tf.convert_to_tensor(
        np.array([e.action for e in experiences if e is not None]), dtype=tf.float32
    rewards = tf.convert_to_tensor(
        np.array([e.reward for e in experiences if e is not None]), dtype=tf.float32
    next_states = tf.convert_to_tensor(
        np.array([e.next_state for e in experiences if e is not None]), dtype=tf.float32
    done_vals = tf.convert_to_tensor(
        np.array([e.done for e in experiences if e is not None]).astype(np.uint8),
    return (states, actions, rewards, next_states, done_vals)

def get_action(q_values, epsilon=0.0):
    if random.random() > epsilon:
        return np.argmax(q_values.numpy()[0])
        return random.choice(np.arange(num_actions))
start = time.time()
def getModel(num_episodes):
    epsilon = 1.0
    start = time.time()
    total_point_history = []
    memory_buffer = deque(maxlen=MEMORY_SIZE)
    for i in range(num_episodes):
              state = env.reset()
              total_points = 0
              for t in range(400):
                  state_qn = np.expand_dims(state, axis=0)
                  q_values = q_network(state_qn)
                  action = get_action(q_values, epsilon)
                  next_state, reward, done, _ = env.step(action)
                  memory_buffer.append(experience(state, action, reward, next_state, done))

                  if (t % NUM_STEPS_FOR_UPDATE==0  and len(memory_buffer) >= MINIBATCH_SIZE):
                      experiences = sample_Unpack(memory_buffer)
                      with tf.GradientTape() as tape:
                              loss = compute_loss(experiences, GAMMA, q_network, target_q_network)

                      gradients = tape.gradient(loss, q_network.trainable_variables)
                      optimizer.apply_gradients(zip(gradients, q_network.trainable_variables))
                      for target_weights, q_net_weights in zip(target_q_network.weights, q_network.weights):
                          target_weights.assign(TAU * q_net_weights + (1.0 - TAU) * target_weights)

                  state = next_state.copy()
                  total_points += reward

                  if done: #(state[0]>=.5):
                      print(f"at t+1={t+1},i+1={i+1} reached to {next_state}")

              av_latest_points = np.mean(total_point_history[-num_p_av:])#always -200 as each complete loop is -200 and mean=200N/N ,where N=num_p_av
              epsilon = max(E_MIN, E_DECAY * epsilon) #epsilon greedy :new epsilon ,multiplying by fraction make it decrease
              if (i+1) % 20 == 0:
                        print(f"esposide {i+1} finished ,{total_point_history[-num_p_av:]} ,avg {av_latest_points}\n")
    return (q_network,total_point_history)

That’s a very big NN you’re trying to train. Do you have a GPU?

no i use google colab ,even i wait i saw rewards not getting better ,or just change small

When you move a project from Coursera to Colab, you’re probably going to have to change a lot of the code so it runs correctly on the tool versions used in Colab.

And unless you use a GPU, training is going to be very slow.

i have installed packages to make it work in colab (xvfb,swing,pyvirtualdisplay,gym[box2d]) have i missed something?now it don’t give me any syntax error but slow ,and when i wait for 2500 espoisde average not getting very better?

Sorry, I do not know what to suggest in this situation.

ok ,thank you for trying