Ex 7 decoder wrong output

decoder cell:

AssertionError

      1 # UNIT TEST
----> 2 Decoder_test(Decoder, create_look_ahead_mask, create_padding_mask)


    assert tf.is_tensor(outd), "Wrong type for outd. It must be a dict"
    assert np.allclose(tf.shape(outd), tf.shape(encoderq_output)), f"Wrong shape. We expected { tf.shape(encoderq_output)}"
--> 223     assert np.allclose(outd[1, 1], [-0.2715261, -0.5606001, -0.861783, 1.69390933]), "Wrong values in outd"
    keys = list(att_weights.keys())

AssertionError: Wrong values in outd

please help. I have checking google tensorflow tutorial, and all the discourse forum about this. I have see no mistake in my code, but must be one.


make sure using “Q1” instead of “mult_attn_out1” in layernorm2()

I have this: (but in UNQ_C6 , NOT INQ_C7)

        # apply layer normalization (layernorm2) to the sum of the attention output and the output of the first block (~1 line)
        mult_attn_out2 = self.layernorm2(mult_attn_out2+Q1)  # (batch_size, target_seq_len, fully_connected_dim)

my problem seems to be somewhat here:

    def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
        Forward  pass for the Decoder
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attention_weights - Dictionary of tensors containing all the attention weights
                                each of shape Tensor of shape (batch_size, num_heads, target_seq_len, input_seq_len)

        seq_len = tf.shape(x)[1]
        attention_weights = {}
        # create word embeddings 
        x = self.embedding(x) # (batch_size, target_seq_len, fully_connected_dim)
        # scale embeddings by multiplying by the square root of their dimension
        x *= tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        #x = x*tf.math.sqrt(tf.cast(self.embedding_dim, tf.float32))
        # calculate positional encodings and add to word embedding
        x += self.pos_encoding[:, :seq_len, :] 

        # apply a dropout layer to x
        x =self.dropout(x,training=training)

        # use a for loop to pass x through a stack of decoder layers and update attention_weights (~4 lines total)
        for i in range(self.num_layers):
            # pass x and the encoder output through a stack of decoder layers and save the attention weights
            # of block 1 and 2 (~1 line)
            x, block1, block2 = self.dec_layers[i](x,enc_output, training,look_ahead_mask, padding_mask)

            #update attention_weights dictionary with the attention weights of block 1 and block 2
            attention_weights['decoder_layer{}_block1_self_att'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = block2
        # END CODE HERE
        # x.shape == (batch_size, target_seq_len, fully_connected_dim)
        return x, attention_weights

Your code for the Decoder() call method seems to be OK.

Please post your code for DecoderLayer().

Also, avoid using the TensorFlow tutorials, as this exercise doesn’t seem to follow them exactly.

class DecoderLayer(tf.keras.layers.Layer):
    The decoder layer is composed by two multi-head attention blocks, 
    one that takes the new input and uses self-attention, and the other 
    one that combines it with the output of the encoder, followed by a
    fully connected block. 
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(num_heads=num_heads,

        self.mha2 = MultiHeadAttention(num_heads=num_heads,

        self.ffn = FullyConnected(embedding_dim=embedding_dim,

        self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)

        self.dropout_ffn = Dropout(dropout_rate)
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        Forward pass for the Decoder Layer
            x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            enc_output --  Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
            training -- Boolean, set to true to activate
                        the training mode for dropout layers
            look_ahead_mask -- Boolean mask for the target_input
            padding_mask -- Boolean mask for the second multihead attention layer
            out3 -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
            attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
            attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
        # enc_output.shape == (batch_size, input_seq_len, fully_connected_dim)
        # BLOCK 1
        # calculate self-attention and return attention scores as attn_weights_block1.
        # Dropout will be applied during training (~1 line).
        mult_attn_out1, attn_weights_block1 = self.mha1(x,x,x, look_ahead_mask, training=True, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        # apply layer normalization (layernorm1) to the sum of the attention output and the input (~1 line)
        Q1 = self.layernorm1(mult_attn_out1+x)

        # BLOCK 2
        # calculate self-attention using the Q from the first block and K and V from the encoder output. 
        # Dropout will be applied during training
        # Return attention scores as attn_weights_block2 (~1 line) 
        mult_attn_out2, attn_weights_block2 = self.mha2(Q1,enc_output,enc_output, padding_mask, training=True, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        # apply layer normalization (layernorm2) to the sum of the attention output and the output of the first block (~1 line)
        mult_attn_out2 = self.layernorm2(mult_attn_out2+Q1)  # (batch_size, target_seq_len, fully_connected_dim)
        #mult_attn_out2 = self.layernorm2(Q1)        
        #BLOCK 3
        # pass the output of the second block through a ffn
        ffn_output = self.ffn(mult_attn_out2)  # (batch_size, target_seq_len, fully_connected_dim)
        # apply a dropout layer to the ffn output
        ffn_output = self.dropout_ffn(ffn_output,training=training)
        # apply layer normalization (layernorm3) to the sum of the ffn output and the output of the second block
        out3 = self.layernorm3(ffn_output + mult_attn_out2)  # (batch_size, target_seq_len, fully_connected_dim)
        # END CODE HERE

        return out3, attn_weights_block1, attn_weights_block2

why you are including “training = True” ? here
mult_attn_out1, attn_weights_block1 and mult_attn_out2, attn_weights_block2
Exclude “training = True” and try it. I hope it will work.

I excluded as this:

        # BLOCK 1
        # calculate self-attention and return attention scores as attn_weights_block1.
        # Dropout will be applied during training (~1 line).
        mult_attn_out1, attn_weights_block1 = self.mha1(x,x,x, look_ahead_mask, training, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
        # apply layer normalization (layernorm1) to the sum of the attention output and the input (~1 line)
        Q1 = self.layernorm1(mult_attn_out1+x)

        # BLOCK 2
        # calculate self-attention using the Q from the first block and K and V from the encoder output. 
        # Dropout will be applied during training
        # Return attention scores as attn_weights_block2 (~1 line) 
        mult_attn_out2, attn_weights_block2 = self.mha2(Q1,enc_output,enc_output, padding_mask, training, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)

but then i get this error:

TypeError                                 Traceback (most recent call last)

      1 # UNIT TEST
----> 2 DecoderLayer_test(DecoderLayer, create_look_ahead_mask)


    padding_mask = None
--> 169     out, attn_w_b1, attn_w_b2 = decoderLayerq(q, encoderq_output, True, look_ahead_mask, padding_mask)
    assert tf.is_tensor(attn_w_b1), "Wrong type for attn_w_b1. Output must be a tensor"


   with autocast_variable.enable_auto_cast_variables(
   self._compute_dtype_object):
-> 1012           outputs = call_fn(inputs, *args, **kwargs)
   if self._activity_regularizer:

<ipython-input-27-d90348f4e83f> in call(self, x, enc_output, training, look_ahead_mask, padding_mask)
     51         # calculate self-attention and return attention scores as attn_weights_block1.
     52         # Dropout will be applied during training (~1 line).
---> 53         mult_attn_out1, attn_weights_block1 = self.mha1(x,x,x, look_ahead_mask, training, return_attention_scores=True)  # (batch_size, target_seq_len, d_model)
     55         # apply layer normalization (layernorm1) to the sum of the attention output and the input (~1 line)


   with autocast_variable.enable_auto_cast_variables(
   self._compute_dtype_object):
-> 1012           outputs = call_fn(inputs, *args, **kwargs)
   if self._activity_regularizer:

TypeError: call() got multiple values for argument 'return_attention_scores'

EDIT: I just exclude the True, not all “training=True”. Excluding “training=True” works. Now works perfectly fine! thanks.

I though training in decoder was needed.

Thanks again.


Thank you so much! takes ages to do this assignment and you helped a lot.just a single input can take so much time to spot