# UNQ_C6 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
# GRADED FUNCTION DecoderLayer
class DecoderLayer(tf.keras.layers.Layer):
"""
The decoder layer is composed by two multi-head attention blocks,
one that takes the new input and uses self-attention, and the other
one that combines it with the output of the encoder, followed by a
fully connected block.
"""
def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
super(DecoderLayer, self).__init__()
self.mha1 = MultiHeadAttention(num_heads=num_heads,
key_dim=embedding_dim,
dropout=dropout_rate)
self.mha2 = MultiHeadAttention(num_heads=num_heads,
key_dim=embedding_dim,
dropout=dropout_rate)
self.ffn = FullyConnected(embedding_dim=embedding_dim,
fully_connected_dim=fully_connected_dim)
self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
self.layernorm3 = LayerNormalization(epsilon=layernorm_eps)
self.dropout_ffn = Dropout(dropout_rate)
def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
"""
Forward pass for the Decoder Layer
Arguments:
x -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
enc_output -- Tensor of shape(batch_size, input_seq_len, fully_connected_dim)
training -- Boolean, set to true to activate
the training mode for dropout layers
look_ahead_mask -- Boolean mask for the target_input
padding_mask -- Boolean mask for the second multihead attention layer
Returns:
out3 -- Tensor of shape (batch_size, target_seq_len, fully_connected_dim)
attn_weights_block1 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
attn_weights_block2 -- Tensor of shape(batch_size, num_heads, target_seq_len, input_seq_len)
"""
# START CODE HERE
# enc_output.shape == (batch_size, input_seq_len, fully_connected_dim)
# BLOCK 1
# calculate self-attention and return attention scores as attn_weights_block1.
# Dropout will be applied during training (~1 line).
mult_attn_out1, attn_weights_block1 = self.mha1(x,x,x, look_ahead_mask, training=True, return_attention_scores=True) # (batch_size, target_seq_len, d_model)
# apply layer normalization (layernorm1) to the sum of the attention output and the input (~1 line)
Q1 = self.layernorm1(mult_attn_out1+x)
# BLOCK 2
# calculate self-attention using the Q from the first block and K and V from the encoder output.
# Dropout will be applied during training
# Return attention scores as attn_weights_block2 (~1 line)
mult_attn_out2, attn_weights_block2 = self.mha2(Q1,enc_output,enc_output, padding_mask, training=True, return_attention_scores=True) # (batch_size, target_seq_len, d_model)
# apply layer normalization (layernorm2) to the sum of the attention output and the output of the first block (~1 line)
mult_attn_out2 = self.layernorm2(mult_attn_out2+Q1) # (batch_size, target_seq_len, fully_connected_dim)
#mult_attn_out2 = self.layernorm2(Q1)
#BLOCK 3
# pass the output of the second block through a ffn
ffn_output = self.ffn(mult_attn_out2) # (batch_size, target_seq_len, fully_connected_dim)
# apply a dropout layer to the ffn output
ffn_output = self.dropout_ffn(ffn_output,training=training)
# apply layer normalization (layernorm3) to the sum of the ffn output and the output of the second block
out3 = self.layernorm3(ffn_output + mult_attn_out2) # (batch_size, target_seq_len, fully_connected_dim)
# END CODE HERE
return out3, attn_weights_block1, attn_weights_block2