Hi,
I’m looking at encoder_layer and decode_layer. Are both of the last dimension of enc_output
and the last dimension of Q1
d_model (length of embedding vector)? If true, are the dimension notes in this code section wrong? Shouldn’t they all be (batch_size, input_seq_len, d_model)
? Thanks!
class EncoderLayer(tf.keras.layers.Layer):
"""
The encoder layer is composed by a multi-head self-attention mechanism,
followed by a simple, positionwise fully connected feed-forward network.
This archirecture includes a residual connection around each of the two
sub-layers, followed by layer normalization.
"""
def __init__(self, embedding_dim, num_heads, fully_connected_dim,
dropout_rate=0.1, layernorm_eps=1e-6):
super(EncoderLayer, self).__init__()
self.mha = MultiHeadAttention(num_heads=num_heads,
key_dim=embedding_dim,
dropout=dropout_rate)
self.ffn = FullyConnected(embedding_dim=embedding_dim,
fully_connected_dim=fully_connected_dim)
self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)
self.dropout_ffn = Dropout(dropout_rate)
def call(self, x, training, mask):
"""
Forward pass for the Encoder Layer
Arguments:
x -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
training -- Boolean, set to true to activate
the training mode for dropout layers
mask -- Boolean mask to ensure that the padding is not
treated as part of the input
Returns:
encoder_layer_out -- Tensor of shape (batch_size, input_seq_len, fully_connected_dim)
"""
# START CODE HERE
# calculate self-attention using mha(~1 line). Dropout will be applied during training
attn_output = ... # Self attention (batch_size, input_seq_len, fully_connected_dim)
# apply layer normalization on sum of the input and the attention output to get the
# output of the multi-head attention layer (~1 line)
out1 = ... # (batch_size, input_seq_len, fully_connected_dim)
# pass the output of the multi-head attention layer through a ffn (~1 line)
ffn_output = ... # (batch_size, input_seq_len, fully_connected_dim)
# apply dropout layer to ffn output during training (~1 line)
ffn_output = ...
# apply layer normalization on sum of the output from multi-head attention and ffn output to get the
# output of the encoder layer (~1 line)
encoder_layer_out = ... # (batch_size, input_seq_len, fully_connected_dim)
# END CODE HERE
return encoder_layer_out