Hi Mr,@Juan_Olano
This is the original implementation
class MultiHeadAttentionBlock(nn.Module):
def __init__(self, d_model: int, h: int, dropout: float) -> None:
super().__init__()
self.d_model = d_model # Embedding vector size
self.h = h # Number of heads
# Make sure d_model is divisible by h
assert d_model % h == 0, "d_model is not divisible by h"
self.d_k = d_model // h # Dimension of vector seen by each head
self.w_q = nn.Linear(d_model, d_model, bias=False) # Wq
self.w_k = nn.Linear(d_model, d_model, bias=False) # Wk
self.w_v = nn.Linear(d_model, d_model, bias=False) # Wv
self.w_o = nn.Linear(d_model, d_model, bias=False) # Wo
self.dropout = nn.Dropout(dropout)
@staticmethod
def attention(query, key, value, mask, dropout: nn.Dropout):
d_k = query.shape[-1]
# Just apply the formula from the paper
# (batch, h, seq_len, d_k) --> (batch, h, seq_len, seq_len)
attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
# Write a very low value (indicating -inf) to the positions where mask == 0
attention_scores.masked_fill_(mask == 0, -1e9)
attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
if dropout is not None:
attention_scores = dropout(attention_scores)
# (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_k)
# return attention scores which can be used for visualization
return (attention_scores @ value), attention_scores
def forward(self, q, k, v, mask):
query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
# (batch, seq_len, d_model) --> (batch, seq_len, h, d_k) --> (batch, h, seq_len, d_k)
query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)
# Calculate attention
x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
# Combine all the heads together
# (batch, h, seq_len, d_k) --> (batch, seq_len, h, d_k) --> (batch, seq_len, d_model)
x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
# Multiply by Wo
# (batch, seq_len, d_model) --> (batch, seq_len, d_model)
return self.w_o(x)
ok i understand that each head is the whole model embedding but this is transformed to the the reduced size, but what do you think about we didn’t reduce size as if we reduce size of course we compress the values and meaning in small vectors or matrix this may lead to the loss of some information that may be important. I also know that gradient descentwill deal with this problem in one way or another, but it will not solve 100% of this problem. What do you think if we apply some thing like stack more that Single-head attention besides like have different prespective for each Single-head attention this image
we will not reduce the size, we will compute scores for each different head( Single-head attention) and in the each the model will have large matrix to choose from each head the part which has more meaningful(value)
like this implementation
class MultiHeadAttentionBlock(nn.Module):
def __init__(self, d_model: int, heads: int, dropout: float) -> None:
super().__init__()
self.d_model=d_model
self.heads=heads
self.w_q=nn.Linear(self.d_model,self.heads*self.d_model)
self.w_k=nn.Linear(self.d_model,self.heads*self.d_model)
self.w_v=nn.Linear(self.d_model,self.heads*self.d_model)
self.w_o = nn.Linear(self.heads*self.d_model, self.d_model)
self.dropout = nn.Dropout(dropout)
@staticmethod
def attention(query, key, value, mask, dropout: nn.Dropout):
d_model = query.shape[-1]
# Just apply the formula from the paper
# (batch, h, seq_len, d_model) --> (batch, h, seq_len, seq_len)
attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_model)
if mask is not None:
# Write a very low value (indicating -inf) to the positions where mask == 0
attention_scores.masked_fill_(mask == 0, -1e9)
attention_scores = attention_scores.softmax(dim=-1) # (batch, h, seq_len, seq_len) # Apply softmax
if dropout is not None:
attention_scores = dropout(attention_scores)
# (batch, h, seq_len, seq_len) --> (batch, h, seq_len, d_model)
# return attention scores which can be used for visualization
return (attention_scores @ value), attention_scores
def forward(self, q, k, v, mask):
query = self.w_q(q) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
key = self.w_k(k) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
value = self.w_v(v) # (batch, seq_len, d_model) --> (batch, seq_len, d_model)
# (batch, seq_len, d_model) --> (batch, seq_len, h, d_model) --> (batch, h, seq_len, d_model)
query = query.view(query.shape[0], query.shape[1], self.heads, self.d_model).transpose(1, 2)
key = key.view(key.shape[0], key.shape[1], self.heads, self.d_model).transpose(1, 2)
value = value.view(value.shape[0], value.shape[1], self.heads, self.d_model).transpose(1, 2)
# Calculate attention
x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
# Combine all the heads together
# (batch, h, seq_len, d_model) --> (batch, seq_len, h, d_model) --> (batch, seq_len, d_model)
x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.heads * self.d_model)
# Multiply by Wo
# (batch, seq_len, d_model) --> (batch, seq_len, d_model)
return self.w_o(x)