Calculates the scaling dot product attention

2024-10-07 09:51:29 +08:00
parent 4500b2719e
commit 9af61e2ade
1 changed files with 13 additions and 8 deletions
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, embed_size, num_heads):
@@ -19,16 +20,21 @@ class MultiHeadAttentionLayer(nn.Module):
        N = query.shape[0]  # batch_size
        # Linear transformations for Q, K, V
-        Q = self.q_linear(query)  # shape: (N, seq_len, embed_size)
+        Q = self.q_linear(query)
-        K = self.k_linear(keys)   # shape: (N, seq_len, embed_size)
+        K = self.k_linear(keys)
-        V = self.v_linear(values) # shape: (N, seq_len, embed_size)
+        V = self.v_linear(values)
-        # Reshape Q, K, V into multiple heads
+        # Reshape into multiple heads
        Q = Q.reshape(N, -1, self.num_heads, self.head_dim)
        K = K.reshape(N, -1, self.num_heads, self.head_dim)
        V = V.reshape(N, -1, self.num_heads, self.head_dim)
-        return Q, K, V
+        # Compute scaled dot-product attention scores
        attention_scores = torch.einsum("nqhd,nkhd->nhqk", [Q, K])
        attention_scores = attention_scores / (self.head_dim ** 0.5)
        attention = torch.softmax(attention_scores, dim=-1)  # Normalize
        return attention
 if __name__ == "__main__":
@@ -36,10 +42,9 @@ if __name__ == "__main__":
    num_heads = 8
    mha_layer = MultiHeadAttentionLayer(embed_size, num_heads)
    # Dummy data
    values = torch.randn(2, 10, embed_size)
    keys = torch.randn(2, 10, embed_size)
    query = torch.randn(2, 10, embed_size)
-    Q, K, V = mha_layer(values, keys, query)
+    attention = mha_layer(values, keys, query)
-    print(f"Q shape: {Q.shape}, K shape: {K.shape}, V shape: {V.shape}")
+    print(f"Attention shape: {attention.shape}")