Implementing GQA Checkpoint Conversion from MHA

zohaib-khan5040 · July 28, 2024, 3:58pm

Hello!

I’m trying to implement Grouped Query Attention for a Vision Transformer but I cannot get the checkpoint conversion to work. The GQA paper states that the key and value tensors are mean pooled along the head axis, and more importantly that the performance right after conversion is already decent (little to no actual uptraining is required to get the model performing close to the MHA equivalent.

I have tried to get this to work with a Vision Transformer but right now the GQA variant pops out at most a 50% accuracy after the first epoch on my classification dataset, but MHA pops out closer to 90% so I know I must be doing something wrong if I’m not misreading the paper.

Here is the code so far:

class GQA(nn.Module):

    def __init__(
            self,
            dim: int,
            num_heads: int = 8,
            qkv_bias: bool = False,
            attn_drop: float = 0.,
            proj_drop: float = 0.,
            num_kv_heads: Optional[int] = None,
    ) -> None:
        super().__init__()
        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.num_kv_heads = num_kv_heads if num_kv_heads is not None else (num_heads // 2) # have at least two heads in each group

        self.q = nn.Linear(dim, dim, bias=qkv_bias)
        self.k = nn.Linear(dim, self.num_kv_heads*self.head_dim, bias=qkv_bias)
        self.v = nn.Linear(dim, self.num_kv_heads*self.head_dim, bias=qkv_bias)

        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, P, C = x.shape
        H = self.num_heads
        q = self.q(x).view(B, P, H, -1).transpose(1, 2) # (B, H, P, head_size)
        k = self.k(x).view(B, P, self.num_kv_heads, -1).transpose(1, 2) # (B, num_kv_heads, P, head_size)
        v = self.v(x).view(B, P, self.num_kv_heads, -1).transpose(1, 2) # (B, num_kv_heads, P, head_size)
        
        q = q * self.scale

        group_size = self.num_heads // self.num_kv_heads
        q_grps = torch.split(q, group_size, dim=1)
        k_grps = torch.split(k, 1, dim=1) 
        v_grps = torch.split(v, 1, dim=1)

        outputs = [None] * len(k_grps)
        for i in range(len(k_grps)):
                
            # Collect items (note q has a larger head axis)
            curr_q = q_grps[i]  # (B, num_heads//num_kv_heads, num_patches, head_size)
            curr_k = k_grps[i]  # (B, 1, num_patches, head_size)
            curr_v = v_grps[i]  # (B, 1, num_patches, head_size)
            
            scores = (curr_q @ curr_k.transpose(-2, -1))
            weights = F.softmax(scores, dim=-1) # (B, num_heads//num_kv_heads, num_patches, num_patches)
            weights = self.attn_drop(weights)
            curr_att = weights @ curr_v # (B, num_heads//num_kv_heads, num_patches, head_size)
            outputs[i] = curr_att

        x = torch.cat(outputs, dim=1) # (B, num_heads, num_patches, head_size)
        x = x.transpose(1, 2).contiguous().view(B, P, C) # (B, num_patches, emb_dim)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x
    
    def att_weight_conversion(self, qkv_params, is_bias=False):
        '''
        Split and convert the QKV parameters from ViT checkpoints for the GQA implementation
        '''
        q, k, v = torch.split(qkv_params, qkv_params.shape[0] // 3, dim=0)

        group_size = self.num_heads // self.num_kv_heads

        def convert_weight(param):
            x = param.clone()

            # TODO: check whether to bring the heads axis at the front or middle
            x = x.view(self.dim, self.num_heads, self.dim//self.num_heads)
            xs = torch.split(x, group_size, dim=1) # split across head axis
            xs = [xs[i].mean(dim=1) for i in range(len(xs))]
            x = torch.cat(xs, dim=1)

            expected_shape = (self.dim, self.num_kv_heads*self.dim//self.num_heads)
            assert x.shape == expected_shape, f'Expected {expected_shape}, got {x.shape}'
            return x
        
        def convert_bias(param):
            x = param.clone()
            x = x.view(self.num_heads, self.dim//self.num_heads)
            xs = torch.split(x, group_size, dim=0) # split across head axis
            xs = [xs[i].mean(dim=0) for i in range(len(xs))]
            x = torch.cat(xs, dim=0)

            expected_shape = (self.num_kv_heads*self.dim//self.num_heads,)
            assert x.shape == expected_shape, f'Expected {expected_shape}, got {x.shape}'
            return x
        
        return {
            "q": q,
            "k": convert_weight(k) if not is_bias else convert_bias(k),
            "v": convert_weight(v) if not is_bias else convert_bias(v)
        }
    
    def load_pretrained_weights(self, state_dict, block_idx):

        # Load in parameters for the Query Key Value layers
        qkv_weight = state_dict[f'blocks.{block_idx}.attn.qkv.weight']
        qkv_bias = state_dict[f'blocks.{block_idx}.attn.qkv.bias']

        wdict = self.att_weight_conversion(qkv_weight)
        bdict = self.att_weight_conversion(qkv_bias, is_bias=True)

        self.q.weight = assign_check(self.q.weight, wdict['q'])
        self.q.bias = assign_check(self.q.bias, bdict['q'])

        self.k.weight = assign_check(self.k.weight, wdict['k'].T)
        self.k.bias = assign_check(self.k.bias, bdict['k'])
        
        self.v.weight = assign_check(self.v.weight, wdict['v'].T)
        self.v.bias = assign_check(self.v.bias, bdict['v'])

        # Load in parameters for the output projection
        self.proj.weight = assign_check(self.proj.weight, state_dict[f'blocks.{block_idx}.attn.proj.weight'])
        self.proj.bias = assign_check(self.proj.bias, state_dict[f'blocks.{block_idx}.attn.proj.bias'])

Please ignore the bulk of the forward pass unless there’s a glaring issue with it.

Hoping someone can help shed some light on what could be the issue.
Thank you!

Topic		Replies	Views
Convert pre-trained MHA weights to GQA weights 🤗Transformers	1	378	September 29, 2024
Is Flash Attention implemented in GPTBigCodeModel? Models	1	861	July 6, 2023
Code from HF tutorial on the customization of transformer components is not working as intended 🤗Transformers	4	27	April 18, 2025
Past_key_values - why not past_key_values_queries? Beginners	5	10884	October 15, 2023
PerceiverIO Output Query Array Doubts 🤗Transformers	1	124	May 3, 2024

Implementing GQA Checkpoint Conversion from MHA

Related topics