Patch Mpt 'link' «TRUSTED»

# Convert to additive mask (0 = keep, -inf = mask) return mask.to(dtype).masked_fill(mask == 0, 0.0).masked_fill(mask == 1, float("-inf")) 3. Monkey-patch into existing MPT model (example) ---------------------------------------------------------------------- def apply_mpt_patches(model: nn.Module): """Replace rotary and mask functions in an existing MPT model.""" # Patch rotary class if found for name, module in model.named_modules(): if "rotary" in name.lower() and hasattr(module, "cos_cached"): module. class = PatchedRotaryEmbedding print(f"[PATCH] Replaced rotary in name")

# Case: (batch, 1, key_len) elif attention_mask.dim() == 3 and attention_mask.size(1) == 1: mask = attention_mask[:, :, None, :] else: raise ValueError(f"Unexpected mask shape: attention_mask.shape") patch mpt

batch = attention_mask.size(0)

# Test rotary cache fix rotary = PatchedRotaryEmbedding(dim=64, max_seq_len=512) x = torch.randn(1, 10, 64) cos1, sin1 = rotary(x, seq_len=10) cos2, sin2 = rotary(x, seq_len=20) # seqlen changes → recalc cache assert cos1.shape[0] == 10 assert cos2.shape[0] == 20 print("Rotary cache patch: OK") # Convert to additive mask (0 = keep,

# Case: (batch, key_len) -> expand to (batch, 1, 1, key_len) if attention_mask.dim() == 2: mask = attention_mask[:, None, None, :] 0.0).masked_fill(mask == 1

# Monkey-patch attention mask expansion function if model has it if hasattr(model, "_expand_attention_mask"): model._expand_attention_mask = patch_attention_mask print("[PATCH] Replaced _expand_attention_mask") Usage example ---------------------------------------------------------------------- if name == " main ": # Assume you have an MPT model loaded # from transformers import AutoModel # model = AutoModel.from_pretrained("mosaicml/mpt-7b", trust_remote_code=True) # apply_mpt_patches(model)

# Broadcast to query_len mask = mask.expand(batch, 1, query_length, key_length)