mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-04-19 01:59:01 -04:00
Masking: avoid modifying tensor in-place to improve performance
This commit is contained in:
parent
b5d872ead0
commit
f3a55f92c2
@ -782,7 +782,7 @@ class Transformer(nn.Module):
|
||||
freqs_cis = self.freqs_cis[start_pos:start_pos+seqlen]
|
||||
mask = None
|
||||
if seqlen > 1:
|
||||
mask = torch.full((seqlen, seqlen), float("-inf"), device=tokens.device).triu_(1)
|
||||
mask = torch.triu(torch.full((seqlen, seqlen), float("-inf"), device=tokens.device), diagonal=1)
|
||||
for layer in self.layers:
|
||||
h = layer(h, start_pos, freqs_cis, mask)
|
||||
h = self.norm(h)[:, -1]
|
||||
|
Loading…
Reference in New Issue
Block a user