mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-02-23 06:08:58 -05:00
43 lines
1.5 KiB
Python
43 lines
1.5 KiB
Python
|
import torch
|
||
|
import torch._dynamo
|
||
|
|
||
|
# Solution 1: Suppress errors (quick fix but not recommended for production)
|
||
|
torch._dynamo.config.suppress_errors = True
|
||
|
|
||
|
# Solution 2: Example of a more robust way to handle MoE with dynamic shapes
|
||
|
class RobustMoE(torch.nn.Module):
|
||
|
def __init__(self, num_experts, d_model):
|
||
|
super().__init__()
|
||
|
self.num_experts = num_experts
|
||
|
self.d_model = d_model
|
||
|
self.experts = torch.nn.ModuleList([
|
||
|
torch.nn.Linear(d_model, d_model) for _ in range(num_experts)
|
||
|
])
|
||
|
self.router = torch.nn.Linear(d_model, num_experts)
|
||
|
|
||
|
def forward(self, x):
|
||
|
# Get routing weights
|
||
|
route_weights = torch.softmax(self.router(x), dim=-1)
|
||
|
|
||
|
# Instead of using if conditions on counts, use masked operations
|
||
|
outputs = torch.zeros_like(x)
|
||
|
for i in range(self.num_experts):
|
||
|
# Apply expert computation to all inputs
|
||
|
expert_out = self.experts[i](x)
|
||
|
# Weight the outputs by routing weights
|
||
|
outputs += route_weights[..., i:i+1] * expert_out
|
||
|
|
||
|
return outputs
|
||
|
|
||
|
"""
|
||
|
Usage example:
|
||
|
model = RobustMoE(num_experts=4, d_model=256)
|
||
|
x = torch.randn(32, 256) # batch_size=32, d_model=256
|
||
|
output = model(x)
|
||
|
|
||
|
This implementation avoids the GuardOnDataDependentSymNode error by:
|
||
|
1. Not using data-dependent control flow (if statements based on counts)
|
||
|
2. Using masked operations instead
|
||
|
3. If needed, you can still enable error suppression with:
|
||
|
torch._dynamo.config.suppress_errors = True
|
||
|
"""
|