mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-06-19 16:13:48 -04:00
- Port HTTP server, and appropriate points across core etc from old API to Zig `0.15.0-dev` patterns - Fix mutability, unused variables, and API compatibility issues - Validate SIMD tensor operations and backend architecture - Foundation now compiles cleanly and produces working binary
311 lines
10 KiB
Zig
311 lines
10 KiB
Zig
// Benchmark Suite for DeepZig V3 Implementation
|
|
// Tests performance of core operations across different backends
|
|
|
|
const std = @import("std");
|
|
const deepseek_core = @import("deepseek_core");
|
|
const cpu_backend = @import("cpu_backend");
|
|
const print = std.debug.print;
|
|
|
|
const BenchmarkResult = struct {
|
|
name: []const u8,
|
|
iterations: u32,
|
|
total_time_ns: u64,
|
|
avg_time_ns: u64,
|
|
ops_per_second: f64,
|
|
memory_used_mb: f64,
|
|
|
|
pub fn format(
|
|
self: BenchmarkResult,
|
|
comptime fmt: []const u8,
|
|
options: std.fmt.FormatOptions,
|
|
writer: anytype,
|
|
) !void {
|
|
_ = fmt;
|
|
_ = options;
|
|
try writer.print(
|
|
"{s:30} | {d:6} iter | {d:8.2} ms | {d:10.0} ops/s | {d:6.1} MB",
|
|
.{ self.name, self.iterations, @as(f64, @floatFromInt(self.avg_time_ns)) / 1_000_000.0, self.ops_per_second, self.memory_used_mb }
|
|
);
|
|
}
|
|
};
|
|
|
|
pub fn main() !void {
|
|
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
|
defer _ = gpa.deinit();
|
|
const allocator = gpa.allocator();
|
|
|
|
print("🚀 DeepZig V3 Performance Benchmarks\n");
|
|
print("==========================================\n\n");
|
|
|
|
// Initialize backends
|
|
const cpu_backend_instance = try cpu_backend.init(allocator);
|
|
defer cpu_backend_instance.deinit();
|
|
|
|
print("Backend: CPU (SIMD optimized)\n");
|
|
print("Architecture: {s}\n", @tagName(@import("builtin").cpu.arch));
|
|
print("Thread count: {d}\n\n", .{std.Thread.getCpuCount() catch 4});
|
|
|
|
// Run benchmarks
|
|
var results = std.ArrayList(BenchmarkResult).init(allocator);
|
|
defer results.deinit();
|
|
|
|
// Tensor operations
|
|
try results.append(try benchmarkTensorCreation(allocator));
|
|
try results.append(try benchmarkTensorAddition(allocator));
|
|
try results.append(try benchmarkMatrixMultiplication(allocator));
|
|
|
|
// Activation functions
|
|
try results.append(try benchmarkSwiGLU(allocator));
|
|
try results.append(try benchmarkRMSNorm(allocator));
|
|
|
|
// Memory operations
|
|
try results.append(try benchmarkMemoryBandwidth(allocator));
|
|
|
|
// Print results
|
|
print("Benchmark Results:\n");
|
|
print("------------------\n");
|
|
print("Operation | Iterations | Avg Time | Operations/s | Memory\n");
|
|
print("-------------------------------|------------|-----------|--------------|-------\n");
|
|
|
|
for (results.items) |result| {
|
|
print("{}\n", .{result});
|
|
}
|
|
|
|
print("\n🎯 Benchmark completed!\n");
|
|
}
|
|
|
|
/// Benchmark tensor creation and memory allocation
|
|
fn benchmarkTensorCreation(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 1000;
|
|
const shape = deepseek_core.Tensor.Shape.init(&[_]u32{ 1024, 1024 });
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
var tensor = try deepseek_core.Tensor.zeros(allocator, shape, .f32);
|
|
tensor.deinit();
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
return BenchmarkResult{
|
|
.name = "Tensor Creation (1024x1024)",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = @as(f64, @floatFromInt(iterations)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0),
|
|
.memory_used_mb = (1024.0 * 1024.0 * 4.0) / (1024.0 * 1024.0), // 4MB tensor
|
|
};
|
|
}
|
|
|
|
/// Benchmark SIMD-optimized tensor addition
|
|
fn benchmarkTensorAddition(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 100;
|
|
const shape = deepseek_core.Tensor.Shape.init(&[_]u32{ 4096, 1024 });
|
|
|
|
var a = try deepseek_core.Tensor.ones(allocator, shape, .f32);
|
|
defer a.deinit();
|
|
|
|
var b = try deepseek_core.Tensor.ones(allocator, shape, .f32);
|
|
defer b.deinit();
|
|
|
|
var result = try deepseek_core.Tensor.zeros(allocator, shape, .f32);
|
|
defer result.deinit();
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
try a.add(&b, &result);
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
const elements_per_iter = shape.numel();
|
|
const total_elements = elements_per_iter * iterations;
|
|
const ops_per_second = @as(f64, @floatFromInt(total_elements)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
|
|
|
|
return BenchmarkResult{
|
|
.name = "Tensor Addition (SIMD)",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = ops_per_second,
|
|
.memory_used_mb = (4096.0 * 1024.0 * 4.0 * 3.0) / (1024.0 * 1024.0), // 3 tensors
|
|
};
|
|
}
|
|
|
|
/// Benchmark matrix multiplication performance
|
|
fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 10;
|
|
const m = 1024;
|
|
const k = 1024;
|
|
const n = 1024;
|
|
|
|
const a_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ m, k });
|
|
const b_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ k, n });
|
|
const c_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ m, n });
|
|
|
|
var a = try deepseek_core.Tensor.ones(allocator, a_shape, .f32);
|
|
defer a.deinit();
|
|
|
|
var b = try deepseek_core.Tensor.ones(allocator, b_shape, .f32);
|
|
defer b.deinit();
|
|
|
|
var c = try deepseek_core.Tensor.zeros(allocator, c_shape, .f32);
|
|
defer c.deinit();
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
try a.matmul(&b, &c);
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
// FLOPS calculation: 2 * M * N * K operations per matrix multiplication
|
|
const flops_per_iter = 2 * m * n * k;
|
|
const total_flops = flops_per_iter * iterations;
|
|
const gflops_per_second = (@as(f64, @floatFromInt(total_flops)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0)) / 1_000_000_000.0;
|
|
|
|
return BenchmarkResult{
|
|
.name = "Matrix Multiplication",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = gflops_per_second, // Actually GFLOPS
|
|
.memory_used_mb = (@as(f64, @floatFromInt(m + k + n)) * 1024.0 * 4.0) / (1024.0 * 1024.0),
|
|
};
|
|
}
|
|
|
|
/// Benchmark SwiGLU activation function
|
|
fn benchmarkSwiGLU(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 1000;
|
|
const size = 1024 * 1024; // 1M elements
|
|
|
|
const input = try allocator.alloc(f32, size);
|
|
defer allocator.free(input);
|
|
|
|
const gate = try allocator.alloc(f32, size);
|
|
defer allocator.free(gate);
|
|
|
|
const output = try allocator.alloc(f32, size);
|
|
defer allocator.free(output);
|
|
|
|
// Fill with random data
|
|
for (input, gate) |*i, *g| {
|
|
i.* = 0.5;
|
|
g.* = 0.3;
|
|
}
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
// SwiGLU: input * swish(gate)
|
|
for (0..size) |i| {
|
|
const g = gate[i];
|
|
const swish_g = g / (1.0 + @exp(-g));
|
|
output[i] = input[i] * swish_g;
|
|
}
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
const total_elements = size * iterations;
|
|
const ops_per_second = @as(f64, @floatFromInt(total_elements)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
|
|
|
|
return BenchmarkResult{
|
|
.name = "SwiGLU Activation",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = ops_per_second,
|
|
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 3.0 * 4.0) / (1024.0 * 1024.0),
|
|
};
|
|
}
|
|
|
|
/// Benchmark RMS normalization
|
|
fn benchmarkRMSNorm(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 1000;
|
|
const size = 4096; // Typical hidden dimension
|
|
|
|
const input = try allocator.alloc(f32, size);
|
|
defer allocator.free(input);
|
|
|
|
const weight = try allocator.alloc(f32, size);
|
|
defer allocator.free(weight);
|
|
|
|
const output = try allocator.alloc(f32, size);
|
|
defer allocator.free(output);
|
|
|
|
// Initialize data
|
|
for (input, weight) |*i, *w| {
|
|
i.* = 0.1;
|
|
w.* = 1.0;
|
|
}
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
deepseek_core.math.rms_norm.rmsNormVec(input, weight, output, 1e-6);
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
const ops_per_second = @as(f64, @floatFromInt(iterations)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
|
|
|
|
return BenchmarkResult{
|
|
.name = "RMS Normalization (SIMD)",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = ops_per_second,
|
|
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 3.0 * 4.0) / (1024.0 * 1024.0),
|
|
};
|
|
}
|
|
|
|
/// Benchmark memory bandwidth
|
|
fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !BenchmarkResult {
|
|
const iterations = 100;
|
|
const size = 64 * 1024 * 1024; // 64MB
|
|
|
|
const source = try allocator.alloc(u8, size);
|
|
defer allocator.free(source);
|
|
|
|
const dest = try allocator.alloc(u8, size);
|
|
defer allocator.free(dest);
|
|
|
|
// Fill source with data
|
|
@memset(source, 0x42);
|
|
|
|
const start_time = std.time.nanoTimestamp();
|
|
|
|
for (0..iterations) |_| {
|
|
@memcpy(dest, source);
|
|
}
|
|
|
|
const end_time = std.time.nanoTimestamp();
|
|
const total_time = @as(u64, @intCast(end_time - start_time));
|
|
const avg_time = total_time / iterations;
|
|
|
|
const total_bytes = size * iterations;
|
|
const gb_per_second = (@as(f64, @floatFromInt(total_bytes)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0)) / (1024.0 * 1024.0 * 1024.0);
|
|
|
|
return BenchmarkResult{
|
|
.name = "Memory Bandwidth",
|
|
.iterations = iterations,
|
|
.total_time_ns = total_time,
|
|
.avg_time_ns = avg_time,
|
|
.ops_per_second = gb_per_second, // Actually GB/s
|
|
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 2.0) / (1024.0 * 1024.0),
|
|
};
|
|
} |