DeepSeek-V3/experimental/bench/main.zig
Triex 31ef81000f feat: Migrate experimental implementation to modern Zig, achieve clean compilation (private repo dump -> /experimental)
- Port HTTP server, and appropriate points across core etc from old API to Zig `0.15.0-dev` patterns
- Fix mutability, unused variables, and API compatibility issues
- Validate SIMD tensor operations and backend architecture
- Foundation now compiles cleanly and produces working binary
2025-06-06 15:31:21 +10:00

311 lines
10 KiB
Zig

// Benchmark Suite for DeepZig V3 Implementation
// Tests performance of core operations across different backends
const std = @import("std");
const deepseek_core = @import("deepseek_core");
const cpu_backend = @import("cpu_backend");
const print = std.debug.print;
const BenchmarkResult = struct {
name: []const u8,
iterations: u32,
total_time_ns: u64,
avg_time_ns: u64,
ops_per_second: f64,
memory_used_mb: f64,
pub fn format(
self: BenchmarkResult,
comptime fmt: []const u8,
options: std.fmt.FormatOptions,
writer: anytype,
) !void {
_ = fmt;
_ = options;
try writer.print(
"{s:30} | {d:6} iter | {d:8.2} ms | {d:10.0} ops/s | {d:6.1} MB",
.{ self.name, self.iterations, @as(f64, @floatFromInt(self.avg_time_ns)) / 1_000_000.0, self.ops_per_second, self.memory_used_mb }
);
}
};
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
print("🚀 DeepZig V3 Performance Benchmarks\n");
print("==========================================\n\n");
// Initialize backends
const cpu_backend_instance = try cpu_backend.init(allocator);
defer cpu_backend_instance.deinit();
print("Backend: CPU (SIMD optimized)\n");
print("Architecture: {s}\n", @tagName(@import("builtin").cpu.arch));
print("Thread count: {d}\n\n", .{std.Thread.getCpuCount() catch 4});
// Run benchmarks
var results = std.ArrayList(BenchmarkResult).init(allocator);
defer results.deinit();
// Tensor operations
try results.append(try benchmarkTensorCreation(allocator));
try results.append(try benchmarkTensorAddition(allocator));
try results.append(try benchmarkMatrixMultiplication(allocator));
// Activation functions
try results.append(try benchmarkSwiGLU(allocator));
try results.append(try benchmarkRMSNorm(allocator));
// Memory operations
try results.append(try benchmarkMemoryBandwidth(allocator));
// Print results
print("Benchmark Results:\n");
print("------------------\n");
print("Operation | Iterations | Avg Time | Operations/s | Memory\n");
print("-------------------------------|------------|-----------|--------------|-------\n");
for (results.items) |result| {
print("{}\n", .{result});
}
print("\n🎯 Benchmark completed!\n");
}
/// Benchmark tensor creation and memory allocation
fn benchmarkTensorCreation(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 1000;
const shape = deepseek_core.Tensor.Shape.init(&[_]u32{ 1024, 1024 });
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
var tensor = try deepseek_core.Tensor.zeros(allocator, shape, .f32);
tensor.deinit();
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
return BenchmarkResult{
.name = "Tensor Creation (1024x1024)",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = @as(f64, @floatFromInt(iterations)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0),
.memory_used_mb = (1024.0 * 1024.0 * 4.0) / (1024.0 * 1024.0), // 4MB tensor
};
}
/// Benchmark SIMD-optimized tensor addition
fn benchmarkTensorAddition(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 100;
const shape = deepseek_core.Tensor.Shape.init(&[_]u32{ 4096, 1024 });
var a = try deepseek_core.Tensor.ones(allocator, shape, .f32);
defer a.deinit();
var b = try deepseek_core.Tensor.ones(allocator, shape, .f32);
defer b.deinit();
var result = try deepseek_core.Tensor.zeros(allocator, shape, .f32);
defer result.deinit();
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
try a.add(&b, &result);
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
const elements_per_iter = shape.numel();
const total_elements = elements_per_iter * iterations;
const ops_per_second = @as(f64, @floatFromInt(total_elements)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
return BenchmarkResult{
.name = "Tensor Addition (SIMD)",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = ops_per_second,
.memory_used_mb = (4096.0 * 1024.0 * 4.0 * 3.0) / (1024.0 * 1024.0), // 3 tensors
};
}
/// Benchmark matrix multiplication performance
fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 10;
const m = 1024;
const k = 1024;
const n = 1024;
const a_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ m, k });
const b_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ k, n });
const c_shape = deepseek_core.Tensor.Shape.init(&[_]u32{ m, n });
var a = try deepseek_core.Tensor.ones(allocator, a_shape, .f32);
defer a.deinit();
var b = try deepseek_core.Tensor.ones(allocator, b_shape, .f32);
defer b.deinit();
var c = try deepseek_core.Tensor.zeros(allocator, c_shape, .f32);
defer c.deinit();
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
try a.matmul(&b, &c);
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
// FLOPS calculation: 2 * M * N * K operations per matrix multiplication
const flops_per_iter = 2 * m * n * k;
const total_flops = flops_per_iter * iterations;
const gflops_per_second = (@as(f64, @floatFromInt(total_flops)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0)) / 1_000_000_000.0;
return BenchmarkResult{
.name = "Matrix Multiplication",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = gflops_per_second, // Actually GFLOPS
.memory_used_mb = (@as(f64, @floatFromInt(m + k + n)) * 1024.0 * 4.0) / (1024.0 * 1024.0),
};
}
/// Benchmark SwiGLU activation function
fn benchmarkSwiGLU(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 1000;
const size = 1024 * 1024; // 1M elements
const input = try allocator.alloc(f32, size);
defer allocator.free(input);
const gate = try allocator.alloc(f32, size);
defer allocator.free(gate);
const output = try allocator.alloc(f32, size);
defer allocator.free(output);
// Fill with random data
for (input, gate) |*i, *g| {
i.* = 0.5;
g.* = 0.3;
}
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
// SwiGLU: input * swish(gate)
for (0..size) |i| {
const g = gate[i];
const swish_g = g / (1.0 + @exp(-g));
output[i] = input[i] * swish_g;
}
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
const total_elements = size * iterations;
const ops_per_second = @as(f64, @floatFromInt(total_elements)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
return BenchmarkResult{
.name = "SwiGLU Activation",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = ops_per_second,
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 3.0 * 4.0) / (1024.0 * 1024.0),
};
}
/// Benchmark RMS normalization
fn benchmarkRMSNorm(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 1000;
const size = 4096; // Typical hidden dimension
const input = try allocator.alloc(f32, size);
defer allocator.free(input);
const weight = try allocator.alloc(f32, size);
defer allocator.free(weight);
const output = try allocator.alloc(f32, size);
defer allocator.free(output);
// Initialize data
for (input, weight) |*i, *w| {
i.* = 0.1;
w.* = 1.0;
}
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
deepseek_core.math.rms_norm.rmsNormVec(input, weight, output, 1e-6);
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
const ops_per_second = @as(f64, @floatFromInt(iterations)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0);
return BenchmarkResult{
.name = "RMS Normalization (SIMD)",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = ops_per_second,
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 3.0 * 4.0) / (1024.0 * 1024.0),
};
}
/// Benchmark memory bandwidth
fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !BenchmarkResult {
const iterations = 100;
const size = 64 * 1024 * 1024; // 64MB
const source = try allocator.alloc(u8, size);
defer allocator.free(source);
const dest = try allocator.alloc(u8, size);
defer allocator.free(dest);
// Fill source with data
@memset(source, 0x42);
const start_time = std.time.nanoTimestamp();
for (0..iterations) |_| {
@memcpy(dest, source);
}
const end_time = std.time.nanoTimestamp();
const total_time = @as(u64, @intCast(end_time - start_time));
const avg_time = total_time / iterations;
const total_bytes = size * iterations;
const gb_per_second = (@as(f64, @floatFromInt(total_bytes)) / (@as(f64, @floatFromInt(total_time)) / 1_000_000_000.0)) / (1024.0 * 1024.0 * 1024.0);
return BenchmarkResult{
.name = "Memory Bandwidth",
.iterations = iterations,
.total_time_ns = total_time,
.avg_time_ns = avg_time,
.ops_per_second = gb_per_second, // Actually GB/s
.memory_used_mb = (@as(f64, @floatFromInt(size)) * 2.0) / (1024.0 * 1024.0),
};
}