// Benchmark Suite for DeepZig V3 Implementation // Tests performance of core operations across different backends const std = @import("std"); const print = std.debug.print; const cpu_backend = @import("cpu_backend"); const deepseek_core = @import("deepseek_core"); const Shape = deepseek_core.Shape; // Import Shape from deepseek_core const BenchmarkResult = struct { name: []const u8, iterations: u32, total_time_ns: u64, avg_time_ns: u64, ops_per_second: f64, memory_used_mb: f64, pub fn format( self: BenchmarkResult, comptime fmt: []const u8, options: std.fmt.FormatOptions, writer: anytype, ) !void { _ = fmt; _ = options; try writer.print("{s:30} | {d:6} iter | {d:8.2} ms | {d:10.0} ops/s | {d:6.1} MB", .{ self.name, self.iterations, @as(f64, @floatFromInt(self.avg_time_ns)) / 1_000_000.0, self.ops_per_second, self.memory_used_mb }); } }; pub fn main() !void { var gpa = std.heap.GeneralPurposeAllocator(.{}){}; defer _ = gpa.deinit(); const allocator = gpa.allocator(); // Print banner printBanner(); // Run comprehensive benchmarks try runTensorBenchmarks(allocator); try runBlasBenchmarks(allocator); try runMemoryBenchmarks(allocator); // Print summary printBenchmarkSummary(); std.log.info("🎉 Benchmark suite completed!", .{}); } fn printBanner() void { std.log.info("🚀 DeepZig V3 Performance Benchmarks", .{}); std.log.info("==========================================", .{}); std.log.info("", .{}); } fn runTensorBenchmarks(allocator: std.mem.Allocator) !void { std.log.info("📊 TENSOR OPERATIONS BENCHMARK", .{}); std.log.info("-------------------------------", .{}); // Test different matrix sizes const sizes = [_]u32{ 256, 512, 1024, 2048 }; const iterations = [_]u32{ 50, 20, 10, 5 }; for (sizes, iterations) |size, iters| { try benchmarkMatrixMultiplication(allocator, size, iters); } // Tensor addition benchmark try benchmarkTensorAddition(allocator); std.log.info("", .{}); } fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator, size: u32, iterations: u32) !void { std.log.info("🔢 Matrix Multiplication {}x{} ({} iterations)", .{ size, size, iterations }); // Create matrices var a = try deepseek_core.createMatrix(.f32, allocator, size, size); var b = try deepseek_core.createMatrix(.f32, allocator, size, size); var c = try deepseek_core.createMatrix(.f32, allocator, size, size); defer a.deinit(); defer b.deinit(); defer c.deinit(); // Fill with random data a.fillRandom(42); b.fillRandom(123); // Benchmark var timer = try std.time.Timer.start(); for (0..iterations) |_| { try a.matmul(&b, &c); } const elapsed_ns = timer.read(); // Calculate performance metrics const ops = 2.0 * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(iterations)); const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; const gflops = ops / elapsed_s / 1e9; const avg_time_ms = elapsed_s * 1000.0 / @as(f64, @floatFromInt(iterations)); // Performance comparison if (a.blas_ctx) |blas_context| { const efficiency = gflops / blas_context.performance_info.peak_gflops * 100.0; std.log.info(" ✅ BLAS-accelerated: {d:.1} ms/iter, {d:.1} GFLOPS ({d:.1}% efficiency)", .{ avg_time_ms, gflops, efficiency }); std.log.info(" 🔧 Backend: {}, Peak: {d:.1} GFLOPS", .{ blas_context.backend, blas_context.performance_info.peak_gflops }); } else { std.log.info(" ⚠️ Naive implementation: {d:.1} ms/iter, {d:.1} GFLOPS", .{ avg_time_ms, gflops }); } } fn benchmarkTensorAddition(allocator: std.mem.Allocator) !void { const size = 1024 * 1024; // 1M elements const iterations = 1000; std.log.info("➕ Tensor Addition (SIMD) - {} elements, {} iterations", .{ size, iterations }); var a = try deepseek_core.createVector(.f32, allocator, size); var b = try deepseek_core.createVector(.f32, allocator, size); var c = try deepseek_core.createVector(.f32, allocator, size); defer a.deinit(); defer b.deinit(); defer c.deinit(); a.fillRandom(42); b.fillRandom(123); var timer = try std.time.Timer.start(); for (0..iterations) |_| { try a.add(&b, &c); } const elapsed_ns = timer.read(); const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; const operations_per_sec = @as(f64, @floatFromInt(size * iterations)) / elapsed_s; const bandwidth_gb_s = operations_per_sec * @sizeOf(f32) * 3 / (1024 * 1024 * 1024); // 3x for read a, read b, write c std.log.info(" ✅ {d:.1} GOp/s, {d:.1} GB/s bandwidth", .{ operations_per_sec / 1e9, bandwidth_gb_s }); } fn runBlasBenchmarks(allocator: std.mem.Allocator) !void { std.log.info("🧮 BLAS LIBRARY BENCHMARK", .{}); std.log.info("-------------------------", .{}); // Initialize BLAS and show detection results const blas_context = deepseek_core.blas.Blas.init(allocator) catch { std.log.info("⚠️ BLAS initialization failed, using naive implementation", .{}); return; }; std.log.info("🔍 BLAS Detection Results:", .{}); std.log.info(" Backend: {}", .{blas_context.backend}); std.log.info(" Expected Peak Performance: {d:.1} GFLOPS", .{blas_context.performance_info.peak_gflops}); std.log.info(" Memory Bandwidth: {d:.1} GB/s", .{blas_context.performance_info.memory_bandwidth_gb_s}); std.log.info(" SIMD Width: {} bits", .{blas_context.performance_info.simd_width}); std.log.info(" Mixed Precision: {}", .{blas_context.performance_info.supports_mixed_precision}); // Run dedicated BLAS benchmark std.log.info("", .{}); std.log.info("🚀 Running dedicated BLAS benchmark...", .{}); try deepseek_core.blas.benchmarkBlas(allocator); std.log.info("", .{}); } fn runMemoryBenchmarks(allocator: std.mem.Allocator) !void { std.log.info("💾 MEMORY PERFORMANCE BENCHMARK", .{}); std.log.info("--------------------------------", .{}); try benchmarkMemoryBandwidth(allocator); try benchmarkMemoryLatency(allocator); std.log.info("", .{}); } fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !void { const size = 128 * 1024 * 1024 / @sizeOf(f32); // 128MB of f32s const iterations = 100; std.log.info("📈 Memory Bandwidth Test - {} MB, {} iterations", .{ size * @sizeOf(f32) / (1024 * 1024), iterations }); const data = try allocator.alloc(f32, size); defer allocator.free(data); // Fill with data for (data, 0..) |*ptr, i| { ptr.* = @floatFromInt(i % 1000); } // Sequential read benchmark var timer = try std.time.Timer.start(); var checksum: f64 = 0; for (0..iterations) |_| { for (data) |value| { checksum += value; } } const elapsed_ns = timer.read(); const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; const bytes_read = @as(f64, @floatFromInt(size * @sizeOf(f32) * iterations)); const bandwidth_gb_s = bytes_read / elapsed_s / (1024 * 1024 * 1024); std.log.info(" ✅ Sequential Read: {d:.1} GB/s (checksum: {d:.1})", .{ bandwidth_gb_s, checksum }); // Memory copy benchmark const dest = try allocator.alloc(f32, size); defer allocator.free(dest); timer.reset(); for (0..iterations) |_| { @memcpy(dest, data); } const copy_elapsed_ns = timer.read(); const copy_elapsed_s = @as(f64, @floatFromInt(copy_elapsed_ns)) / 1e9; const copy_bandwidth_gb_s = bytes_read / copy_elapsed_s / (1024 * 1024 * 1024); std.log.info(" ✅ Memory Copy: {d:.1} GB/s", .{copy_bandwidth_gb_s}); } fn benchmarkMemoryLatency(allocator: std.mem.Allocator) !void { const size = 1024 * 1024; // 1M elements const iterations = 1000; std.log.info("⏱️ Memory Latency Test - Random Access Pattern", .{}); const data = try allocator.alloc(u32, size); defer allocator.free(data); // Create random access pattern var rng = std.Random.DefaultPrng.init(42); for (data, 0..) |*ptr, i| { ptr.* = @intCast(rng.random().uintLessThan(usize, size)); _ = i; } var timer = try std.time.Timer.start(); var index: u32 = 0; for (0..iterations) |_| { for (0..size) |_| { index = data[index]; } } const elapsed_ns = timer.read(); const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9; const accesses_per_sec = @as(f64, @floatFromInt(size * iterations)) / elapsed_s; const avg_latency_ns = elapsed_s * 1e9 / @as(f64, @floatFromInt(size * iterations)); std.log.info(" ✅ {d:.1} M accesses/s, {d:.1} ns avg latency (index: {})", .{ accesses_per_sec / 1e6, avg_latency_ns, index }); }