mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-07-05 07:51:38 -04:00
Matrix Performance Improvements: - ✅ Apple Accelerate backend integrated and functional - ✅ Matrix ops: 1004 GFLOPS (38.6% efficiency) on 1024×1024 - ✅ Significant speedup: 6418ms naive → 2.1ms BLAS - ✅ Draft implementation with working acceleration Performance Results (Apple M1, debug build): - Matrix 256×256: 0.1ms, 561 GFLOPS (21.6% efficiency) - Matrix 512×512: 0.2ms, 1129 GFLOPS (43.4% efficiency) - Matrix 1024×1024: 2.1ms, 1004 GFLOPS (38.6% efficiency) - Matrix 2048×2048: 21.5ms, 799 GFLOPS (30.7% efficiency) System Integration: - ✅ Memory bandwidth: 23.5 GB/s - ✅ Access latency: 1.8ns - ✅ Apple Silicon detection working - ✅ BLAS backend selection functional Web Layer Updates: - Enhanced /health endpoint with BLAS status - New /performance endpoint with benchmark data - Module dependency conflicts resolved - Hardware acceleration reporting Implementation Status: - Matrix operations now use BLAS acceleration - Foundation ready for transformer development - DeepSeek V3 model implementation next priority - Experimental/draft status maintained This represents significant progress in the experimental foundation - matrix operations now deliver good performance while maintaining the zero-deployment-complexity advantage of Zig.
254 lines
8.8 KiB
Zig
254 lines
8.8 KiB
Zig
// Benchmark Suite for DeepZig V3 Implementation
|
||
// Tests performance of core operations across different backends
|
||
|
||
const std = @import("std");
|
||
const print = std.debug.print;
|
||
|
||
const cpu_backend = @import("cpu_backend");
|
||
const deepseek_core = @import("deepseek_core");
|
||
const Shape = deepseek_core.Shape;
|
||
|
||
// Import Shape from deepseek_core
|
||
const BenchmarkResult = struct {
|
||
name: []const u8,
|
||
iterations: u32,
|
||
total_time_ns: u64,
|
||
avg_time_ns: u64,
|
||
ops_per_second: f64,
|
||
memory_used_mb: f64,
|
||
|
||
pub fn format(
|
||
self: BenchmarkResult,
|
||
comptime fmt: []const u8,
|
||
options: std.fmt.FormatOptions,
|
||
writer: anytype,
|
||
) !void {
|
||
_ = fmt;
|
||
_ = options;
|
||
try writer.print("{s:30} | {d:6} iter | {d:8.2} ms | {d:10.0} ops/s | {d:6.1} MB", .{ self.name, self.iterations, @as(f64, @floatFromInt(self.avg_time_ns)) / 1_000_000.0, self.ops_per_second, self.memory_used_mb });
|
||
}
|
||
};
|
||
|
||
pub fn main() !void {
|
||
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
|
||
defer _ = gpa.deinit();
|
||
const allocator = gpa.allocator();
|
||
|
||
// Print banner
|
||
printBanner();
|
||
|
||
// Run comprehensive benchmarks
|
||
try runTensorBenchmarks(allocator);
|
||
try runBlasBenchmarks(allocator);
|
||
try runMemoryBenchmarks(allocator);
|
||
|
||
// Print summary
|
||
printBenchmarkSummary();
|
||
|
||
std.log.info("🎉 Benchmark suite completed!", .{});
|
||
}
|
||
|
||
fn printBanner() void {
|
||
std.log.info("🚀 DeepZig V3 Performance Benchmarks", .{});
|
||
std.log.info("==========================================", .{});
|
||
std.log.info("", .{});
|
||
}
|
||
|
||
fn runTensorBenchmarks(allocator: std.mem.Allocator) !void {
|
||
std.log.info("📊 TENSOR OPERATIONS BENCHMARK", .{});
|
||
std.log.info("-------------------------------", .{});
|
||
|
||
// Test different matrix sizes
|
||
const sizes = [_]u32{ 256, 512, 1024, 2048 };
|
||
const iterations = [_]u32{ 50, 20, 10, 5 };
|
||
|
||
for (sizes, iterations) |size, iters| {
|
||
try benchmarkMatrixMultiplication(allocator, size, iters);
|
||
}
|
||
|
||
// Tensor addition benchmark
|
||
try benchmarkTensorAddition(allocator);
|
||
|
||
std.log.info("", .{});
|
||
}
|
||
|
||
fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator, size: u32, iterations: u32) !void {
|
||
std.log.info("🔢 Matrix Multiplication {}x{} ({} iterations)", .{ size, size, iterations });
|
||
|
||
// Create matrices
|
||
var a = try deepseek_core.createMatrix(.f32, allocator, size, size);
|
||
var b = try deepseek_core.createMatrix(.f32, allocator, size, size);
|
||
var c = try deepseek_core.createMatrix(.f32, allocator, size, size);
|
||
defer a.deinit();
|
||
defer b.deinit();
|
||
defer c.deinit();
|
||
|
||
// Fill with random data
|
||
a.fillRandom(42);
|
||
b.fillRandom(123);
|
||
|
||
// Benchmark
|
||
var timer = try std.time.Timer.start();
|
||
for (0..iterations) |_| {
|
||
try a.matmul(&b, &c);
|
||
}
|
||
const elapsed_ns = timer.read();
|
||
|
||
// Calculate performance metrics
|
||
const ops = 2.0 * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(size)) * @as(f64, @floatFromInt(iterations));
|
||
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
|
||
const gflops = ops / elapsed_s / 1e9;
|
||
const avg_time_ms = elapsed_s * 1000.0 / @as(f64, @floatFromInt(iterations));
|
||
|
||
// Performance comparison
|
||
if (a.blas_ctx) |blas_context| {
|
||
const efficiency = gflops / blas_context.performance_info.peak_gflops * 100.0;
|
||
std.log.info(" ✅ BLAS-accelerated: {d:.1} ms/iter, {d:.1} GFLOPS ({d:.1}% efficiency)", .{ avg_time_ms, gflops, efficiency });
|
||
std.log.info(" 🔧 Backend: {}, Peak: {d:.1} GFLOPS", .{ blas_context.backend, blas_context.performance_info.peak_gflops });
|
||
} else {
|
||
std.log.info(" ⚠️ Naive implementation: {d:.1} ms/iter, {d:.1} GFLOPS", .{ avg_time_ms, gflops });
|
||
}
|
||
}
|
||
|
||
fn benchmarkTensorAddition(allocator: std.mem.Allocator) !void {
|
||
const size = 1024 * 1024; // 1M elements
|
||
const iterations = 1000;
|
||
|
||
std.log.info("➕ Tensor Addition (SIMD) - {} elements, {} iterations", .{ size, iterations });
|
||
|
||
var a = try deepseek_core.createVector(.f32, allocator, size);
|
||
var b = try deepseek_core.createVector(.f32, allocator, size);
|
||
var c = try deepseek_core.createVector(.f32, allocator, size);
|
||
defer a.deinit();
|
||
defer b.deinit();
|
||
defer c.deinit();
|
||
|
||
a.fillRandom(42);
|
||
b.fillRandom(123);
|
||
|
||
var timer = try std.time.Timer.start();
|
||
for (0..iterations) |_| {
|
||
try a.add(&b, &c);
|
||
}
|
||
const elapsed_ns = timer.read();
|
||
|
||
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
|
||
const operations_per_sec = @as(f64, @floatFromInt(size * iterations)) / elapsed_s;
|
||
const bandwidth_gb_s = operations_per_sec * @sizeOf(f32) * 3 / (1024 * 1024 * 1024); // 3x for read a, read b, write c
|
||
|
||
std.log.info(" ✅ {d:.1} GOp/s, {d:.1} GB/s bandwidth", .{ operations_per_sec / 1e9, bandwidth_gb_s });
|
||
}
|
||
|
||
fn runBlasBenchmarks(allocator: std.mem.Allocator) !void {
|
||
std.log.info("🧮 BLAS LIBRARY BENCHMARK", .{});
|
||
std.log.info("-------------------------", .{});
|
||
|
||
// Initialize BLAS and show detection results
|
||
const blas_context = deepseek_core.blas.Blas.init(allocator) catch {
|
||
std.log.info("⚠️ BLAS initialization failed, using naive implementation", .{});
|
||
return;
|
||
};
|
||
|
||
std.log.info("🔍 BLAS Detection Results:", .{});
|
||
std.log.info(" Backend: {}", .{blas_context.backend});
|
||
std.log.info(" Expected Peak Performance: {d:.1} GFLOPS", .{blas_context.performance_info.peak_gflops});
|
||
std.log.info(" Memory Bandwidth: {d:.1} GB/s", .{blas_context.performance_info.memory_bandwidth_gb_s});
|
||
std.log.info(" SIMD Width: {} bits", .{blas_context.performance_info.simd_width});
|
||
std.log.info(" Mixed Precision: {}", .{blas_context.performance_info.supports_mixed_precision});
|
||
|
||
// Run dedicated BLAS benchmark
|
||
std.log.info("", .{});
|
||
std.log.info("🚀 Running dedicated BLAS benchmark...", .{});
|
||
try deepseek_core.blas.benchmarkBlas(allocator);
|
||
|
||
std.log.info("", .{});
|
||
}
|
||
|
||
fn runMemoryBenchmarks(allocator: std.mem.Allocator) !void {
|
||
std.log.info("💾 MEMORY PERFORMANCE BENCHMARK", .{});
|
||
std.log.info("--------------------------------", .{});
|
||
|
||
try benchmarkMemoryBandwidth(allocator);
|
||
try benchmarkMemoryLatency(allocator);
|
||
|
||
std.log.info("", .{});
|
||
}
|
||
|
||
fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !void {
|
||
const size = 128 * 1024 * 1024 / @sizeOf(f32); // 128MB of f32s
|
||
const iterations = 100;
|
||
|
||
std.log.info("📈 Memory Bandwidth Test - {} MB, {} iterations", .{ size * @sizeOf(f32) / (1024 * 1024), iterations });
|
||
|
||
const data = try allocator.alloc(f32, size);
|
||
defer allocator.free(data);
|
||
|
||
// Fill with data
|
||
for (data, 0..) |*ptr, i| {
|
||
ptr.* = @floatFromInt(i % 1000);
|
||
}
|
||
|
||
// Sequential read benchmark
|
||
var timer = try std.time.Timer.start();
|
||
var checksum: f64 = 0;
|
||
for (0..iterations) |_| {
|
||
for (data) |value| {
|
||
checksum += value;
|
||
}
|
||
}
|
||
const elapsed_ns = timer.read();
|
||
|
||
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
|
||
const bytes_read = @as(f64, @floatFromInt(size * @sizeOf(f32) * iterations));
|
||
const bandwidth_gb_s = bytes_read / elapsed_s / (1024 * 1024 * 1024);
|
||
|
||
std.log.info(" ✅ Sequential Read: {d:.1} GB/s (checksum: {d:.1})", .{ bandwidth_gb_s, checksum });
|
||
|
||
// Memory copy benchmark
|
||
const dest = try allocator.alloc(f32, size);
|
||
defer allocator.free(dest);
|
||
|
||
timer.reset();
|
||
for (0..iterations) |_| {
|
||
@memcpy(dest, data);
|
||
}
|
||
const copy_elapsed_ns = timer.read();
|
||
|
||
const copy_elapsed_s = @as(f64, @floatFromInt(copy_elapsed_ns)) / 1e9;
|
||
const copy_bandwidth_gb_s = bytes_read / copy_elapsed_s / (1024 * 1024 * 1024);
|
||
|
||
std.log.info(" ✅ Memory Copy: {d:.1} GB/s", .{copy_bandwidth_gb_s});
|
||
}
|
||
|
||
fn benchmarkMemoryLatency(allocator: std.mem.Allocator) !void {
|
||
const size = 1024 * 1024; // 1M elements
|
||
const iterations = 1000;
|
||
|
||
std.log.info("⏱️ Memory Latency Test - Random Access Pattern", .{});
|
||
|
||
const data = try allocator.alloc(u32, size);
|
||
defer allocator.free(data);
|
||
|
||
// Create random access pattern
|
||
var rng = std.Random.DefaultPrng.init(42);
|
||
for (data, 0..) |*ptr, i| {
|
||
ptr.* = @intCast(rng.random().uintLessThan(usize, size));
|
||
_ = i;
|
||
}
|
||
|
||
var timer = try std.time.Timer.start();
|
||
var index: u32 = 0;
|
||
for (0..iterations) |_| {
|
||
for (0..size) |_| {
|
||
index = data[index];
|
||
}
|
||
}
|
||
const elapsed_ns = timer.read();
|
||
|
||
const elapsed_s = @as(f64, @floatFromInt(elapsed_ns)) / 1e9;
|
||
const accesses_per_sec = @as(f64, @floatFromInt(size * iterations)) / elapsed_s;
|
||
const avg_latency_ns = elapsed_s * 1e9 / @as(f64, @floatFromInt(size * iterations));
|
||
|
||
std.log.info(" ✅ {d:.1} M accesses/s, {d:.1} ns avg latency (index: {})", .{ accesses_per_sec / 1e6, avg_latency_ns, index });
|
||
}
|