mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-07-05 07:51:38 -04:00
Merge pull request #4 from Triex:feat--Implement-dynamic-benchmark-summary-with-real-performance-metrics
feat: Implement dynamic benchmark summary with real performance metrics
This commit is contained in:
commit
c4ca746a60
@ -13,7 +13,7 @@ A high-performance implementation of DeepSeek V3 in [Zig](https://ziglang.org/)
|
|||||||
> - ✅ **Functional matrix operations** (significant performance improvement)
|
> - ✅ **Functional matrix operations** (significant performance improvement)
|
||||||
>
|
>
|
||||||
> **Recent Progress**: Matrix operations now use BLAS acceleration<br/>
|
> **Recent Progress**: Matrix operations now use BLAS acceleration<br/>
|
||||||
> **Performance Status**: 1000+ GFLOPS with Apple Accelerate backend working<br/>
|
> **Performance Status**: 1160+ GFLOPS with Apple Accelerate backend working (measured on Apple M1)<br/>
|
||||||
>
|
>
|
||||||
> See [Performance Results](#performance-notes) for detailed benchmarks.
|
> See [Performance Results](#performance-notes) for detailed benchmarks.
|
||||||
|
|
||||||
@ -252,7 +252,7 @@ Operation | Iterations | Avg Time | Operations/s | Memory
|
|||||||
-------------------------------|------------|-----------|--------------|-------
|
-------------------------------|------------|-----------|--------------|-------
|
||||||
Tensor Creation (1024x1024) | 1000 iter | 2.03 ms | 493 ops/s | 4.0 MB
|
Tensor Creation (1024x1024) | 1000 iter | 2.03 ms | 493 ops/s | 4.0 MB
|
||||||
Tensor Addition (SIMD) | 100 iter | 1.49 ms | 2806962690 ops/s | 48.0 MB
|
Tensor Addition (SIMD) | 100 iter | 1.49 ms | 2806962690 ops/s | 48.0 MB
|
||||||
Matrix Multiplication (BLAS) | 10 iter | 2.1 ms | 1004 GFLOPS | 12.0 MB
|
Matrix Multiplication (BLAS) | 10 iter | 2.1 ms | 1164 GFLOPS | 12.0 MB
|
||||||
SwiGLU Activation | 1000 iter | 4.44 ms | 236002478 ops/s | 12.0 MB
|
SwiGLU Activation | 1000 iter | 4.44 ms | 236002478 ops/s | 12.0 MB
|
||||||
RMS Normalization (SIMD) | 1000 iter | 0.00 ms | 1077586 ops/s | 0.0 MB
|
RMS Normalization (SIMD) | 1000 iter | 0.00 ms | 1077586 ops/s | 0.0 MB
|
||||||
Memory Bandwidth | 100 iter | 4.92 ms | 13 ops/s | 128.0 MB
|
Memory Bandwidth | 100 iter | 4.92 ms | 13 ops/s | 128.0 MB
|
||||||
|
@ -8,6 +8,46 @@ const cpu_backend = @import("cpu_backend");
|
|||||||
const deepseek_core = @import("deepseek_core");
|
const deepseek_core = @import("deepseek_core");
|
||||||
const Shape = deepseek_core.Shape;
|
const Shape = deepseek_core.Shape;
|
||||||
|
|
||||||
|
// Benchmark result collection
|
||||||
|
const MatrixResult = struct {
|
||||||
|
size: u32,
|
||||||
|
gflops: f64,
|
||||||
|
time_ms: f64,
|
||||||
|
};
|
||||||
|
|
||||||
|
const BenchmarkResults = struct {
|
||||||
|
matrix_results: std.ArrayList(MatrixResult),
|
||||||
|
tensor_add_bandwidth_gbps: f64,
|
||||||
|
memory_copy_bandwidth_gbps: f64,
|
||||||
|
memory_latency_ns: f64,
|
||||||
|
blas_backend: ?[]const u8,
|
||||||
|
blas_peak_gflops: f64,
|
||||||
|
|
||||||
|
pub fn init(allocator: std.mem.Allocator) BenchmarkResults {
|
||||||
|
return BenchmarkResults{
|
||||||
|
.matrix_results = std.ArrayList(MatrixResult).init(allocator),
|
||||||
|
.tensor_add_bandwidth_gbps = 0,
|
||||||
|
.memory_copy_bandwidth_gbps = 0,
|
||||||
|
.memory_latency_ns = 0,
|
||||||
|
.blas_backend = null,
|
||||||
|
.blas_peak_gflops = 0,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn deinit(self: *BenchmarkResults) void {
|
||||||
|
self.matrix_results.deinit();
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn setBLASBackend(self: *BenchmarkResults, backend: anytype) void {
|
||||||
|
switch (backend) {
|
||||||
|
.naive => self.blas_backend = "Naive",
|
||||||
|
.accelerate => self.blas_backend = "Apple Accelerate",
|
||||||
|
.intel_mkl => self.blas_backend = "Intel MKL",
|
||||||
|
.openblas => self.blas_backend = "OpenBLAS",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Import Shape from deepseek_core
|
// Import Shape from deepseek_core
|
||||||
const BenchmarkResult = struct {
|
const BenchmarkResult = struct {
|
||||||
name: []const u8,
|
name: []const u8,
|
||||||
@ -34,16 +74,20 @@ pub fn main() !void {
|
|||||||
defer _ = gpa.deinit();
|
defer _ = gpa.deinit();
|
||||||
const allocator = gpa.allocator();
|
const allocator = gpa.allocator();
|
||||||
|
|
||||||
|
// Initialize results collection
|
||||||
|
var results = BenchmarkResults.init(allocator);
|
||||||
|
defer results.deinit();
|
||||||
|
|
||||||
// Print banner
|
// Print banner
|
||||||
printBanner();
|
printBanner();
|
||||||
|
|
||||||
// Run comprehensive benchmarks
|
// Run comprehensive benchmarks and collect results
|
||||||
try runTensorBenchmarks(allocator);
|
try runTensorBenchmarks(allocator, &results);
|
||||||
try runBlasBenchmarks(allocator);
|
try runBlasBenchmarks(allocator, &results);
|
||||||
try runMemoryBenchmarks(allocator);
|
try runMemoryBenchmarks(allocator, &results);
|
||||||
|
|
||||||
// Print summary
|
// Print dynamic summary based on actual results
|
||||||
printBenchmarkSummary();
|
printDynamicSummary(&results);
|
||||||
|
|
||||||
std.log.info("🎉 Benchmark suite completed!", .{});
|
std.log.info("🎉 Benchmark suite completed!", .{});
|
||||||
}
|
}
|
||||||
@ -54,7 +98,7 @@ fn printBanner() void {
|
|||||||
std.log.info("", .{});
|
std.log.info("", .{});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn runTensorBenchmarks(allocator: std.mem.Allocator) !void {
|
fn runTensorBenchmarks(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
std.log.info("📊 TENSOR OPERATIONS BENCHMARK", .{});
|
std.log.info("📊 TENSOR OPERATIONS BENCHMARK", .{});
|
||||||
std.log.info("-------------------------------", .{});
|
std.log.info("-------------------------------", .{});
|
||||||
|
|
||||||
@ -63,16 +107,16 @@ fn runTensorBenchmarks(allocator: std.mem.Allocator) !void {
|
|||||||
const iterations = [_]u32{ 50, 20, 10, 5 };
|
const iterations = [_]u32{ 50, 20, 10, 5 };
|
||||||
|
|
||||||
for (sizes, iterations) |size, iters| {
|
for (sizes, iterations) |size, iters| {
|
||||||
try benchmarkMatrixMultiplication(allocator, size, iters);
|
try benchmarkMatrixMultiplication(allocator, size, iters, results);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tensor addition benchmark
|
// Tensor addition benchmark
|
||||||
try benchmarkTensorAddition(allocator);
|
try benchmarkTensorAddition(allocator, results);
|
||||||
|
|
||||||
std.log.info("", .{});
|
std.log.info("", .{});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator, size: u32, iterations: u32) !void {
|
fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator, size: u32, iterations: u32, results: *BenchmarkResults) !void {
|
||||||
std.log.info("🔢 Matrix Multiplication {}x{} ({} iterations)", .{ size, size, iterations });
|
std.log.info("🔢 Matrix Multiplication {}x{} ({} iterations)", .{ size, size, iterations });
|
||||||
|
|
||||||
// Create matrices
|
// Create matrices
|
||||||
@ -105,12 +149,17 @@ fn benchmarkMatrixMultiplication(allocator: std.mem.Allocator, size: u32, iterat
|
|||||||
const efficiency = gflops / blas_context.performance_info.peak_gflops * 100.0;
|
const efficiency = gflops / blas_context.performance_info.peak_gflops * 100.0;
|
||||||
std.log.info(" ✅ BLAS-accelerated: {d:.1} ms/iter, {d:.1} GFLOPS ({d:.1}% efficiency)", .{ avg_time_ms, gflops, efficiency });
|
std.log.info(" ✅ BLAS-accelerated: {d:.1} ms/iter, {d:.1} GFLOPS ({d:.1}% efficiency)", .{ avg_time_ms, gflops, efficiency });
|
||||||
std.log.info(" 🔧 Backend: {}, Peak: {d:.1} GFLOPS", .{ blas_context.backend, blas_context.performance_info.peak_gflops });
|
std.log.info(" 🔧 Backend: {}, Peak: {d:.1} GFLOPS", .{ blas_context.backend, blas_context.performance_info.peak_gflops });
|
||||||
|
try results.matrix_results.append(MatrixResult{
|
||||||
|
.size = size,
|
||||||
|
.gflops = gflops,
|
||||||
|
.time_ms = avg_time_ms,
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
std.log.info(" ⚠️ Naive implementation: {d:.1} ms/iter, {d:.1} GFLOPS", .{ avg_time_ms, gflops });
|
std.log.info(" ⚠️ Naive implementation: {d:.1} ms/iter, {d:.1} GFLOPS", .{ avg_time_ms, gflops });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn benchmarkTensorAddition(allocator: std.mem.Allocator) !void {
|
fn benchmarkTensorAddition(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
const size = 1024 * 1024; // 1M elements
|
const size = 1024 * 1024; // 1M elements
|
||||||
const iterations = 1000;
|
const iterations = 1000;
|
||||||
|
|
||||||
@ -137,9 +186,10 @@ fn benchmarkTensorAddition(allocator: std.mem.Allocator) !void {
|
|||||||
const bandwidth_gb_s = operations_per_sec * @sizeOf(f32) * 3 / (1024 * 1024 * 1024); // 3x for read a, read b, write c
|
const bandwidth_gb_s = operations_per_sec * @sizeOf(f32) * 3 / (1024 * 1024 * 1024); // 3x for read a, read b, write c
|
||||||
|
|
||||||
std.log.info(" ✅ {d:.1} GOp/s, {d:.1} GB/s bandwidth", .{ operations_per_sec / 1e9, bandwidth_gb_s });
|
std.log.info(" ✅ {d:.1} GOp/s, {d:.1} GB/s bandwidth", .{ operations_per_sec / 1e9, bandwidth_gb_s });
|
||||||
|
results.tensor_add_bandwidth_gbps = bandwidth_gb_s;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn runBlasBenchmarks(allocator: std.mem.Allocator) !void {
|
fn runBlasBenchmarks(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
std.log.info("🧮 BLAS LIBRARY BENCHMARK", .{});
|
std.log.info("🧮 BLAS LIBRARY BENCHMARK", .{});
|
||||||
std.log.info("-------------------------", .{});
|
std.log.info("-------------------------", .{});
|
||||||
|
|
||||||
@ -162,19 +212,21 @@ fn runBlasBenchmarks(allocator: std.mem.Allocator) !void {
|
|||||||
try deepseek_core.blas.benchmarkBlas(allocator);
|
try deepseek_core.blas.benchmarkBlas(allocator);
|
||||||
|
|
||||||
std.log.info("", .{});
|
std.log.info("", .{});
|
||||||
|
results.setBLASBackend(blas_context.backend);
|
||||||
|
results.blas_peak_gflops = blas_context.performance_info.peak_gflops;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn runMemoryBenchmarks(allocator: std.mem.Allocator) !void {
|
fn runMemoryBenchmarks(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
std.log.info("💾 MEMORY PERFORMANCE BENCHMARK", .{});
|
std.log.info("💾 MEMORY PERFORMANCE BENCHMARK", .{});
|
||||||
std.log.info("--------------------------------", .{});
|
std.log.info("--------------------------------", .{});
|
||||||
|
|
||||||
try benchmarkMemoryBandwidth(allocator);
|
try benchmarkMemoryBandwidth(allocator, results);
|
||||||
try benchmarkMemoryLatency(allocator);
|
try benchmarkMemoryLatency(allocator, results);
|
||||||
|
|
||||||
std.log.info("", .{});
|
std.log.info("", .{});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !void {
|
fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
const size = 128 * 1024 * 1024 / @sizeOf(f32); // 128MB of f32s
|
const size = 128 * 1024 * 1024 / @sizeOf(f32); // 128MB of f32s
|
||||||
const iterations = 100;
|
const iterations = 100;
|
||||||
|
|
||||||
@ -218,9 +270,10 @@ fn benchmarkMemoryBandwidth(allocator: std.mem.Allocator) !void {
|
|||||||
const copy_bandwidth_gb_s = bytes_read / copy_elapsed_s / (1024 * 1024 * 1024);
|
const copy_bandwidth_gb_s = bytes_read / copy_elapsed_s / (1024 * 1024 * 1024);
|
||||||
|
|
||||||
std.log.info(" ✅ Memory Copy: {d:.1} GB/s", .{copy_bandwidth_gb_s});
|
std.log.info(" ✅ Memory Copy: {d:.1} GB/s", .{copy_bandwidth_gb_s});
|
||||||
|
results.memory_copy_bandwidth_gbps = copy_bandwidth_gb_s;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn benchmarkMemoryLatency(allocator: std.mem.Allocator) !void {
|
fn benchmarkMemoryLatency(allocator: std.mem.Allocator, results: *BenchmarkResults) !void {
|
||||||
const size = 1024 * 1024; // 1M elements
|
const size = 1024 * 1024; // 1M elements
|
||||||
const iterations = 1000;
|
const iterations = 1000;
|
||||||
|
|
||||||
@ -250,4 +303,85 @@ fn benchmarkMemoryLatency(allocator: std.mem.Allocator) !void {
|
|||||||
const avg_latency_ns = elapsed_s * 1e9 / @as(f64, @floatFromInt(size * iterations));
|
const avg_latency_ns = elapsed_s * 1e9 / @as(f64, @floatFromInt(size * iterations));
|
||||||
|
|
||||||
std.log.info(" ✅ {d:.1} M accesses/s, {d:.1} ns avg latency (index: {})", .{ accesses_per_sec / 1e6, avg_latency_ns, index });
|
std.log.info(" ✅ {d:.1} M accesses/s, {d:.1} ns avg latency (index: {})", .{ accesses_per_sec / 1e6, avg_latency_ns, index });
|
||||||
|
results.memory_latency_ns = avg_latency_ns;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn printDynamicSummary(results: *BenchmarkResults) void {
|
||||||
|
std.log.info("", .{});
|
||||||
|
std.log.info("🎯 DYNAMIC BENCHMARK SUMMARY", .{});
|
||||||
|
std.log.info("===============================", .{});
|
||||||
|
std.log.info("", .{});
|
||||||
|
|
||||||
|
if (results.matrix_results.items.len > 0) {
|
||||||
|
std.log.info("📊 Matrix Multiplication Performance:", .{});
|
||||||
|
for (results.matrix_results.items) |result| {
|
||||||
|
std.log.info(" • {}×{}: {d:.1} ms, {d:.0} GFLOPS", .{ result.size, result.size, result.time_ms, result.gflops });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find best performance
|
||||||
|
var best_gflops: f64 = 0;
|
||||||
|
var best_size: u32 = 0;
|
||||||
|
for (results.matrix_results.items) |result| {
|
||||||
|
if (result.gflops > best_gflops) {
|
||||||
|
best_gflops = result.gflops;
|
||||||
|
best_size = result.size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std.log.info(" 🏆 Peak measured: {d:.0} GFLOPS at {}×{}", .{ best_gflops, best_size, best_size });
|
||||||
|
std.log.info("", .{});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.blas_backend) |backend_name| {
|
||||||
|
std.log.info("🧮 BLAS Configuration:", .{});
|
||||||
|
std.log.info(" • Backend: {s}", .{backend_name});
|
||||||
|
std.log.info(" • Theoretical peak: {d:.0} GFLOPS (estimated)", .{results.blas_peak_gflops});
|
||||||
|
std.log.info("", .{});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.tensor_add_bandwidth_gbps > 0) {
|
||||||
|
std.log.info("➕ Tensor Operations:", .{});
|
||||||
|
std.log.info(" • SIMD Addition: {d:.1} GB/s", .{results.tensor_add_bandwidth_gbps});
|
||||||
|
std.log.info("", .{});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (results.memory_copy_bandwidth_gbps > 0 or results.memory_latency_ns > 0) {
|
||||||
|
std.log.info("💾 Memory Performance:", .{});
|
||||||
|
if (results.memory_copy_bandwidth_gbps > 0) {
|
||||||
|
std.log.info(" • Copy Bandwidth: {d:.1} GB/s", .{results.memory_copy_bandwidth_gbps});
|
||||||
|
}
|
||||||
|
if (results.memory_latency_ns > 0) {
|
||||||
|
std.log.info(" • Random Access Latency: {d:.1} ns", .{results.memory_latency_ns});
|
||||||
|
}
|
||||||
|
std.log.info("", .{});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Performance assessment based on actual measurements only
|
||||||
|
if (results.matrix_results.items.len > 0) {
|
||||||
|
var best_measured_gflops: f64 = 0;
|
||||||
|
for (results.matrix_results.items) |result| {
|
||||||
|
if (result.gflops > best_measured_gflops) {
|
||||||
|
best_measured_gflops = result.gflops;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std.log.info("🎯 Performance Assessment:", .{});
|
||||||
|
|
||||||
|
if (best_measured_gflops > 1000) {
|
||||||
|
std.log.info(" ✅ Excellent: BLAS delivering 1000+ GFLOPS", .{});
|
||||||
|
} else if (best_measured_gflops > 500) {
|
||||||
|
std.log.info(" ✅ Good: BLAS delivering 500+ GFLOPS", .{});
|
||||||
|
} else if (best_measured_gflops > 100) {
|
||||||
|
std.log.info(" ⚠️ Moderate: BLAS working, performance could improve", .{});
|
||||||
|
} else {
|
||||||
|
std.log.info(" ❌ Poor: BLAS may not be working optimally", .{});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only show efficiency comparison if we have reasonable confidence in the estimate
|
||||||
|
if (results.blas_peak_gflops > best_measured_gflops * 1.5) {
|
||||||
|
const estimated_efficiency = best_measured_gflops / results.blas_peak_gflops * 100.0;
|
||||||
|
std.log.info(" • Est. efficiency: {d:.0}% (vs theoretical peak)", .{estimated_efficiency});
|
||||||
|
}
|
||||||
|
|
||||||
|
std.log.info("", .{});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user