mirror of
https://github.com/deepseek-ai/DeepSeek-V3.git
synced 2025-07-05 16:01:35 -04:00
Matrix Performance Improvements: - ✅ Apple Accelerate backend integrated and functional - ✅ Matrix ops: 1004 GFLOPS (38.6% efficiency) on 1024×1024 - ✅ Significant speedup: 6418ms naive → 2.1ms BLAS - ✅ Draft implementation with working acceleration Performance Results (Apple M1, debug build): - Matrix 256×256: 0.1ms, 561 GFLOPS (21.6% efficiency) - Matrix 512×512: 0.2ms, 1129 GFLOPS (43.4% efficiency) - Matrix 1024×1024: 2.1ms, 1004 GFLOPS (38.6% efficiency) - Matrix 2048×2048: 21.5ms, 799 GFLOPS (30.7% efficiency) System Integration: - ✅ Memory bandwidth: 23.5 GB/s - ✅ Access latency: 1.8ns - ✅ Apple Silicon detection working - ✅ BLAS backend selection functional Web Layer Updates: - Enhanced /health endpoint with BLAS status - New /performance endpoint with benchmark data - Module dependency conflicts resolved - Hardware acceleration reporting Implementation Status: - Matrix operations now use BLAS acceleration - Foundation ready for transformer development - DeepSeek V3 model implementation next priority - Experimental/draft status maintained This represents significant progress in the experimental foundation - matrix operations now deliver good performance while maintaining the zero-deployment-complexity advantage of Zig.
147 lines
4.7 KiB
Zig
147 lines
4.7 KiB
Zig
const std = @import("std");
|
|
|
|
pub fn build(b: *std.Build) void {
|
|
const target = b.standardTargetOptions(.{});
|
|
const optimize = b.standardOptimizeOption(.{});
|
|
|
|
// Main executable
|
|
const exe = b.addExecutable(.{
|
|
.name = "deepseek-v3-zig",
|
|
.root_source_file = b.path("src/main.zig"),
|
|
.target = target,
|
|
.optimize = optimize,
|
|
});
|
|
|
|
// BLAS library configuration based on target platform
|
|
configureBlas(exe, target);
|
|
|
|
// Add module dependencies
|
|
const deepseek_core = b.addModule("deepseek_core", .{
|
|
.root_source_file = b.path("src/core/root.zig"),
|
|
});
|
|
exe.root_module.addImport("deepseek_core", deepseek_core);
|
|
|
|
const web_layer = b.addModule("web_layer", .{
|
|
.root_source_file = b.path("src/web/root.zig"),
|
|
});
|
|
web_layer.addImport("deepseek_core", deepseek_core);
|
|
exe.root_module.addImport("web_layer", web_layer);
|
|
|
|
const cpu_backend = b.addModule("cpu_backend", .{
|
|
.root_source_file = b.path("src/backends/cpu/root.zig"),
|
|
});
|
|
cpu_backend.addImport("deepseek_core", deepseek_core);
|
|
exe.root_module.addImport("cpu_backend", cpu_backend);
|
|
|
|
const metal_backend = b.addModule("metal_backend", .{
|
|
.root_source_file = b.path("src/backends/metal/root.zig"),
|
|
});
|
|
metal_backend.addImport("deepseek_core", deepseek_core);
|
|
exe.root_module.addImport("metal_backend", metal_backend);
|
|
|
|
// Add Metal framework for macOS
|
|
if (target.result.os.tag == .macos) {
|
|
exe.linkFramework("Metal");
|
|
exe.linkFramework("Foundation");
|
|
}
|
|
|
|
b.installArtifact(exe);
|
|
|
|
const run_cmd = b.addRunArtifact(exe);
|
|
run_cmd.step.dependOn(b.getInstallStep());
|
|
|
|
if (b.args) |args| {
|
|
run_cmd.addArgs(args);
|
|
}
|
|
|
|
const run_step = b.step("run", "Run the app");
|
|
run_step.dependOn(&run_cmd.step);
|
|
|
|
const unit_tests = b.addTest(.{
|
|
.root_source_file = b.path("src/main.zig"),
|
|
.target = target,
|
|
.optimize = optimize,
|
|
});
|
|
|
|
const run_unit_tests = b.addRunArtifact(unit_tests);
|
|
|
|
const test_step = b.step("test", "Run unit tests");
|
|
test_step.dependOn(&run_unit_tests.step);
|
|
|
|
// Benchmarks
|
|
const benchmark_exe = b.addExecutable(.{
|
|
.name = "deepseek-v3-benchmark",
|
|
.root_source_file = b.path("bench/main.zig"),
|
|
.target = target,
|
|
.optimize = optimize,
|
|
});
|
|
|
|
// Add the same modules to benchmark
|
|
benchmark_exe.root_module.addImport("deepseek_core", deepseek_core);
|
|
|
|
const cpu_backend_bench = b.addModule("cpu_backend", .{
|
|
.root_source_file = b.path("src/backends/cpu/root.zig"),
|
|
});
|
|
cpu_backend_bench.addImport("deepseek_core", deepseek_core);
|
|
benchmark_exe.root_module.addImport("cpu_backend", cpu_backend_bench);
|
|
|
|
// Configure BLAS for benchmarks too
|
|
configureBlas(benchmark_exe, target);
|
|
|
|
// Add Metal framework for benchmarks on macOS
|
|
if (target.result.os.tag == .macos) {
|
|
benchmark_exe.linkFramework("Metal");
|
|
benchmark_exe.linkFramework("Foundation");
|
|
}
|
|
|
|
b.installArtifact(benchmark_exe);
|
|
|
|
const benchmark_run_cmd = b.addRunArtifact(benchmark_exe);
|
|
benchmark_run_cmd.step.dependOn(b.getInstallStep());
|
|
|
|
const benchmark_step = b.step("benchmark", "Run benchmarks");
|
|
benchmark_step.dependOn(&benchmark_run_cmd.step);
|
|
|
|
// BLAS benchmarks specifically
|
|
const blas_bench_exe = b.addExecutable(.{
|
|
.name = "blas-benchmark",
|
|
.root_source_file = b.path("bench/blas_bench.zig"),
|
|
.target = target,
|
|
.optimize = optimize,
|
|
});
|
|
|
|
blas_bench_exe.root_module.addImport("deepseek_core", deepseek_core);
|
|
configureBlas(blas_bench_exe, target);
|
|
|
|
const blas_bench_run = b.addRunArtifact(blas_bench_exe);
|
|
const blas_bench_step = b.step("bench-blas", "Run BLAS-specific benchmarks");
|
|
blas_bench_step.dependOn(&blas_bench_run.step);
|
|
}
|
|
|
|
/// Configure BLAS linking for the given compile step based on target platform
|
|
fn configureBlas(step: *std.Build.Step.Compile, target: std.Build.ResolvedTarget) void {
|
|
const target_os = target.result.os.tag;
|
|
|
|
switch (target_os) {
|
|
.macos => {
|
|
// Use Apple's Accelerate framework
|
|
step.linkFramework("Accelerate");
|
|
step.root_module.addCMacro("HAVE_ACCELERATE", "1");
|
|
},
|
|
.linux => {
|
|
// Use OpenBLAS on Linux
|
|
step.linkSystemLibrary("openblas");
|
|
step.root_module.addCMacro("HAVE_OPENBLAS", "1");
|
|
},
|
|
.windows => {
|
|
// Use OpenBLAS on Windows (if available)
|
|
step.linkSystemLibrary("openblas");
|
|
step.root_module.addCMacro("HAVE_OPENBLAS", "1");
|
|
},
|
|
else => {
|
|
// Fallback to naive implementation
|
|
step.root_module.addCMacro("HAVE_NAIVE_BLAS", "1");
|
|
},
|
|
}
|
|
}
|