From 65d8f5f1e99cba73e0f6a72bf0c871ac7873a023 Mon Sep 17 00:00:00 2001
From: Yang Wang <wyatuestc@gmail.com>
Date: Thu, 26 Dec 2024 23:18:39 +0800
Subject: [PATCH] Add CUDA cache clearing in memory management

Added torch.cuda.empty_cache() to free up unused memory on the GPU,
---
 inference/fp8_cast_bf16.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inference/fp8_cast_bf16.py b/inference/fp8_cast_bf16.py
index d6130ac..1b9735a 100644
--- a/inference/fp8_cast_bf16.py
+++ b/inference/fp8_cast_bf16.py
@@ -60,6 +60,7 @@ def main(fp8_path, bf16_path):
         if len(loaded_files) > 2:
             oldest_file = next(iter(loaded_files))
             del loaded_files[oldest_file]
+            torch.cuda.empty_cache()
     
     # Update model index
     new_model_index_file = os.path.join(bf16_path, "model.safetensors.index.json")