|
35 | 35 | from diffusers.quantizers import PipelineQuantizationConfig |
36 | 36 |
|
37 | 37 | from ...testing_utils import ( |
| 38 | + Expectations, |
38 | 39 | backend_empty_cache, |
39 | 40 | backend_synchronize, |
40 | 41 | enable_full_determinism, |
@@ -497,8 +498,23 @@ def test_memory_footprint(self): |
497 | 498 |
|
498 | 499 | def test_model_memory_usage(self): |
499 | 500 | model_id = "hf-internal-testing/tiny-flux-pipe" |
500 | | - expected_memory_saving_ratio = 2.0 |
501 | | - |
| 501 | + expected_memory_saving_ratios = Expectations( |
| 502 | + { |
| 503 | + # XPU: For this tiny model, per-tensor overheads (alignment, fragmentation, metadata) become visible. |
| 504 | + # While XPU doesn't have the large fixed cuBLAS workspace of A100, these small overheads prevent reaching the ideal 2.0 ratio. |
| 505 | + # Observed ~1.27x (158k vs 124k) for model size. |
| 506 | + # The runtime memory overhead is ~88k for both bf16 and int8wo. Adding this to model size: (158k+88k)/(124k+88k) ≈ 1.15. |
| 507 | + ("xpu", None): 1.15, |
| 508 | + # On Ampere, the cuBLAS kernels used for matrix multiplication often allocate a fixed-size workspace. |
| 509 | + # Since the tiny-flux model weights are likely smaller than or comparable to this workspace, the total memory is dominated by the workspace. |
| 510 | + ("cuda", 8): 1.02, |
| 511 | + # On Hopper, TorchAO utilizes newer, highly optimized kernels (via Triton or CUTLASS 3.x) that are designed to be workspace-free or use negligible extra memory. |
| 512 | + # Additionally, Triton kernels often handle unaligned memory better, avoiding the padding overhead seen on other backends for tiny tensors. |
| 513 | + # This allows it to achieve the near-ideal 2.0x compression ratio. |
| 514 | + ("cuda", 9): 2.0, |
| 515 | + } |
| 516 | + ) |
| 517 | + expected_memory_saving_ratio = expected_memory_saving_ratios.get_expectation() |
502 | 518 | inputs = self.get_dummy_tensor_inputs(device=torch_device) |
503 | 519 |
|
504 | 520 | transformer_bf16 = self.get_dummy_components(None, model_id=model_id)["transformer"] |
|
0 commit comments