Code to reproduce
from diffusers import DiffusionPipeline
from torchao.prototype.mx_formats.inference_workflow import (
NVFP4DynamicActivationNVFP4WeightConfig,
)
from torchao.quantization import quantize_
import torch
pipe = DiffusionPipeline.from_pretrained(
"black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
)
quant_config = NVFP4DynamicActivationNVFP4WeightConfig(
use_dynamic_per_tensor_scale=True,
use_triton_kernel=True,
)
quantize_(pipe.transformer, config=quant_config, filter_fn=None)
pipe.transformer.enable_group_offload(
onload_device="cuda",
offload_device="cpu",
offload_type="leaf_level",
use_stream=True,
non_blocking=True,
)
for _, component in pipe.components.items():
if isinstance(component, torch.nn.Module):
component.to("cuda")
for _ in range(2):
_ = pipe("a dog", num_inference_steps=2)
Error:
NotImplementedError: NVFP4Tensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.prototype.mx_formats.nvfp4_tensor.NVFP4Tensor'>,), arg_types=(<class 'torchao.prototype.mx_formats.nvfp4_tensor.NVFP4Tensor'>,), kwarg_types={}
Full trace:
https://pastebin.com/Dkh4shw1
This enables bigger models to combine quantization and offloading so that they can run on consumer hardware.
Cc: @asomoza
Cc: @vkuzo
Code to reproduce
Error:
Full trace:
https://pastebin.com/Dkh4shw1
This enables bigger models to combine quantization and offloading so that they can run on consumer hardware.
Cc: @asomoza
Cc: @vkuzo