[Graph] Add major UNet building components (#97)

KTong821 · vadiklyutiy · commit 364ba9c37777 · 2024-07-23T21:48:33.000+04:00
Add UNet Down, Up, and Mid block definitions and attention transformer utility layer. Modules are designed so that kwargs passed to constructors are all the same config from huggingface with minimal changes - lots of shared values and too many parameters to list individually. Same kwargs are passed to nested objects. Open to other suggestions, although this is a single use case problem. Towards #57.
diff --git a/python/hidet/apps/diffusion/modeling/stable_diffusion/transformer_blocks.py b/python/hidet/apps/diffusion/modeling/stable_diffusion/transformer_blocks.py
@@ -0,0 +1,139 @@
+from typing import Optional
+from hidet.graph import nn, ops
+from hidet.graph.tensor import Tensor
+
+
+class FeedForward(nn.Module[Tensor]):
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        activation_fn: str = "geglu",
+        inner_dim: Optional[int] = None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        if activation_fn != "geglu":
+            raise NotImplementedError("Expected geglu for feedforward activation.")
+
+        act_fn = nn.Geglu(dim, inner_dim, bias=bias)
+
+        self.net = []
+        self.net.append(act_fn)
+        self.net.append(nn.Identity())  # replaces dropout
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        self.net = nn.Sequential(self.net)
+
+    def forward(self, x) -> Tensor:
+        return self.net(x)
+
+
+class BasicTransformerBlock(nn.Module[Tensor]):
+    def __init__(self, dim: int, **kwargs):
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.attn1 = nn.CrossAttention(
+            dim, heads=kwargs["num_attention_heads"], dim_head=kwargs["attention_head_dim"], upcast=True, out_bias=True
+        )
+
+        self.norm2 = nn.LayerNorm(dim)
+        self.attn2 = nn.CrossAttention(
+            dim,
+            cross_attention_dim=kwargs["cross_attention_dim"],
+            heads=kwargs["num_attention_heads"],
+            dim_head=kwargs["attention_head_dim"],
+            upcast=True,
+            out_bias=True,
+        )
+
+        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, activation_fn="geglu", bias=True)
+
+    def forward(self, hidden_states: Tensor, encoder_hidden_states: Tensor, temperature_scaling: float = 1.0) -> Tensor:
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(norm_hidden_states, temperature_scaling=temperature_scaling)
+
+        hidden_states = attn_output + hidden_states
+        if len(hidden_states.shape) == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+
+        hidden_states = attn_output + hidden_states
+
+        norm_hidden_states = self.norm3(hidden_states)
+
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        if len(hidden_states.shape) == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class Transformer2DModel(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        inner_dim = kwargs["num_attention_heads"] * kwargs["attention_head_dim"]
+        self.use_linear_projection = kwargs["use_linear_projection"]
+
+        self.norm = nn.GroupNorm(kwargs["resnet_groups"], kwargs["input_channels"], eps=1e-6, affine=True)
+        if kwargs["use_linear_projection"]:
+            self.proj_in = nn.Linear(kwargs["input_channels"], inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(kwargs["input_channels"], inner_dim, kernel_size=1)
+
+        self.transformer_blocks = nn.ModuleList(
+            [BasicTransformerBlock(inner_dim, **kwargs) for _ in range(kwargs["num_layers"])]
+        )
+
+        self.output_channels = (
+            kwargs["input_channels"] if kwargs.get("output_channels", None) is None else kwargs["output_channels"]
+        )
+
+        if kwargs["use_linear_projection"]:
+            self.proj_out = nn.Linear(inner_dim, kwargs["input_channels"])
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, kwargs["input_channels"], kernel_size=1)
+
+    def forward(self, hidden_states: Tensor, encoder_hidden_states: Tensor, temperature_scaling: float = 1.0) -> Tensor:
+        bs, _, h, w = hidden_states.shape
+        residuals = hidden_states
+        hidden_states = self.norm(hidden_states)
+
+        def compress_hidden_states(x):
+            return ops.permute_dims(x, (0, 2, 3, 1)).reshape((bs, h * w, x.shape[1]))
+
+        def decompress_hidden_states(x):
+            return ops.permute_dims(x.reshape((bs, h, w, inner_dim)), (0, 3, 1, 2)).contiguous()
+
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = compress_hidden_states(hidden_states)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = compress_hidden_states(hidden_states)
+            hidden_states = self.proj_in(hidden_states)
+
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, encoder_hidden_states, temperature_scaling=temperature_scaling)
+
+        if not self.use_linear_projection:
+            hidden_states = decompress_hidden_states(hidden_states)
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = decompress_hidden_states(hidden_states)
+
+        return hidden_states + residuals
diff --git a/python/hidet/apps/diffusion/modeling/stable_diffusion/unet_blocks.py b/python/hidet/apps/diffusion/modeling/stable_diffusion/unet_blocks.py
@@ -0,0 +1,239 @@
+from typing import Tuple
+from hidet import nn
+from hidet.apps.diffusion.modeling.stable_diffusion.downsample import Downsample2D
+from hidet.apps.diffusion.modeling.stable_diffusion.resnet_blocks import ResnetBlock2D
+from hidet.apps.diffusion.modeling.stable_diffusion.transformer_blocks import Transformer2DModel
+from hidet.apps.diffusion.modeling.stable_diffusion.upsample import Upsample2D
+from hidet.graph.tensor import Tensor
+from hidet.graph.ops import concat
+
+
+class CrossAttnDownBlock2D(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.has_cross_attention = True
+        self.resnets = []
+        self.attentions = []
+
+        transformer_layers_per_block = kwargs["transformer_layers_per_block"]
+        num_layers = kwargs["num_layers"]
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            input_channels = kwargs["input_channels"] if i == 0 else kwargs["output_channels"]
+            self.resnets.append(ResnetBlock2D(**{**kwargs, "input_channels": input_channels}))
+            self.attentions.append(
+                Transformer2DModel(
+                    **{
+                        **kwargs,
+                        "attention_head_dim": kwargs["output_channels"] // kwargs["num_attention_heads"],
+                        "input_channels": kwargs["output_channels"],
+                        "num_layers": transformer_layers_per_block[i],
+                    }
+                )
+            )
+
+        self.resnets = nn.ModuleList(self.resnets)
+        self.attentions = nn.ModuleList(self.attentions)
+
+        if kwargs["add_downsample"]:
+            self.downsamplers = nn.ModuleList([Downsample2D(kwargs["output_channels"], **kwargs)])
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: Tensor, temb: Tensor, encoder_hidden_states: Tensor) -> Tensor:
+        output_states = ()
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for resnet, attn in blocks:
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.has_cross_attention = False
+        self.resnets = []
+
+        for i in range(kwargs["num_layers"]):
+            input_channels = kwargs["input_channels"] if i == 0 else kwargs["output_channels"]
+            self.resnets.append(ResnetBlock2D(**{**kwargs, "input_channels": input_channels}))
+
+        self.resnets = nn.ModuleList(self.resnets)
+        if kwargs["add_downsample"]:
+            self.downsamplers = nn.ModuleList([Downsample2D(kwargs["output_channels"], **kwargs)])
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: Tensor, temb: Tensor) -> Tensor:
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class MidBlock2DCrossAttn(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        transformer_layers_per_block = kwargs["transformer_layers_per_block"]
+        if isinstance(kwargs["transformer_layers_per_block"], int):
+            transformer_layers_per_block = [transformer_layers_per_block] * kwargs["num_layers"]
+
+        self.resnets = [ResnetBlock2D(**{**kwargs, "input_channels": kwargs["input_channels"]})]
+        self.attentions = []
+
+        for i in range(kwargs["num_layers"]):
+            self.attentions.append(
+                Transformer2DModel(
+                    **{
+                        **kwargs,
+                        "attention_head_dim": kwargs["input_channels"] // kwargs["num_attention_heads"],
+                        "input_channels": kwargs["input_channels"],
+                        "num_layers": transformer_layers_per_block[i],
+                    }
+                )
+            )
+
+            self.resnets.append(ResnetBlock2D(**{**kwargs, "input_channels": kwargs["input_channels"]}))
+
+        self.resnets = nn.ModuleList(self.resnets)
+        self.attentions = nn.ModuleList(self.attentions)
+
+    def forward(self, hidden_states: Tensor, temb: Tensor, encoder_hidden_states: Tensor) -> Tensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.has_cross_attention = True
+        num_layers = kwargs["num_layers"]
+
+        transformer_layers_per_block = kwargs["transformer_layers_per_block"]
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        self.resnets = []
+        self.attentions = []
+        for i in range(num_layers):
+            res_skip_channels = kwargs["input_channels"] if (i == num_layers - 1) else kwargs["output_channels"]
+            resnet_in_channels = kwargs["prev_output_channel"] if i == 0 else kwargs["output_channels"]
+            input_channels = resnet_in_channels + res_skip_channels
+
+            self.resnets.append(ResnetBlock2D(**{**kwargs, "input_channels": input_channels}))
+
+            self.attentions.append(
+                Transformer2DModel(
+                    **{
+                        **kwargs,
+                        "attention_head_dim": kwargs["output_channels"] // kwargs["num_attention_heads"],
+                        "input_channels": kwargs["output_channels"],
+                        "num_layers": transformer_layers_per_block[i],
+                    }
+                )
+            )
+
+        self.resnets = nn.ModuleList(self.resnets)
+        self.attentions = nn.ModuleList(self.attentions)
+
+        if kwargs["add_upsample"]:
+            self.upsamplers = nn.ModuleList([Upsample2D(kwargs["output_channels"], **kwargs)])
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        res_hidden_states_tuple: Tuple[Tensor],
+        temb: Tensor,
+        upsample_size: int,
+        encoder_hidden_states: Tensor,
+        is_final_block=False,
+    ) -> Tensor:
+        for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = concat([hidden_states, res_hidden_states], axis=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                temperature_scaling=2 if is_final_block and i == 1 else 1,
+            )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module[Tensor]):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.has_cross_attention = False
+        self.resnets = []
+
+        for i in range(kwargs["num_layers"]):
+            res_skip_channels = (
+                kwargs["input_channels"] if (i == kwargs["num_layers"] - 1) else kwargs["output_channels"]
+            )
+            resnet_input_channels = kwargs["prev_output_channel"] if i == 0 else kwargs["output_channels"]
+            input_channels = res_skip_channels + resnet_input_channels
+
+            self.resnets.append(ResnetBlock2D(**{**kwargs, "input_channels": input_channels}))
+
+        self.resnets = nn.ModuleList(self.resnets)
+        if kwargs["add_upsample"]:
+            self.upsamplers = nn.ModuleList([Upsample2D(kwargs["output_channels"], **kwargs)])
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self, hidden_states: Tensor, res_hidden_states_tuple: Tuple[Tensor], temb: Tensor, upsample_size: int
+    ) -> Tensor:
+        for resnet in self.resnets:
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = concat([hidden_states, res_hidden_states], axis=1)
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states