Adding device specific configs & more input image type options + small model spec from args change

Browse files

Files changed (6) hide show

configs/config-dev-1-RTX6000ADA.json +57 -0
configs/config-dev-offload-1-4080.json +58 -0
configs/config-dev-offload-1-4090.json +58 -0
flux_pipeline.py +123 -17
image_encoder.py +7 -16
util.py +2 -2

configs/config-dev-1-RTX6000ADA.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": false,
+  "offload_vae": false,
+  "offload_flow": false
+}

configs/config-dev-offload-1-4080.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": true
+}

configs/config-dev-offload-1-4090.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "version": "flux-dev",
+  "params": {
+    "in_channels": 64,
+    "vec_in_dim": 768,
+    "context_in_dim": 4096,
+    "hidden_size": 3072,
+    "mlp_ratio": 4.0,
+    "num_heads": 24,
+    "depth": 19,
+    "depth_single_blocks": 38,
+    "axes_dim": [
+      16,
+      56,
+      56
+    ],
+    "theta": 10000,
+    "qkv_bias": true,
+    "guidance_embed": true
+  },
+  "ae_params": {
+    "resolution": 256,
+    "in_channels": 3,
+    "ch": 128,
+    "out_ch": 3,
+    "ch_mult": [
+      1,
+      2,
+      4,
+      4
+    ],
+    "num_res_blocks": 2,
+    "z_channels": 16,
+    "scale_factor": 0.3611,
+    "shift_factor": 0.1159
+  },
+  "ckpt_path": "/big/generator-ui/flux-testing/flux/model-dir/flux1-dev.sft",
+  "ae_path": "/big/generator-ui/flux-testing/flux/model-dir/ae.sft",
+  "repo_id": "black-forest-labs/FLUX.1-dev",
+  "repo_flow": "flux1-dev.sft",
+  "repo_ae": "ae.sft",
+  "text_enc_max_length": 512,
+  "text_enc_path": "city96/t5-v1_1-xxl-encoder-bf16",
+  "text_enc_device": "cuda:0",
+  "ae_device": "cuda:0",
+  "flux_device": "cuda:0",
+  "flow_dtype": "float16",
+  "ae_dtype": "bfloat16",
+  "text_enc_dtype": "bfloat16",
+  "flow_quantization_dtype": "qfloat8",
+  "text_enc_quantization_dtype": "qint4",
+  "ae_quantization_dtype": "qfloat8",
+  "compile_extras": true,
+  "compile_blocks": true,
+  "offload_text_encoder": true,
+  "offload_vae": true,
+  "offload_flow": false
+}

flux_pipeline.py CHANGED Viewed

@@ -40,6 +40,12 @@ if TYPE_CHECKING:
 class FluxPipeline:
     def __init__(
         self,
         name: str,
@@ -56,7 +62,12 @@ class FluxPipeline:
         t5_device: torch.device | str = "cuda:1",
         config: ModelSpec = None,
     ):
         self.name = name
         self.device_flux = (
             flux_device
@@ -104,10 +115,10 @@ class FluxPipeline:
             if not self.config.prequantized_flow:
                 print("Warmups for compile...")
                 warmup_dict = dict(
-                    prompt="Street photography portrait of a beautiful asian woman in traditional clothing with golden hairpin and blue eyes, wearing a red kimono with dragon patterns",
-                    height=1024,
-                    width=1024,
-                    num_steps=30,
                     guidance=3.5,
                     seed=10,
                 )
@@ -138,6 +149,32 @@ class FluxPipeline:
         target_device: torch.device = torch.device("cuda:0"),
         target_dtype: torch.dtype = torch.float16,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         bs, c, h, w = img.shape
         if bs == 1 and not isinstance(prompt, str):
             bs = len(prompt)
@@ -165,8 +202,8 @@ class FluxPipeline:
         img_ids = img_ids[None].repeat(bs, 1, 1, 1).flatten(1, 2)
         if self.offload_text_encoder:
-            self.clip.cuda(self.device_clip)
-            self.t5.cuda(self.device_t5)
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
@@ -201,6 +238,7 @@ class FluxPipeline:
         max_shift: float = 1.15,
         shift: bool = True,
     ) -> list[float]:
         # extra step for zero
         timesteps = torch.linspace(1, 0, num_steps + 1)
@@ -221,7 +259,8 @@ class FluxPipeline:
         generator: torch.Generator,
         dtype=None,
         device=None,
-    ):
         if device is None:
             device = self.device_flux
         if dtype is None:
@@ -240,6 +279,7 @@ class FluxPipeline:
     @torch.inference_mode()
     def into_bytes(self, x: torch.Tensor) -> io.BytesIO:
         # bring into PIL format and save
         torch.cuda.synchronize()
         x = x.contiguous()
@@ -257,10 +297,34 @@ class FluxPipeline:
         torch.cuda.synchronize()
         im = self.img_encoder.encode_torch(im, quality=99)
         images.clear()
-        return io.BytesIO(im)
     @torch.inference_mode()
     def vae_decode(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
         if self.offload_vae:
             self.ae.to(self.device_ae)
             x = x.to(self.device_ae)
@@ -290,6 +354,7 @@ class FluxPipeline:
     def resize_center_crop(
         self, img: torch.Tensor, height: int, width: int
     ) -> torch.Tensor:
         img = TF.resize(img, min(width, height))
         img = TF.center_crop(img, (height, width))
         return img
@@ -305,6 +370,11 @@ class FluxPipeline:
         generator: torch.Generator = None,
         num_images: int = 1,
     ) -> tuple[torch.Tensor, List[float]]:
         # prepare input
         if init_image is not None:
@@ -364,20 +434,55 @@ class FluxPipeline:
         num_steps: int = 24,
         guidance: float = 3.5,
         seed: int | None = None,
-        init_image: torch.Tensor | str | None = None,
         strength: float = 1.0,
         silent: bool = False,
         num_images: int = 1,
         return_seed: bool = False,
     ) -> io.BytesIO:
         num_steps = 4 if self.name == "flux-schnell" else num_steps
-        if isinstance(init_image, str):
-            try:
-                init_image = Image.open(init_image)
-            except Exception as e:
-                init_image = Image.open(io.BytesIO(standard_b64decode(init_image)))
-            init_image = torch.from_numpy(np.array(init_image)).type(torch.uint8)
         # allow for packing and conversion to latent space
         height = 16 * (height // 16)
@@ -465,8 +570,9 @@ class FluxPipeline:
         from float8_quantize import quantize_flow_transformer_and_dispatch_float8
         with torch.inference_mode():
-            print("flow_quantization_dtype", config.flow_quantization_dtype)
-            print("prequantized_flow?", config.prequantized_flow)
             models = load_models_from_config(config)
             config = models.config

 class FluxPipeline:
+    """
+    FluxPipeline is a class that provides a pipeline for generating images using the Flux model.
+    It handles input preparation, timestep generation, noise generation, device management
+    and model compilation.
+    """
     def __init__(
         self,
         name: str,
         t5_device: torch.device | str = "cuda:1",
         config: ModelSpec = None,
     ):
+        """
+        Initialize the FluxPipeline class.
+        This class is responsible for preparing input tensors for the Flux model, generating
+        timesteps and noise, and handling device management for model offloading.
+        """
         self.name = name
         self.device_flux = (
             flux_device
             if not self.config.prequantized_flow:
                 print("Warmups for compile...")
                 warmup_dict = dict(
+                    prompt="A beautiful test image used to solidify the fp8 nn.Linear input scales prior to compilation 😉",
+                    height=768,
+                    width=768,
+                    num_steps=25,
                     guidance=3.5,
                     seed=10,
                 )
         target_device: torch.device = torch.device("cuda:0"),
         target_dtype: torch.dtype = torch.float16,
     ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Prepare input tensors for the Flux model.
+        This function processes the input image and text prompt, converting them into
+        the appropriate format and embedding representations required by the model.
+        Args:
+            img (torch.Tensor): Input image tensor of shape (batch_size, channels, height, width).
+            prompt (str | list[str]): Text prompt or list of prompts guiding the image generation.
+            target_device (torch.device, optional): The target device for the output tensors.
+                Defaults to torch.device("cuda:0").
+            target_dtype (torch.dtype, optional): The target data type for the output tensors.
+                Defaults to torch.float16.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing:
+                - img: Processed image tensor.
+                - img_ids: Image position IDs.
+                - vec: Clip text embedding vector.
+                - txt: T5 text embedding hidden states.
+                - txt_ids: Text position IDs.
+        Note:
+            This function handles the necessary device management for text encoder offloading
+            if enabled in the configuration.
+        """
         bs, c, h, w = img.shape
         if bs == 1 and not isinstance(prompt, str):
             bs = len(prompt)
         img_ids = img_ids[None].repeat(bs, 1, 1, 1).flatten(1, 2)
         if self.offload_text_encoder:
+            self.clip.to(device=self.device_clip)
+            self.t5.to(device=self.device_t5)
         vec, txt, txt_ids = get_weighted_text_embeddings_flux(
             self,
             prompt,
         max_shift: float = 1.15,
         shift: bool = True,
     ) -> list[float]:
+        """Generates a schedule of timesteps for the given number of steps and image sequence length."""
         # extra step for zero
         timesteps = torch.linspace(1, 0, num_steps + 1)
         generator: torch.Generator,
         dtype=None,
         device=None,
+    ) -> torch.Tensor:
+        """Generates a latent noise tensor of the given shape and dtype on the given device."""
         if device is None:
             device = self.device_flux
         if dtype is None:
     @torch.inference_mode()
     def into_bytes(self, x: torch.Tensor) -> io.BytesIO:
+        """Converts the image tensor to bytes."""
         # bring into PIL format and save
         torch.cuda.synchronize()
         x = x.contiguous()
         torch.cuda.synchronize()
         im = self.img_encoder.encode_torch(im, quality=99)
         images.clear()
+        return im
+    @torch.inference_mode()
+    def load_init_image_if_needed(
+        self, init_image: torch.Tensor | str | Image.Image | np.ndarray
+    ) -> torch.Tensor:
+        """
+        Loads the initial image if it is a string, numpy array, or PIL.Image,
+        if torch.Tensor, expects it to be in the correct format and returns it as is.
+        """
+        if isinstance(init_image, str):
+            try:
+                init_image = Image.open(init_image)
+            except Exception as e:
+                init_image = Image.open(
+                    io.BytesIO(standard_b64decode(init_image.split(",")[-1]))
+                )
+            init_image = torch.from_numpy(np.array(init_image)).type(torch.uint8)
+        elif isinstance(init_image, np.ndarray):
+            init_image = torch.from_numpy(init_image).type(torch.uint8)
+        elif isinstance(init_image, Image.Image):
+            init_image = torch.from_numpy(np.array(init_image)).type(torch.uint8)
+        return init_image
     @torch.inference_mode()
     def vae_decode(self, x: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """Decodes the latent tensor to the pixel space."""
         if self.offload_vae:
             self.ae.to(self.device_ae)
             x = x.to(self.device_ae)
     def resize_center_crop(
         self, img: torch.Tensor, height: int, width: int
     ) -> torch.Tensor:
+        """Resizes and crops the image to the given height and width."""
         img = TF.resize(img, min(width, height))
         img = TF.center_crop(img, (height, width))
         return img
         generator: torch.Generator = None,
         num_images: int = 1,
     ) -> tuple[torch.Tensor, List[float]]:
+        """
+        Preprocesses the latent tensor for the given number of steps and image sequence length.
+        Also, if an initial image is provided, it is vae encoded and injected with the appropriate noise
+        given the strength and number of steps replacing the latent tensor.
+        """
         # prepare input
         if init_image is not None:
         num_steps: int = 24,
         guidance: float = 3.5,
         seed: int | None = None,
+        init_image: torch.Tensor | str | Image.Image | np.ndarray | None = None,
         strength: float = 1.0,
         silent: bool = False,
         num_images: int = 1,
         return_seed: bool = False,
     ) -> io.BytesIO:
+        """
+        Generate images based on the given prompt and parameters.
+        Args:
+            prompt `(str)`: The text prompt to guide the image generation.
+            width `(int, optional)`: Width of the generated image. Defaults to 720.
+            height `(int, optional)`: Height of the generated image. Defaults to 1024.
+            num_steps `(int, optional)`: Number of denoising steps. Defaults to 24.
+            guidance `(float, optional)`: Guidance scale for text-to-image generation. Defaults to 3.5.
+            seed `(int | None, optional)`: Random seed for reproducibility. If None, a random seed is used. Defaults to None.
+            init_image `(torch.Tensor | str | Image.Image | np.ndarray | None, optional)`: Initial image for image-to-image generation. Defaults to None.
+                -- note: if the image's height/width do not match the height/width of the generated image, the image is resized and centered cropped to match the height/width arguments.
+                -- If a string is provided, it is assumed to be either a path to an image file or a base64 encoded image.
+                -- If a numpy array is provided, it is assumed to be an RGB numpy array of shape (height, width, 3) and dtype uint8.
+                -- If a PIL.Image is provided, it is assumed to be an RGB PIL.Image.
+                -- If a torch.Tensor is provided, it is assumed to be a torch.Tensor of shape (height, width, 3) and dtype uint8 with range [0, 255].
+            strength `(float, optional)`: Strength of the init_image in image-to-image generation. Defaults to 1.0.
+            silent `(bool, optional)`: If True, suppresses progress bar. Defaults to False.
+            num_images `(int, optional)`: Number of images to generate. Defaults to 1.
+            return_seed `(bool, optional)`: If True, returns the seed along with the generated image. Defaults to False.
+        Returns:
+            io.BytesIO: Generated image(s) in bytes format.
+            int: Seed used for generation (only if return_seed is True).
+        """
         num_steps = 4 if self.name == "flux-schnell" else num_steps
+        init_image = self.load_init_image_if_needed(init_image)
         # allow for packing and conversion to latent space
         height = 16 * (height // 16)
         from float8_quantize import quantize_flow_transformer_and_dispatch_float8
         with torch.inference_mode():
+            logger.info(
+                f"Loading as prequantized flow transformer? {config.prequantized_flow}"
+            )
             models = load_models_from_config(config)
             config = models.config

image_encoder.py CHANGED Viewed

@@ -7,38 +7,29 @@ import torch
 class ImageEncoder:
     @torch.inference_mode()
-    def encode_torch(self, img: torch.Tensor, quality=90):
         if img.ndim == 2:
             img = (
                 img[None]
-                .contiguous()
                 .repeat_interleave(3, dim=0)
                 .contiguous()
                 .clamp(0, 255)
                 .type(torch.uint8)
             )
-            print(img.shape)
         elif img.ndim == 3:
             if img.shape[0] == 3:
-                img = img.contiguous().clamp(0, 255).type(torch.uint8)
             elif img.shape[2] == 3:
-                img = img.permute(2, 0, 1).contiguous().clamp(0, 255).type(torch.uint8)
             else:
                 raise ValueError(f"Unsupported image shape: {img.shape}")
         else:
             raise ValueError(f"Unsupported image num dims: {img.ndim}")
-        img = (
-            img.permute(1, 2, 0)
-            .contiguous()
-            .to(torch.uint8)
-            .cpu()
-            .numpy()
-            .astype(np.uint8)
-        )
         im = Image.fromarray(img)
         iob = io.BytesIO()
-        im.save(iob, format="JPEG", quality=95)
         iob.seek(0)
-        return iob.getvalue()

 class ImageEncoder:
     @torch.inference_mode()
+    def encode_torch(self, img: torch.Tensor, quality=95):
         if img.ndim == 2:
             img = (
                 img[None]
                 .repeat_interleave(3, dim=0)
+                .permute(1, 2, 0)
                 .contiguous()
                 .clamp(0, 255)
                 .type(torch.uint8)
             )
         elif img.ndim == 3:
             if img.shape[0] == 3:
+                img = img.permute(1, 2, 0).contiguous().clamp(0, 255).type(torch.uint8)
             elif img.shape[2] == 3:
+                img = img.contiguous().clamp(0, 255).type(torch.uint8)
             else:
                 raise ValueError(f"Unsupported image shape: {img.shape}")
         else:
             raise ValueError(f"Unsupported image num dims: {img.ndim}")
+        img = img.cpu().numpy().astype(np.uint8)
         im = Image.fromarray(img)
         iob = io.BytesIO()
+        im.save(iob, format="JPEG", quality=quality)
         iob.seek(0)
+        return iob

util.py CHANGED Viewed

@@ -141,7 +141,7 @@ def load_config(
             axes_dim=[16, 56, 56],
             theta=10_000,
             qkv_bias=True,
-            guidance_embed=True,
         ),
         ae_path=ae_path,
         ae_params=AutoEncoderParams(
@@ -243,8 +243,8 @@ def load_autoencoder(config: ModelSpec) -> AutoEncoder:
         sd = load_sft(ckpt_path, device=str(config.ae_device))
         missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
         print_load_warning(missing, unexpected)
     if config.ae_quantization_dtype is not None:
-        ae.to(into_device(config.ae_device))
         from float8_quantize import recursive_swap_linears
         recursive_swap_linears(ae)

             axes_dim=[16, 56, 56],
             theta=10_000,
             qkv_bias=True,
+            guidance_embed=name == ModelVersion.flux_dev,
         ),
         ae_path=ae_path,
         ae_params=AutoEncoderParams(
         sd = load_sft(ckpt_path, device=str(config.ae_device))
         missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
         print_load_warning(missing, unexpected)
+    ae.to(device=into_device(config.ae_device), dtype=into_dtype(config.ae_dtype))
     if config.ae_quantization_dtype is not None:
         from float8_quantize import recursive_swap_linears
         recursive_swap_linears(ae)