Spaces:

AmberHeart
/

AetherV1

Running on Zero

App Files Files Community

Wenzheng Chang commited on 8 days ago

Commit

8040e22

1 Parent(s): 3e3a156

update bloat16

Browse files

Files changed (2) hide show

aether/pipelines/aetherv1_pipeline_cogvideox.py +14 -8
app.py +45 -16

aether/pipelines/aetherv1_pipeline_cogvideox.py CHANGED Viewed

@@ -294,6 +294,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
             num_videos_per_prompt=1,
             prompt_embeds=None,
         )
     def _prepare_rotary_positional_embeddings(
         self,
@@ -472,19 +473,19 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
             if isinstance(image, PIL.Image.Image):
                 image = self.video_processor.preprocess(
                     image, height, width, resize_mode="crop"
-                ).to(self._execution_device)
             else:
                 image = self._preprocess_image(image, height, width).to(
-                    self._execution_device
                 )
         if goal is not None:
             if isinstance(goal, PIL.Image.Image):
                 goal = self.video_processor.preprocess(
                     goal, height, width, resize_mode="crop"
-                ).to(self._execution_device)
             else:
                 goal = self._preprocess_image(goal, height, width).to(
-                    self._execution_device
                 )
         if video is not None:
             if isinstance(video, list) and all(
@@ -492,17 +493,21 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
             ):
                 video = self.video_processor.preprocess(
                     video, height, width, resize_mode="crop"
-                ).to(self._execution_device)
             else:
                 video = self._preprocess_image(video, height, width).to(
-                    self._execution_device
                 )
         # TODO: check raymap shape
         if raymap is not None:
             if isinstance(raymap, np.ndarray):
-                raymap = torch.from_numpy(raymap).to(self._execution_device)
             if raymap.ndim == 4:
-                raymap = raymap.unsqueeze(0)
         return image, goal, video, raymap
@@ -938,6 +943,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
             rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
                 :, -rgb_video.shape[1] :, :, :
             ]
             .cpu()
             .numpy()
         )

             num_videos_per_prompt=1,
             prompt_embeds=None,
         )
+        self.empty_prompt_embeds = self.empty_prompt_embeds.to(dtype=torch.bfloat16)
     def _prepare_rotary_positional_embeddings(
         self,
             if isinstance(image, PIL.Image.Image):
                 image = self.video_processor.preprocess(
                     image, height, width, resize_mode="crop"
+                ).to(device=self._execution_device, dtype=torch.bfloat16)
             else:
                 image = self._preprocess_image(image, height, width).to(
+                    device=self._execution_device, dtype=torch.bfloat16
                 )
         if goal is not None:
             if isinstance(goal, PIL.Image.Image):
                 goal = self.video_processor.preprocess(
                     goal, height, width, resize_mode="crop"
+                ).to(device=self._execution_device, dtype=torch.bfloat16)
             else:
                 goal = self._preprocess_image(goal, height, width).to(
+                    device=self._execution_device, dtype=torch.bfloat16
                 )
         if video is not None:
             if isinstance(video, list) and all(
             ):
                 video = self.video_processor.preprocess(
                     video, height, width, resize_mode="crop"
+                ).to(device=self._execution_device, dtype=torch.bfloat16)
             else:
                 video = self._preprocess_image(video, height, width).to(
+                    device=self._execution_device, dtype=torch.bfloat16
                 )
         # TODO: check raymap shape
         if raymap is not None:
             if isinstance(raymap, np.ndarray):
+                raymap = torch.from_numpy(raymap).to(
+                    self._execution_device, dtype=torch.bfloat16
+                )
             if raymap.ndim == 4:
+                raymap = raymap.unsqueeze(0).to(
+                    self._execution_device, dtype=torch.bfloat16
+                )
         return image, goal, video, raymap
             rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
                 :, -rgb_video.shape[1] :, :, :
             ]
+            .float()
             .cpu()
             .numpy()
         )

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from diffusers import (
     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
-import huggingface_hub.spaces as spaces
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
@@ -40,9 +40,6 @@ from aether.utils.postprocess_utils import (  # noqa: E402
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
@@ -53,7 +50,7 @@ def seed_all(seed: int = 0) -> None:
     torch.cuda.manual_seed_all(seed)
-# Global pipeline
 cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
 aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
 pipeline = AetherV1PipelineCogVideoX(
@@ -65,22 +62,45 @@ pipeline = AetherV1PipelineCogVideoX(
         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
     ),
     vae=AutoencoderKLCogVideoX.from_pretrained(
-        cogvideox_pretrained_model_name_or_path, subfolder="vae"
     ),
     scheduler=CogVideoXDPMScheduler.from_pretrained(
         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
     ),
     transformer=CogVideoXTransformer3DModel.from_pretrained(
-        aether_pretrained_model_name_or_path, subfolder="transformer"
     ),
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
-pipeline.to(device)
-def build_pipeline() -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
     return pipeline
@@ -424,7 +444,7 @@ def save_output_files(
     return paths
-@spaces.GPU(duration=240)
 def process_reconstruction(
     video_file,
     height,
@@ -449,11 +469,15 @@ def process_reconstruction(
         gc.collect()
         torch.cuda.empty_cache()
-        # Set random seed
         seed_all(seed)
-        # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading video")
         # Check if video_file is a string or a file object
@@ -578,7 +602,7 @@ def process_prediction(
         seed_all(seed)
         # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading image")
         # Check if image_file is a string or a file object
@@ -704,8 +728,13 @@ def process_planning(
         # Set random seed
         seed_all(seed)
         # Build the pipeline
-        pipeline = build_pipeline()
         progress(0.1, "Loading images")
         # Check if image_file and goal_file are strings or file objects
@@ -1467,7 +1496,7 @@ with gr.Blocks(
         )
     # Load the model at startup
-    demo.load(lambda: build_pipeline(), inputs=None, outputs=None)
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"

     CogVideoXTransformer3DModel,
 )
 from transformers import AutoTokenizer, T5EncoderModel
+import spaces
 rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
 def seed_all(seed: int = 0) -> None:
     """
     Set random seeds of all components.
     torch.cuda.manual_seed_all(seed)
+# # Global pipeline
 cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
 aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
 pipeline = AetherV1PipelineCogVideoX(
         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
     ),
     vae=AutoencoderKLCogVideoX.from_pretrained(
+        cogvideox_pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16
     ),
     scheduler=CogVideoXDPMScheduler.from_pretrained(
         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
     ),
     transformer=CogVideoXTransformer3DModel.from_pretrained(
+        aether_pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16
     ),
 )
 pipeline.vae.enable_slicing()
 pipeline.vae.enable_tiling()
+# pipeline.to(device)
+def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
     """Initialize the model pipeline."""
+    # cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
+    # aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
+    # pipeline = AetherV1PipelineCogVideoX(
+    #     tokenizer=AutoTokenizer.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path,
+    #         subfolder="tokenizer",
+    #     ),
+    #     text_encoder=T5EncoderModel.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
+    #     ),
+    #     vae=AutoencoderKLCogVideoX.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="vae"
+    #     ),
+    #     scheduler=CogVideoXDPMScheduler.from_pretrained(
+    #         cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
+    #     ),
+    #     transformer=CogVideoXTransformer3DModel.from_pretrained(
+    #         aether_pretrained_model_name_or_path, subfolder="transformer"
+    #     ),
+    # )
+    # pipeline.vae.enable_slicing()
+    # pipeline.vae.enable_tiling()
+    pipeline.to(device)
     return pipeline
     return paths
+@spaces.GPU(duration=300)
 def process_reconstruction(
     video_file,
     height,
         gc.collect()
         torch.cuda.empty_cache()
+        # 设置随机种子
         seed_all(seed)
+        # 检查CUDA是否可用
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not torch.cuda.is_available():
+            raise ValueError("CUDA is not available. Check your environment.")
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading video")
         # Check if video_file is a string or a file object
         seed_all(seed)
         # Build the pipeline
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading image")
         # Check if image_file is a string or a file object
         # Set random seed
         seed_all(seed)
+        # Check if CUDA is available
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not torch.cuda.is_available():
+            raise ValueError("CUDA is not available. Check your environment.")
         # Build the pipeline
+        pipeline = build_pipeline(device)
         progress(0.1, "Loading images")
         # Check if image_file and goal_file are strings or file objects
         )
     # Load the model at startup
+    demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
 if __name__ == "__main__":
     os.environ["TOKENIZERS_PARALLELISM"] = "false"