Wenzheng Chang commited on
Commit
8040e22
·
1 Parent(s): 3e3a156

update bloat16

Browse files
aether/pipelines/aetherv1_pipeline_cogvideox.py CHANGED
@@ -294,6 +294,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
294
  num_videos_per_prompt=1,
295
  prompt_embeds=None,
296
  )
 
297
 
298
  def _prepare_rotary_positional_embeddings(
299
  self,
@@ -472,19 +473,19 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
472
  if isinstance(image, PIL.Image.Image):
473
  image = self.video_processor.preprocess(
474
  image, height, width, resize_mode="crop"
475
- ).to(self._execution_device)
476
  else:
477
  image = self._preprocess_image(image, height, width).to(
478
- self._execution_device
479
  )
480
  if goal is not None:
481
  if isinstance(goal, PIL.Image.Image):
482
  goal = self.video_processor.preprocess(
483
  goal, height, width, resize_mode="crop"
484
- ).to(self._execution_device)
485
  else:
486
  goal = self._preprocess_image(goal, height, width).to(
487
- self._execution_device
488
  )
489
  if video is not None:
490
  if isinstance(video, list) and all(
@@ -492,17 +493,21 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
492
  ):
493
  video = self.video_processor.preprocess(
494
  video, height, width, resize_mode="crop"
495
- ).to(self._execution_device)
496
  else:
497
  video = self._preprocess_image(video, height, width).to(
498
- self._execution_device
499
  )
500
  # TODO: check raymap shape
501
  if raymap is not None:
502
  if isinstance(raymap, np.ndarray):
503
- raymap = torch.from_numpy(raymap).to(self._execution_device)
 
 
504
  if raymap.ndim == 4:
505
- raymap = raymap.unsqueeze(0)
 
 
506
 
507
  return image, goal, video, raymap
508
 
@@ -938,6 +943,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
938
  rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
939
  :, -rgb_video.shape[1] :, :, :
940
  ]
 
941
  .cpu()
942
  .numpy()
943
  )
 
294
  num_videos_per_prompt=1,
295
  prompt_embeds=None,
296
  )
297
+ self.empty_prompt_embeds = self.empty_prompt_embeds.to(dtype=torch.bfloat16)
298
 
299
  def _prepare_rotary_positional_embeddings(
300
  self,
 
473
  if isinstance(image, PIL.Image.Image):
474
  image = self.video_processor.preprocess(
475
  image, height, width, resize_mode="crop"
476
+ ).to(device=self._execution_device, dtype=torch.bfloat16)
477
  else:
478
  image = self._preprocess_image(image, height, width).to(
479
+ device=self._execution_device, dtype=torch.bfloat16
480
  )
481
  if goal is not None:
482
  if isinstance(goal, PIL.Image.Image):
483
  goal = self.video_processor.preprocess(
484
  goal, height, width, resize_mode="crop"
485
+ ).to(device=self._execution_device, dtype=torch.bfloat16)
486
  else:
487
  goal = self._preprocess_image(goal, height, width).to(
488
+ device=self._execution_device, dtype=torch.bfloat16
489
  )
490
  if video is not None:
491
  if isinstance(video, list) and all(
 
493
  ):
494
  video = self.video_processor.preprocess(
495
  video, height, width, resize_mode="crop"
496
+ ).to(device=self._execution_device, dtype=torch.bfloat16)
497
  else:
498
  video = self._preprocess_image(video, height, width).to(
499
+ device=self._execution_device, dtype=torch.bfloat16
500
  )
501
  # TODO: check raymap shape
502
  if raymap is not None:
503
  if isinstance(raymap, np.ndarray):
504
+ raymap = torch.from_numpy(raymap).to(
505
+ self._execution_device, dtype=torch.bfloat16
506
+ )
507
  if raymap.ndim == 4:
508
+ raymap = raymap.unsqueeze(0).to(
509
+ self._execution_device, dtype=torch.bfloat16
510
+ )
511
 
512
  return image, goal, video, raymap
513
 
 
943
  rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
944
  :, -rgb_video.shape[1] :, :, :
945
  ]
946
+ .float()
947
  .cpu()
948
  .numpy()
949
  )
app.py CHANGED
@@ -17,7 +17,7 @@ from diffusers import (
17
  CogVideoXTransformer3DModel,
18
  )
19
  from transformers import AutoTokenizer, T5EncoderModel
20
- import huggingface_hub.spaces as spaces
21
 
22
 
23
  rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
@@ -40,9 +40,6 @@ from aether.utils.postprocess_utils import ( # noqa: E402
40
  from aether.utils.visualize_utils import predictions_to_glb # noqa: E402
41
 
42
 
43
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
44
-
45
-
46
  def seed_all(seed: int = 0) -> None:
47
  """
48
  Set random seeds of all components.
@@ -53,7 +50,7 @@ def seed_all(seed: int = 0) -> None:
53
  torch.cuda.manual_seed_all(seed)
54
 
55
 
56
- # Global pipeline
57
  cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
58
  aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
59
  pipeline = AetherV1PipelineCogVideoX(
@@ -65,22 +62,45 @@ pipeline = AetherV1PipelineCogVideoX(
65
  cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
66
  ),
67
  vae=AutoencoderKLCogVideoX.from_pretrained(
68
- cogvideox_pretrained_model_name_or_path, subfolder="vae"
69
  ),
70
  scheduler=CogVideoXDPMScheduler.from_pretrained(
71
  cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
72
  ),
73
  transformer=CogVideoXTransformer3DModel.from_pretrained(
74
- aether_pretrained_model_name_or_path, subfolder="transformer"
75
  ),
76
  )
77
  pipeline.vae.enable_slicing()
78
  pipeline.vae.enable_tiling()
79
- pipeline.to(device)
80
 
81
 
82
- def build_pipeline() -> AetherV1PipelineCogVideoX:
83
  """Initialize the model pipeline."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  return pipeline
85
 
86
 
@@ -424,7 +444,7 @@ def save_output_files(
424
  return paths
425
 
426
 
427
- @spaces.GPU(duration=240)
428
  def process_reconstruction(
429
  video_file,
430
  height,
@@ -449,11 +469,15 @@ def process_reconstruction(
449
  gc.collect()
450
  torch.cuda.empty_cache()
451
 
452
- # Set random seed
453
  seed_all(seed)
454
 
455
- # Build the pipeline
456
- pipeline = build_pipeline()
 
 
 
 
457
 
458
  progress(0.1, "Loading video")
459
  # Check if video_file is a string or a file object
@@ -578,7 +602,7 @@ def process_prediction(
578
  seed_all(seed)
579
 
580
  # Build the pipeline
581
- pipeline = build_pipeline()
582
 
583
  progress(0.1, "Loading image")
584
  # Check if image_file is a string or a file object
@@ -704,8 +728,13 @@ def process_planning(
704
  # Set random seed
705
  seed_all(seed)
706
 
 
 
 
 
 
707
  # Build the pipeline
708
- pipeline = build_pipeline()
709
 
710
  progress(0.1, "Loading images")
711
  # Check if image_file and goal_file are strings or file objects
@@ -1467,7 +1496,7 @@ with gr.Blocks(
1467
  )
1468
 
1469
  # Load the model at startup
1470
- demo.load(lambda: build_pipeline(), inputs=None, outputs=None)
1471
 
1472
  if __name__ == "__main__":
1473
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
17
  CogVideoXTransformer3DModel,
18
  )
19
  from transformers import AutoTokenizer, T5EncoderModel
20
+ import spaces
21
 
22
 
23
  rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
 
40
  from aether.utils.visualize_utils import predictions_to_glb # noqa: E402
41
 
42
 
 
 
 
43
  def seed_all(seed: int = 0) -> None:
44
  """
45
  Set random seeds of all components.
 
50
  torch.cuda.manual_seed_all(seed)
51
 
52
 
53
+ # # Global pipeline
54
  cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
55
  aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
56
  pipeline = AetherV1PipelineCogVideoX(
 
62
  cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
63
  ),
64
  vae=AutoencoderKLCogVideoX.from_pretrained(
65
+ cogvideox_pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16
66
  ),
67
  scheduler=CogVideoXDPMScheduler.from_pretrained(
68
  cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
69
  ),
70
  transformer=CogVideoXTransformer3DModel.from_pretrained(
71
+ aether_pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16
72
  ),
73
  )
74
  pipeline.vae.enable_slicing()
75
  pipeline.vae.enable_tiling()
76
+ # pipeline.to(device)
77
 
78
 
79
+ def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
80
  """Initialize the model pipeline."""
81
+ # cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
82
+ # aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
83
+ # pipeline = AetherV1PipelineCogVideoX(
84
+ # tokenizer=AutoTokenizer.from_pretrained(
85
+ # cogvideox_pretrained_model_name_or_path,
86
+ # subfolder="tokenizer",
87
+ # ),
88
+ # text_encoder=T5EncoderModel.from_pretrained(
89
+ # cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
90
+ # ),
91
+ # vae=AutoencoderKLCogVideoX.from_pretrained(
92
+ # cogvideox_pretrained_model_name_or_path, subfolder="vae"
93
+ # ),
94
+ # scheduler=CogVideoXDPMScheduler.from_pretrained(
95
+ # cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
96
+ # ),
97
+ # transformer=CogVideoXTransformer3DModel.from_pretrained(
98
+ # aether_pretrained_model_name_or_path, subfolder="transformer"
99
+ # ),
100
+ # )
101
+ # pipeline.vae.enable_slicing()
102
+ # pipeline.vae.enable_tiling()
103
+ pipeline.to(device)
104
  return pipeline
105
 
106
 
 
444
  return paths
445
 
446
 
447
+ @spaces.GPU(duration=300)
448
  def process_reconstruction(
449
  video_file,
450
  height,
 
469
  gc.collect()
470
  torch.cuda.empty_cache()
471
 
472
+ # 设置随机种子
473
  seed_all(seed)
474
 
475
+ # 检查CUDA是否可用
476
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
477
+ if not torch.cuda.is_available():
478
+ raise ValueError("CUDA is not available. Check your environment.")
479
+
480
+ pipeline = build_pipeline(device)
481
 
482
  progress(0.1, "Loading video")
483
  # Check if video_file is a string or a file object
 
602
  seed_all(seed)
603
 
604
  # Build the pipeline
605
+ pipeline = build_pipeline(device)
606
 
607
  progress(0.1, "Loading image")
608
  # Check if image_file is a string or a file object
 
728
  # Set random seed
729
  seed_all(seed)
730
 
731
+ # Check if CUDA is available
732
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
733
+ if not torch.cuda.is_available():
734
+ raise ValueError("CUDA is not available. Check your environment.")
735
+
736
  # Build the pipeline
737
+ pipeline = build_pipeline(device)
738
 
739
  progress(0.1, "Loading images")
740
  # Check if image_file and goal_file are strings or file objects
 
1496
  )
1497
 
1498
  # Load the model at startup
1499
+ demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
1500
 
1501
  if __name__ == "__main__":
1502
  os.environ["TOKENIZERS_PARALLELISM"] = "false"