Spaces:
Running
on
Zero
Running
on
Zero
Wenzheng Chang
commited on
Commit
·
8040e22
1
Parent(s):
3e3a156
update bloat16
Browse files- aether/pipelines/aetherv1_pipeline_cogvideox.py +14 -8
- app.py +45 -16
aether/pipelines/aetherv1_pipeline_cogvideox.py
CHANGED
@@ -294,6 +294,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
|
|
294 |
num_videos_per_prompt=1,
|
295 |
prompt_embeds=None,
|
296 |
)
|
|
|
297 |
|
298 |
def _prepare_rotary_positional_embeddings(
|
299 |
self,
|
@@ -472,19 +473,19 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
|
|
472 |
if isinstance(image, PIL.Image.Image):
|
473 |
image = self.video_processor.preprocess(
|
474 |
image, height, width, resize_mode="crop"
|
475 |
-
).to(self._execution_device)
|
476 |
else:
|
477 |
image = self._preprocess_image(image, height, width).to(
|
478 |
-
self._execution_device
|
479 |
)
|
480 |
if goal is not None:
|
481 |
if isinstance(goal, PIL.Image.Image):
|
482 |
goal = self.video_processor.preprocess(
|
483 |
goal, height, width, resize_mode="crop"
|
484 |
-
).to(self._execution_device)
|
485 |
else:
|
486 |
goal = self._preprocess_image(goal, height, width).to(
|
487 |
-
self._execution_device
|
488 |
)
|
489 |
if video is not None:
|
490 |
if isinstance(video, list) and all(
|
@@ -492,17 +493,21 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
|
|
492 |
):
|
493 |
video = self.video_processor.preprocess(
|
494 |
video, height, width, resize_mode="crop"
|
495 |
-
).to(self._execution_device)
|
496 |
else:
|
497 |
video = self._preprocess_image(video, height, width).to(
|
498 |
-
self._execution_device
|
499 |
)
|
500 |
# TODO: check raymap shape
|
501 |
if raymap is not None:
|
502 |
if isinstance(raymap, np.ndarray):
|
503 |
-
raymap = torch.from_numpy(raymap).to(
|
|
|
|
|
504 |
if raymap.ndim == 4:
|
505 |
-
raymap = raymap.unsqueeze(0)
|
|
|
|
|
506 |
|
507 |
return image, goal, video, raymap
|
508 |
|
@@ -938,6 +943,7 @@ class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
|
|
938 |
rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
|
939 |
:, -rgb_video.shape[1] :, :, :
|
940 |
]
|
|
|
941 |
.cpu()
|
942 |
.numpy()
|
943 |
)
|
|
|
294 |
num_videos_per_prompt=1,
|
295 |
prompt_embeds=None,
|
296 |
)
|
297 |
+
self.empty_prompt_embeds = self.empty_prompt_embeds.to(dtype=torch.bfloat16)
|
298 |
|
299 |
def _prepare_rotary_positional_embeddings(
|
300 |
self,
|
|
|
473 |
if isinstance(image, PIL.Image.Image):
|
474 |
image = self.video_processor.preprocess(
|
475 |
image, height, width, resize_mode="crop"
|
476 |
+
).to(device=self._execution_device, dtype=torch.bfloat16)
|
477 |
else:
|
478 |
image = self._preprocess_image(image, height, width).to(
|
479 |
+
device=self._execution_device, dtype=torch.bfloat16
|
480 |
)
|
481 |
if goal is not None:
|
482 |
if isinstance(goal, PIL.Image.Image):
|
483 |
goal = self.video_processor.preprocess(
|
484 |
goal, height, width, resize_mode="crop"
|
485 |
+
).to(device=self._execution_device, dtype=torch.bfloat16)
|
486 |
else:
|
487 |
goal = self._preprocess_image(goal, height, width).to(
|
488 |
+
device=self._execution_device, dtype=torch.bfloat16
|
489 |
)
|
490 |
if video is not None:
|
491 |
if isinstance(video, list) and all(
|
|
|
493 |
):
|
494 |
video = self.video_processor.preprocess(
|
495 |
video, height, width, resize_mode="crop"
|
496 |
+
).to(device=self._execution_device, dtype=torch.bfloat16)
|
497 |
else:
|
498 |
video = self._preprocess_image(video, height, width).to(
|
499 |
+
device=self._execution_device, dtype=torch.bfloat16
|
500 |
)
|
501 |
# TODO: check raymap shape
|
502 |
if raymap is not None:
|
503 |
if isinstance(raymap, np.ndarray):
|
504 |
+
raymap = torch.from_numpy(raymap).to(
|
505 |
+
self._execution_device, dtype=torch.bfloat16
|
506 |
+
)
|
507 |
if raymap.ndim == 4:
|
508 |
+
raymap = raymap.unsqueeze(0).to(
|
509 |
+
self._execution_device, dtype=torch.bfloat16
|
510 |
+
)
|
511 |
|
512 |
return image, goal, video, raymap
|
513 |
|
|
|
943 |
rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
|
944 |
:, -rgb_video.shape[1] :, :, :
|
945 |
]
|
946 |
+
.float()
|
947 |
.cpu()
|
948 |
.numpy()
|
949 |
)
|
app.py
CHANGED
@@ -17,7 +17,7 @@ from diffusers import (
|
|
17 |
CogVideoXTransformer3DModel,
|
18 |
)
|
19 |
from transformers import AutoTokenizer, T5EncoderModel
|
20 |
-
import
|
21 |
|
22 |
|
23 |
rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
|
@@ -40,9 +40,6 @@ from aether.utils.postprocess_utils import ( # noqa: E402
|
|
40 |
from aether.utils.visualize_utils import predictions_to_glb # noqa: E402
|
41 |
|
42 |
|
43 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
44 |
-
|
45 |
-
|
46 |
def seed_all(seed: int = 0) -> None:
|
47 |
"""
|
48 |
Set random seeds of all components.
|
@@ -53,7 +50,7 @@ def seed_all(seed: int = 0) -> None:
|
|
53 |
torch.cuda.manual_seed_all(seed)
|
54 |
|
55 |
|
56 |
-
# Global pipeline
|
57 |
cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
|
58 |
aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
|
59 |
pipeline = AetherV1PipelineCogVideoX(
|
@@ -65,22 +62,45 @@ pipeline = AetherV1PipelineCogVideoX(
|
|
65 |
cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
|
66 |
),
|
67 |
vae=AutoencoderKLCogVideoX.from_pretrained(
|
68 |
-
cogvideox_pretrained_model_name_or_path, subfolder="vae"
|
69 |
),
|
70 |
scheduler=CogVideoXDPMScheduler.from_pretrained(
|
71 |
cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
|
72 |
),
|
73 |
transformer=CogVideoXTransformer3DModel.from_pretrained(
|
74 |
-
aether_pretrained_model_name_or_path, subfolder="transformer"
|
75 |
),
|
76 |
)
|
77 |
pipeline.vae.enable_slicing()
|
78 |
pipeline.vae.enable_tiling()
|
79 |
-
pipeline.to(device)
|
80 |
|
81 |
|
82 |
-
def build_pipeline() -> AetherV1PipelineCogVideoX:
|
83 |
"""Initialize the model pipeline."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return pipeline
|
85 |
|
86 |
|
@@ -424,7 +444,7 @@ def save_output_files(
|
|
424 |
return paths
|
425 |
|
426 |
|
427 |
-
@spaces.GPU(duration=
|
428 |
def process_reconstruction(
|
429 |
video_file,
|
430 |
height,
|
@@ -449,11 +469,15 @@ def process_reconstruction(
|
|
449 |
gc.collect()
|
450 |
torch.cuda.empty_cache()
|
451 |
|
452 |
-
#
|
453 |
seed_all(seed)
|
454 |
|
455 |
-
#
|
456 |
-
|
|
|
|
|
|
|
|
|
457 |
|
458 |
progress(0.1, "Loading video")
|
459 |
# Check if video_file is a string or a file object
|
@@ -578,7 +602,7 @@ def process_prediction(
|
|
578 |
seed_all(seed)
|
579 |
|
580 |
# Build the pipeline
|
581 |
-
pipeline = build_pipeline()
|
582 |
|
583 |
progress(0.1, "Loading image")
|
584 |
# Check if image_file is a string or a file object
|
@@ -704,8 +728,13 @@ def process_planning(
|
|
704 |
# Set random seed
|
705 |
seed_all(seed)
|
706 |
|
|
|
|
|
|
|
|
|
|
|
707 |
# Build the pipeline
|
708 |
-
pipeline = build_pipeline()
|
709 |
|
710 |
progress(0.1, "Loading images")
|
711 |
# Check if image_file and goal_file are strings or file objects
|
@@ -1467,7 +1496,7 @@ with gr.Blocks(
|
|
1467 |
)
|
1468 |
|
1469 |
# Load the model at startup
|
1470 |
-
demo.load(lambda: build_pipeline(), inputs=None, outputs=None)
|
1471 |
|
1472 |
if __name__ == "__main__":
|
1473 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
|
17 |
CogVideoXTransformer3DModel,
|
18 |
)
|
19 |
from transformers import AutoTokenizer, T5EncoderModel
|
20 |
+
import spaces
|
21 |
|
22 |
|
23 |
rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
|
|
|
40 |
from aether.utils.visualize_utils import predictions_to_glb # noqa: E402
|
41 |
|
42 |
|
|
|
|
|
|
|
43 |
def seed_all(seed: int = 0) -> None:
|
44 |
"""
|
45 |
Set random seeds of all components.
|
|
|
50 |
torch.cuda.manual_seed_all(seed)
|
51 |
|
52 |
|
53 |
+
# # Global pipeline
|
54 |
cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
|
55 |
aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
|
56 |
pipeline = AetherV1PipelineCogVideoX(
|
|
|
62 |
cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
|
63 |
),
|
64 |
vae=AutoencoderKLCogVideoX.from_pretrained(
|
65 |
+
cogvideox_pretrained_model_name_or_path, subfolder="vae", torch_dtype=torch.bfloat16
|
66 |
),
|
67 |
scheduler=CogVideoXDPMScheduler.from_pretrained(
|
68 |
cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
|
69 |
),
|
70 |
transformer=CogVideoXTransformer3DModel.from_pretrained(
|
71 |
+
aether_pretrained_model_name_or_path, subfolder="transformer", torch_dtype=torch.bfloat16
|
72 |
),
|
73 |
)
|
74 |
pipeline.vae.enable_slicing()
|
75 |
pipeline.vae.enable_tiling()
|
76 |
+
# pipeline.to(device)
|
77 |
|
78 |
|
79 |
+
def build_pipeline(device: torch.device) -> AetherV1PipelineCogVideoX:
|
80 |
"""Initialize the model pipeline."""
|
81 |
+
# cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
|
82 |
+
# aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
|
83 |
+
# pipeline = AetherV1PipelineCogVideoX(
|
84 |
+
# tokenizer=AutoTokenizer.from_pretrained(
|
85 |
+
# cogvideox_pretrained_model_name_or_path,
|
86 |
+
# subfolder="tokenizer",
|
87 |
+
# ),
|
88 |
+
# text_encoder=T5EncoderModel.from_pretrained(
|
89 |
+
# cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
|
90 |
+
# ),
|
91 |
+
# vae=AutoencoderKLCogVideoX.from_pretrained(
|
92 |
+
# cogvideox_pretrained_model_name_or_path, subfolder="vae"
|
93 |
+
# ),
|
94 |
+
# scheduler=CogVideoXDPMScheduler.from_pretrained(
|
95 |
+
# cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
|
96 |
+
# ),
|
97 |
+
# transformer=CogVideoXTransformer3DModel.from_pretrained(
|
98 |
+
# aether_pretrained_model_name_or_path, subfolder="transformer"
|
99 |
+
# ),
|
100 |
+
# )
|
101 |
+
# pipeline.vae.enable_slicing()
|
102 |
+
# pipeline.vae.enable_tiling()
|
103 |
+
pipeline.to(device)
|
104 |
return pipeline
|
105 |
|
106 |
|
|
|
444 |
return paths
|
445 |
|
446 |
|
447 |
+
@spaces.GPU(duration=300)
|
448 |
def process_reconstruction(
|
449 |
video_file,
|
450 |
height,
|
|
|
469 |
gc.collect()
|
470 |
torch.cuda.empty_cache()
|
471 |
|
472 |
+
# 设置随机种子
|
473 |
seed_all(seed)
|
474 |
|
475 |
+
# 检查CUDA是否可用
|
476 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
477 |
+
if not torch.cuda.is_available():
|
478 |
+
raise ValueError("CUDA is not available. Check your environment.")
|
479 |
+
|
480 |
+
pipeline = build_pipeline(device)
|
481 |
|
482 |
progress(0.1, "Loading video")
|
483 |
# Check if video_file is a string or a file object
|
|
|
602 |
seed_all(seed)
|
603 |
|
604 |
# Build the pipeline
|
605 |
+
pipeline = build_pipeline(device)
|
606 |
|
607 |
progress(0.1, "Loading image")
|
608 |
# Check if image_file is a string or a file object
|
|
|
728 |
# Set random seed
|
729 |
seed_all(seed)
|
730 |
|
731 |
+
# Check if CUDA is available
|
732 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
733 |
+
if not torch.cuda.is_available():
|
734 |
+
raise ValueError("CUDA is not available. Check your environment.")
|
735 |
+
|
736 |
# Build the pipeline
|
737 |
+
pipeline = build_pipeline(device)
|
738 |
|
739 |
progress(0.1, "Loading images")
|
740 |
# Check if image_file and goal_file are strings or file objects
|
|
|
1496 |
)
|
1497 |
|
1498 |
# Load the model at startup
|
1499 |
+
demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
|
1500 |
|
1501 |
if __name__ == "__main__":
|
1502 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|