Spaces:

AmberHeart
/

AetherV1

Running on Zero

App Files Files Community

Wenzheng Chang commited on 12 days ago

Commit

19da45c

1 Parent(s): ddee6ec

aetherv1 init

Browse files

Files changed (41) hide show

.gitattributes +1 -0
.gitignore +157 -0
.pre-commit-config.yaml +14 -0
.project-root +2 -0
LICENSE +21 -0
README.md +60 -3
aether/__init__.py +0 -0
aether/pipelines/__init__.py +0 -0
aether/pipelines/aetherv1_pipeline_cogvideox.py +959 -0
aether/utils/__init__.py +0 -0
aether/utils/postprocess_utils.py +842 -0
aether/utils/preprocess_utils.py +39 -0
aether/utils/visualize_utils.py +255 -0
app.py +0 -7
assets/example_obs/car.png +3 -0
assets/example_obs/cartoon.png +3 -0
assets/example_obs/garden.jpg +0 -0
assets/example_obs/room.jpg +0 -0
assets/example_obs_goal/01_goal.png +3 -0
assets/example_obs_goal/01_obs.png +3 -0
assets/example_obs_goal/02_goal.png +3 -0
assets/example_obs_goal/02_obs.png +3 -0
assets/example_obs_goal/03_goal.png +3 -0
assets/example_obs_goal/03_obs.png +3 -0
assets/example_obs_goal/04_goal.png +3 -0
assets/example_obs_goal/04_obs.png +3 -0
assets/example_raymaps/raymap_backward.npy +3 -0
assets/example_raymaps/raymap_forward_right.npy +3 -0
assets/example_raymaps/raymap_left_forward.npy +3 -0
assets/example_raymaps/raymap_right.npy +3 -0
assets/example_videos/bridge.mp4 +3 -0
assets/example_videos/moviegen.mp4 +3 -0
assets/example_videos/nuscenes.mp4 +3 -0
assets/example_videos/veo2.mp4 +3 -0
assets/logo.png +3 -0
assets/teaser.png +3 -0
pyproject.toml +30 -0
requirements.txt +41 -0
scripts/demo.py +614 -0
scripts/demo_gradio.py +1470 -0
setup.py +69 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,157 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+env/
+.venv
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+### VisualStudioCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+**/.vscode
+# JetBrains
+.idea/
+# Data & Models
+*.h5
+*.tar
+*.tar.gz
+# Lightning-Hydra-Template
+configs/local/default.yaml
+# data/
+/logs/
+.env
+# Aim logging
+.aim
+# local files
+logs

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.1
+    hooks:
+      - id: ruff
+        exclude: ^test/  # Skip the entire "test" directory
+        args:
+          - --fix
+      - id: ruff-format
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: check-yaml

.project-root ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # this file is required for inferring the project root directory
2	+ # do not delete

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Aether Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1,71 @@
 ---
 title: AetherV1
-emoji: 😻
 colorFrom: purple
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.23.1
-app_file: app.py
 pinned: false
 license: mit
 short_description: 'Aether: Geometric-Aware Unified World Modeling'
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AetherV1
+emoji: 🌏
 colorFrom: purple
 colorTo: yellow
 sdk: gradio
 sdk_version: 5.23.1
+app_file: scripts/demo_gradio.py
 pinned: false
 license: mit
 short_description: 'Aether: Geometric-Aware Unified World Modeling'
 ---
+<div align="center">
+# Aether: Geometric-Aware Unified World Modeling
+</div>
+<div align="center">
+  <img width="400" alt="image" src="assets/logo.png">
+  <!-- <br> -->
+</div>
+<div align="center">
+<a href='https://arxiv.org/abs/2503.18945'><img src='https://img.shields.io/badge/arXiv-2503.18945-red'></a> &nbsp;
+<a href='https://aether-world.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;
+<a href=''><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo%20(Coming%20Soon)-blue'></a> &nbsp;
+</div>
+Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with generative modeling
+for human-like spatial reasoning. Our framework unifies three core capabilities: (1) **4D dynamic reconstruction**,
+(2) **action-conditioned video prediction**, and (3) **goal-conditioned visual planning**. Trained entirely on
+synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.
+<div align="center">
+    <img src="assets/teaser.png" alt="Teaser" width="800"/>
+</div>
+## 📝 Citation
+If you find this work useful in your research, please consider citing:
+```bibtex
+@article{aether,
+  title     = {Aether: Geometric-Aware Unified World Modeling},
+  author    = {Aether Team and Haoyi Zhu and Yifan Wang and Jianjun Zhou and Wenzheng Chang and Yang Zhou and Zizun Li and Junyi Chen and Chunhua Shen and Jiangmiao Pang and Tong He},
+  journal   = {arXiv preprint arXiv:2503.18945},
+  year      = {2025}
+}
+```
+## ⚖️ License
+This repository is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+## 🙏 Acknowledgements
+Our work is primarily built upon
+[Accelerate](https://github.com/huggingface/accelerate),
+[Diffusers](https://github.com/huggingface/diffusers),
+[CogVideoX](https://github.com/THUDM/CogVideo),
+[Finetrainers](https://github.com/a-r-r-o-w/finetrainers),
+[DepthAnyVideo](https://github.com/Nightmare-n/DepthAnyVideo),
+[CUT3R](https://github.com/CUT3R/CUT3R),
+[MonST3R](https://github.com/Junyi42/monst3r),
+[VBench](https://github.com/Vchitect/VBench),
+[GST](https://github.com/SOTAMak1r/GST),
+[SPA](https://github.com/HaoyiZhu/SPA),
+[DroidCalib](https://github.com/boschresearch/DroidCalib),
+[Grounded-SAM-2](https://github.com/IDEA-Research/Grounded-SAM-2),
+[ceres-solver](https://github.com/ceres-solver/ceres-solver), etc.
+We extend our gratitude to all these authors for their generously open-sourced code and their significant contributions to the community.

aether/__init__.py ADDED Viewed

File without changes

aether/pipelines/__init__.py ADDED Viewed

File without changes

aether/pipelines/aetherv1_pipeline_cogvideox.py ADDED Viewed

	@@ -0,0 +1,959 @@

+import inspect
+import math
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import PIL
+import torch
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from einops import rearrange
+from transformers import AutoTokenizer, T5EncoderModel
+from aether.utils.preprocess_utils import imcrop_center
+def get_3d_rotary_pos_embed(
+    embed_dim,
+    crops_coords,
+    grid_size,
+    temporal_size,
+    theta: int = 10000,
+    use_real: bool = True,
+    grid_type: str = "linspace",
+    max_size: Optional[Tuple[int, int]] = None,
+    device: Optional[torch.device] = None,
+    fps_factor: Optional[float] = 1.0,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+    grid_type (`str`):
+        Whether to use "linspace" or "slice" to compute grids.
+    fps_factor (`float`):
+        The relative fps factor of the video, computed by base_fps / fps. Useful for variable fps training.
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    if use_real is not True:
+        raise ValueError(
+            " `use_real = False` is not currently supported for get_3d_rotary_pos_embed"
+        )
+    if grid_type == "linspace":
+        start, stop = crops_coords
+        grid_size_h, grid_size_w = grid_size
+        grid_h = torch.linspace(
+            start[0],
+            stop[0] * (grid_size_h - 1) / grid_size_h,
+            grid_size_h,
+            device=device,
+            dtype=torch.float32,
+        )
+        grid_w = torch.linspace(
+            start[1],
+            stop[1] * (grid_size_w - 1) / grid_size_w,
+            grid_size_w,
+            device=device,
+            dtype=torch.float32,
+        )
+        grid_t = (
+            torch.linspace(
+                0,
+                temporal_size * (temporal_size - 1) / temporal_size,
+                temporal_size,
+                device=device,
+                dtype=torch.float32,
+            )
+            * fps_factor
+        )
+    elif grid_type == "slice":
+        max_h, max_w = max_size
+        grid_size_h, grid_size_w = grid_size
+        grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
+        grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
+        grid_t = (
+            torch.arange(temporal_size, device=device, dtype=torch.float32) * fps_factor
+        )
+    else:
+        raise ValueError("Invalid value passed for `grid_type`.")
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+    # Temporal frequencies
+    freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, theta=theta, use_real=True)
+    # Spatial frequencies for height and width
+    freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, theta=theta, use_real=True)
+    freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, theta=theta, use_real=True)
+    # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
+    def combine_time_height_width(freqs_t, freqs_h, freqs_w):
+        freqs_t = freqs_t[:, None, None, :].expand(
+            -1, grid_size_h, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_w, dim_t
+        freqs_h = freqs_h[None, :, None, :].expand(
+            temporal_size, -1, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_h
+        freqs_w = freqs_w[None, None, :, :].expand(
+            temporal_size, grid_size_h, -1, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_w
+        freqs = torch.cat(
+            [freqs_t, freqs_h, freqs_w], dim=-1
+        )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
+        freqs = freqs.view(
+            temporal_size * grid_size_h * grid_size_w, -1
+        )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
+        return freqs
+    t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
+    h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
+    w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
+    if grid_type == "slice":
+        t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size]
+        h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h]
+        w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w]
+    cos = combine_time_height_width(t_cos, h_cos, w_cos)
+    sin = combine_time_height_width(t_sin, h_sin, w_sin)
+    return cos, sin
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor,
+    generator: Optional[torch.Generator] = None,
+    sample_mode: str = "sample",
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+@dataclass
+class AetherV1PipelineOutput(BaseOutput):
+    rgb: np.ndarray
+    disparity: np.ndarray
+    raymap: np.ndarray
+class AetherV1PipelineCogVideoX(CogVideoXImageToVideoPipeline):
+    _supported_tasks = ["reconstruction", "prediction", "planning"]
+    _default_num_inference_steps = {
+        "reconstruction": 4,
+        "prediction": 50,
+        "planning": 50,
+    }
+    _default_guidance_scale = {
+        "reconstruction": 1.0,
+        "prediction": 3.0,
+        "planning": 3.0,
+    }
+    _default_use_dynamic_cfg = {
+        "reconstruction": False,
+        "prediction": True,
+        "planning": True,
+    }
+    _base_fps = 12
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        scheduler: CogVideoXDPMScheduler,
+        transformer: CogVideoXTransformer3DModel,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            scheduler=scheduler,
+            transformer=transformer,
+        )
+        self.empty_prompt_embeds, _ = self.encode_prompt(
+            prompt="",
+            negative_prompt=None,
+            do_classifier_free_guidance=False,
+            num_videos_per_prompt=1,
+            prompt_embeds=None,
+        )
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+        fps: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (
+            self.vae_scale_factor_spatial * self.transformer.config.patch_size
+        )
+        grid_width = width // (
+            self.vae_scale_factor_spatial * self.transformer.config.patch_size
+        )
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+                device=device,
+                fps_factor=self._base_fps / fps,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+                device=device,
+                fps_factor=self._base_fps / fps,
+            )
+        return freqs_cos, freqs_sin
+    def check_inputs(
+        self,
+        task,
+        image,
+        video,
+        goal,
+        raymap,
+        height,
+        width,
+        num_frames,
+        fps,
+    ):
+        if task not in self._supported_tasks:
+            raise ValueError(f"`task` has to be one of {self._supported_tasks}.")
+        if image is None and video is None:
+            raise ValueError("`image` or `video` has to be provided.")
+        if image is not None and video is not None:
+            raise ValueError("`image` and `video` cannot both be provided.")
+        if image is not None:
+            if task == "reconstruction":
+                raise ValueError("`image` is not supported for `reconstruction` task.")
+            if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, np.ndarray)
+                and not isinstance(image, PIL.Image.Image)
+            ):
+                raise ValueError(
+                    "`image` has to be of type `torch.Tensor` or `np.ndarray` or `PIL.Image.Image` but is"
+                    f" {type(image)}"
+                )
+        if goal is not None:
+            if task != "planning":
+                raise ValueError("`goal` is only supported for `planning` task.")
+            if (
+                not isinstance(goal, torch.Tensor)
+                and not isinstance(goal, np.ndarray)
+                and not isinstance(goal, PIL.Image.Image)
+            ):
+                raise ValueError(
+                    "`goal` has to be of type `torch.Tensor` or `np.ndarray` or `PIL.Image.Image` but is"
+                    f" {type(goal)}"
+                )
+        if video is not None:
+            if task != "reconstruction":
+                raise ValueError("`video` is only supported for `reconstruction` task.")
+            if (
+                not isinstance(video, torch.Tensor)
+                and not isinstance(video, np.ndarray)
+                and not (
+                    isinstance(video, list)
+                    and all(isinstance(v, PIL.Image.Image) for v in video)
+                )
+            ):
+                raise ValueError(
+                    "`video` has to be of type `torch.Tensor` or `np.ndarray` or `List[PIL.Image.Image]` but is"
+                    f" {type(video)}"
+                )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if num_frames is None:
+            raise ValueError("`num_frames` is required.")
+        if num_frames not in [17, 25, 33, 41]:
+            raise ValueError("`num_frames` has to be one of [17, 25, 33, 41].")
+        if fps not in [8, 10, 12, 15, 24]:
+            raise ValueError("`fps` has to be one of [8, 10, 12, 15, 24].")
+        if (
+            raymap is not None
+            and not isinstance(raymap, torch.Tensor)
+            and not isinstance(raymap, np.ndarray)
+        ):
+            raise ValueError(
+                "`raymap` has to be of type `torch.Tensor` or `np.ndarray`."
+            )
+        if raymap is not None:
+            if raymap.shape[-4:] != (
+                num_frames,
+                6,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+            ):
+                raise ValueError(
+                    f"`raymap` shape is not correct. "
+                    f"Expected {num_frames, 6, height // self.vae_scale_factor_spatial, width // self.vae_scale_factor_spatial}, "
+                    f"got {raymap.shape}."
+                )
+    def _preprocess_image(self, image, height, width):
+        if isinstance(image, torch.Tensor):
+            image = image.cpu().numpy()
+        if image.dtype == np.uint8:
+            image = image.astype(np.float32) / 255.0
+        if image.ndim == 3:
+            image = [image]
+        image = imcrop_center(image, height, width)
+        image = self.video_processor.preprocess(image, height, width)
+        return image
+    def preprocess_inputs(
+        self,
+        image,
+        goal,
+        video,
+        raymap,
+        height,
+        width,
+        num_frames,
+    ):
+        if image is not None:
+            if isinstance(image, PIL.Image.Image):
+                image = self.video_processor.preprocess(
+                    image, height, width, resize_mode="crop"
+                ).to(self._execution_device)
+            else:
+                image = self._preprocess_image(image, height, width).to(
+                    self._execution_device
+                )
+        if goal is not None:
+            if isinstance(goal, PIL.Image.Image):
+                goal = self.video_processor.preprocess(
+                    goal, height, width, resize_mode="crop"
+                ).to(self._execution_device)
+            else:
+                goal = self._preprocess_image(goal, height, width).to(
+                    self._execution_device
+                )
+        if video is not None:
+            if isinstance(video, list) and all(
+                isinstance(v, PIL.Image.Image) for v in video
+            ):
+                video = self.video_processor.preprocess(
+                    video, height, width, resize_mode="crop"
+                ).to(self._execution_device)
+            else:
+                video = self._preprocess_image(video, height, width).to(
+                    self._execution_device
+                )
+        # TODO: check raymap shape
+        if raymap is not None:
+            if isinstance(raymap, np.ndarray):
+                raymap = torch.from_numpy(raymap).to(self._execution_device)
+            if raymap.ndim == 4:
+                raymap = raymap.unsqueeze(0)
+        return image, goal, video, raymap
+    @torch.no_grad()
+    def prepare_latents(
+        self,
+        image: Optional[torch.Tensor] = None,
+        goal: Optional[torch.Tensor] = None,
+        video: Optional[torch.Tensor] = None,
+        raymap: Optional[torch.Tensor] = None,
+        batch_size: int = 1,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            56,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
+        if self.transformer.config.patch_size_t is not None:
+            shape = (
+                shape[:1]
+                + (shape[1] + shape[1] % self.transformer.config.patch_size_t,)
+                + shape[2:]
+            )
+        if image is not None:
+            image = image.unsqueeze(2)
+            if isinstance(generator, list):
+                image_latents = [
+                    retrieve_latents(
+                        self.vae.encode(image[i].unsqueeze(0)), generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+            else:
+                image_latents = [
+                    retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator)
+                    for img in image
+                ]
+            image_latents = (
+                torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)
+            )  # [B, F, C, H, W]
+            if not self.vae.config.invert_scale_latents:
+                image_latents = self.vae_scaling_factor_image * image_latents
+            else:
+                # This is awkward but required because the CogVideoX team forgot to multiply the
+                # scaling factor during training :)
+                image_latents = 1 / self.vae_scaling_factor_image * image_latents
+        if goal is not None:
+            goal = goal.unsqueeze(2)
+            if isinstance(generator, list):
+                goal_latents = [
+                    retrieve_latents(
+                        self.vae.encode(goal[i].unsqueeze(0)), generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+            else:
+                goal_latents = [
+                    retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator)
+                    for img in goal
+                ]
+            goal_latents = (
+                torch.cat(goal_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)
+            )  # [B, F, C, H, W]
+            if not self.vae.config.invert_scale_latents:
+                goal_latents = self.vae_scaling_factor_image * goal_latents
+            else:
+                # This is awkward but required because the CogVideoX team forgot to multiply the
+                # scaling factor during training :)
+                goal_latents = 1 / self.vae_scaling_factor_image * goal_latents
+        if video is not None:
+            if video.ndim == 4:
+                video = video.unsqueeze(0)
+            video = video.permute(0, 2, 1, 3, 4)
+            if isinstance(generator, list):
+                video_latents = [
+                    retrieve_latents(
+                        self.vae.encode(video[i].unsqueeze(0)), generator[i]
+                    )
+                    for i in range(batch_size)
+                ]
+            else:
+                video_latents = [
+                    retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator)
+                    for img in video
+                ]
+            video_latents = (
+                torch.cat(video_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)
+            )  # [B, F, C, H, W]
+            if not self.vae.config.invert_scale_latents:
+                video_latents = self.vae_scaling_factor_image * video_latents
+            else:
+                # This is awkward but required because the CogVideoX team forgot to multiply the
+                # scaling factor during training :)
+                video_latents = 1 / self.vae_scaling_factor_image * video_latents
+        if image is not None and goal is None:
+            padding_shape = (
+                batch_size,
+                num_frames - image_latents.shape[1],
+                *image_latents.shape[2:],
+            )
+            padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+            condition_latents = torch.cat([image_latents, padding], dim=1)
+        elif goal is not None:
+            padding_shape = (
+                batch_size,
+                num_frames - goal_latents.shape[1] - image_latents.shape[1],
+                *image_latents.shape[2:],
+            )
+            padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+            condition_latents = torch.cat([image_latents, padding, goal_latents], dim=1)
+        elif video is not None:
+            condition_latents = video_latents
+        if raymap is not None:
+            if raymap.shape[1] % self.vae_scale_factor_temporal != 0:
+                # repeat
+                raymap = torch.cat(
+                    [
+                        raymap[
+                            :,
+                            : self.vae_scale_factor_temporal
+                            - raymap.shape[1] % self.vae_scale_factor_temporal,
+                        ],
+                        raymap,
+                    ],
+                    dim=1,
+                )
+            camera_conditions = rearrange(
+                raymap,
+                "b (n t) c h w -> b t (n c) h w",
+                n=self.vae_scale_factor_temporal,
+            )
+        else:
+            camera_conditions = torch.zeros(
+                batch_size,
+                num_frames,
+                24,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
+                device=device,
+                dtype=dtype,
+            )
+        condition_latents = torch.cat([condition_latents, camera_conditions], dim=2)
+        latents = randn_tensor(shape, device=device, generator=generator, dtype=dtype)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, condition_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        task: Optional[str] = None,
+        image: Optional[PipelineImageInput] = None,
+        video: Optional[PipelineImageInput] = None,
+        goal: Optional[PipelineImageInput] = None,
+        raymap: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: Optional[int] = None,
+        num_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: Optional[float] = None,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict] = None,
+        fps: Optional[int] = None,
+    ) -> Union[AetherV1PipelineOutput, Tuple]:
+        if task is None:
+            if video is not None:
+                task = "reconstruction"
+            elif goal is not None:
+                task = "planning"
+            else:
+                task = "prediction"
+        height = (
+            height
+            or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        )
+        width = (
+            width
+            or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        )
+        num_frames = num_frames or self.transformer.config.sample_frames
+        fps = fps or self._base_fps
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            task=task,
+            image=image,
+            video=video,
+            goal=goal,
+            raymap=raymap,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            fps=fps,
+        )
+        # 2. Preprocess inputs
+        image, goal, video, raymap = self.preprocess_inputs(
+            image=image,
+            goal=goal,
+            video=video,
+            raymap=raymap,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+        )
+        self._guidance_scale = guidance_scale
+        self._current_timestep = None
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        batch_size = 1
+        device = self._execution_device
+        # 3. Encode input prompt
+        prompt_embeds = self.empty_prompt_embeds.to(device)
+        num_inference_steps = (
+            num_inference_steps or self._default_num_inference_steps[task]
+        )
+        guidance_scale = guidance_scale or self._default_guidance_scale[task]
+        use_dynamic_cfg = use_dynamic_cfg or self._default_use_dynamic_cfg[task]
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latents
+        latents, condition_latents = self.prepare_latents(
+            image,
+            goal,
+            video,
+            raymap,
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(
+                height, width, latents.size(1), device, fps=fps
+            )
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+        # 8. Create ofs embeds if required
+        ofs_emb = (
+            None
+            if self.transformer.config.ofs_embed_dim is None
+            else latents.new_full((1,), fill_value=2.0)
+        )
+        # 8. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                if do_classifier_free_guidance:
+                    if task == "planning":
+                        assert goal is not None
+                        uncond = condition_latents.clone()
+                        uncond[:, :, : self.vae.config.latent_channels] = 0
+                        latent_condition = torch.cat([uncond, condition_latents])
+                    elif task == "prediction":
+                        uncond = condition_latents.clone()
+                        uncond[:, :1, : self.vae.config.latent_channels] = 0
+                        latent_condition = torch.cat([uncond, condition_latents])
+                    else:
+                        raise ValueError(
+                            f"Task {task} not supported for classifier-free guidance."
+                        )
+                else:
+                    latent_condition = condition_latents
+                latent_model_input = torch.cat(
+                    [latent_model_input, latent_condition], dim=2
+                )
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds.repeat(
+                        latent_model_input.shape[0], 1, 1
+                    ),
+                    timestep=timestep,
+                    ofs=ofs_emb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (
+                            1
+                            - math.cos(
+                                math.pi
+                                * (
+                                    (num_inference_steps - t.item())
+                                    / num_inference_steps
+                                )
+                                ** 5.0
+                            )
+                        )
+                        / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(
+                        noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                    )[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+        self._current_timestep = None
+        rgb_latents = latents[:, :, : self.vae.config.latent_channels]
+        disparity_latents = latents[
+            :, :, self.vae.config.latent_channels : self.vae.config.latent_channels * 2
+        ]
+        camera_latents = latents[:, :, self.vae.config.latent_channels * 2 :]
+        rgb_video = self.decode_latents(rgb_latents)
+        rgb_video = self.video_processor.postprocess_video(
+            video=rgb_video, output_type="np"
+        )
+        disparity_video = self.decode_latents(disparity_latents)
+        disparity_video = disparity_video.mean(dim=1, keepdim=False)
+        disparity_video = disparity_video * 0.5 + 0.5
+        disparity_video = torch.square(disparity_video)
+        disparity_video = disparity_video.float().cpu().numpy()
+        raymap = (
+            rearrange(camera_latents, "b t (n c) h w -> b (n t) c h w", n=4)[
+                :, -rgb_video.shape[1] :, :, :
+            ]
+            .cpu()
+            .numpy()
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (
+                rgb_video,
+                disparity_video,
+                raymap,
+            )
+        return AetherV1PipelineOutput(
+            rgb=rgb_video.squeeze(0),
+            disparity=disparity_video.squeeze(0),
+            raymap=raymap.squeeze(0),
+        )

aether/utils/__init__.py ADDED Viewed

File without changes

aether/utils/postprocess_utils.py ADDED Viewed

	@@ -0,0 +1,842 @@

+from __future__ import annotations
+from typing import Optional
+import matplotlib
+import numpy as np
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from plyfile import PlyData, PlyElement
+def signed_log1p_inverse(x):
+    """
+    Computes the inverse of signed_log1p: x = sign(x) * (exp(abs(x)) - 1).
+    Args:
+        y (torch.Tensor): Input tensor (output of signed_log1p).
+    Returns:
+        torch.Tensor: Original tensor x.
+    """
+    if isinstance(x, torch.Tensor):
+        return torch.sign(x) * (torch.exp(torch.abs(x)) - 1)
+    elif isinstance(x, np.ndarray):
+        return np.sign(x) * (np.exp(np.abs(x)) - 1)
+    else:
+        raise TypeError("Input must be a torch.Tensor or numpy.ndarray")
+def colorize_depth(depth, cmap="Spectral"):
+    min_d, max_d = (depth[depth > 0]).min(), (depth[depth > 0]).max()
+    depth = (max_d - depth) / (max_d - min_d)
+    cm = matplotlib.colormaps[cmap]
+    depth = depth.clip(0, 1)
+    depth = cm(depth, bytes=False)[..., 0:3]
+    return depth
+def save_ply(pointmap, image, output_file, downsample=20, mask=None):
+    _, h, w, _ = pointmap.shape
+    image = image[:, :h, :w]
+    pointmap = pointmap[:, :h, :w]
+    points = pointmap.reshape(-1, 3)  # (H*W, 3)
+    colors = image.reshape(-1, 3)  # (H*W, 3)
+    if mask is not None:
+        points = points[mask.reshape(-1)]
+        colors = colors[mask.reshape(-1)]
+    indices = np.random.choice(
+        colors.shape[0], int(colors.shape[0] / downsample), replace=False
+    )
+    points = points[indices]
+    colors = colors[indices]
+    vertices = []
+    for p, c in zip(points, colors):
+        vertex = (p[0], p[1], p[2], int(c[0]), int(c[1]), int(c[2]))
+        vertices.append(vertex)
+    vertex_dtype = np.dtype(
+        [
+            ("x", "f4"),
+            ("y", "f4"),
+            ("z", "f4"),
+            ("red", "u1"),
+            ("green", "u1"),
+            ("blue", "u1"),
+        ]
+    )
+    vertex_array = np.array(vertices, dtype=vertex_dtype)
+    ply_element = PlyElement.describe(vertex_array, "vertex")
+    PlyData([ply_element], text=True).write(output_file)
+def fov_to_focal(fovx, fovy, h, w):
+    focal_x = w * 0.5 / np.tan(fovx)
+    focal_y = h * 0.5 / np.tan(fovy)
+    focal = (focal_x + focal_y) / 2
+    return focal
+def get_rays(pose, h, w, focal=None, fovx=None, fovy=None):
+    import torch.nn.functional as F
+    pose = torch.from_numpy(pose).float()
+    x, y = torch.meshgrid(
+        torch.arange(w),
+        torch.arange(h),
+        indexing="xy",
+    )
+    x = x.flatten().unsqueeze(0).repeat(pose.shape[0], 1)
+    y = y.flatten().unsqueeze(0).repeat(pose.shape[0], 1)
+    cx = w * 0.5
+    cy = h * 0.5
+    intrinsics, focal = get_intrinsics(pose.shape[0], h, w, fovx, fovy, focal)
+    focal = torch.from_numpy(focal).float()
+    camera_dirs = F.pad(
+        torch.stack(
+            [
+                (x - cx + 0.5) / focal.unsqueeze(-1),
+                (y - cy + 0.5) / focal.unsqueeze(-1),
+            ],
+            dim=-1,
+        ),
+        (0, 1),
+        value=1.0,
+    )  # [t, hw, 3]
+    pose = pose.to(dtype=camera_dirs.dtype)
+    rays_d = camera_dirs @ pose[:, :3, :3].transpose(1, 2)  # [t, hw, 3]
+    rays_o = pose[:, :3, 3].unsqueeze(1).expand_as(rays_d)  # [hw, 3]
+    rays_o = rays_o.view(pose.shape[0], h, w, 3)
+    rays_d = rays_d.view(pose.shape[0], h, w, 3)
+    return rays_o.float().numpy(), rays_d.float().numpy(), intrinsics
+def get_intrinsics(batch_size, h, w, fovx=None, fovy=None, focal=None):
+    if focal is None:
+        focal_x = w * 0.5 / np.tan(fovx)
+        focal_y = h * 0.5 / np.tan(fovy)
+        focal = (focal_x + focal_y) / 2
+    cx = w * 0.5
+    cy = h * 0.5
+    intrinsics = np.zeros((batch_size, 3, 3))
+    intrinsics[:, 0, 0] = focal
+    intrinsics[:, 1, 1] = focal
+    intrinsics[:, 0, 2] = cx
+    intrinsics[:, 1, 2] = cy
+    intrinsics[:, 2, 2] = 1.0
+    return intrinsics, focal
+def save_pointmap(
+    rgb,
+    disparity,
+    raymap,
+    save_file,
+    vae_downsample_scale=8,
+    camera_pose=None,
+    ray_o_scale_inv=1.0,
+    max_depth=1e2,
+    save_full_pcd_videos=False,
+    smooth_camera=False,
+    smooth_method="kalman",  # or simple
+    **kwargs,
+):
+    """
+    Args:
+        rgb (numpy.ndarray): Shape of (t, h, w, 3), range [0, 1]
+        disparity (numpy.ndarray): Shape of (t, h, w), range [0, 1]
+        raymap (numpy.ndarray): Shape of (t, 6, h // 8, w // 8)
+        ray_o_scale_inv (float, optional): A `ray_o` scale constant. Defaults to 10.
+    """
+    rgb = np.clip(rgb, 0, 1) * 255
+    pointmap_dict = postprocess_pointmap(
+        disparity,
+        raymap,
+        vae_downsample_scale,
+        camera_pose,
+        ray_o_scale_inv=ray_o_scale_inv,
+        smooth_camera=smooth_camera,
+        smooth_method=smooth_method,
+        **kwargs,
+    )
+    save_ply(
+        pointmap_dict["pointmap"],
+        rgb,
+        save_file,
+        mask=(pointmap_dict["depth"] < max_depth),
+    )
+    if save_full_pcd_videos:
+        pcd_dict = {
+            "points": pointmap_dict["pointmap"],
+            "colors": rgb,
+            "intrinsics": pointmap_dict["intrinsics"],
+            "poses": pointmap_dict["camera_pose"],
+            "depths": pointmap_dict["depth"],
+        }
+        np.save(save_file.replace(".ply", "_pcd.npy"), pcd_dict)
+    return pointmap_dict
+def raymap_to_poses(
+    raymap, camera_pose=None, ray_o_scale_inv=1.0, return_intrinsics=True
+):
+    ts = raymap.shape[0]
+    if (not return_intrinsics) and (camera_pose is not None):
+        return camera_pose, None, None
+    raymap[:, 3:] = signed_log1p_inverse(raymap[:, 3:])
+    # Extract ray origins and directions
+    ray_o = (
+        rearrange(raymap[:, 3:], "t c h w -> t h w c") * ray_o_scale_inv
+    )  # [T, H, W, C]
+    ray_d = rearrange(raymap[:, :3], "t c h w -> t h w c")  # [T, H, W, C]
+    # Compute orientation and directions
+    orient = ray_o.reshape(ts, -1, 3).mean(axis=1)  # T, 3
+    image_orient = (ray_o + ray_d).reshape(ts, -1, 3).mean(axis=1)  # T, 3
+    Focal = np.linalg.norm(image_orient - orient, axis=-1)  # T,
+    Z_Dir = image_orient - orient  # T, 3
+    # Compute the width (W) and field of view (FoV_x)
+    W_Left = ray_d[:, :, :1, :].reshape(ts, -1, 3).mean(axis=1)
+    W_Right = ray_d[:, :, -1:, :].reshape(ts, -1, 3).mean(axis=1)
+    W = W_Right - W_Left
+    W_real = (
+        np.linalg.norm(np.cross(W, Z_Dir), axis=-1)
+        / (raymap.shape[-1] - 1)
+        * raymap.shape[-1]
+    )
+    Fov_x = np.arctan(W_real / (2 * Focal))
+    # Compute the height (H) and field of view (FoV_y)
+    H_Up = ray_d[:, :1, :, :].reshape(ts, -1, 3).mean(axis=1)
+    H_Down = ray_d[:, -1:, :, :].reshape(ts, -1, 3).mean(axis=1)
+    H = H_Up - H_Down
+    H_real = (
+        np.linalg.norm(np.cross(H, Z_Dir), axis=-1)
+        / (raymap.shape[-2] - 1)
+        * raymap.shape[-2]
+    )
+    Fov_y = np.arctan(H_real / (2 * Focal))
+    # Compute X, Y, and Z directions for the camera
+    X_Dir = W_Right - W_Left
+    Y_Dir = np.cross(Z_Dir, X_Dir)
+    X_Dir = np.cross(Y_Dir, Z_Dir)
+    X_Dir /= np.linalg.norm(X_Dir, axis=-1, keepdims=True)
+    Y_Dir /= np.linalg.norm(Y_Dir, axis=-1, keepdims=True)
+    Z_Dir /= np.linalg.norm(Z_Dir, axis=-1, keepdims=True)
+    # Create the camera-to-world (camera_pose) transformation matrix
+    if camera_pose is None:
+        camera_pose = np.zeros((ts, 4, 4))
+        camera_pose[:, :3, 0] = X_Dir
+        camera_pose[:, :3, 1] = Y_Dir
+        camera_pose[:, :3, 2] = Z_Dir
+        camera_pose[:, :3, 3] = orient
+        camera_pose[:, 3, 3] = 1.0
+    return camera_pose, Fov_x, Fov_y
+def postprocess_pointmap(
+    disparity,
+    raymap,
+    vae_downsample_scale=8,
+    camera_pose=None,
+    focal=None,
+    ray_o_scale_inv=1.0,
+    smooth_camera=False,
+    smooth_method="simple",
+    **kwargs,
+):
+    """
+    Args:
+        disparity (numpy.ndarray): Shape of (t, h, w), range [0, 1]
+        raymap (numpy.ndarray): Shape of (t, 6, h // 8, w // 8)
+        ray_o_scale_inv (float, optional): A `ray_o` scale constant. Defaults to 10.
+    """
+    depth = np.clip(1.0 / np.clip(disparity, 1e-3, 1), 0, 1e8)
+    camera_pose, fov_x, fov_y = raymap_to_poses(
+        raymap,
+        camera_pose=camera_pose,
+        ray_o_scale_inv=ray_o_scale_inv,
+        return_intrinsics=(focal is not None),
+    )
+    if focal is None:
+        focal = fov_to_focal(
+            fov_x,
+            fov_y,
+            int(raymap.shape[2] * vae_downsample_scale),
+            int(raymap.shape[3] * vae_downsample_scale),
+        )
+    if smooth_camera:
+        # Check if sequence is static
+        is_static, trans_diff, rot_diff = detect_static_sequence(camera_pose)
+        if is_static:
+            print(
+                f"Detected static/near-static sequence (trans_diff={trans_diff:.6f}, rot_diff={rot_diff:.6f})"
+            )
+            # Apply stronger smoothing for static sequences
+            camera_pose = adaptive_pose_smoothing(camera_pose, trans_diff, rot_diff)
+        else:
+            if smooth_method == "simple":
+                camera_pose = smooth_poses(
+                    camera_pose, window_size=5, method="gaussian"
+                )
+            elif smooth_method == "kalman":
+                camera_pose = smooth_trajectory(camera_pose, window_size=5)
+    ray_o, ray_d, intrinsics = get_rays(
+        camera_pose,
+        int(raymap.shape[2] * vae_downsample_scale),
+        int(raymap.shape[3] * vae_downsample_scale),
+        focal,
+    )
+    pointmap = depth[..., None] * ray_d + ray_o
+    return {
+        "pointmap": pointmap,
+        "camera_pose": camera_pose,
+        "intrinsics": intrinsics,
+        "ray_o": ray_o,
+        "ray_d": ray_d,
+        "depth": depth,
+    }
+def detect_static_sequence(poses, threshold=0.01):
+    """Detect if the camera sequence is static based on pose differences."""
+    translations = poses[:, :3, 3]
+    rotations = poses[:, :3, :3]
+    # Compute translation differences
+    trans_diff = np.linalg.norm(translations[1:] - translations[:-1], axis=1).mean()
+    # Compute rotation differences (using matrix frobenius norm)
+    rot_diff = np.linalg.norm(rotations[1:] - rotations[:-1], axis=(1, 2)).mean()
+    return trans_diff < threshold and rot_diff < threshold, trans_diff, rot_diff
+def adaptive_pose_smoothing(poses, trans_diff, rot_diff, base_window=5):
+    """Apply adaptive smoothing based on motion magnitude."""
+    # Increase window size for low motion sequences
+    motion_magnitude = trans_diff + rot_diff
+    adaptive_window = min(
+        41, max(base_window, int(base_window * (0.1 / max(motion_magnitude, 1e-6))))
+    )
+    # Apply stronger smoothing for low motion
+    poses_smooth = smooth_poses(poses, window_size=adaptive_window, method="gaussian")
+    return poses_smooth
+def get_pixel(H, W):
+    # get 2D pixels (u, v) for image_a in cam_a pixel space
+    u_a, v_a = np.meshgrid(np.arange(W), np.arange(H))
+    # u_a = np.flip(u_a, axis=1)
+    # v_a = np.flip(v_a, axis=0)
+    pixels_a = np.stack(
+        [u_a.flatten() + 0.5, v_a.flatten() + 0.5, np.ones_like(u_a.flatten())], axis=0
+    )
+    return pixels_a
+def project(depth, intrinsic, pose):
+    H, W = depth.shape
+    pixel = get_pixel(H, W).astype(np.float32)
+    points = (np.linalg.inv(intrinsic) @ pixel) * depth.reshape(-1)
+    points = pose[:3, :4] @ np.concatenate(
+        [points, np.ones((1, points.shape[1]))], axis=0
+    )
+    points = points.T.reshape(H, W, 3)
+    return points
+def depth_edge(
+    depth: torch.Tensor,
+    atol: float = None,
+    rtol: float = None,
+    kernel_size: int = 3,
+    mask: Optional[torch.Tensor] = None,
+) -> torch.BoolTensor:
+    """
+    Compute the edge mask of a depth map. The edge is defined as the pixels whose neighbors have a large difference in depth.
+    Args:
+        depth (torch.Tensor): shape (..., height, width), linear depth map
+        atol (float): absolute tolerance
+        rtol (float): relative tolerance
+    Returns:
+        edge (torch.Tensor): shape (..., height, width) of dtype torch.bool
+    """
+    is_numpy = isinstance(depth, np.ndarray)
+    if is_numpy:
+        depth = torch.from_numpy(depth)
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask)
+    shape = depth.shape
+    depth = depth.reshape(-1, 1, *shape[-2:])
+    if mask is not None:
+        mask = mask.reshape(-1, 1, *shape[-2:])
+    if mask is None:
+        diff = F.max_pool2d(
+            depth, kernel_size, stride=1, padding=kernel_size // 2
+        ) + F.max_pool2d(-depth, kernel_size, stride=1, padding=kernel_size // 2)
+    else:
+        diff = F.max_pool2d(
+            torch.where(mask, depth, -torch.inf),
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        ) + F.max_pool2d(
+            torch.where(mask, -depth, -torch.inf),
+            kernel_size,
+            stride=1,
+            padding=kernel_size // 2,
+        )
+    edge = torch.zeros_like(depth, dtype=torch.bool)
+    if atol is not None:
+        edge |= diff > atol
+    if rtol is not None:
+        edge |= (diff / depth).nan_to_num_() > rtol
+    edge = edge.reshape(*shape)
+    if is_numpy:
+        return edge.numpy()
+    return edge
+@torch.jit.script
+def align_rigid(
+    p,
+    q,
+    weights,
+):
+    """Compute a rigid transformation that, when applied to p, minimizes the weighted
+    squared distance between transformed points in p and points in q. See "Least-Squares
+    Rigid Motion Using SVD" by Olga Sorkine-Hornung and Michael Rabinovich for more
+    details (https://igl.ethz.ch/projects/ARAP/svd_rot.pdf).
+    """
+    device = p.device
+    dtype = p.dtype
+    batch, _, _ = p.shape
+    # 1. Compute the centroids of both point sets.
+    weights_normalized = weights / (weights.sum(dim=-1, keepdim=True) + 1e-8)
+    p_centroid = (weights_normalized[..., None] * p).sum(dim=-2)
+    q_centroid = (weights_normalized[..., None] * q).sum(dim=-2)
+    # 2. Compute the centered vectors.
+    p_centered = p - p_centroid[..., None, :]
+    q_centered = q - q_centroid[..., None, :]
+    # 3. Compute the 3x3 covariance matrix.
+    covariance = (q_centered * weights[..., None]).transpose(-1, -2) @ p_centered
+    # 4. Compute the singular value decomposition and then the rotation.
+    u, _, vt = torch.linalg.svd(covariance)
+    s = torch.eye(3, dtype=dtype, device=device)
+    s = s.expand((batch, 3, 3)).contiguous()
+    s[..., 2, 2] = (u.det() * vt.det()).sign()
+    rotation = u @ s @ vt
+    # 5. Compute the optimal scale
+    scale = (
+        (torch.einsum("b i j, b k j -> b k i", rotation, p_centered) * q_centered).sum(
+            -1
+        )
+        * weights
+    ).sum(-1) / ((p_centered**2).sum(-1) * weights).sum(-1)
+    # scale = (torch.einsum("b i j, b k j -> b k i", rotation, p_centered) * q_centered).sum([-1, -2]) / (p_centered**2).sum([-1, -2])
+    # 6. Compute the optimal translation.
+    translation = q_centroid - torch.einsum(
+        "b i j, b j -> b i", rotation, p_centroid * scale[:, None]
+    )
+    return rotation, translation, scale
+def align_camera_extrinsics(
+    cameras_src: torch.Tensor,  # Bx3x4 tensor representing [R | t]
+    cameras_tgt: torch.Tensor,  # Bx3x4 tensor representing [R | t]
+    estimate_scale: bool = True,
+    eps: float = 1e-9,
+):
+    """
+    Align the source camera extrinsics to the target camera extrinsics.
+    NOTE Assume OPENCV convention
+    Args:
+        cameras_src (torch.Tensor): Bx3x4 tensor representing [R | t] for source cameras.
+        cameras_tgt (torch.Tensor): Bx3x4 tensor representing [R | t] for target cameras.
+        estimate_scale (bool, optional): Whether to estimate the scale factor. Default is True.
+        eps (float, optional): Small value to avoid division by zero. Default is 1e-9.
+    Returns:
+        align_t_R (torch.Tensor): 1x3x3 rotation matrix for alignment.
+        align_t_T (torch.Tensor): 1x3 translation vector for alignment.
+        align_t_s (float): Scaling factor for alignment.
+    """
+    R_src = cameras_src[:, :, :3]  # Extracting the rotation matrices from [R | t]
+    R_tgt = cameras_tgt[:, :, :3]  # Extracting the rotation matrices from [R | t]
+    RRcov = torch.bmm(R_tgt.transpose(2, 1), R_src).mean(0)
+    U, _, V = torch.svd(RRcov)
+    align_t_R = V @ U.t()
+    T_src = cameras_src[:, :, 3]  # Extracting the translation vectors from [R | t]
+    T_tgt = cameras_tgt[:, :, 3]  # Extracting the translation vectors from [R | t]
+    A = torch.bmm(T_src[:, None], R_src)[:, 0]
+    B = torch.bmm(T_tgt[:, None], R_src)[:, 0]
+    Amu = A.mean(0, keepdim=True)
+    Bmu = B.mean(0, keepdim=True)
+    if estimate_scale and A.shape[0] > 1:
+        # get the scaling component by matching covariances
+        # of centered A and centered B
+        Ac = A - Amu
+        Bc = B - Bmu
+        align_t_s = (Ac * Bc).mean() / (Ac**2).mean().clamp(eps)
+    else:
+        # set the scale to identity
+        align_t_s = 1.0
+    # get the translation as the difference between the means of A and B
+    align_t_T = Bmu - align_t_s * Amu
+    align_t_R = align_t_R[None]
+    return align_t_R, align_t_T, align_t_s
+def apply_transformation(
+    cameras_src: torch.Tensor,  # Bx3x4 tensor representing [R | t]
+    align_t_R: torch.Tensor,  # 1x3x3 rotation matrix
+    align_t_T: torch.Tensor,  # 1x3 translation vector
+    align_t_s: float,  # Scaling factor
+    return_extri: bool = True,
+) -> torch.Tensor:
+    """
+    Align and transform the source cameras using the provided rotation, translation, and scaling factors.
+    NOTE Assume OPENCV convention
+    Args:
+        cameras_src (torch.Tensor): Bx3x4 tensor representing [R | t] for source cameras.
+        align_t_R (torch.Tensor): 1x3x3 rotation matrix for alignment.
+        align_t_T (torch.Tensor): 1x3 translation vector for alignment.
+        align_t_s (float): Scaling factor for alignment.
+    Returns:
+        aligned_R (torch.Tensor): Bx3x3 tensor representing the aligned rotation matrices.
+        aligned_T (torch.Tensor): Bx3 tensor representing the aligned translation vectors.
+    """
+    R_src = cameras_src[:, :, :3]
+    T_src = cameras_src[:, :, 3]
+    aligned_R = torch.bmm(R_src, align_t_R.expand(R_src.shape[0], 3, 3))
+    # Apply the translation alignment to the source translations
+    align_t_T_expanded = align_t_T[..., None].repeat(R_src.shape[0], 1, 1)
+    transformed_T = torch.bmm(R_src, align_t_T_expanded)[..., 0]
+    aligned_T = transformed_T + T_src * align_t_s
+    if return_extri:
+        extri = torch.cat([aligned_R, aligned_T.unsqueeze(-1)], dim=-1)
+        return extri
+    return aligned_R, aligned_T
+def slerp(q1, q2, t):
+    """Spherical Linear Interpolation between quaternions.
+    Args:
+        q1: (4,) first quaternion
+        q2: (4,) second quaternion
+        t: float between 0 and 1
+    Returns:
+        (4,) interpolated quaternion
+    """
+    # Compute the cosine of the angle between the two vectors
+    dot = np.sum(q1 * q2)
+    # If the dot product is negative, slerp won't take the shorter path
+    # Fix by negating one of the input quaternions
+    if dot < 0.0:
+        q2 = -q2
+        dot = -dot
+    # Threshold for using linear interpolation instead of spherical
+    DOT_THRESHOLD = 0.9995
+    if dot > DOT_THRESHOLD:
+        # If the inputs are too close for comfort, linearly interpolate
+        # and normalize the result
+        result = q1 + t * (q2 - q1)
+        return result / np.linalg.norm(result)
+    # Compute the angle between the quaternions
+    theta_0 = np.arccos(dot)
+    sin_theta_0 = np.sin(theta_0)
+    # Compute interpolation factors
+    theta = theta_0 * t
+    sin_theta = np.sin(theta)
+    s0 = np.cos(theta) - dot * sin_theta / sin_theta_0
+    s1 = sin_theta / sin_theta_0
+    return (s0 * q1) + (s1 * q2)
+def interpolate_poses(pose1, pose2, weight):
+    """Interpolate between two camera poses with weight.
+    Args:
+        pose1: (4, 4) first camera pose
+        pose2: (4, 4) second camera pose
+        weight: float between 0 and 1, weight for pose1 (1-weight for pose2)
+    Returns:
+        (4, 4) interpolated pose
+    """
+    from scipy.spatial.transform import Rotation as R
+    # Extract rotations and translations
+    R1 = R.from_matrix(pose1[:3, :3])
+    R2 = R.from_matrix(pose2[:3, :3])
+    t1 = pose1[:3, 3]
+    t2 = pose2[:3, 3]
+    # Get quaternions
+    q1 = R1.as_quat()
+    q2 = R2.as_quat()
+    # Interpolate rotation using our slerp implementation
+    q_interp = slerp(q1, q2, 1 - weight)  # 1-weight because weight is for pose1
+    R_interp = R.from_quat(q_interp)
+    # Linear interpolation for translation
+    t_interp = weight * t1 + (1 - weight) * t2
+    # Construct interpolated pose
+    pose_interp = np.eye(4)
+    pose_interp[:3, :3] = R_interp.as_matrix()
+    pose_interp[:3, 3] = t_interp
+    return pose_interp
+def smooth_poses(poses, window_size=5, method="gaussian"):
+    """Smooth camera poses temporally.
+    Args:
+        poses: (N, 4, 4) camera poses
+        window_size: int, must be odd number
+        method: str, 'gaussian' or 'savgol' or 'ma'
+    Returns:
+        (N, 4, 4) smoothed poses
+    """
+    from scipy.ndimage import gaussian_filter1d
+    from scipy.signal import savgol_filter
+    from scipy.spatial.transform import Rotation as R
+    assert window_size % 2 == 1, "window_size must be odd"
+    N = poses.shape[0]
+    smoothed = np.zeros_like(poses)
+    # Extract translations and quaternions
+    translations = poses[:, :3, 3]
+    rotations = R.from_matrix(poses[:, :3, :3])
+    quats = rotations.as_quat()  # (N, 4)
+    # Ensure consistent quaternion signs to prevent interpolation artifacts
+    for i in range(1, N):
+        if np.dot(quats[i], quats[i - 1]) < 0:
+            quats[i] = -quats[i]
+    # Smooth translations
+    if method == "gaussian":
+        sigma = window_size / 6.0  # approximately 99.7% of the weight within the window
+        smoothed_trans = gaussian_filter1d(translations, sigma, axis=0, mode="nearest")
+        smoothed_quats = gaussian_filter1d(quats, sigma, axis=0, mode="nearest")
+    elif method == "savgol":
+        # Savitzky-Golay filter: polynomial fitting
+        poly_order = min(window_size - 1, 3)
+        smoothed_trans = savgol_filter(
+            translations, window_size, poly_order, axis=0, mode="nearest"
+        )
+        smoothed_quats = savgol_filter(
+            quats, window_size, poly_order, axis=0, mode="nearest"
+        )
+    elif method == "ma":
+        # Simple moving average
+        kernel = np.ones(window_size) / window_size
+        smoothed_trans = np.array(
+            [np.convolve(translations[:, i], kernel, mode="same") for i in range(3)]
+        ).T
+        smoothed_quats = np.array(
+            [np.convolve(quats[:, i], kernel, mode="same") for i in range(4)]
+        ).T
+    # Normalize quaternions
+    smoothed_quats /= np.linalg.norm(smoothed_quats, axis=1, keepdims=True)
+    # Reconstruct poses
+    smoothed_rots = R.from_quat(smoothed_quats).as_matrix()
+    for i in range(N):
+        smoothed[i] = np.eye(4)
+        smoothed[i, :3, :3] = smoothed_rots[i]
+        smoothed[i, :3, 3] = smoothed_trans[i]
+    return smoothed
+def smooth_trajectory(poses, window_size=5):
+    """Smooth camera trajectory using Kalman filter.
+    Args:
+        poses: (N, 4, 4) camera poses
+        window_size: int, window size for initial smoothing
+    Returns:
+        (N, 4, 4) smoothed poses
+    """
+    from filterpy.kalman import KalmanFilter
+    from scipy.spatial.transform import Rotation as R
+    N = poses.shape[0]
+    # Initialize Kalman filter for position and velocity
+    kf = KalmanFilter(dim_x=6, dim_z=3)  # 3D position and velocity
+    dt = 1.0  # assume uniform time steps
+    # State transition matrix
+    kf.F = np.array(
+        [
+            [1, 0, 0, dt, 0, 0],
+            [0, 1, 0, 0, dt, 0],
+            [0, 0, 1, 0, 0, dt],
+            [0, 0, 0, 1, 0, 0],
+            [0, 0, 0, 0, 1, 0],
+            [0, 0, 0, 0, 0, 1],
+        ]
+    )
+    # Measurement matrix
+    kf.H = np.array([[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]])
+    # Measurement noise
+    kf.R *= 0.1
+    # Process noise
+    kf.Q *= 0.1
+    # Initial state uncertainty
+    kf.P *= 1.0
+    # Extract translations and rotations
+    translations = poses[:, :3, 3]
+    rotations = R.from_matrix(poses[:, :3, :3])
+    quats = rotations.as_quat()
+    # First pass: simple smoothing for initial estimates
+    smoothed = smooth_poses(poses, window_size, method="gaussian")
+    smooth_trans = smoothed[:, :3, 3]
+    # Second pass: Kalman filter for trajectory
+    filtered_trans = np.zeros_like(translations)
+    kf.x = np.zeros(6)
+    kf.x[:3] = smooth_trans[0]
+    filtered_trans[0] = smooth_trans[0]
+    # Forward pass
+    for i in range(1, N):
+        kf.predict()
+        kf.update(smooth_trans[i])
+        filtered_trans[i] = kf.x[:3]
+    # Backward smoothing for rotations using SLERP
+    window_half = window_size // 2
+    smoothed_quats = np.zeros_like(quats)
+    for i in range(N):
+        start_idx = max(0, i - window_half)
+        end_idx = min(N, i + window_half + 1)
+        weights = np.exp(
+            -0.5 * ((np.arange(start_idx, end_idx) - i) / (window_half / 2)) ** 2
+        )
+        weights /= weights.sum()
+        # Weighted average of nearby quaternions
+        avg_quat = np.zeros(4)
+        for j, w in zip(range(start_idx, end_idx), weights):
+            if np.dot(quats[j], quats[i]) < 0:
+                avg_quat += w * -quats[j]
+            else:
+                avg_quat += w * quats[j]
+        smoothed_quats[i] = avg_quat / np.linalg.norm(avg_quat)
+    # Reconstruct final smoothed poses
+    final_smoothed = np.zeros_like(poses)
+    smoothed_rots = R.from_quat(smoothed_quats).as_matrix()
+    for i in range(N):
+        final_smoothed[i] = np.eye(4)
+        final_smoothed[i, :3, :3] = smoothed_rots[i]
+        final_smoothed[i, :3, 3] = filtered_trans[i]
+    return final_smoothed
+def compute_scale(prediction, target, mask):
+    if isinstance(prediction, np.ndarray):
+        prediction = torch.from_numpy(prediction).float()
+    if isinstance(target, np.ndarray):
+        target = torch.from_numpy(target).float()
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask).bool()
+    numerator = torch.sum(mask * prediction * target, (1, 2))
+    denominator = torch.sum(mask * prediction * prediction, (1, 2))
+    scale = torch.zeros_like(numerator)
+    valid = (denominator != 0).nonzero()
+    scale[valid] = numerator[valid] / denominator[valid]
+    return scale.item()

aether/utils/preprocess_utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import numpy as np
+def imcrop_center(img_list, crop_p_h, crop_p_w):
+    new_img = []
+    for i, _img in enumerate(img_list):
+        if crop_p_h / crop_p_w > _img.shape[0] / _img.shape[1]:  # crop left and right
+            start_h = int(0)
+            start_w = int((_img.shape[1] - _img.shape[0] / crop_p_h * crop_p_w) / 2)
+            crop_size = (_img.shape[0], int(_img.shape[0] / crop_p_h * crop_p_w))
+        else:
+            start_h = int((_img.shape[0] - _img.shape[1] / crop_p_w * crop_p_h) / 2)
+            start_w = int(0)
+            crop_size = (int(_img.shape[1] / crop_p_w * crop_p_h), _img.shape[1])
+        _img_src = crop(_img, start_h, start_w, crop_size[0], crop_size[1])
+        new_img.append(_img_src)
+    return new_img
+def crop(img, start_h, start_w, crop_h, crop_w):
+    img_src = np.zeros((crop_h, crop_w, *img.shape[2:]), dtype=img.dtype)
+    hsize, wsize = crop_h, crop_w
+    dh, dw, sh, sw = start_h, start_w, 0, 0
+    if dh < 0:
+        sh = -dh
+        hsize += dh
+        dh = 0
+    if dh + hsize > img.shape[0]:
+        hsize = img.shape[0] - dh
+    if dw < 0:
+        sw = -dw
+        wsize += dw
+        dw = 0
+    if dw + wsize > img.shape[1]:
+        wsize = img.shape[1] - dw
+    img_src[sh : sh + hsize, sw : sw + wsize] = img[dh : dh + hsize, dw : dw + wsize]
+    return img_src

aether/utils/visualize_utils.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Adapted from https://github.com/facebookresearch/vggt/blob/main/visual_util.py
+import matplotlib
+import numpy as np
+import trimesh
+from scipy.spatial.transform import Rotation
+from aether.utils.postprocess_utils import depth_edge
+def predictions_to_glb(
+    predictions,
+    filter_by_frames="all",
+    show_cam=True,
+    max_depth=100.0,
+    rtol=0.03,
+    frame_rel_idx: float = 0.0,
+) -> trimesh.Scene:
+    """
+    Converts predictions to a 3D scene represented as a GLB file.
+    Args:
+        predictions (dict): Dictionary containing model predictions with keys:
+            - world_points: 3D point coordinates (S, H, W, 3)
+            - images: Input images (S, H, W, 3)
+            - depths: Depths (S, H, W)
+            - camera poses: Camera poses (S, 4, 4)
+        filter_by_frames (str): Frame filter specification (default: "all")
+        show_cam (bool): Include camera visualization (default: True)
+        max_depth (float): Maximum depth value (default: 100.0)
+        rtol (float): Relative tolerance for depth edge detection (default: 0.2)
+        frame_rel_idx (float): Relative index of the frame to visualize (default: 0.0)
+    Returns:
+        trimesh.Scene: Processed 3D scene containing point cloud and cameras
+    Raises:
+        ValueError: If input predictions structure is invalid
+    """
+    if not isinstance(predictions, dict):
+        raise ValueError("predictions must be a dictionary")
+    selected_frame_idx = None
+    if filter_by_frames != "all" and filter_by_frames != "All":
+        try:
+            # Extract the index part before the colon
+            selected_frame_idx = int(filter_by_frames.split(":")[0])
+        except (ValueError, IndexError):
+            pass
+    pred_world_points = predictions["world_points"]
+    # Get images from predictions
+    images = predictions["images"]
+    # Use extrinsic matrices instead of pred_extrinsic_list
+    camera_poses = predictions["camera_poses"]
+    if selected_frame_idx is not None:
+        pred_world_points = pred_world_points[selected_frame_idx][None]
+        images = images[selected_frame_idx][None]
+        camera_poses = camera_poses[selected_frame_idx][None]
+    vertices_3d = pred_world_points.reshape(-1, 3)
+    # Handle different image formats - check if images need transposing
+    if images.ndim == 4 and images.shape[1] == 3:  # NCHW format
+        colors_rgb = np.transpose(images, (0, 2, 3, 1))
+    else:  # Assume already in NHWC format
+        colors_rgb = images
+    colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
+    depths = predictions["depths"]
+    masks = depths < max_depth
+    edge = ~depth_edge(depths, rtol=rtol, mask=masks)
+    masks = (masks & edge).reshape(-1)
+    vertices_3d = vertices_3d[masks]
+    colors_rgb = colors_rgb[masks]
+    if vertices_3d is None or np.asarray(vertices_3d).size == 0:
+        vertices_3d = np.array([[1, 0, 0]])
+        colors_rgb = np.array([[255, 255, 255]])
+        scene_scale = 1
+    else:
+        # Calculate the 5th and 95th percentiles along each axis
+        lower_percentile = np.percentile(vertices_3d, 5, axis=0)
+        upper_percentile = np.percentile(vertices_3d, 95, axis=0)
+        # Calculate the diagonal length of the percentile bounding box
+        scene_scale = np.linalg.norm(upper_percentile - lower_percentile)
+    colormap = matplotlib.colormaps.get_cmap("gist_rainbow")
+    # Initialize a 3D scene
+    scene_3d = trimesh.Scene()
+    # Add point cloud data to the scene
+    point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
+    scene_3d.add_geometry(point_cloud_data)
+    # Prepare 4x4 matrices for camera extrinsics
+    num_cameras = len(camera_poses)
+    extrinsics_matrices = np.zeros((num_cameras, 4, 4))
+    extrinsics_matrices[:, :3, :4] = camera_poses[:, :3, :4]
+    extrinsics_matrices[:, 3, 3] = 1
+    if show_cam:
+        # Add camera models to the scene
+        for i in range(num_cameras):
+            camera_to_world = camera_poses[i]
+            rgba_color = colormap(frame_rel_idx)
+            current_color = tuple(int(255 * x) for x in rgba_color[:3])
+            integrate_camera_into_scene(
+                scene_3d, camera_to_world, current_color, scene_scale
+            )
+    return scene_3d
+def integrate_camera_into_scene(
+    scene: trimesh.Scene,
+    transform: np.ndarray,
+    face_colors: tuple,
+    scene_scale: float,
+):
+    """
+    Integrates a fake camera mesh into the 3D scene.
+    Args:
+        scene (trimesh.Scene): The 3D scene to add the camera model.
+        transform (np.ndarray): Transformation matrix for camera positioning.
+        face_colors (tuple): Color of the camera face.
+        scene_scale (float): Scale of the scene.
+    """
+    cam_width = scene_scale * 0.025
+    cam_height = scene_scale * 0.05
+    # Create cone shape for camera
+    rot_45_degree = np.eye(4)
+    rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
+    rot_45_degree[2, 3] = -cam_height
+    opengl_transform = get_opengl_conversion_matrix()
+    # Combine transformations
+    complete_transform = transform @ opengl_transform @ rot_45_degree
+    camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)
+    # Generate mesh for the camera
+    slight_rotation = np.eye(4)
+    slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()
+    vertices_combined = np.concatenate(
+        [
+            camera_cone_shape.vertices,
+            0.95 * camera_cone_shape.vertices,
+            transform_points(slight_rotation, camera_cone_shape.vertices),
+        ]
+    )
+    vertices_transformed = transform_points(complete_transform, vertices_combined)
+    mesh_faces = compute_camera_faces(camera_cone_shape)
+    # Add the camera mesh to the scene
+    camera_mesh = trimesh.Trimesh(vertices=vertices_transformed, faces=mesh_faces)
+    camera_mesh.visual.face_colors[:, :3] = face_colors
+    scene.add_geometry(camera_mesh)
+def get_opengl_conversion_matrix() -> np.ndarray:
+    """
+    Constructs and returns the OpenGL conversion matrix.
+    Returns:
+        numpy.ndarray: A 4x4 OpenGL conversion matrix.
+    """
+    # Create an identity matrix
+    matrix = np.identity(4)
+    # Flip the y and z axes
+    matrix[1, 1] = -1
+    matrix[2, 2] = -1
+    return matrix
+def transform_points(
+    transformation: np.ndarray, points: np.ndarray, dim: int = None
+) -> np.ndarray:
+    """
+    Applies a 4x4 transformation to a set of points.
+    Args:
+        transformation (np.ndarray): Transformation matrix.
+        points (np.ndarray): Points to be transformed.
+        dim (int, optional): Dimension for reshaping the result.
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    points = np.asarray(points)
+    initial_shape = points.shape[:-1]
+    dim = dim or points.shape[-1]
+    # Apply transformation
+    transformation = transformation.swapaxes(
+        -1, -2
+    )  # Transpose the transformation matrix
+    points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]
+    # Reshape the result
+    result = points[..., :dim].reshape(*initial_shape, dim)
+    return result
+def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
+    """
+    Computes the faces for the camera mesh.
+    Args:
+        cone_shape (trimesh.Trimesh): The shape of the camera cone.
+    Returns:
+        np.ndarray: Array of faces for the camera mesh.
+    """
+    # Create pseudo cameras
+    faces_list = []
+    num_vertices_cone = len(cone_shape.vertices)
+    for face in cone_shape.faces:
+        if 0 in face:
+            continue
+        v1, v2, v3 = face
+        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
+        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
+        faces_list.extend(
+            [
+                (v1, v2, v2_offset),
+                (v1, v1_offset, v3),
+                (v3_offset, v2, v3),
+                (v1, v2, v2_offset_2),
+                (v1, v1_offset_2, v3),
+                (v3_offset_2, v2, v3),
+            ]
+        )
+    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
+    return np.array(faces_list)

app.py DELETED Viewed

@@ -1,7 +0,0 @@
-import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

assets/example_obs/car.png ADDED Viewed

Git LFS Details

SHA256: e69a7af1f0aeb161c76f0d6b251b3394ee09705c0d1823207cc280add07a3933
Pointer size: 131 Bytes
Size of remote file: 475 kB

assets/example_obs/cartoon.png ADDED Viewed

Git LFS Details

SHA256: 82df67cd9a1393cbaed97ce3052fbf04a60cbb0e2603b8871e6364a432cb086d
Pointer size: 131 Bytes
Size of remote file: 564 kB

assets/example_obs/garden.jpg ADDED Viewed

assets/example_obs/room.jpg ADDED Viewed

assets/example_obs_goal/01_goal.png ADDED Viewed

Git LFS Details

SHA256: 42cecbddab958627643db8651fe18886ab05db51faa1288549efbca3c85e2276
Pointer size: 131 Bytes
Size of remote file: 514 kB

assets/example_obs_goal/01_obs.png ADDED Viewed

Git LFS Details

SHA256: 705f540f79b897aa44bb7f09618015def3b71f7a5a251adbef30dca7e323d740
Pointer size: 131 Bytes
Size of remote file: 451 kB

assets/example_obs_goal/02_goal.png ADDED Viewed

Git LFS Details

SHA256: ed95356dd64f889a4516ae71196f4e26fdc33927918c35d692013224cfdee0c3
Pointer size: 131 Bytes
Size of remote file: 506 kB

assets/example_obs_goal/02_obs.png ADDED Viewed

Git LFS Details

SHA256: 9915c1c5f6d09c1d1486822e4f91aad0a3240bb750d8e2ba87a3383d843065b1
Pointer size: 131 Bytes
Size of remote file: 578 kB

assets/example_obs_goal/03_goal.png ADDED Viewed

Git LFS Details

SHA256: ff452a47a87bb8df41b7feb310d6dc430c18e1a4c8d662a186ebf706e37dd521
Pointer size: 131 Bytes
Size of remote file: 411 kB

assets/example_obs_goal/03_obs.png ADDED Viewed

Git LFS Details

SHA256: 7d05892ce404b3170c682bf82ea71b215d6549ba1cf4903ad368a9fc6cb8b363
Pointer size: 131 Bytes
Size of remote file: 365 kB

assets/example_obs_goal/04_goal.png ADDED Viewed

Git LFS Details

SHA256: cc896d573a888dbf2d77e95e4a901efa239f07ee5cd10787199c45e559ece166
Pointer size: 131 Bytes
Size of remote file: 588 kB

assets/example_obs_goal/04_obs.png ADDED Viewed

Git LFS Details

SHA256: 9e3d6ec20f188b5f5501933b75aae89c9c8e057e544a5d10bc562e69b5bfdf6d
Pointer size: 131 Bytes
Size of remote file: 599 kB

assets/example_raymaps/raymap_backward.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:396d0f398d8a59bc27a1d76b91222a58c8d751c6aff5a3ccc10a56a2beecd540
+size 5313728

assets/example_raymaps/raymap_forward_right.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46f73167495d98d19e86cd47cd062f6d18f2c2882d73c1239effc1b5f039bd32
+size 5313728

assets/example_raymaps/raymap_left_forward.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6e744a0adf4c5f0386ce8c1d586c85f1b17d392978fa34057c638aff5b84ac3
+size 5313728

assets/example_raymaps/raymap_right.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2eab316eab3e94297b0d4d5cffae367484265873ce3c4c98e24bffb4361d4a8d
+size 5313728

assets/example_videos/bridge.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dcb3a3dff9d02adfa97534fb3cb96b0618c101c049cf37cb81ce375bf71f252
+size 6714828

assets/example_videos/moviegen.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4204806ad057c53068710d9f3db1ad74f3e947be92d1e8c3f3e06343efd0c1c6
+size 2164737

assets/example_videos/nuscenes.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1230424dfae7159140968649e06f0d8a874f0e52f5ad296ba0fa5fd9c1c2d467
+size 6256650

assets/example_videos/veo2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9206a10f368f4334338c0ca44d6c85aa2d260294d720309a855741d57116ed60
+size 2909620

assets/logo.png ADDED Viewed

Git LFS Details

SHA256: 1fcc6a3c8e5fc8206ce96ca50f85b06aa337d38354b98b4faef986f06026550e
Pointer size: 132 Bytes
Size of remote file: 1.29 MB

assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: 3b9cfe7dbabbb999ad75f78ef3a38ffb0ed9f56303cff3a0d9ebfa90bf29031c
Pointer size: 133 Bytes
Size of remote file: 11.8 MB

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[tool.ruff]
+line-length = 88
+[tool.ruff.lint]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "F402", "F823"]
+select = ["C", "E", "F", "I", "W"]
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"aether/*.py" = ["E402"]
+"tests/*.py" = ["E402", "F841"]
+[tool.ruff.lint.isort]
+lines-after-imports = 2
+known-first-party = ["aether"]
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"

requirements.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+accelerate>=1.2.1
+coloredlogs>=15.0.1
+colorlog>=6.9.0
+diffusers>=0.32.2
+easydict>=1.13
+einops>=0.8.0
+hf_transfer>=0.1.8
+huggingface-hub>=0.27.1
+imageio>=2.33.1
+imageio-ffmpeg>=0.5.1
+iopath>=0.1.10
+matplotlib>=3.10.0
+numpy>=1.26.4
+omegaconf>=2.3.0
+opencv-python-headless>=4.10.0.84
+pillow>=11.1.0
+plotly>=5.24.1
+plyfile>=1.1
+pre_commit>=4.0.1
+python-dotenv>=1.0.1
+PyYAML>=6.0.2
+rich>=13.9.4
+rootutils>=1.0.7
+safetensors>=0.5.2
+scikit-image>=0.25.0
+scipy>=1.15.0
+sentencepiece>=0.2.0
+six>=1.17.0
+tokenizers>=0.21.0
+torchaudio>=2.5.1
+torchmetrics>=1.6.1
+torchvision>=0.20.1
+tqdm>=4.67.1
+transformers>=4.48.0
+triton>=3.1.0
+typer>=0.15.1
+typing_extensions>=4.12.2
+viser>=0.2.23
+filterpy
+trimesh
+gradio

scripts/demo.py ADDED Viewed

	@@ -0,0 +1,614 @@

+import argparse
+import os
+import random
+from typing import List, Optional, Tuple
+import imageio.v3 as iio
+import numpy as np
+import PIL
+import rootutils
+import torch
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXTransformer3DModel,
+)
+from transformers import AutoTokenizer, T5EncoderModel
+rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from aether.pipelines.aetherv1_pipeline_cogvideox import (  # noqa: E402
+    AetherV1PipelineCogVideoX,
+    AetherV1PipelineOutput,
+)
+from aether.utils.postprocess_utils import (  # noqa: E402
+    align_camera_extrinsics,
+    apply_transformation,
+    colorize_depth,
+    compute_scale,
+    get_intrinsics,
+    interpolate_poses,
+    postprocess_pointmap,
+    project,
+    raymap_to_poses,
+)
+from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def seed_all(seed: int = 0) -> None:
+    """
+    Set random seeds of all components.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def parse_args() -> argparse.Namespace:
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="AetherV1-CogvideoX Inference Demo")
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=True,
+        choices=["reconstruction", "prediction", "planning"],
+        help="Task to perform: 'reconstruction', 'prediction' or 'planning'.",
+    )
+    parser.add_argument(
+        "--video",
+        type=str,
+        default=None,
+        help="Path to a video file. Only used for 'reconstruction' task.",
+    )
+    parser.add_argument(
+        "--image",
+        type=str,
+        default=None,
+        help="Path to an image file. Only used for 'prediction' and 'planning' tasks.",
+    )
+    parser.add_argument(
+        "--goal",
+        type=str,
+        default=None,
+        help="Path to a goal image file. Only used for 'planning' task.",
+    )
+    parser.add_argument(
+        "--raymap_action",
+        type=str,
+        default=None,
+        help="Path to a raymap action file. Should be a numpy array of shape (num_frame, 6, latent_height, latent_width).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="outputs",
+        help="Path to save the outputs.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed.",
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=12,
+        choices=[8, 10, 12, 15, 24],
+        help="Frames per second. Options: 8, 10, 12, 15, 24.",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=None,
+        help="Number of inference steps. If not specified, will use the default number of steps for the task.",
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=None,
+        help="Guidance scale. If not specified, will use the default guidance scale for the task.",
+    )
+    parser.add_argument(
+        "--use_dynamic_cfg",
+        action="store_true",
+        default=True,
+        help="Use dynamic cfg.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=480,
+        help="Height of the output video.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=720,
+        help="Width of the output video.",
+    )
+    parser.add_argument(
+        "--num_frames",
+        type=int,
+        default=41,
+        help="Number of frames to predict.",
+    )
+    parser.add_argument(
+        "--max_depth",
+        type=float,
+        default=100.0,
+        help="Maximum depth of the scene in meters.",
+    )
+    parser.add_argument(
+        "--rtol",
+        type=float,
+        default=0.03,
+        help="Relative tolerance for depth edge detection.",
+    )
+    parser.add_argument(
+        "--cogvideox_pretrained_model_name_or_path",
+        type=str,
+        default="THUDM/CogVideoX-5b-I2V",
+        help="Name or path of the CogVideoX model to use.",
+    )
+    parser.add_argument(
+        "--aether_pretrained_model_name_or_path",
+        type=str,
+        default="AetherWorldModel/AetherV1-CogVideoX",
+        help="Name or path of the Aether model to use.",
+    )
+    parser.add_argument(
+        "--smooth_camera",
+        action="store_true",
+        default=True,
+        help="Smooth the camera trajectory.",
+    )
+    parser.add_argument(
+        "--smooth_method",
+        type=str,
+        default="kalman",
+        choices=["kalman", "simple"],
+        help="Smooth method.",
+    )
+    parser.add_argument(
+        "--sliding_window_stride",
+        type=int,
+        default=24,
+        help="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task.",
+    )
+    parser.add_argument(
+        "--post_reconstruction",
+        action="store_true",
+        default=True,
+        help="Run reconstruction after prediction for better quality. Only used for 'prediction' and 'planning' tasks.",
+    )
+    parser.add_argument(
+        "--pointcloud_save_frame_interval",
+        type=int,
+        default=10,
+        help="Pointcloud save frame interval.",
+    )
+    parser.add_argument(
+        "--align_pointmaps",
+        action="store_true",
+        default=False,
+        help="Align pointmaps.",
+    )
+    return parser.parse_args()
+def build_pipeline(args: argparse.Namespace) -> AetherV1PipelineCogVideoX:
+    pipeline = AetherV1PipelineCogVideoX(
+        tokenizer=AutoTokenizer.from_pretrained(
+            args.cogvideox_pretrained_model_name_or_path,
+            subfolder="tokenizer",
+        ),
+        text_encoder=T5EncoderModel.from_pretrained(
+            args.cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
+        ),
+        vae=AutoencoderKLCogVideoX.from_pretrained(
+            args.cogvideox_pretrained_model_name_or_path, subfolder="vae"
+        ),
+        scheduler=CogVideoXDPMScheduler.from_pretrained(
+            args.cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
+        ),
+        transformer=CogVideoXTransformer3DModel.from_pretrained(
+            args.aether_pretrained_model_name_or_path, subfolder="transformer"
+        ),
+    )
+    pipeline.vae.enable_slicing()
+    pipeline.vae.enable_tiling()
+    pipeline.to(device)
+    return pipeline
+def get_window_starts(
+    total_frames: int, sliding_window_size: int, temporal_stride: int
+) -> List[int]:
+    """Calculate window start indices."""
+    starts = list(
+        range(
+            0,
+            total_frames - sliding_window_size + 1,
+            temporal_stride,
+        )
+    )
+    if (
+        total_frames > sliding_window_size
+        and (total_frames - sliding_window_size) % temporal_stride != 0
+    ):
+        starts.append(total_frames - sliding_window_size)
+    return starts
+def blend_and_merge_window_results(
+    window_results: List[AetherV1PipelineOutput],
+    window_indices: List[int],
+    args: argparse.Namespace,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Blend and merge window results."""
+    merged_rgb = None
+    merged_disparity = None
+    merged_poses = None
+    merged_focals = None
+    if args.align_pointmaps:
+        merged_pointmaps = None
+    w1 = window_results[0].disparity
+    for idx, (window_result, t_start) in enumerate(zip(window_results, window_indices)):
+        t_end = t_start + window_result.rgb.shape[0]
+        if idx == 0:
+            merged_rgb = window_result.rgb
+            merged_disparity = window_result.disparity
+            pointmap_dict = postprocess_pointmap(
+                window_result.disparity,
+                window_result.raymap,
+                vae_downsample_scale=8,
+                ray_o_scale_inv=0.1,
+                smooth_camera=args.smooth_camera,
+                smooth_method=args.smooth_method if args.smooth_camera else "none",
+            )
+            merged_poses = pointmap_dict["camera_pose"]
+            merged_focals = (
+                pointmap_dict["intrinsics"][:, 0, 0]
+                + pointmap_dict["intrinsics"][:, 1, 1]
+            ) / 2
+            if args.align_pointmaps:
+                merged_pointmaps = pointmap_dict["pointmap"]
+        else:
+            overlap_t = window_indices[idx - 1] + window_result.rgb.shape[0] - t_start
+            window_disparity = window_result.disparity
+            # Align disparity
+            disp_mask = window_disparity[:overlap_t].reshape(1, -1, w1.shape[-1]) > 0.1
+            scale = compute_scale(
+                window_disparity[:overlap_t].reshape(1, -1, w1.shape[-1]),
+                merged_disparity[-overlap_t:].reshape(1, -1, w1.shape[-1]),
+                disp_mask.reshape(1, -1, w1.shape[-1]),
+            )
+            window_disparity = scale * window_disparity
+            # Blend disparity
+            result_disparity = np.ones((t_end, *w1.shape[1:]))
+            result_disparity[:t_start] = merged_disparity[:t_start]
+            result_disparity[t_start + overlap_t :] = window_disparity[overlap_t:]
+            weight = np.linspace(1, 0, overlap_t)[:, None, None]
+            result_disparity[t_start : t_start + overlap_t] = merged_disparity[
+                t_start : t_start + overlap_t
+            ] * weight + window_disparity[:overlap_t] * (1 - weight)
+            merged_disparity = result_disparity
+            # Blend RGB
+            result_rgb = np.ones((t_end, *w1.shape[1:], 3))
+            result_rgb[:t_start] = merged_rgb[:t_start]
+            result_rgb[t_start + overlap_t :] = window_result.rgb[overlap_t:]
+            weight_rgb = np.linspace(1, 0, overlap_t)[:, None, None, None]
+            result_rgb[t_start : t_start + overlap_t] = merged_rgb[
+                t_start : t_start + overlap_t
+            ] * weight_rgb + window_result.rgb[:overlap_t] * (1 - weight_rgb)
+            merged_rgb = result_rgb
+            # Align poses
+            window_raymap = window_result.raymap
+            window_poses, window_Fov_x, window_Fov_y = raymap_to_poses(
+                window_raymap, ray_o_scale_inv=0.1
+            )
+            rel_r, rel_t, rel_s = align_camera_extrinsics(
+                torch.from_numpy(window_poses[:overlap_t]),
+                torch.from_numpy(merged_poses[-overlap_t:]),
+            )
+            aligned_window_poses = (
+                apply_transformation(
+                    torch.from_numpy(window_poses),
+                    rel_r,
+                    rel_t,
+                    rel_s,
+                    return_extri=True,
+                )
+                .cpu()
+                .numpy()
+            )
+            result_poses = np.ones((t_end, 4, 4))
+            result_poses[:t_start] = merged_poses[:t_start]
+            result_poses[t_start + overlap_t :] = aligned_window_poses[overlap_t:]
+            # Interpolate poses in overlap region
+            weights = np.linspace(1, 0, overlap_t)
+            for t in range(overlap_t):
+                weight = weights[t]
+                pose1 = merged_poses[t_start + t]
+                pose2 = aligned_window_poses[t]
+                result_poses[t_start + t] = interpolate_poses(pose1, pose2, weight)
+            merged_poses = result_poses
+            # Align intrinsics
+            window_intrinsics, _ = get_intrinsics(
+                batch_size=window_poses.shape[0],
+                h=window_result.disparity.shape[1],
+                w=window_result.disparity.shape[2],
+                fovx=window_Fov_x,
+                fovy=window_Fov_y,
+            )
+            window_focals = (
+                window_intrinsics[:, 0, 0] + window_intrinsics[:, 1, 1]
+            ) / 2
+            scale = (merged_focals[-overlap_t:] / window_focals[:overlap_t]).mean()
+            window_focals = scale * window_focals
+            result_focals = np.ones((t_end,))
+            result_focals[:t_start] = merged_focals[:t_start]
+            result_focals[t_start + overlap_t :] = window_focals[overlap_t:]
+            weight = np.linspace(1, 0, overlap_t)
+            result_focals[t_start : t_start + overlap_t] = merged_focals[
+                t_start : t_start + overlap_t
+            ] * weight + window_focals[:overlap_t] * (1 - weight)
+            merged_focals = result_focals
+            if args.align_pointmaps:
+                # Align pointmaps
+                window_pointmaps = postprocess_pointmap(
+                    result_disparity[t_start:],
+                    window_raymap,
+                    vae_downsample_scale=8,
+                    camera_pose=aligned_window_poses,
+                    focal=window_focals,
+                    ray_o_scale_inv=0.1,
+                    smooth_camera=args.smooth_camera,
+                    smooth_method=args.smooth_method if args.smooth_camera else "none",
+                )
+                result_pointmaps = np.ones((t_end, *w1.shape[1:], 3))
+                result_pointmaps[:t_start] = merged_pointmaps[:t_start]
+                result_pointmaps[t_start + overlap_t :] = window_pointmaps["pointmap"][
+                    overlap_t:
+                ]
+                weight = np.linspace(1, 0, overlap_t)[:, None, None, None]
+                result_pointmaps[t_start : t_start + overlap_t] = merged_pointmaps[
+                    t_start : t_start + overlap_t
+                ] * weight + window_pointmaps["pointmap"][:overlap_t] * (1 - weight)
+                merged_pointmaps = result_pointmaps
+    # project to pointmaps
+    intrinsics = [
+        np.array([[f, 0, 0.5 * args.width], [0, f, 0.5 * args.height], [0, 0, 1]])
+        for f in merged_focals
+    ]
+    if args.align_pointmaps:
+        pointmaps = merged_pointmaps
+    else:
+        pointmaps = np.stack(
+            [
+                project(
+                    1 / np.clip(merged_disparity[i], 1e-8, 1e8),
+                    intrinsics[i],
+                    merged_poses[i],
+                )
+                for i in range(merged_poses.shape[0])
+            ]
+        )
+    return merged_rgb, merged_disparity, merged_poses, pointmaps
+def save_output(
+    rgb: np.ndarray,
+    disparity: np.ndarray,
+    poses: Optional[np.ndarray] = None,
+    raymap: Optional[np.ndarray] = None,
+    pointmap: Optional[np.ndarray] = None,
+    args: argparse.Namespace = None,
+) -> None:
+    output_dir = args.output_dir
+    os.makedirs(output_dir, exist_ok=True)
+    if pointmap is None:
+        assert raymap is not None, "Raymap is required for saving pointmap."
+        pointmap_dict = postprocess_pointmap(
+            disparity,
+            raymap,
+            vae_downsample_scale=8,
+            ray_o_scale_inv=0.1,
+            smooth_camera=args.smooth_camera,
+            smooth_method=args.smooth_method,
+        )
+        pointmap = pointmap_dict["pointmap"]
+    if poses is None:
+        assert raymap is not None, "Raymap is required for saving poses."
+        poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
+    if args.task == "reconstruction":
+        filename = f"reconstruction_{args.video.split('/')[-1].split('.')[0]}"
+    elif args.task == "prediction":
+        filename = f"prediction_{args.image.split('/')[-1].split('.')[0]}"
+    elif args.task == "planning":
+        filename = f"planning_{args.image.split('/')[-1].split('.')[0]}_{args.goal.split('/')[-1].split('.')[0]}"
+    filename = os.path.join(output_dir, filename)
+    iio.imwrite(
+        f"{filename}_rgb.mp4",
+        (np.clip(rgb, 0, 1) * 255).astype(np.uint8),
+        fps=12,
+    )
+    iio.imwrite(
+        f"{filename}_disparity.mp4",
+        (colorize_depth(disparity) * 255).astype(np.uint8),
+        fps=12,
+    )
+    print("Building GLB scene")
+    for frame_idx in range(pointmap.shape[0])[:: args.pointcloud_save_frame_interval]:
+        predictions = {
+            "world_points": pointmap[frame_idx : frame_idx + 1],
+            "images": rgb[frame_idx : frame_idx + 1],
+            "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
+            "camera_poses": poses[frame_idx : frame_idx + 1],
+        }
+        scene_3d = predictions_to_glb(
+            predictions,
+            filter_by_frames="all",
+            show_cam=True,
+            max_depth=args.max_depth,
+            rtol=args.rtol,
+            frame_rel_idx=float(frame_idx) / pointmap.shape[0],
+        )
+        scene_3d.export(f"{filename}_pointcloud_frame_{frame_idx}.glb")
+    print("GLB Scene built")
+def main() -> None:
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    args = parse_args()
+    seed_all(args.seed)
+    if args.num_inference_steps is None:
+        args.num_inference_steps = 4 if args.task == "reconstruction" else 50
+    if args.guidance_scale is None:
+        args.guidance_scale = 1.0 if args.task == "reconstruction" else 3.0
+    pipeline = build_pipeline(args)
+    if args.task == "reconstruction":
+        assert args.video is not None, "Video is required for reconstruction task."
+        assert args.image is None, "Image is not required for reconstruction task."
+        assert args.goal is None, "Goal is not required for reconstruction task."
+        video = iio.imread(args.video).astype(np.float32) / 255.0
+        image, goal = None, None
+    elif args.task == "prediction":
+        assert args.image is not None, "Image is required for prediction task."
+        assert args.goal is None, "Goal is not required for prediction task."
+        image = PIL.Image.open(args.image)
+        video, goal = None, None
+    elif args.task == "planning":
+        assert args.image is not None, "Image is required for planning task."
+        assert args.goal is not None, "Goal is required for planning task."
+        image = PIL.Image.open(args.image)
+        goal = PIL.Image.open(args.goal)
+        video = None
+    if args.raymap_action is not None:
+        raymap = np.load(args.raymap_action)
+    else:
+        raymap = None
+    if args.task != "reconstruction":
+        output = pipeline(
+            task=args.task,
+            image=image,
+            video=video,
+            goal=goal,
+            raymap=raymap,
+            height=args.height,
+            width=args.width,
+            num_frames=args.num_frames,
+            fps=args.fps,
+            num_inference_steps=args.num_inference_steps,
+            guidance_scale=args.guidance_scale,
+            use_dynamic_cfg=args.use_dynamic_cfg,
+            generator=torch.Generator(device=device).manual_seed(args.seed),
+            return_dict=True,
+        )
+        if not args.post_reconstruction:
+            save_output(
+                rgb=output.rgb,
+                disparity=output.disparity,
+                raymap=output.raymap,
+                args=args,
+            )
+        else:
+            recon_output = pipeline(
+                task="reconstruction",
+                video=output.rgb,
+                height=args.height,
+                width=args.width,
+                num_frames=args.num_frames,
+                fps=args.fps,
+                num_inference_steps=4,
+                guidance_scale=1.0,  # we don't need guidance scale for reconstruction task
+                use_dynamic_cfg=False,
+                generator=torch.Generator(device=device).manual_seed(args.seed),
+            )
+            save_output(
+                rgb=output.rgb,
+                disparity=recon_output.disparity,
+                raymap=recon_output.raymap,
+                args=args,
+            )
+    else:
+        # for reconstruction task, we have to employ sliding window on long videos
+        window_results = []
+        window_indices = get_window_starts(
+            len(video), args.num_frames, args.sliding_window_stride
+        )
+        for start_idx in window_indices:
+            output = pipeline(
+                task=args.task,
+                image=None,
+                goal=None,
+                video=video[start_idx : start_idx + args.num_frames],
+                raymap=raymap[start_idx : start_idx + args.num_frames]
+                if raymap is not None
+                else None,
+                height=args.height,
+                width=args.width,
+                num_frames=args.num_frames,
+                fps=args.fps,
+                num_inference_steps=args.num_inference_steps,
+                guidance_scale=1.0,  # we don't need guidance scale for reconstruction task
+                use_dynamic_cfg=False,
+                generator=torch.Generator(device=device).manual_seed(args.seed),
+            )
+            window_results.append(output)
+        # merge window results
+        (
+            merged_rgb,
+            merged_disparity,
+            merged_poses,
+            pointmaps,
+        ) = blend_and_merge_window_results(window_results, window_indices, args)
+        save_output(
+            rgb=merged_rgb,
+            disparity=merged_disparity,
+            poses=merged_poses,
+            pointmap=pointmaps,
+            args=args,
+        )
+if __name__ == "__main__":
+    main()

scripts/demo_gradio.py ADDED Viewed

	@@ -0,0 +1,1470 @@

+import gc
+import os
+import random
+import re
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple
+import gradio as gr
+import imageio.v3 as iio
+import numpy as np
+import PIL
+import rootutils
+import torch
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXTransformer3DModel,
+)
+from transformers import AutoTokenizer, T5EncoderModel
+rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+from aether.pipelines.aetherv1_pipeline_cogvideox import (  # noqa: E402
+    AetherV1PipelineCogVideoX,
+    AetherV1PipelineOutput,
+)
+from aether.utils.postprocess_utils import (  # noqa: E402
+    align_camera_extrinsics,
+    apply_transformation,
+    colorize_depth,
+    compute_scale,
+    get_intrinsics,
+    interpolate_poses,
+    postprocess_pointmap,
+    project,
+    raymap_to_poses,
+)
+from aether.utils.visualize_utils import predictions_to_glb  # noqa: E402
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def seed_all(seed: int = 0) -> None:
+    """
+    Set random seeds of all components.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+# Global pipeline
+cogvideox_pretrained_model_name_or_path: str = "THUDM/CogVideoX-5b-I2V"
+aether_pretrained_model_name_or_path: str = "AetherWorldModel/AetherV1"
+pipeline = AetherV1PipelineCogVideoX(
+    tokenizer=AutoTokenizer.from_pretrained(
+        cogvideox_pretrained_model_name_or_path,
+        subfolder="tokenizer",
+    ),
+    text_encoder=T5EncoderModel.from_pretrained(
+        cogvideox_pretrained_model_name_or_path, subfolder="text_encoder"
+    ),
+    vae=AutoencoderKLCogVideoX.from_pretrained(
+        cogvideox_pretrained_model_name_or_path, subfolder="vae"
+    ),
+    scheduler=CogVideoXDPMScheduler.from_pretrained(
+        cogvideox_pretrained_model_name_or_path, subfolder="scheduler"
+    ),
+    transformer=CogVideoXTransformer3DModel.from_pretrained(
+        aether_pretrained_model_name_or_path, subfolder="transformer"
+    ),
+)
+pipeline.vae.enable_slicing()
+pipeline.vae.enable_tiling()
+pipeline.to(device)
+def build_pipeline() -> AetherV1PipelineCogVideoX:
+    """Initialize the model pipeline."""
+    return pipeline
+def get_window_starts(
+    total_frames: int, sliding_window_size: int, temporal_stride: int
+) -> List[int]:
+    """Calculate window start indices."""
+    starts = list(
+        range(
+            0,
+            total_frames - sliding_window_size + 1,
+            temporal_stride,
+        )
+    )
+    if (
+        total_frames > sliding_window_size
+        and (total_frames - sliding_window_size) % temporal_stride != 0
+    ):
+        starts.append(total_frames - sliding_window_size)
+    return starts
+def blend_and_merge_window_results(
+    window_results: List[AetherV1PipelineOutput], window_indices: List[int], args: Dict
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Blend and merge window results."""
+    merged_rgb = None
+    merged_disparity = None
+    merged_poses = None
+    merged_focals = None
+    align_pointmaps = args.get("align_pointmaps", True)
+    smooth_camera = args.get("smooth_camera", True)
+    smooth_method = args.get("smooth_method", "kalman") if smooth_camera else "none"
+    if align_pointmaps:
+        merged_pointmaps = None
+    w1 = window_results[0].disparity
+    for idx, (window_result, t_start) in enumerate(zip(window_results, window_indices)):
+        t_end = t_start + window_result.rgb.shape[0]
+        if idx == 0:
+            merged_rgb = window_result.rgb
+            merged_disparity = window_result.disparity
+            pointmap_dict = postprocess_pointmap(
+                window_result.disparity,
+                window_result.raymap,
+                vae_downsample_scale=8,
+                ray_o_scale_inv=0.1,
+                smooth_camera=smooth_camera,
+                smooth_method=smooth_method if smooth_camera else "none",
+            )
+            merged_poses = pointmap_dict["camera_pose"]
+            merged_focals = (
+                pointmap_dict["intrinsics"][:, 0, 0]
+                + pointmap_dict["intrinsics"][:, 1, 1]
+            ) / 2
+            if align_pointmaps:
+                merged_pointmaps = pointmap_dict["pointmap"]
+        else:
+            overlap_t = window_indices[idx - 1] + window_result.rgb.shape[0] - t_start
+            window_disparity = window_result.disparity
+            # Align disparity
+            disp_mask = window_disparity[:overlap_t].reshape(1, -1, w1.shape[-1]) > 0.1
+            scale = compute_scale(
+                window_disparity[:overlap_t].reshape(1, -1, w1.shape[-1]),
+                merged_disparity[-overlap_t:].reshape(1, -1, w1.shape[-1]),
+                disp_mask.reshape(1, -1, w1.shape[-1]),
+            )
+            window_disparity = scale * window_disparity
+            # Blend disparity
+            result_disparity = np.ones((t_end, *w1.shape[1:]))
+            result_disparity[:t_start] = merged_disparity[:t_start]
+            result_disparity[t_start + overlap_t :] = window_disparity[overlap_t:]
+            weight = np.linspace(1, 0, overlap_t)[:, None, None]
+            result_disparity[t_start : t_start + overlap_t] = merged_disparity[
+                t_start : t_start + overlap_t
+            ] * weight + window_disparity[:overlap_t] * (1 - weight)
+            merged_disparity = result_disparity
+            # Blend RGB
+            result_rgb = np.ones((t_end, *w1.shape[1:], 3))
+            result_rgb[:t_start] = merged_rgb[:t_start]
+            result_rgb[t_start + overlap_t :] = window_result.rgb[overlap_t:]
+            weight_rgb = np.linspace(1, 0, overlap_t)[:, None, None, None]
+            result_rgb[t_start : t_start + overlap_t] = merged_rgb[
+                t_start : t_start + overlap_t
+            ] * weight_rgb + window_result.rgb[:overlap_t] * (1 - weight_rgb)
+            merged_rgb = result_rgb
+            # Align poses
+            window_raymap = window_result.raymap
+            window_poses, window_Fov_x, window_Fov_y = raymap_to_poses(
+                window_raymap, ray_o_scale_inv=0.1
+            )
+            rel_r, rel_t, rel_s = align_camera_extrinsics(
+                torch.from_numpy(window_poses[:overlap_t]),
+                torch.from_numpy(merged_poses[-overlap_t:]),
+            )
+            aligned_window_poses = (
+                apply_transformation(
+                    torch.from_numpy(window_poses),
+                    rel_r,
+                    rel_t,
+                    rel_s,
+                    return_extri=True,
+                )
+                .cpu()
+                .numpy()
+            )
+            result_poses = np.ones((t_end, 4, 4))
+            result_poses[:t_start] = merged_poses[:t_start]
+            result_poses[t_start + overlap_t :] = aligned_window_poses[overlap_t:]
+            # Interpolate poses in overlap region
+            weights = np.linspace(1, 0, overlap_t)
+            for t in range(overlap_t):
+                weight = weights[t]
+                pose1 = merged_poses[t_start + t]
+                pose2 = aligned_window_poses[t]
+                result_poses[t_start + t] = interpolate_poses(pose1, pose2, weight)
+            merged_poses = result_poses
+            # Align intrinsics
+            window_intrinsics, _ = get_intrinsics(
+                batch_size=window_poses.shape[0],
+                h=window_result.disparity.shape[1],
+                w=window_result.disparity.shape[2],
+                fovx=window_Fov_x,
+                fovy=window_Fov_y,
+            )
+            window_focals = (
+                window_intrinsics[:, 0, 0] + window_intrinsics[:, 1, 1]
+            ) / 2
+            scale = (merged_focals[-overlap_t:] / window_focals[:overlap_t]).mean()
+            window_focals = scale * window_focals
+            result_focals = np.ones((t_end,))
+            result_focals[:t_start] = merged_focals[:t_start]
+            result_focals[t_start + overlap_t :] = window_focals[overlap_t:]
+            weight = np.linspace(1, 0, overlap_t)
+            result_focals[t_start : t_start + overlap_t] = merged_focals[
+                t_start : t_start + overlap_t
+            ] * weight + window_focals[:overlap_t] * (1 - weight)
+            merged_focals = result_focals
+            if align_pointmaps:
+                # Align pointmaps
+                window_pointmaps = postprocess_pointmap(
+                    result_disparity[t_start:],
+                    window_raymap,
+                    vae_downsample_scale=8,
+                    camera_pose=aligned_window_poses,
+                    focal=window_focals,
+                    ray_o_scale_inv=0.1,
+                    smooth_camera=smooth_camera,
+                    smooth_method=smooth_method if smooth_camera else "none",
+                )
+                result_pointmaps = np.ones((t_end, *w1.shape[1:], 3))
+                result_pointmaps[:t_start] = merged_pointmaps[:t_start]
+                result_pointmaps[t_start + overlap_t :] = window_pointmaps["pointmap"][
+                    overlap_t:
+                ]
+                weight = np.linspace(1, 0, overlap_t)[:, None, None, None]
+                result_pointmaps[t_start : t_start + overlap_t] = merged_pointmaps[
+                    t_start : t_start + overlap_t
+                ] * weight + window_pointmaps["pointmap"][:overlap_t] * (1 - weight)
+                merged_pointmaps = result_pointmaps
+    # project to pointmaps
+    height = args.get("height", 480)
+    width = args.get("width", 720)
+    intrinsics = [
+        np.array([[f, 0, 0.5 * width], [0, f, 0.5 * height], [0, 0, 1]])
+        for f in merged_focals
+    ]
+    if align_pointmaps:
+        pointmaps = merged_pointmaps
+    else:
+        pointmaps = np.stack(
+            [
+                project(
+                    1 / np.clip(merged_disparity[i], 1e-8, 1e8),
+                    intrinsics[i],
+                    merged_poses[i],
+                )
+                for i in range(merged_poses.shape[0])
+            ]
+        )
+    return merged_rgb, merged_disparity, merged_poses, pointmaps
+def process_video_to_frames(video_path: str, fps_sample: int = 12) -> List[str]:
+    """Process video into frames and save them locally."""
+    # Create a unique output directory
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = f"temp_frames_{timestamp}"
+    os.makedirs(output_dir, exist_ok=True)
+    # Read video
+    video = iio.imread(video_path)
+    # Calculate frame interval based on original video fps
+    if isinstance(video, np.ndarray):
+        # For captured videos
+        total_frames = len(video)
+        frame_interval = max(
+            1, round(total_frames / (fps_sample * (total_frames / 30)))
+        )
+    else:
+        # Default if can't determine
+        frame_interval = 2
+    frame_paths = []
+    for i, frame in enumerate(video[::frame_interval]):
+        frame_path = os.path.join(output_dir, f"frame_{i:04d}.jpg")
+        if isinstance(frame, np.ndarray):
+            iio.imwrite(frame_path, frame)
+            frame_paths.append(frame_path)
+    return frame_paths, output_dir
+def save_output_files(
+    rgb: np.ndarray,
+    disparity: np.ndarray,
+    poses: Optional[np.ndarray] = None,
+    raymap: Optional[np.ndarray] = None,
+    pointmap: Optional[np.ndarray] = None,
+    task: str = "reconstruction",
+    output_dir: str = "outputs",
+    **kwargs,
+) -> Dict[str, str]:
+    """
+    Save outputs and return paths to saved files.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    if pointmap is None and raymap is not None:
+        # Generate pointmap from raymap and disparity
+        smooth_camera = kwargs.get("smooth_camera", True)
+        smooth_method = (
+            kwargs.get("smooth_method", "kalman") if smooth_camera else "none"
+        )
+        pointmap_dict = postprocess_pointmap(
+            disparity,
+            raymap,
+            vae_downsample_scale=8,
+            ray_o_scale_inv=0.1,
+            smooth_camera=smooth_camera,
+            smooth_method=smooth_method,
+        )
+        pointmap = pointmap_dict["pointmap"]
+    if poses is None and raymap is not None:
+        poses, _, _ = raymap_to_poses(raymap, ray_o_scale_inv=0.1)
+    # Create a unique filename
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_filename = f"{task}_{timestamp}"
+    # Paths for saved files
+    paths = {}
+    # Save RGB video
+    rgb_path = os.path.join(output_dir, f"{base_filename}_rgb.mp4")
+    iio.imwrite(
+        rgb_path,
+        (np.clip(rgb, 0, 1) * 255).astype(np.uint8),
+        fps=kwargs.get("fps", 12),
+    )
+    paths["rgb"] = rgb_path
+    # Save depth/disparity video
+    depth_path = os.path.join(output_dir, f"{base_filename}_disparity.mp4")
+    iio.imwrite(
+        depth_path,
+        (colorize_depth(disparity) * 255).astype(np.uint8),
+        fps=kwargs.get("fps", 12),
+    )
+    paths["disparity"] = depth_path
+    # Save point cloud GLB files
+    if pointmap is not None and poses is not None:
+        pointcloud_save_frame_interval = kwargs.get(
+            "pointcloud_save_frame_interval", 10
+        )
+        max_depth = kwargs.get("max_depth", 100.0)
+        rtol = kwargs.get("rtol", 0.03)
+        glb_paths = []
+        # Determine which frames to save based on the interval
+        frames_to_save = list(
+            range(0, pointmap.shape[0], pointcloud_save_frame_interval)
+        )
+        # Always include the first and last frame
+        if 0 not in frames_to_save:
+            frames_to_save.insert(0, 0)
+        if pointmap.shape[0] - 1 not in frames_to_save:
+            frames_to_save.append(pointmap.shape[0] - 1)
+        # Sort the frames to ensure they're in order
+        frames_to_save = sorted(set(frames_to_save))
+        for frame_idx in frames_to_save:
+            if frame_idx >= pointmap.shape[0]:
+                continue
+            predictions = {
+                "world_points": pointmap[frame_idx : frame_idx + 1],
+                "images": rgb[frame_idx : frame_idx + 1],
+                "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
+                "camera_poses": poses[frame_idx : frame_idx + 1],
+            }
+            glb_path = os.path.join(
+                output_dir, f"{base_filename}_pointcloud_frame_{frame_idx}.glb"
+            )
+            scene_3d = predictions_to_glb(
+                predictions,
+                filter_by_frames="all",
+                show_cam=True,
+                max_depth=max_depth,
+                rtol=rtol,
+                frame_rel_idx=float(frame_idx) / pointmap.shape[0],
+            )
+            scene_3d.export(glb_path)
+            glb_paths.append(glb_path)
+        paths["pointcloud_glbs"] = glb_paths
+    return paths
+def process_reconstruction(
+    video_file,
+    height,
+    width,
+    num_frames,
+    num_inference_steps,
+    guidance_scale,
+    sliding_window_stride,
+    fps,
+    smooth_camera,
+    align_pointmaps,
+    max_depth,
+    rtol,
+    pointcloud_save_frame_interval,
+    seed,
+    progress=gr.Progress(),
+):
+    """
+    Process reconstruction task.
+    """
+    try:
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Set random seed
+        seed_all(seed)
+        # Build the pipeline
+        pipeline = build_pipeline()
+        progress(0.1, "Loading video")
+        # Check if video_file is a string or a file object
+        if isinstance(video_file, str):
+            video_path = video_file
+        else:
+            video_path = video_file.name
+        video = iio.imread(video_path).astype(np.float32) / 255.0
+        # Setup arguments
+        args = {
+            "height": height,
+            "width": width,
+            "num_frames": num_frames,
+            "sliding_window_stride": sliding_window_stride,
+            "smooth_camera": smooth_camera,
+            "smooth_method": "kalman" if smooth_camera else "none",
+            "align_pointmaps": align_pointmaps,
+            "max_depth": max_depth,
+            "rtol": rtol,
+            "pointcloud_save_frame_interval": pointcloud_save_frame_interval,
+        }
+        # Process in sliding windows
+        window_results = []
+        window_indices = get_window_starts(
+            len(video), num_frames, sliding_window_stride
+        )
+        progress(0.2, f"Processing video in {len(window_indices)} windows")
+        for i, start_idx in enumerate(window_indices):
+            progress_val = 0.2 + (0.6 * (i / len(window_indices)))
+            progress(progress_val, f"Processing window {i+1}/{len(window_indices)}")
+            output = pipeline(
+                task="reconstruction",
+                image=None,
+                goal=None,
+                video=video[start_idx : start_idx + num_frames],
+                raymap=None,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                fps=fps,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                use_dynamic_cfg=False,
+                generator=torch.Generator(device=device).manual_seed(seed),
+            )
+            window_results.append(output)
+        progress(0.8, "Merging results from all windows")
+        # Merge window results
+        (
+            merged_rgb,
+            merged_disparity,
+            merged_poses,
+            pointmaps,
+        ) = blend_and_merge_window_results(window_results, window_indices, args)
+        progress(0.9, "Saving output files")
+        # Save output files
+        output_dir = "outputs"
+        os.makedirs(output_dir, exist_ok=True)
+        output_paths = save_output_files(
+            rgb=merged_rgb,
+            disparity=merged_disparity,
+            poses=merged_poses,
+            pointmap=pointmaps,
+            task="reconstruction",
+            output_dir=output_dir,
+            fps=12,
+            **args,
+        )
+        progress(1.0, "Done!")
+        # Return paths for displaying
+        return (
+            output_paths["rgb"],
+            output_paths["disparity"],
+            output_paths.get("pointcloud_glbs", []),
+        )
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        return None, None, []
+def process_prediction(
+    image_file,
+    height,
+    width,
+    num_frames,
+    num_inference_steps,
+    guidance_scale,
+    use_dynamic_cfg,
+    raymap_option,
+    post_reconstruction,
+    fps,
+    smooth_camera,
+    align_pointmaps,
+    max_depth,
+    rtol,
+    pointcloud_save_frame_interval,
+    seed,
+    progress=gr.Progress(),
+):
+    """
+    Process prediction task.
+    """
+    try:
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Set random seed
+        seed_all(seed)
+        # Build the pipeline
+        pipeline = build_pipeline()
+        progress(0.1, "Loading image")
+        # Check if image_file is a string or a file object
+        if isinstance(image_file, str):
+            image_path = image_file
+        else:
+            image_path = image_file.name
+        image = PIL.Image.open(image_path)
+        progress(0.2, "Running prediction")
+        # Run prediction
+        output = pipeline(
+            task="prediction",
+            image=image,
+            video=None,
+            goal=None,
+            raymap=np.load(f"assets/example_raymaps/raymap_{raymap_option}.npy"),
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            fps=fps,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            use_dynamic_cfg=use_dynamic_cfg,
+            generator=torch.Generator(device=device).manual_seed(seed),
+            return_dict=True,
+        )
+        # Show RGB output immediately
+        rgb_output = output.rgb
+        # Setup arguments for saving
+        args = {
+            "height": height,
+            "width": width,
+            "smooth_camera": smooth_camera,
+            "smooth_method": "kalman" if smooth_camera else "none",
+            "align_pointmaps": align_pointmaps,
+            "max_depth": max_depth,
+            "rtol": rtol,
+            "pointcloud_save_frame_interval": pointcloud_save_frame_interval,
+        }
+        if post_reconstruction:
+            progress(0.5, "Running post-reconstruction for better quality")
+            recon_output = pipeline(
+                task="reconstruction",
+                video=output.rgb,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                fps=fps,
+                num_inference_steps=4,
+                guidance_scale=1.0,
+                use_dynamic_cfg=False,
+                generator=torch.Generator(device=device).manual_seed(seed),
+            )
+            disparity = recon_output.disparity
+            raymap = recon_output.raymap
+        else:
+            disparity = output.disparity
+            raymap = output.raymap
+        progress(0.8, "Saving output files")
+        # Save output files
+        output_dir = "outputs"
+        os.makedirs(output_dir, exist_ok=True)
+        output_paths = save_output_files(
+            rgb=rgb_output,
+            disparity=disparity,
+            raymap=raymap,
+            task="prediction",
+            output_dir=output_dir,
+            fps=12,
+            **args,
+        )
+        progress(1.0, "Done!")
+        # Return paths for displaying
+        return (
+            output_paths["rgb"],
+            output_paths["disparity"],
+            output_paths.get("pointcloud_glbs", []),
+        )
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        return None, None, []
+def process_planning(
+    image_file,
+    goal_file,
+    height,
+    width,
+    num_frames,
+    num_inference_steps,
+    guidance_scale,
+    use_dynamic_cfg,
+    post_reconstruction,
+    fps,
+    smooth_camera,
+    align_pointmaps,
+    max_depth,
+    rtol,
+    pointcloud_save_frame_interval,
+    seed,
+    progress=gr.Progress(),
+):
+    """
+    Process planning task.
+    """
+    try:
+        gc.collect()
+        torch.cuda.empty_cache()
+        # Set random seed
+        seed_all(seed)
+        # Build the pipeline
+        pipeline = build_pipeline()
+        progress(0.1, "Loading images")
+        # Check if image_file and goal_file are strings or file objects
+        if isinstance(image_file, str):
+            image_path = image_file
+        else:
+            image_path = image_file.name
+        if isinstance(goal_file, str):
+            goal_path = goal_file
+        else:
+            goal_path = goal_file.name
+        image = PIL.Image.open(image_path)
+        goal = PIL.Image.open(goal_path)
+        progress(0.2, "Running planning")
+        # Run planning
+        output = pipeline(
+            task="planning",
+            image=image,
+            video=None,
+            goal=goal,
+            raymap=None,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            fps=fps,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            use_dynamic_cfg=use_dynamic_cfg,
+            generator=torch.Generator(device=device).manual_seed(seed),
+            return_dict=True,
+        )
+        # Show RGB output immediately
+        rgb_output = output.rgb
+        # Setup arguments for saving
+        args = {
+            "height": height,
+            "width": width,
+            "smooth_camera": smooth_camera,
+            "smooth_method": "kalman" if smooth_camera else "none",
+            "align_pointmaps": align_pointmaps,
+            "max_depth": max_depth,
+            "rtol": rtol,
+            "pointcloud_save_frame_interval": pointcloud_save_frame_interval,
+        }
+        if post_reconstruction:
+            progress(0.5, "Running post-reconstruction for better quality")
+            recon_output = pipeline(
+                task="reconstruction",
+                video=output.rgb,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                fps=12,
+                num_inference_steps=4,
+                guidance_scale=1.0,
+                use_dynamic_cfg=False,
+                generator=torch.Generator(device=device).manual_seed(seed),
+            )
+            disparity = recon_output.disparity
+            raymap = recon_output.raymap
+        else:
+            disparity = output.disparity
+            raymap = output.raymap
+        progress(0.8, "Saving output files")
+        # Save output files
+        output_dir = "outputs"
+        os.makedirs(output_dir, exist_ok=True)
+        output_paths = save_output_files(
+            rgb=rgb_output,
+            disparity=disparity,
+            raymap=raymap,
+            task="planning",
+            output_dir=output_dir,
+            fps=fps,
+            **args,
+        )
+        progress(1.0, "Done!")
+        # Return paths for displaying
+        return (
+            output_paths["rgb"],
+            output_paths["disparity"],
+            output_paths.get("pointcloud_glbs", []),
+        )
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        return None, None, []
+def update_task_ui(task):
+    """Update UI elements based on selected task."""
+    if task == "reconstruction":
+        return (
+            gr.update(visible=True),  # video_input
+            gr.update(visible=False),  # image_input
+            gr.update(visible=False),  # goal_input
+            gr.update(visible=False),  # image_preview
+            gr.update(visible=False),  # goal_preview
+            gr.update(value=4),  # num_inference_steps
+            gr.update(visible=True),  # sliding_window_stride
+            gr.update(visible=False),  # use_dynamic_cfg
+            gr.update(visible=False),  # raymap_option
+            gr.update(visible=False),  # post_reconstruction
+            gr.update(value=1.0),  # guidance_scale
+        )
+    elif task == "prediction":
+        return (
+            gr.update(visible=False),  # video_input
+            gr.update(visible=True),  # image_input
+            gr.update(visible=False),  # goal_input
+            gr.update(visible=True),  # image_preview
+            gr.update(visible=False),  # goal_preview
+            gr.update(value=50),  # num_inference_steps
+            gr.update(visible=False),  # sliding_window_stride
+            gr.update(visible=True),  # use_dynamic_cfg
+            gr.update(visible=True),  # raymap_option
+            gr.update(visible=True),  # post_reconstruction
+            gr.update(value=3.0),  # guidance_scale
+        )
+    elif task == "planning":
+        return (
+            gr.update(visible=False),  # video_input
+            gr.update(visible=True),  # image_input
+            gr.update(visible=True),  # goal_input
+            gr.update(visible=True),  # image_preview
+            gr.update(visible=True),  # goal_preview
+            gr.update(value=50),  # num_inference_steps
+            gr.update(visible=False),  # sliding_window_stride
+            gr.update(visible=True),  # use_dynamic_cfg
+            gr.update(visible=False),  # raymap_option
+            gr.update(visible=True),  # post_reconstruction
+            gr.update(value=3.0),  # guidance_scale
+        )
+def update_image_preview(image_file):
+    """Update the image preview."""
+    if image_file:
+        return image_file.name
+    return None
+def update_goal_preview(goal_file):
+    """Update the goal preview."""
+    if goal_file:
+        return goal_file.name
+    return None
+def get_download_link(selected_frame, all_paths):
+    """Update the download button with the selected file path."""
+    if not selected_frame or not all_paths:
+        return gr.update(visible=False, value=None)
+    frame_num = int(re.search(r"Frame (\d+)", selected_frame).group(1))
+    for path in all_paths:
+        if f"frame_{frame_num}" in path:
+            # Make sure the file exists before setting it
+            if os.path.exists(path):
+                return gr.update(visible=True, value=path, interactive=True)
+    return gr.update(visible=False, value=None)
+# Theme setup
+theme = gr.themes.Default(
+    primary_hue="blue",
+    secondary_hue="cyan",
+)
+with gr.Blocks(
+    theme=theme,
+    css="""
+    .output-column {
+        min-height: 400px;
+    }
+    .warning {
+        color: #ff9800;
+        font-weight: bold;
+    }
+    .highlight {
+        background-color: rgba(0, 123, 255, 0.1);
+        padding: 10px;
+        border-radius: 8px;
+        border-left: 5px solid #007bff;
+        margin: 10px 0;
+    }
+    .task-header {
+        margin-top: 10px;
+        margin-bottom: 15px;
+        font-size: 1.2em;
+        font-weight: bold;
+        color: #007bff;
+    }
+    .flex-display {
+        display: flex;
+        flex-wrap: wrap;
+        gap: 10px;
+    }
+    .output-subtitle {
+        font-size: 1.1em;
+        margin-top: 5px;
+        margin-bottom: 5px;
+        color: #505050;
+    }
+    .input-section, .params-section, .advanced-section {
+        border: 1px solid #ddd;
+        padding: 15px;
+        border-radius: 8px;
+        margin-bottom: 15px;
+    }
+    .logo-container {
+        display: flex;
+        justify-content: center;
+        margin-bottom: 20px;
+    }
+    .logo-image {
+        max-width: 300px;
+        height: auto;
+    }
+""",
+) as demo:
+    with gr.Row(elem_classes=["logo-container"]):
+        gr.Image("assets/logo.png", show_label=False, elem_classes=["logo-image"])
+    gr.Markdown(
+        """
+    # Aether: Geometric-Aware Unified World Modeling
+    Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with
+    generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
+    1. **4D dynamic reconstruction** - Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
+    2. **Action-Conditioned Video Prediction** - Predict future frames based on initial observation images, with optional conditions of camera trajectory actions.
+    3. **Goal-Conditioned Visual Planning** - Generate planning paths from pairs of observation and goal images.
+    Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            task = gr.Radio(
+                ["reconstruction", "prediction", "planning"],
+                label="Select Task",
+                value="reconstruction",
+                info="Choose the task you want to perform",
+            )
+            with gr.Group(elem_classes=["input-section"]):
+                # Input section - changes based on task
+                gr.Markdown("## 📥 Input", elem_classes=["task-header"])
+                # Task-specific inputs
+                video_input = gr.Video(
+                    label="Upload Input Video",
+                    sources=["upload"],
+                    visible=True,
+                    interactive=True,
+                    elem_id="video_input",
+                )
+                image_input = gr.File(
+                    label="Upload Start Image",
+                    file_count="single",
+                    file_types=["image"],
+                    visible=False,
+                    interactive=True,
+                    elem_id="image_input",
+                )
+                goal_input = gr.File(
+                    label="Upload Goal Image",
+                    file_count="single",
+                    file_types=["image"],
+                    visible=False,
+                    interactive=True,
+                    elem_id="goal_input",
+                )
+                with gr.Row(visible=False) as preview_row:
+                    image_preview = gr.Image(
+                        label="Start Image Preview",
+                        elem_id="image_preview",
+                        visible=False,
+                    )
+                    goal_preview = gr.Image(
+                        label="Goal Image Preview",
+                        elem_id="goal_preview",
+                        visible=False,
+                    )
+            with gr.Group(elem_classes=["params-section"]):
+                gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        height = gr.Dropdown(
+                            choices=[480],
+                            value=480,
+                            label="Height",
+                            info="Height of the output video",
+                        )
+                    with gr.Column(scale=1):
+                        width = gr.Dropdown(
+                            choices=[720],
+                            value=720,
+                            label="Width",
+                            info="Width of the output video",
+                        )
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        num_frames = gr.Dropdown(
+                            choices=[17, 25, 33, 41],
+                            value=41,
+                            label="Number of Frames",
+                            info="Number of frames to predict",
+                        )
+                    with gr.Column(scale=1):
+                        fps = gr.Dropdown(
+                            choices=[8, 10, 12, 15, 24],
+                            value=12,
+                            label="FPS",
+                            info="Frames per second",
+                        )
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        num_inference_steps = gr.Slider(
+                            minimum=1,
+                            maximum=60,
+                            value=4,
+                            step=1,
+                            label="Inference Steps",
+                            info="Number of inference step",
+                        )
+                sliding_window_stride = gr.Slider(
+                    minimum=1,
+                    maximum=40,
+                    value=24,
+                    step=1,
+                    label="Sliding Window Stride",
+                    info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
+                    visible=True,
+                )
+                use_dynamic_cfg = gr.Checkbox(
+                    label="Use Dynamic CFG",
+                    value=True,
+                    info="Use dynamic CFG",
+                    visible=False,
+                )
+                raymap_option = gr.Radio(
+                    choices=["backward", "forward_right", "left_forward", "right"],
+                    label="Camera Movement Direction",
+                    value="forward_right",
+                    info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
+                    visible=False,
+                )
+                post_reconstruction = gr.Checkbox(
+                    label="Post-Reconstruction",
+                    value=True,
+                    info="Run reconstruction after prediction for better quality",
+                    visible=False,
+                )
+            with gr.Accordion(
+                "Advanced Options", open=False, visible=True
+            ) as advanced_options:
+                with gr.Group(elem_classes=["advanced-section"]):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            guidance_scale = gr.Slider(
+                                minimum=1.0,
+                                maximum=10.0,
+                                value=1.0,
+                                step=0.1,
+                                label="Guidance Scale",
+                                info="Guidance scale (only for prediction / planning)",
+                            )
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            seed = gr.Number(
+                                value=42,
+                                label="Random Seed",
+                                info="Set a seed for reproducible results",
+                                precision=0,
+                                minimum=0,
+                                maximum=2147483647,
+                            )
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            smooth_camera = gr.Checkbox(
+                                label="Smooth Camera",
+                                value=True,
+                                info="Apply smoothing to camera trajectory",
+                            )
+                        with gr.Column(scale=1):
+                            align_pointmaps = gr.Checkbox(
+                                label="Align Point Maps",
+                                value=False,
+                                info="Align point maps across frames",
+                            )
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            max_depth = gr.Slider(
+                                minimum=10,
+                                maximum=200,
+                                value=60,
+                                step=10,
+                                label="Max Depth",
+                                info="Maximum depth for point cloud (higher = more distant points)",
+                            )
+                        with gr.Column(scale=1):
+                            rtol = gr.Slider(
+                                minimum=0.01,
+                                maximum=2.0,
+                                value=0.03,
+                                step=0.01,
+                                label="Relative Tolerance",
+                                info="Used for depth edge detection. Lower = remove more edges",
+                            )
+                    pointcloud_save_frame_interval = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=10,
+                        step=1,
+                        label="Point Cloud Frame Interval",
+                        info="Save point cloud every N frames (higher = fewer files but less complete representation)",
+                    )
+            run_button = gr.Button("Run Aether", variant="primary")
+        with gr.Column(scale=1, elem_classes=["output-column"]):
+            with gr.Group():
+                gr.Markdown("## 📤 Output", elem_classes=["task-header"])
+                gr.Markdown("### RGB Video", elem_classes=["output-subtitle"])
+                rgb_output = gr.Video(
+                    label="RGB Output", interactive=False, elem_id="rgb_output"
+                )
+                gr.Markdown("### Depth Video", elem_classes=["output-subtitle"])
+                depth_output = gr.Video(
+                    label="Depth Output", interactive=False, elem_id="depth_output"
+                )
+                gr.Markdown("### Point Clouds", elem_classes=["output-subtitle"])
+                with gr.Row(elem_classes=["flex-display"]):
+                    pointcloud_frames = gr.Dropdown(
+                        label="Select Frame",
+                        choices=[],
+                        value=None,
+                        interactive=True,
+                        elem_id="pointcloud_frames",
+                    )
+                    pointcloud_download = gr.DownloadButton(
+                        label="Download Point Cloud",
+                        visible=False,
+                        elem_id="pointcloud_download",
+                    )
+                model_output = gr.Model3D(
+                    label="Point Cloud Viewer", interactive=True, elem_id="model_output"
+                )
+                with gr.Tab("About Results"):
+                    gr.Markdown(
+                        """
+                    ### Understanding the Outputs
+                    - **RGB Video**: Shows the predicted or reconstructed RGB frames
+                    - **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
+                    - **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
+                    <p class="warning">Note: 3D point clouds take a long time to visualize, and we show the keyframes only.
+                    You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.</p>
+                    """
+                    )
+    # Event handlers
+    task.change(
+        fn=update_task_ui,
+        inputs=[task],
+        outputs=[
+            video_input,
+            image_input,
+            goal_input,
+            image_preview,
+            goal_preview,
+            num_inference_steps,
+            sliding_window_stride,
+            use_dynamic_cfg,
+            raymap_option,
+            post_reconstruction,
+            guidance_scale,
+        ],
+    )
+    image_input.change(
+        fn=update_image_preview, inputs=[image_input], outputs=[image_preview]
+    ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
+    goal_input.change(
+        fn=update_goal_preview, inputs=[goal_input], outputs=[goal_preview]
+    ).then(fn=lambda: gr.update(visible=True), inputs=[], outputs=[preview_row])
+    def update_pointcloud_frames(pointcloud_paths):
+        """Update the pointcloud frames dropdown with available frames."""
+        if not pointcloud_paths:
+            return gr.update(choices=[], value=None), None, gr.update(visible=False)
+        # Extract frame numbers from filenames
+        frame_info = []
+        for path in pointcloud_paths:
+            filename = os.path.basename(path)
+            match = re.search(r"frame_(\d+)", filename)
+            if match:
+                frame_num = int(match.group(1))
+                frame_info.append((f"Frame {frame_num}", path))
+        # Sort by frame number
+        frame_info.sort(key=lambda x: int(re.search(r"Frame (\d+)", x[0]).group(1)))
+        choices = [label for label, _ in frame_info]
+        paths = [path for _, path in frame_info]
+        if not choices:
+            return gr.update(choices=[], value=None), None, gr.update(visible=False)
+        # Make download button visible when we have point cloud files
+        return (
+            gr.update(choices=choices, value=choices[0]),
+            paths[0],
+            gr.update(visible=True),
+        )
+    def select_pointcloud_frame(frame_label, all_paths):
+        """Select a specific pointcloud frame."""
+        if not frame_label or not all_paths:
+            return None
+        frame_num = int(re.search(r"Frame (\d+)", frame_label).group(1))
+        for path in all_paths:
+            if f"frame_{frame_num}" in path:
+                return path
+        return None
+    # Then in the run button click handler:
+    def process_task(task_type, *args):
+        """Process selected task with appropriate function."""
+        if task_type == "reconstruction":
+            rgb_path, depth_path, pointcloud_paths = process_reconstruction(*args)
+            # Update the pointcloud frames dropdown
+            frame_dropdown, initial_path, download_visible = update_pointcloud_frames(
+                pointcloud_paths
+            )
+            return (
+                rgb_path,
+                depth_path,
+                initial_path,
+                frame_dropdown,
+                pointcloud_paths,
+                download_visible,
+            )
+        elif task_type == "prediction":
+            rgb_path, depth_path, pointcloud_paths = process_prediction(*args)
+            frame_dropdown, initial_path, download_visible = update_pointcloud_frames(
+                pointcloud_paths
+            )
+            return (
+                rgb_path,
+                depth_path,
+                initial_path,
+                frame_dropdown,
+                pointcloud_paths,
+                download_visible,
+            )
+        elif task_type == "planning":
+            rgb_path, depth_path, pointcloud_paths = process_planning(*args)
+            frame_dropdown, initial_path, download_visible = update_pointcloud_frames(
+                pointcloud_paths
+            )
+            return (
+                rgb_path,
+                depth_path,
+                initial_path,
+                frame_dropdown,
+                pointcloud_paths,
+                download_visible,
+            )
+        return (
+            None,
+            None,
+            None,
+            gr.update(choices=[], value=None),
+            [],
+            gr.update(visible=False),
+        )
+    # Store all pointcloud paths for later use
+    all_pointcloud_paths = gr.State([])
+    run_button.click(
+        fn=lambda task_type,
+        video_file,
+        image_file,
+        goal_file,
+        height,
+        width,
+        num_frames,
+        num_inference_steps,
+        guidance_scale,
+        sliding_window_stride,
+        use_dynamic_cfg,
+        raymap_option,
+        post_reconstruction,
+        fps,
+        smooth_camera,
+        align_pointmaps,
+        max_depth,
+        rtol,
+        pointcloud_save_frame_interval,
+        seed: process_task(
+            task_type,
+            *(
+                [
+                    video_file,
+                    height,
+                    width,
+                    num_frames,
+                    num_inference_steps,
+                    guidance_scale,
+                    sliding_window_stride,
+                    fps,
+                    smooth_camera,
+                    align_pointmaps,
+                    max_depth,
+                    rtol,
+                    pointcloud_save_frame_interval,
+                    seed,
+                ]
+                if task_type == "reconstruction"
+                else [
+                    image_file,
+                    height,
+                    width,
+                    num_frames,
+                    num_inference_steps,
+                    guidance_scale,
+                    use_dynamic_cfg,
+                    raymap_option,
+                    post_reconstruction,
+                    fps,
+                    smooth_camera,
+                    align_pointmaps,
+                    max_depth,
+                    rtol,
+                    pointcloud_save_frame_interval,
+                    seed,
+                ]
+                if task_type == "prediction"
+                else [
+                    image_file,
+                    goal_file,
+                    height,
+                    width,
+                    num_frames,
+                    num_inference_steps,
+                    guidance_scale,
+                    use_dynamic_cfg,
+                    post_reconstruction,
+                    fps,
+                    smooth_camera,
+                    align_pointmaps,
+                    max_depth,
+                    rtol,
+                    pointcloud_save_frame_interval,
+                    seed,
+                ]
+            ),
+        ),
+        inputs=[
+            task,
+            video_input,
+            image_input,
+            goal_input,
+            height,
+            width,
+            num_frames,
+            num_inference_steps,
+            guidance_scale,
+            sliding_window_stride,
+            use_dynamic_cfg,
+            raymap_option,
+            post_reconstruction,
+            fps,
+            smooth_camera,
+            align_pointmaps,
+            max_depth,
+            rtol,
+            pointcloud_save_frame_interval,
+            seed,
+        ],
+        outputs=[
+            rgb_output,
+            depth_output,
+            model_output,
+            pointcloud_frames,
+            all_pointcloud_paths,
+            pointcloud_download,
+        ],
+    )
+    pointcloud_frames.change(
+        fn=select_pointcloud_frame,
+        inputs=[pointcloud_frames, all_pointcloud_paths],
+        outputs=[model_output],
+    ).then(
+        fn=get_download_link,
+        inputs=[pointcloud_frames, all_pointcloud_paths],
+        outputs=[pointcloud_download],
+    )
+    # Example Accordion
+    with gr.Accordion("Examples"):
+        gr.Markdown(
+            """
+        ### Examples will be added soon
+        Check back for example inputs for each task type.
+        """
+        )
+    # Load the model at startup
+    demo.load(lambda: build_pipeline(), inputs=None, outputs=None)
+if __name__ == "__main__":
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    demo.queue(max_size=20).launch(show_error=True, share=True)

setup.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python
+import pathlib
+import sys
+import pkg_resources
+from setuptools import find_packages, setup
+PKG_NAME = "aether"
+VERSION = "0.1"
+EXTRAS = {}
+def _read_file(fname):
+    with pathlib.Path(fname).open() as fp:
+        return fp.read()
+def _read_install_requires():
+    with pathlib.Path("requirements.txt").open() as fp:
+        return [
+            str(requirement) for requirement in pkg_resources.parse_requirements(fp)
+        ]
+def _fill_extras(extras):
+    if extras:
+        extras["all"] = list({item for group in extras.values() for item in group})
+    return extras
+version_range_max = max(sys.version_info[1], 10) + 1
+setup(
+    name=PKG_NAME,
+    version=VERSION,
+    author="Aether Team",
+    author_email="[email protected]",
+    url="https://github.com/OpenRobotLab/Aether",
+    description="",
+    long_description=_read_file("README.md"),
+    long_description_content_type="text/markdown",
+    keywords=[
+        "Deep Learning",
+        "Machine Learning",
+        "World Model",
+        "3D Vision",
+        "Reconstruction",
+        "Sythetic Data",
+        "Embodied AI",
+    ],
+    license="MIT License",
+    packages=find_packages(include=f"{PKG_NAME}.*"),
+    include_package_data=True,
+    zip_safe=False,
+    install_requires=_read_install_requires(),
+    extras_require=_fill_extras(EXTRAS),
+    python_requires=">=3.8",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Programming Language :: Python :: 3",
+    ]
+    + [f"Programming Language :: Python :: 3.{i}" for i in range(8, version_range_max)],
+)