Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
e88ccc8
1
Parent(s):
8040e22
space init version
Browse files- .gitignore +1 -0
- app.py +656 -247
.gitignore
CHANGED
@@ -16,6 +16,7 @@ eggs/
|
|
16 |
.eggs/
|
17 |
lib/
|
18 |
lib64/
|
|
|
19 |
parts/
|
20 |
sdist/
|
21 |
var/
|
|
|
16 |
.eggs/
|
17 |
lib/
|
18 |
lib64/
|
19 |
+
outputs/
|
20 |
parts/
|
21 |
sdist/
|
22 |
var/
|
app.py
CHANGED
@@ -416,12 +416,29 @@ def save_output_files(
|
|
416 |
for frame_idx in frames_to_save:
|
417 |
if frame_idx >= pointmap.shape[0]:
|
418 |
continue
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
predictions = {
|
421 |
-
"world_points":
|
422 |
"images": rgb[frame_idx : frame_idx + 1],
|
423 |
"depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
|
424 |
-
"camera_poses":
|
425 |
}
|
426 |
|
427 |
glb_path = os.path.join(
|
@@ -571,7 +588,7 @@ def process_reconstruction(
|
|
571 |
return None, None, []
|
572 |
|
573 |
|
574 |
-
@spaces.GPU(duration=
|
575 |
def process_prediction(
|
576 |
image_file,
|
577 |
height,
|
@@ -600,6 +617,11 @@ def process_prediction(
|
|
600 |
|
601 |
# Set random seed
|
602 |
seed_all(seed)
|
|
|
|
|
|
|
|
|
|
|
603 |
|
604 |
# Build the pipeline
|
605 |
pipeline = build_pipeline(device)
|
@@ -698,7 +720,7 @@ def process_prediction(
|
|
698 |
return None, None, []
|
699 |
|
700 |
|
701 |
-
@spaces.GPU(duration=
|
702 |
def process_planning(
|
703 |
image_file,
|
704 |
goal_file,
|
@@ -851,6 +873,7 @@ def update_task_ui(task):
|
|
851 |
gr.update(visible=False), # raymap_option
|
852 |
gr.update(visible=False), # post_reconstruction
|
853 |
gr.update(value=1.0), # guidance_scale
|
|
|
854 |
)
|
855 |
elif task == "prediction":
|
856 |
return (
|
@@ -865,6 +888,7 @@ def update_task_ui(task):
|
|
865 |
gr.update(visible=True), # raymap_option
|
866 |
gr.update(visible=True), # post_reconstruction
|
867 |
gr.update(value=3.0), # guidance_scale
|
|
|
868 |
)
|
869 |
elif task == "planning":
|
870 |
return (
|
@@ -879,6 +903,7 @@ def update_task_ui(task):
|
|
879 |
gr.update(visible=False), # raymap_option
|
880 |
gr.update(visible=True), # post_reconstruction
|
881 |
gr.update(value=3.0), # guidance_scale
|
|
|
882 |
)
|
883 |
|
884 |
|
@@ -925,8 +950,17 @@ with gr.Blocks(
|
|
925 |
min-height: 400px;
|
926 |
}
|
927 |
.warning {
|
928 |
-
color: #
|
929 |
-
font-weight: bold;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
930 |
}
|
931 |
.highlight {
|
932 |
background-color: rgba(0, 123, 255, 0.1);
|
@@ -936,9 +970,9 @@ with gr.Blocks(
|
|
936 |
margin: 10px 0;
|
937 |
}
|
938 |
.task-header {
|
939 |
-
margin-top:
|
940 |
-
margin-bottom:
|
941 |
-
font-size: 1.
|
942 |
font-weight: bold;
|
943 |
color: #007bff;
|
944 |
}
|
@@ -955,9 +989,9 @@ with gr.Blocks(
|
|
955 |
}
|
956 |
.input-section, .params-section, .advanced-section {
|
957 |
border: 1px solid #ddd;
|
958 |
-
padding:
|
959 |
border-radius: 8px;
|
960 |
-
margin-bottom:
|
961 |
}
|
962 |
.logo-container {
|
963 |
display: flex;
|
@@ -968,277 +1002,660 @@ with gr.Blocks(
|
|
968 |
max-width: 300px;
|
969 |
height: auto;
|
970 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
971 |
""",
|
972 |
) as demo:
|
973 |
-
with gr.
|
974 |
-
gr.
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
|
982 |
-
|
983 |
-
1. **4D dynamic reconstruction** - Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
|
984 |
-
2. **Action-Conditioned Video Prediction** - Predict future frames based on initial observation images, with optional conditions of camera trajectory actions.
|
985 |
-
3. **Goal-Conditioned Visual Planning** - Generate planning paths from pairs of observation and goal images.
|
986 |
-
|
987 |
-
Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.
|
988 |
-
"""
|
989 |
-
)
|
990 |
-
|
991 |
-
with gr.Row():
|
992 |
-
with gr.Column(scale=1):
|
993 |
-
task = gr.Radio(
|
994 |
-
["reconstruction", "prediction", "planning"],
|
995 |
-
label="Select Task",
|
996 |
-
value="reconstruction",
|
997 |
-
info="Choose the task you want to perform",
|
998 |
-
)
|
999 |
-
|
1000 |
-
with gr.Group(elem_classes=["input-section"]):
|
1001 |
-
# Input section - changes based on task
|
1002 |
-
gr.Markdown("## 📥 Input", elem_classes=["task-header"])
|
1003 |
-
|
1004 |
-
# Task-specific inputs
|
1005 |
-
video_input = gr.Video(
|
1006 |
-
label="Upload Input Video",
|
1007 |
-
sources=["upload"],
|
1008 |
-
visible=True,
|
1009 |
-
interactive=True,
|
1010 |
-
elem_id="video_input",
|
1011 |
)
|
1012 |
-
|
1013 |
-
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
|
|
|
|
1020 |
)
|
1021 |
|
1022 |
-
|
1023 |
-
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
1027 |
-
|
1028 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1029 |
)
|
1030 |
|
1031 |
-
|
1032 |
-
|
1033 |
-
|
1034 |
-
|
1035 |
-
|
|
|
|
|
|
|
1036 |
)
|
1037 |
-
|
1038 |
-
|
1039 |
-
|
1040 |
-
|
|
|
|
|
|
|
|
|
|
|
1041 |
)
|
1042 |
|
1043 |
-
|
1044 |
-
|
|
|
1045 |
|
1046 |
-
|
1047 |
-
|
1048 |
-
|
1049 |
-
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
1053 |
-
|
1054 |
|
1055 |
-
|
1056 |
-
|
1057 |
-
|
1058 |
-
|
1059 |
-
|
1060 |
-
|
1061 |
-
|
|
|
1062 |
|
1063 |
-
|
1064 |
-
|
1065 |
-
|
1066 |
-
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
-
|
1071 |
|
1072 |
-
with gr.
|
1073 |
-
|
1074 |
-
|
1075 |
-
|
1076 |
-
|
1077 |
-
info="Frames per second",
|
1078 |
)
|
1079 |
-
|
1080 |
-
|
1081 |
-
|
1082 |
-
|
1083 |
-
minimum=1,
|
1084 |
-
maximum=60,
|
1085 |
-
value=4,
|
1086 |
-
step=1,
|
1087 |
-
label="Inference Steps",
|
1088 |
-
info="Number of inference step",
|
1089 |
)
|
1090 |
|
1091 |
-
|
1092 |
-
|
1093 |
-
maximum=40,
|
1094 |
-
value=24,
|
1095 |
-
step=1,
|
1096 |
-
label="Sliding Window Stride",
|
1097 |
-
info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
|
1098 |
-
visible=True,
|
1099 |
-
)
|
1100 |
-
|
1101 |
-
use_dynamic_cfg = gr.Checkbox(
|
1102 |
-
label="Use Dynamic CFG",
|
1103 |
-
value=True,
|
1104 |
-
info="Use dynamic CFG",
|
1105 |
-
visible=False,
|
1106 |
-
)
|
1107 |
-
|
1108 |
-
raymap_option = gr.Radio(
|
1109 |
-
choices=["backward", "forward_right", "left_forward", "right"],
|
1110 |
-
label="Camera Movement Direction",
|
1111 |
-
value="forward_right",
|
1112 |
-
info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
|
1113 |
-
visible=False,
|
1114 |
-
)
|
1115 |
-
|
1116 |
-
post_reconstruction = gr.Checkbox(
|
1117 |
-
label="Post-Reconstruction",
|
1118 |
-
value=True,
|
1119 |
-
info="Run reconstruction after prediction for better quality",
|
1120 |
-
visible=False,
|
1121 |
-
)
|
1122 |
|
1123 |
-
with gr.Accordion(
|
1124 |
-
"Advanced Options", open=False, visible=True
|
1125 |
-
) as advanced_options:
|
1126 |
-
with gr.Group(elem_classes=["advanced-section"]):
|
1127 |
with gr.Row():
|
1128 |
with gr.Column(scale=1):
|
1129 |
-
|
1130 |
-
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
label="Guidance Scale",
|
1135 |
-
info="Guidance scale (only for prediction / planning)",
|
1136 |
)
|
1137 |
|
1138 |
-
with gr.Row():
|
1139 |
with gr.Column(scale=1):
|
1140 |
-
|
1141 |
-
|
1142 |
-
|
1143 |
-
|
1144 |
-
|
1145 |
-
minimum=0,
|
1146 |
-
maximum=2147483647,
|
1147 |
)
|
1148 |
|
1149 |
with gr.Row():
|
1150 |
with gr.Column(scale=1):
|
1151 |
-
|
1152 |
-
|
1153 |
-
value=
|
1154 |
-
|
|
|
1155 |
)
|
1156 |
|
1157 |
with gr.Column(scale=1):
|
1158 |
-
|
1159 |
-
|
1160 |
-
value=
|
1161 |
-
|
|
|
1162 |
)
|
1163 |
|
1164 |
with gr.Row():
|
1165 |
-
|
1166 |
-
|
1167 |
-
|
1168 |
-
|
1169 |
-
|
1170 |
-
|
1171 |
-
|
1172 |
-
|
1173 |
-
)
|
1174 |
-
|
1175 |
-
with gr.Column(scale=1):
|
1176 |
-
rtol = gr.Slider(
|
1177 |
-
minimum=0.01,
|
1178 |
-
maximum=2.0,
|
1179 |
-
value=0.03,
|
1180 |
-
step=0.01,
|
1181 |
-
label="Relative Tolerance",
|
1182 |
-
info="Used for depth edge detection. Lower = remove more edges",
|
1183 |
-
)
|
1184 |
|
1185 |
-
|
1186 |
minimum=1,
|
1187 |
-
maximum=
|
1188 |
-
value=
|
1189 |
step=1,
|
1190 |
-
label="
|
1191 |
-
info="
|
|
|
1192 |
)
|
1193 |
|
1194 |
-
|
1195 |
-
|
1196 |
-
|
1197 |
-
|
1198 |
-
|
1199 |
-
|
1200 |
-
gr.Markdown("### RGB Video", elem_classes=["output-subtitle"])
|
1201 |
-
rgb_output = gr.Video(
|
1202 |
-
label="RGB Output", interactive=False, elem_id="rgb_output"
|
1203 |
-
)
|
1204 |
-
|
1205 |
-
gr.Markdown("### Depth Video", elem_classes=["output-subtitle"])
|
1206 |
-
depth_output = gr.Video(
|
1207 |
-
label="Depth Output", interactive=False, elem_id="depth_output"
|
1208 |
-
)
|
1209 |
-
|
1210 |
-
gr.Markdown("### Point Clouds", elem_classes=["output-subtitle"])
|
1211 |
-
with gr.Row(elem_classes=["flex-display"]):
|
1212 |
-
pointcloud_frames = gr.Dropdown(
|
1213 |
-
label="Select Frame",
|
1214 |
-
choices=[],
|
1215 |
-
value=None,
|
1216 |
-
interactive=True,
|
1217 |
-
elem_id="pointcloud_frames",
|
1218 |
)
|
1219 |
-
|
1220 |
-
|
|
|
|
|
|
|
|
|
1221 |
visible=False,
|
1222 |
-
elem_id="pointcloud_download",
|
1223 |
)
|
1224 |
|
1225 |
-
|
1226 |
-
|
1227 |
-
|
|
|
|
|
|
|
1228 |
|
1229 |
-
|
1230 |
-
|
1231 |
-
|
1232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1233 |
|
1234 |
-
|
1235 |
-
|
1236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1237 |
|
1238 |
-
|
1239 |
-
|
1240 |
-
|
1241 |
-
|
|
|
|
|
|
|
|
|
1242 |
|
1243 |
# Event handlers
|
1244 |
task.change(
|
@@ -1256,6 +1673,7 @@ with gr.Blocks(
|
|
1256 |
raymap_option,
|
1257 |
post_reconstruction,
|
1258 |
guidance_scale,
|
|
|
1259 |
],
|
1260 |
)
|
1261 |
|
@@ -1486,15 +1904,6 @@ with gr.Blocks(
|
|
1486 |
outputs=[pointcloud_download],
|
1487 |
)
|
1488 |
|
1489 |
-
# Example Accordion
|
1490 |
-
with gr.Accordion("Examples"):
|
1491 |
-
gr.Markdown(
|
1492 |
-
"""
|
1493 |
-
### Examples will be added soon
|
1494 |
-
Check back for example inputs for each task type.
|
1495 |
-
"""
|
1496 |
-
)
|
1497 |
-
|
1498 |
# Load the model at startup
|
1499 |
demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
|
1500 |
|
|
|
416 |
for frame_idx in frames_to_save:
|
417 |
if frame_idx >= pointmap.shape[0]:
|
418 |
continue
|
419 |
+
|
420 |
+
# fix the problem of point cloud being upside down and left-right reversed: flip Y axis and X axis
|
421 |
+
flipped_pointmap = pointmap[frame_idx:frame_idx+1].copy()
|
422 |
+
flipped_pointmap[..., 1] = -flipped_pointmap[..., 1] # flip Y axis (up and down)
|
423 |
+
flipped_pointmap[..., 0] = -flipped_pointmap[..., 0] # flip X axis (left and right)
|
424 |
+
|
425 |
+
# flip camera poses
|
426 |
+
flipped_poses = poses[frame_idx:frame_idx+1].copy()
|
427 |
+
# flip Y axis and X axis of camera orientation
|
428 |
+
flipped_poses[..., 1, :3] = -flipped_poses[..., 1, :3] # flip Y axis of camera orientation
|
429 |
+
flipped_poses[..., 0, :3] = -flipped_poses[..., 0, :3] # flip X axis of camera orientation
|
430 |
+
flipped_poses[..., :3, 1] = -flipped_poses[..., :3, 1] # flip Y axis of camera orientation
|
431 |
+
flipped_poses[..., :3, 0] = -flipped_poses[..., :3, 0] # flip X axis of camera orientation
|
432 |
+
# flip Y axis and X axis of camera position
|
433 |
+
flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3] # flip Y axis position
|
434 |
+
flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3] # flip X axis position
|
435 |
+
|
436 |
+
# use flipped point cloud and camera poses
|
437 |
predictions = {
|
438 |
+
"world_points": flipped_pointmap,
|
439 |
"images": rgb[frame_idx : frame_idx + 1],
|
440 |
"depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
|
441 |
+
"camera_poses": flipped_poses,
|
442 |
}
|
443 |
|
444 |
glb_path = os.path.join(
|
|
|
588 |
return None, None, []
|
589 |
|
590 |
|
591 |
+
@spaces.GPU(duration=300)
|
592 |
def process_prediction(
|
593 |
image_file,
|
594 |
height,
|
|
|
617 |
|
618 |
# Set random seed
|
619 |
seed_all(seed)
|
620 |
+
|
621 |
+
# Check if CUDA is available
|
622 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
623 |
+
if not torch.cuda.is_available():
|
624 |
+
raise ValueError("CUDA is not available. Check your environment.")
|
625 |
|
626 |
# Build the pipeline
|
627 |
pipeline = build_pipeline(device)
|
|
|
720 |
return None, None, []
|
721 |
|
722 |
|
723 |
+
@spaces.GPU(duration=300)
|
724 |
def process_planning(
|
725 |
image_file,
|
726 |
goal_file,
|
|
|
873 |
gr.update(visible=False), # raymap_option
|
874 |
gr.update(visible=False), # post_reconstruction
|
875 |
gr.update(value=1.0), # guidance_scale
|
876 |
+
gr.update(visible=False), # gpu_time_warning
|
877 |
)
|
878 |
elif task == "prediction":
|
879 |
return (
|
|
|
888 |
gr.update(visible=True), # raymap_option
|
889 |
gr.update(visible=True), # post_reconstruction
|
890 |
gr.update(value=3.0), # guidance_scale
|
891 |
+
gr.update(visible=True), # gpu_time_warning
|
892 |
)
|
893 |
elif task == "planning":
|
894 |
return (
|
|
|
903 |
gr.update(visible=False), # raymap_option
|
904 |
gr.update(visible=True), # post_reconstruction
|
905 |
gr.update(value=3.0), # guidance_scale
|
906 |
+
gr.update(visible=True), # gpu_time_warning
|
907 |
)
|
908 |
|
909 |
|
|
|
950 |
min-height: 400px;
|
951 |
}
|
952 |
.warning {
|
953 |
+
color: #856404 !important;
|
954 |
+
font-weight: bold !important;
|
955 |
+
padding: 10px !important;
|
956 |
+
background-color: #fff3cd !important;
|
957 |
+
border-left: 4px solid #ffc107 !important;
|
958 |
+
border-radius: 4px !important;
|
959 |
+
margin: 10px 0 !important;
|
960 |
+
}
|
961 |
+
.dark .warning {
|
962 |
+
background-color: rgba(255, 193, 7, 0.1) !important;
|
963 |
+
color: #fbd38d !important;
|
964 |
}
|
965 |
.highlight {
|
966 |
background-color: rgba(0, 123, 255, 0.1);
|
|
|
970 |
margin: 10px 0;
|
971 |
}
|
972 |
.task-header {
|
973 |
+
margin-top: 15px;
|
974 |
+
margin-bottom: 20px;
|
975 |
+
font-size: 1.4em;
|
976 |
font-weight: bold;
|
977 |
color: #007bff;
|
978 |
}
|
|
|
989 |
}
|
990 |
.input-section, .params-section, .advanced-section {
|
991 |
border: 1px solid #ddd;
|
992 |
+
padding: 20px;
|
993 |
border-radius: 8px;
|
994 |
+
margin-bottom: 20px;
|
995 |
}
|
996 |
.logo-container {
|
997 |
display: flex;
|
|
|
1002 |
max-width: 300px;
|
1003 |
height: auto;
|
1004 |
}
|
1005 |
+
|
1006 |
+
/* Optimize layout and spacing */
|
1007 |
+
.container {
|
1008 |
+
margin: 0 auto;
|
1009 |
+
padding: 0 15px;
|
1010 |
+
max-width: 1800px;
|
1011 |
+
}
|
1012 |
+
|
1013 |
+
.header {
|
1014 |
+
text-align: center;
|
1015 |
+
margin-bottom: 20px;
|
1016 |
+
padding: 15px;
|
1017 |
+
background: linear-gradient(to right, #f8f9fa, #e9ecef);
|
1018 |
+
border-radius: 10px;
|
1019 |
+
}
|
1020 |
+
|
1021 |
+
.dark .header {
|
1022 |
+
background: linear-gradient(to right, #2d3748, #1a202c);
|
1023 |
+
}
|
1024 |
+
|
1025 |
+
.main-title {
|
1026 |
+
font-size: 2.2em;
|
1027 |
+
font-weight: bold;
|
1028 |
+
margin: 0 auto;
|
1029 |
+
color: #2c3e50;
|
1030 |
+
max-width: 800px;
|
1031 |
+
}
|
1032 |
+
|
1033 |
+
.dark .main-title {
|
1034 |
+
color: #e2e8f0;
|
1035 |
+
}
|
1036 |
+
|
1037 |
+
.links-bar {
|
1038 |
+
display: flex;
|
1039 |
+
justify-content: center;
|
1040 |
+
gap: 15px;
|
1041 |
+
margin: 12px 0;
|
1042 |
+
}
|
1043 |
+
|
1044 |
+
.link-button {
|
1045 |
+
display: inline-flex;
|
1046 |
+
align-items: center;
|
1047 |
+
padding: 6px 12px;
|
1048 |
+
background-color: #007bff;
|
1049 |
+
color: white !important;
|
1050 |
+
text-decoration: none;
|
1051 |
+
border-radius: 5px;
|
1052 |
+
transition: background-color 0.3s;
|
1053 |
+
font-size: 0.95em;
|
1054 |
+
}
|
1055 |
+
|
1056 |
+
.link-button:hover {
|
1057 |
+
background-color: #0056b3;
|
1058 |
+
text-decoration: none;
|
1059 |
+
}
|
1060 |
+
|
1061 |
+
.features-limitations-container {
|
1062 |
+
display: flex;
|
1063 |
+
gap: 15px;
|
1064 |
+
margin: 20px 0;
|
1065 |
+
}
|
1066 |
+
|
1067 |
+
.capabilities-box, .limitations-box {
|
1068 |
+
flex: 1;
|
1069 |
+
padding: 18px;
|
1070 |
+
border-radius: 8px;
|
1071 |
+
margin-bottom: 15px;
|
1072 |
+
}
|
1073 |
+
|
1074 |
+
.capabilities-box {
|
1075 |
+
background: #f0f9ff;
|
1076 |
+
border-left: 5px solid #3498db;
|
1077 |
+
}
|
1078 |
+
|
1079 |
+
.dark .capabilities-box {
|
1080 |
+
background: #172a3a;
|
1081 |
+
border-left: 5px solid #3498db;
|
1082 |
+
}
|
1083 |
+
|
1084 |
+
.limitations-box {
|
1085 |
+
background: #f8f9fa;
|
1086 |
+
border-left: 5px solid #ffc107;
|
1087 |
+
}
|
1088 |
+
|
1089 |
+
.dark .limitations-box {
|
1090 |
+
background: #2d2a20;
|
1091 |
+
border-left: 5px solid #ffc107;
|
1092 |
+
}
|
1093 |
+
|
1094 |
+
.capabilities-text, .limitations-text {
|
1095 |
+
color: #495057;
|
1096 |
+
line-height: 1.6;
|
1097 |
+
}
|
1098 |
+
|
1099 |
+
.dark .capabilities-text, .dark .limitations-text {
|
1100 |
+
color: #cbd5e0;
|
1101 |
+
}
|
1102 |
+
|
1103 |
+
.capabilities-text h3 {
|
1104 |
+
color: #2980b9;
|
1105 |
+
margin-top: 0;
|
1106 |
+
margin-bottom: 15px;
|
1107 |
+
}
|
1108 |
+
|
1109 |
+
.dark .capabilities-text h3 {
|
1110 |
+
color: #63b3ed;
|
1111 |
+
}
|
1112 |
+
|
1113 |
+
.limitations-text h3 {
|
1114 |
+
color: #d39e00;
|
1115 |
+
margin-top: 0;
|
1116 |
+
margin-bottom: 15px;
|
1117 |
+
}
|
1118 |
+
|
1119 |
+
.dark .limitations-text h3 {
|
1120 |
+
color: #fbd38d;
|
1121 |
+
}
|
1122 |
+
|
1123 |
+
.capabilities-text blockquote, .limitations-text blockquote {
|
1124 |
+
margin: 20px 0 0 0;
|
1125 |
+
padding: 10px 20px;
|
1126 |
+
font-style: italic;
|
1127 |
+
}
|
1128 |
+
|
1129 |
+
.capabilities-text blockquote {
|
1130 |
+
border-left: 3px solid #3498db;
|
1131 |
+
background: rgba(52, 152, 219, 0.1);
|
1132 |
+
}
|
1133 |
+
|
1134 |
+
.dark .capabilities-text blockquote {
|
1135 |
+
background: rgba(52, 152, 219, 0.2);
|
1136 |
+
}
|
1137 |
+
|
1138 |
+
.limitations-text blockquote {
|
1139 |
+
border-left: 3px solid #ffc107;
|
1140 |
+
background: rgba(255, 193, 7, 0.1);
|
1141 |
+
}
|
1142 |
+
|
1143 |
+
.dark .limitations-text blockquote {
|
1144 |
+
background: rgba(255, 193, 7, 0.2);
|
1145 |
+
}
|
1146 |
+
|
1147 |
+
/* Optimize layout and spacing */
|
1148 |
+
.main-interface {
|
1149 |
+
display: flex;
|
1150 |
+
gap: 30px;
|
1151 |
+
margin-top: 20px;
|
1152 |
+
}
|
1153 |
+
|
1154 |
+
.input-column, .output-column {
|
1155 |
+
flex: 1;
|
1156 |
+
min-width: 0;
|
1157 |
+
display: flex;
|
1158 |
+
flex-direction: column;
|
1159 |
+
}
|
1160 |
+
|
1161 |
+
.output-panel {
|
1162 |
+
border: 1px solid #ddd;
|
1163 |
+
border-radius: 8px;
|
1164 |
+
padding: 20px;
|
1165 |
+
height: 100%;
|
1166 |
+
display: flex;
|
1167 |
+
flex-direction: column;
|
1168 |
+
overflow-y: auto;
|
1169 |
+
}
|
1170 |
+
|
1171 |
+
.dark .output-panel {
|
1172 |
+
border-color: #4a5568;
|
1173 |
+
}
|
1174 |
+
|
1175 |
+
.run-button-container {
|
1176 |
+
display: flex;
|
1177 |
+
justify-content: center;
|
1178 |
+
margin: 15px 0;
|
1179 |
+
}
|
1180 |
+
|
1181 |
+
.run-button {
|
1182 |
+
padding: 10px 30px;
|
1183 |
+
font-size: 1.1em;
|
1184 |
+
font-weight: bold;
|
1185 |
+
background: linear-gradient(to right, #3498db, #2980b9);
|
1186 |
+
border: none;
|
1187 |
+
border-radius: 5px;
|
1188 |
+
color: white;
|
1189 |
+
cursor: pointer;
|
1190 |
+
transition: all 0.3s;
|
1191 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
1192 |
+
}
|
1193 |
+
|
1194 |
+
.run-button:hover {
|
1195 |
+
background: linear-gradient(to right, #2980b9, #1a5276);
|
1196 |
+
box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
|
1197 |
+
transform: translateY(-2px);
|
1198 |
+
}
|
1199 |
+
|
1200 |
+
.task-selector {
|
1201 |
+
background-color: #f8f9fa;
|
1202 |
+
padding: 12px;
|
1203 |
+
border-radius: 8px;
|
1204 |
+
margin-bottom: 15px;
|
1205 |
+
border: 1px solid #e9ecef;
|
1206 |
+
}
|
1207 |
+
|
1208 |
+
.dark .task-selector {
|
1209 |
+
background-color: #2d3748;
|
1210 |
+
border-color: #4a5568;
|
1211 |
+
}
|
1212 |
+
|
1213 |
+
/* Compact parameter settings */
|
1214 |
+
.compact-params .row {
|
1215 |
+
margin-bottom: 8px;
|
1216 |
+
}
|
1217 |
+
|
1218 |
+
.compact-params label {
|
1219 |
+
margin-bottom: 4px;
|
1220 |
+
}
|
1221 |
+
|
1222 |
+
/* More obvious advanced options */
|
1223 |
+
.advanced-options-header {
|
1224 |
+
background-color: #e9ecef;
|
1225 |
+
padding: 10px 15px;
|
1226 |
+
border-radius: 6px;
|
1227 |
+
margin-top: 10px;
|
1228 |
+
font-weight: bold;
|
1229 |
+
color: #495057;
|
1230 |
+
border-left: 4px solid #6c757d;
|
1231 |
+
cursor: pointer;
|
1232 |
+
transition: all 0.2s;
|
1233 |
+
}
|
1234 |
+
|
1235 |
+
.advanced-options-header:hover {
|
1236 |
+
background-color: #dee2e6;
|
1237 |
+
}
|
1238 |
+
|
1239 |
+
.dark .advanced-options-header {
|
1240 |
+
background-color: #2d3748;
|
1241 |
+
color: #e2e8f0;
|
1242 |
+
border-left: 4px solid #a0aec0;
|
1243 |
+
}
|
1244 |
+
|
1245 |
+
.dark .advanced-options-header:hover {
|
1246 |
+
background-color: #4a5568;
|
1247 |
+
}
|
1248 |
+
|
1249 |
+
/* Vertical arrangement of output section */
|
1250 |
+
.output-section {
|
1251 |
+
margin-bottom: 30px;
|
1252 |
+
border: 1px solid #e9ecef;
|
1253 |
+
border-radius: 8px;
|
1254 |
+
padding: 20px;
|
1255 |
+
}
|
1256 |
+
|
1257 |
+
.output-section-title {
|
1258 |
+
font-weight: bold;
|
1259 |
+
color: #495057;
|
1260 |
+
margin-bottom: 15px;
|
1261 |
+
font-size: 1.2em;
|
1262 |
+
}
|
1263 |
+
|
1264 |
+
.dark .output-section-title {
|
1265 |
+
color: #e2e8f0;
|
1266 |
+
}
|
1267 |
+
|
1268 |
+
.pointcloud-controls {
|
1269 |
+
display: flex;
|
1270 |
+
gap: 10px;
|
1271 |
+
margin-bottom: 10px;
|
1272 |
+
align-items: center;
|
1273 |
+
}
|
1274 |
+
|
1275 |
+
.note-box {
|
1276 |
+
background-color: #fff8e1 !important;
|
1277 |
+
border-left: 4px solid #ffc107 !important;
|
1278 |
+
padding: 12px !important;
|
1279 |
+
margin: 15px 0 !important;
|
1280 |
+
border-radius: 4px !important;
|
1281 |
+
color: #333 !important;
|
1282 |
+
}
|
1283 |
+
|
1284 |
+
.dark .note-box {
|
1285 |
+
background-color: rgba(255, 193, 7, 0.1) !important;
|
1286 |
+
color: #e0e0e0 !important;
|
1287 |
+
}
|
1288 |
+
|
1289 |
+
.note-box p, .note-box strong {
|
1290 |
+
color: inherit !important;
|
1291 |
+
}
|
1292 |
+
|
1293 |
+
/* Ensure warning class styles are correctly applied */
|
1294 |
+
.warning {
|
1295 |
+
color: #856404 !important;
|
1296 |
+
font-weight: bold !important;
|
1297 |
+
padding: 10px !important;
|
1298 |
+
background-color: #fff3cd !important;
|
1299 |
+
border-left: 4px solid #ffc107 !important;
|
1300 |
+
border-radius: 4px !important;
|
1301 |
+
margin: 10px 0 !important;
|
1302 |
+
}
|
1303 |
+
|
1304 |
+
.dark .warning {
|
1305 |
+
background-color: rgba(255, 193, 7, 0.1) !important;
|
1306 |
+
color: #fbd38d !important;
|
1307 |
+
}
|
1308 |
+
|
1309 |
+
.warning-box {
|
1310 |
+
background-color: #fff3cd;
|
1311 |
+
border-left: 4px solid #ffc107;
|
1312 |
+
padding: 12px;
|
1313 |
+
margin: 15px 0;
|
1314 |
+
border-radius: 4px;
|
1315 |
+
color: #856404;
|
1316 |
+
}
|
1317 |
+
|
1318 |
+
.dark .warning-box {
|
1319 |
+
background-color: rgba(255, 193, 7, 0.1);
|
1320 |
+
color: #fbd38d;
|
1321 |
+
}
|
1322 |
""",
|
1323 |
) as demo:
|
1324 |
+
with gr.Column(elem_classes=["container"]):
|
1325 |
+
with gr.Row(elem_classes=["header"]):
|
1326 |
+
with gr.Column():
|
1327 |
+
gr.Markdown(
|
1328 |
+
"""
|
1329 |
+
# Aether: Geometric-Aware Unified World Modeling
|
1330 |
+
""",
|
1331 |
+
elem_classes=["main-title"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1332 |
)
|
1333 |
+
|
1334 |
+
gr.Markdown(
|
1335 |
+
"""
|
1336 |
+
<div class="links-bar">
|
1337 |
+
🌐<a href="https://aether-world.github.io/" class="link-button" target="_blank"> Project Page</a>
|
1338 |
+
📄<a href="https://arxiv.org/abs/2503.18945" class="link-button" target="_blank"> Paper</a>
|
1339 |
+
💻<a href="https://github.com/OpenRobotLab/Aether" class="link-button" target="_blank"> Code</a>
|
1340 |
+
🤗<a href="https://huggingface.co/AetherWorldModel/AetherV1" class="link-button" target="_blank"> Model</a>
|
1341 |
+
</div>
|
1342 |
+
""",
|
1343 |
)
|
1344 |
|
1345 |
+
with gr.Row(elem_classes=["features-limitations-container"]):
|
1346 |
+
with gr.Column(elem_classes=["capabilities-box"]):
|
1347 |
+
gr.Markdown(
|
1348 |
+
"""
|
1349 |
+
### 🚀 Key Capabilities
|
1350 |
+
|
1351 |
+
Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
|
1352 |
+
|
1353 |
+
- 🌏 **4D Dynamic Reconstruction**: Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
|
1354 |
+
|
1355 |
+
- 🎬 **Action-Conditioned Prediction**: Predict future frames based on initial observations, with optional camera trajectory actions.
|
1356 |
+
|
1357 |
+
- 🎯 **Goal-Conditioned Planning**: Generate planning paths from pairs of observation and goal images.
|
1358 |
+
|
1359 |
+
> *Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.*
|
1360 |
+
""",
|
1361 |
+
elem_classes=["capabilities-text"]
|
1362 |
+
)
|
1363 |
+
|
1364 |
+
with gr.Column(elem_classes=["limitations-box"]):
|
1365 |
+
gr.Markdown(
|
1366 |
+
"""
|
1367 |
+
### 📝 Current Limitations
|
1368 |
+
|
1369 |
+
Aether represents an initial step in our journey, trained entirely on synthetic data. While it demonstrates promising capabilities, it is important to be aware of its current limitations:
|
1370 |
+
|
1371 |
+
- 🔄 **Dynamic Scenarios**: Struggles with highly dynamic scenarios involving significant motion or dense crowds.
|
1372 |
+
|
1373 |
+
- 📸 **Camera Stability**: Camera pose estimation can be less stable in certain conditions.
|
1374 |
+
|
1375 |
+
- 📐 **Planning Range**: For visual planning tasks, we recommend keeping the observations and goals relatively close to ensure optimal performance.
|
1376 |
+
|
1377 |
+
> *We are actively working on the next generation of Aether and are committed to addressing these limitations in future releases.*
|
1378 |
+
""",
|
1379 |
+
elem_classes=["limitations-text"]
|
1380 |
)
|
1381 |
|
1382 |
+
with gr.Row(elem_classes=["main-interface"]):
|
1383 |
+
with gr.Column(elem_classes=["input-column"]):
|
1384 |
+
with gr.Group(elem_classes=["task-selector"]):
|
1385 |
+
task = gr.Radio(
|
1386 |
+
["reconstruction", "prediction", "planning"],
|
1387 |
+
label="Select Task",
|
1388 |
+
value="reconstruction",
|
1389 |
+
info="Choose the task you want to perform",
|
1390 |
)
|
1391 |
+
gpu_time_warning = gr.Markdown(
|
1392 |
+
"""
|
1393 |
+
<div class="warning-box">
|
1394 |
+
<strong>⚠️ Note:</strong> Due to HuggingFace Spaces ZERO GPU quota limitations (5 minutes max),
|
1395 |
+
prediction and planning tasks may not complete in time. We strongly recommend deploying
|
1396 |
+
our model locally for the full Aether experience.
|
1397 |
+
</div>
|
1398 |
+
""",
|
1399 |
+
visible=False
|
1400 |
)
|
1401 |
|
1402 |
+
with gr.Group(elem_classes=["input-section"]):
|
1403 |
+
# Input section - changes based on task
|
1404 |
+
gr.Markdown("## 📥 Input", elem_classes=["task-header"])
|
1405 |
|
1406 |
+
# Task-specific inputs
|
1407 |
+
video_input = gr.Video(
|
1408 |
+
label="Upload Input Video",
|
1409 |
+
sources=["upload"],
|
1410 |
+
visible=True,
|
1411 |
+
interactive=True,
|
1412 |
+
elem_id="video_input",
|
1413 |
+
)
|
1414 |
|
1415 |
+
image_input = gr.File(
|
1416 |
+
label="Upload Start Image",
|
1417 |
+
file_count="single",
|
1418 |
+
file_types=["image"],
|
1419 |
+
visible=False,
|
1420 |
+
interactive=True,
|
1421 |
+
elem_id="image_input",
|
1422 |
+
)
|
1423 |
|
1424 |
+
goal_input = gr.File(
|
1425 |
+
label="Upload Goal Image",
|
1426 |
+
file_count="single",
|
1427 |
+
file_types=["image"],
|
1428 |
+
visible=False,
|
1429 |
+
interactive=True,
|
1430 |
+
elem_id="goal_input",
|
1431 |
+
)
|
1432 |
|
1433 |
+
with gr.Row(visible=False) as preview_row:
|
1434 |
+
image_preview = gr.Image(
|
1435 |
+
label="Start Image Preview",
|
1436 |
+
elem_id="image_preview",
|
1437 |
+
visible=False,
|
|
|
1438 |
)
|
1439 |
+
goal_preview = gr.Image(
|
1440 |
+
label="Goal Image Preview",
|
1441 |
+
elem_id="goal_preview",
|
1442 |
+
visible=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
1443 |
)
|
1444 |
|
1445 |
+
with gr.Group(elem_classes=["params-section", "compact-params"]):
|
1446 |
+
gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1447 |
|
|
|
|
|
|
|
|
|
1448 |
with gr.Row():
|
1449 |
with gr.Column(scale=1):
|
1450 |
+
height = gr.Dropdown(
|
1451 |
+
choices=[480],
|
1452 |
+
value=480,
|
1453 |
+
label="Height",
|
1454 |
+
info="Height of the output video",
|
|
|
|
|
1455 |
)
|
1456 |
|
|
|
1457 |
with gr.Column(scale=1):
|
1458 |
+
width = gr.Dropdown(
|
1459 |
+
choices=[720],
|
1460 |
+
value=720,
|
1461 |
+
label="Width",
|
1462 |
+
info="Width of the output video",
|
|
|
|
|
1463 |
)
|
1464 |
|
1465 |
with gr.Row():
|
1466 |
with gr.Column(scale=1):
|
1467 |
+
num_frames = gr.Dropdown(
|
1468 |
+
choices=[17, 25, 33, 41],
|
1469 |
+
value=41,
|
1470 |
+
label="Number of Frames",
|
1471 |
+
info="Number of frames to predict",
|
1472 |
)
|
1473 |
|
1474 |
with gr.Column(scale=1):
|
1475 |
+
fps = gr.Dropdown(
|
1476 |
+
choices=[8, 10, 12, 15, 24],
|
1477 |
+
value=12,
|
1478 |
+
label="FPS",
|
1479 |
+
info="Frames per second",
|
1480 |
)
|
1481 |
|
1482 |
with gr.Row():
|
1483 |
+
num_inference_steps = gr.Slider(
|
1484 |
+
minimum=1,
|
1485 |
+
maximum=60,
|
1486 |
+
value=4,
|
1487 |
+
step=1,
|
1488 |
+
label="Inference Steps",
|
1489 |
+
info="Number of inference step",
|
1490 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1491 |
|
1492 |
+
sliding_window_stride = gr.Slider(
|
1493 |
minimum=1,
|
1494 |
+
maximum=40,
|
1495 |
+
value=24,
|
1496 |
step=1,
|
1497 |
+
label="Sliding Window Stride",
|
1498 |
+
info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
|
1499 |
+
visible=True,
|
1500 |
)
|
1501 |
|
1502 |
+
use_dynamic_cfg = gr.Checkbox(
|
1503 |
+
label="Use Dynamic CFG",
|
1504 |
+
value=True,
|
1505 |
+
info="Use dynamic CFG",
|
1506 |
+
visible=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1507 |
)
|
1508 |
+
|
1509 |
+
raymap_option = gr.Radio(
|
1510 |
+
choices=["backward", "forward_right", "left_forward", "right"],
|
1511 |
+
label="Camera Movement Direction",
|
1512 |
+
value="forward_right",
|
1513 |
+
info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
|
1514 |
visible=False,
|
|
|
1515 |
)
|
1516 |
|
1517 |
+
post_reconstruction = gr.Checkbox(
|
1518 |
+
label="Post-Reconstruction",
|
1519 |
+
value=True,
|
1520 |
+
info="Run reconstruction after prediction for better quality",
|
1521 |
+
visible=False,
|
1522 |
+
)
|
1523 |
|
1524 |
+
with gr.Accordion(
|
1525 |
+
"Advanced Options", open=False, visible=True, elem_classes=["advanced-options-header"]
|
1526 |
+
) as advanced_options:
|
1527 |
+
with gr.Group(elem_classes=["advanced-section"]):
|
1528 |
+
with gr.Row():
|
1529 |
+
guidance_scale = gr.Slider(
|
1530 |
+
minimum=1.0,
|
1531 |
+
maximum=10.0,
|
1532 |
+
value=1.0,
|
1533 |
+
step=0.1,
|
1534 |
+
label="Guidance Scale",
|
1535 |
+
info="Guidance scale (only for prediction / planning)",
|
1536 |
+
)
|
1537 |
+
|
1538 |
+
with gr.Row():
|
1539 |
+
seed = gr.Number(
|
1540 |
+
value=42,
|
1541 |
+
label="Random Seed",
|
1542 |
+
info="Set a seed for reproducible results",
|
1543 |
+
precision=0,
|
1544 |
+
minimum=0,
|
1545 |
+
maximum=2147483647,
|
1546 |
+
)
|
1547 |
+
|
1548 |
+
with gr.Row():
|
1549 |
+
with gr.Column(scale=1):
|
1550 |
+
smooth_camera = gr.Checkbox(
|
1551 |
+
label="Smooth Camera",
|
1552 |
+
value=True,
|
1553 |
+
info="Apply smoothing to camera trajectory",
|
1554 |
+
)
|
1555 |
+
|
1556 |
+
with gr.Column(scale=1):
|
1557 |
+
align_pointmaps = gr.Checkbox(
|
1558 |
+
label="Align Point Maps",
|
1559 |
+
value=False,
|
1560 |
+
info="Align point maps across frames",
|
1561 |
+
)
|
1562 |
+
|
1563 |
+
with gr.Row():
|
1564 |
+
with gr.Column(scale=1):
|
1565 |
+
max_depth = gr.Slider(
|
1566 |
+
minimum=10,
|
1567 |
+
maximum=200,
|
1568 |
+
value=60,
|
1569 |
+
step=10,
|
1570 |
+
label="Max Depth",
|
1571 |
+
info="Maximum depth for point cloud (higher = more distant points)",
|
1572 |
+
)
|
1573 |
+
|
1574 |
+
with gr.Column(scale=1):
|
1575 |
+
rtol = gr.Slider(
|
1576 |
+
minimum=0.01,
|
1577 |
+
maximum=2.0,
|
1578 |
+
value=0.2,
|
1579 |
+
step=0.01,
|
1580 |
+
label="Relative Tolerance",
|
1581 |
+
info="Used for depth edge detection. Lower = remove more edges",
|
1582 |
+
)
|
1583 |
+
|
1584 |
+
pointcloud_save_frame_interval = gr.Slider(
|
1585 |
+
minimum=1,
|
1586 |
+
maximum=20,
|
1587 |
+
value=10,
|
1588 |
+
step=1,
|
1589 |
+
label="Point Cloud Frame Interval",
|
1590 |
+
info="Save point cloud every N frames (higher = fewer files but less complete representation)",
|
1591 |
+
)
|
1592 |
|
1593 |
+
with gr.Group(elem_classes=["run-button-container"]):
|
1594 |
+
run_button = gr.Button("Run Aether", variant="primary", elem_classes=["run-button"])
|
1595 |
+
|
1596 |
+
with gr.Column(elem_classes=["output-column"]):
|
1597 |
+
with gr.Group(elem_classes=["output-panel"]):
|
1598 |
+
gr.Markdown("## 📤 Output", elem_classes=["task-header"])
|
1599 |
+
|
1600 |
+
with gr.Group(elem_classes=["output-section"]):
|
1601 |
+
gr.Markdown("### RGB Video", elem_classes=["output-section-title"])
|
1602 |
+
rgb_output = gr.Video(
|
1603 |
+
label="RGB Output", interactive=False, elem_id="rgb_output"
|
1604 |
+
)
|
1605 |
+
|
1606 |
+
with gr.Group(elem_classes=["output-section"]):
|
1607 |
+
gr.Markdown("### Depth Video", elem_classes=["output-section-title"])
|
1608 |
+
depth_output = gr.Video(
|
1609 |
+
label="Depth Output", interactive=False, elem_id="depth_output"
|
1610 |
+
)
|
1611 |
+
|
1612 |
+
with gr.Group(elem_classes=["output-section"]):
|
1613 |
+
gr.Markdown("### Point Clouds", elem_classes=["output-section-title"])
|
1614 |
+
with gr.Row(elem_classes=["pointcloud-controls"]):
|
1615 |
+
pointcloud_frames = gr.Dropdown(
|
1616 |
+
label="Select Frame",
|
1617 |
+
choices=[],
|
1618 |
+
value=None,
|
1619 |
+
interactive=True,
|
1620 |
+
elem_id="pointcloud_frames",
|
1621 |
+
)
|
1622 |
+
pointcloud_download = gr.DownloadButton(
|
1623 |
+
label="Download Point Cloud",
|
1624 |
+
visible=False,
|
1625 |
+
elem_id="pointcloud_download",
|
1626 |
+
)
|
1627 |
+
|
1628 |
+
model_output = gr.Model3D(
|
1629 |
+
label="Point Cloud Viewer", interactive=True, elem_id="model_output"
|
1630 |
+
)
|
1631 |
+
|
1632 |
+
gr.Markdown(
|
1633 |
+
"""
|
1634 |
+
> **Note:** 3D point clouds take a long time to visualize, and we show the keyframes only.
|
1635 |
+
> You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.
|
1636 |
+
"""
|
1637 |
+
)
|
1638 |
+
|
1639 |
+
with gr.Group(elem_classes=["output-section"]):
|
1640 |
+
gr.Markdown("### About Results", elem_classes=["output-section-title"])
|
1641 |
+
gr.Markdown(
|
1642 |
+
"""
|
1643 |
+
#### Understanding the Outputs
|
1644 |
+
|
1645 |
+
- **RGB Video**: Shows the predicted or reconstructed RGB frames
|
1646 |
+
- **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
|
1647 |
+
- **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
|
1648 |
+
"""
|
1649 |
+
)
|
1650 |
|
1651 |
+
# Example Accordion
|
1652 |
+
with gr.Accordion("Examples", open=False):
|
1653 |
+
gr.Markdown(
|
1654 |
+
"""
|
1655 |
+
### Examples will be added soon
|
1656 |
+
Check back for example inputs for each task type.
|
1657 |
+
"""
|
1658 |
+
)
|
1659 |
|
1660 |
# Event handlers
|
1661 |
task.change(
|
|
|
1673 |
raymap_option,
|
1674 |
post_reconstruction,
|
1675 |
guidance_scale,
|
1676 |
+
gpu_time_warning,
|
1677 |
],
|
1678 |
)
|
1679 |
|
|
|
1904 |
outputs=[pointcloud_download],
|
1905 |
)
|
1906 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1907 |
# Load the model at startup
|
1908 |
demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
|
1909 |
|