AmberHeart commited on
Commit
e88ccc8
·
1 Parent(s): 8040e22

space init version

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +656 -247
.gitignore CHANGED
@@ -16,6 +16,7 @@ eggs/
16
  .eggs/
17
  lib/
18
  lib64/
 
19
  parts/
20
  sdist/
21
  var/
 
16
  .eggs/
17
  lib/
18
  lib64/
19
+ outputs/
20
  parts/
21
  sdist/
22
  var/
app.py CHANGED
@@ -416,12 +416,29 @@ def save_output_files(
416
  for frame_idx in frames_to_save:
417
  if frame_idx >= pointmap.shape[0]:
418
  continue
419
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
420
  predictions = {
421
- "world_points": pointmap[frame_idx : frame_idx + 1],
422
  "images": rgb[frame_idx : frame_idx + 1],
423
  "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
424
- "camera_poses": poses[frame_idx : frame_idx + 1],
425
  }
426
 
427
  glb_path = os.path.join(
@@ -571,7 +588,7 @@ def process_reconstruction(
571
  return None, None, []
572
 
573
 
574
- @spaces.GPU(duration=240)
575
  def process_prediction(
576
  image_file,
577
  height,
@@ -600,6 +617,11 @@ def process_prediction(
600
 
601
  # Set random seed
602
  seed_all(seed)
 
 
 
 
 
603
 
604
  # Build the pipeline
605
  pipeline = build_pipeline(device)
@@ -698,7 +720,7 @@ def process_prediction(
698
  return None, None, []
699
 
700
 
701
- @spaces.GPU(duration=240)
702
  def process_planning(
703
  image_file,
704
  goal_file,
@@ -851,6 +873,7 @@ def update_task_ui(task):
851
  gr.update(visible=False), # raymap_option
852
  gr.update(visible=False), # post_reconstruction
853
  gr.update(value=1.0), # guidance_scale
 
854
  )
855
  elif task == "prediction":
856
  return (
@@ -865,6 +888,7 @@ def update_task_ui(task):
865
  gr.update(visible=True), # raymap_option
866
  gr.update(visible=True), # post_reconstruction
867
  gr.update(value=3.0), # guidance_scale
 
868
  )
869
  elif task == "planning":
870
  return (
@@ -879,6 +903,7 @@ def update_task_ui(task):
879
  gr.update(visible=False), # raymap_option
880
  gr.update(visible=True), # post_reconstruction
881
  gr.update(value=3.0), # guidance_scale
 
882
  )
883
 
884
 
@@ -925,8 +950,17 @@ with gr.Blocks(
925
  min-height: 400px;
926
  }
927
  .warning {
928
- color: #ff9800;
929
- font-weight: bold;
 
 
 
 
 
 
 
 
 
930
  }
931
  .highlight {
932
  background-color: rgba(0, 123, 255, 0.1);
@@ -936,9 +970,9 @@ with gr.Blocks(
936
  margin: 10px 0;
937
  }
938
  .task-header {
939
- margin-top: 10px;
940
- margin-bottom: 15px;
941
- font-size: 1.2em;
942
  font-weight: bold;
943
  color: #007bff;
944
  }
@@ -955,9 +989,9 @@ with gr.Blocks(
955
  }
956
  .input-section, .params-section, .advanced-section {
957
  border: 1px solid #ddd;
958
- padding: 15px;
959
  border-radius: 8px;
960
- margin-bottom: 15px;
961
  }
962
  .logo-container {
963
  display: flex;
@@ -968,277 +1002,660 @@ with gr.Blocks(
968
  max-width: 300px;
969
  height: auto;
970
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
  """,
972
  ) as demo:
973
- with gr.Row(elem_classes=["logo-container"]):
974
- gr.Image("assets/logo.png", show_label=False, elem_classes=["logo-image"])
975
-
976
- gr.Markdown(
977
- """
978
- # Aether: Geometric-Aware Unified World Modeling
979
-
980
- Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with
981
- generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
982
-
983
- 1. **4D dynamic reconstruction** - Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
984
- 2. **Action-Conditioned Video Prediction** - Predict future frames based on initial observation images, with optional conditions of camera trajectory actions.
985
- 3. **Goal-Conditioned Visual Planning** - Generate planning paths from pairs of observation and goal images.
986
-
987
- Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.
988
- """
989
- )
990
-
991
- with gr.Row():
992
- with gr.Column(scale=1):
993
- task = gr.Radio(
994
- ["reconstruction", "prediction", "planning"],
995
- label="Select Task",
996
- value="reconstruction",
997
- info="Choose the task you want to perform",
998
- )
999
-
1000
- with gr.Group(elem_classes=["input-section"]):
1001
- # Input section - changes based on task
1002
- gr.Markdown("## 📥 Input", elem_classes=["task-header"])
1003
-
1004
- # Task-specific inputs
1005
- video_input = gr.Video(
1006
- label="Upload Input Video",
1007
- sources=["upload"],
1008
- visible=True,
1009
- interactive=True,
1010
- elem_id="video_input",
1011
  )
1012
-
1013
- image_input = gr.File(
1014
- label="Upload Start Image",
1015
- file_count="single",
1016
- file_types=["image"],
1017
- visible=False,
1018
- interactive=True,
1019
- elem_id="image_input",
 
 
1020
  )
1021
 
1022
- goal_input = gr.File(
1023
- label="Upload Goal Image",
1024
- file_count="single",
1025
- file_types=["image"],
1026
- visible=False,
1027
- interactive=True,
1028
- elem_id="goal_input",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1029
  )
1030
 
1031
- with gr.Row(visible=False) as preview_row:
1032
- image_preview = gr.Image(
1033
- label="Start Image Preview",
1034
- elem_id="image_preview",
1035
- visible=False,
 
 
 
1036
  )
1037
- goal_preview = gr.Image(
1038
- label="Goal Image Preview",
1039
- elem_id="goal_preview",
1040
- visible=False,
 
 
 
 
 
1041
  )
1042
 
1043
- with gr.Group(elem_classes=["params-section"]):
1044
- gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
 
1045
 
1046
- with gr.Row():
1047
- with gr.Column(scale=1):
1048
- height = gr.Dropdown(
1049
- choices=[480],
1050
- value=480,
1051
- label="Height",
1052
- info="Height of the output video",
1053
- )
1054
 
1055
- with gr.Column(scale=1):
1056
- width = gr.Dropdown(
1057
- choices=[720],
1058
- value=720,
1059
- label="Width",
1060
- info="Width of the output video",
1061
- )
 
1062
 
1063
- with gr.Row():
1064
- with gr.Column(scale=1):
1065
- num_frames = gr.Dropdown(
1066
- choices=[17, 25, 33, 41],
1067
- value=41,
1068
- label="Number of Frames",
1069
- info="Number of frames to predict",
1070
- )
1071
 
1072
- with gr.Column(scale=1):
1073
- fps = gr.Dropdown(
1074
- choices=[8, 10, 12, 15, 24],
1075
- value=12,
1076
- label="FPS",
1077
- info="Frames per second",
1078
  )
1079
-
1080
- with gr.Row():
1081
- with gr.Column(scale=1):
1082
- num_inference_steps = gr.Slider(
1083
- minimum=1,
1084
- maximum=60,
1085
- value=4,
1086
- step=1,
1087
- label="Inference Steps",
1088
- info="Number of inference step",
1089
  )
1090
 
1091
- sliding_window_stride = gr.Slider(
1092
- minimum=1,
1093
- maximum=40,
1094
- value=24,
1095
- step=1,
1096
- label="Sliding Window Stride",
1097
- info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
1098
- visible=True,
1099
- )
1100
-
1101
- use_dynamic_cfg = gr.Checkbox(
1102
- label="Use Dynamic CFG",
1103
- value=True,
1104
- info="Use dynamic CFG",
1105
- visible=False,
1106
- )
1107
-
1108
- raymap_option = gr.Radio(
1109
- choices=["backward", "forward_right", "left_forward", "right"],
1110
- label="Camera Movement Direction",
1111
- value="forward_right",
1112
- info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
1113
- visible=False,
1114
- )
1115
-
1116
- post_reconstruction = gr.Checkbox(
1117
- label="Post-Reconstruction",
1118
- value=True,
1119
- info="Run reconstruction after prediction for better quality",
1120
- visible=False,
1121
- )
1122
 
1123
- with gr.Accordion(
1124
- "Advanced Options", open=False, visible=True
1125
- ) as advanced_options:
1126
- with gr.Group(elem_classes=["advanced-section"]):
1127
  with gr.Row():
1128
  with gr.Column(scale=1):
1129
- guidance_scale = gr.Slider(
1130
- minimum=1.0,
1131
- maximum=10.0,
1132
- value=1.0,
1133
- step=0.1,
1134
- label="Guidance Scale",
1135
- info="Guidance scale (only for prediction / planning)",
1136
  )
1137
 
1138
- with gr.Row():
1139
  with gr.Column(scale=1):
1140
- seed = gr.Number(
1141
- value=42,
1142
- label="Random Seed",
1143
- info="Set a seed for reproducible results",
1144
- precision=0,
1145
- minimum=0,
1146
- maximum=2147483647,
1147
  )
1148
 
1149
  with gr.Row():
1150
  with gr.Column(scale=1):
1151
- smooth_camera = gr.Checkbox(
1152
- label="Smooth Camera",
1153
- value=True,
1154
- info="Apply smoothing to camera trajectory",
 
1155
  )
1156
 
1157
  with gr.Column(scale=1):
1158
- align_pointmaps = gr.Checkbox(
1159
- label="Align Point Maps",
1160
- value=False,
1161
- info="Align point maps across frames",
 
1162
  )
1163
 
1164
  with gr.Row():
1165
- with gr.Column(scale=1):
1166
- max_depth = gr.Slider(
1167
- minimum=10,
1168
- maximum=200,
1169
- value=60,
1170
- step=10,
1171
- label="Max Depth",
1172
- info="Maximum depth for point cloud (higher = more distant points)",
1173
- )
1174
-
1175
- with gr.Column(scale=1):
1176
- rtol = gr.Slider(
1177
- minimum=0.01,
1178
- maximum=2.0,
1179
- value=0.03,
1180
- step=0.01,
1181
- label="Relative Tolerance",
1182
- info="Used for depth edge detection. Lower = remove more edges",
1183
- )
1184
 
1185
- pointcloud_save_frame_interval = gr.Slider(
1186
  minimum=1,
1187
- maximum=20,
1188
- value=10,
1189
  step=1,
1190
- label="Point Cloud Frame Interval",
1191
- info="Save point cloud every N frames (higher = fewer files but less complete representation)",
 
1192
  )
1193
 
1194
- run_button = gr.Button("Run Aether", variant="primary")
1195
-
1196
- with gr.Column(scale=1, elem_classes=["output-column"]):
1197
- with gr.Group():
1198
- gr.Markdown("## 📤 Output", elem_classes=["task-header"])
1199
-
1200
- gr.Markdown("### RGB Video", elem_classes=["output-subtitle"])
1201
- rgb_output = gr.Video(
1202
- label="RGB Output", interactive=False, elem_id="rgb_output"
1203
- )
1204
-
1205
- gr.Markdown("### Depth Video", elem_classes=["output-subtitle"])
1206
- depth_output = gr.Video(
1207
- label="Depth Output", interactive=False, elem_id="depth_output"
1208
- )
1209
-
1210
- gr.Markdown("### Point Clouds", elem_classes=["output-subtitle"])
1211
- with gr.Row(elem_classes=["flex-display"]):
1212
- pointcloud_frames = gr.Dropdown(
1213
- label="Select Frame",
1214
- choices=[],
1215
- value=None,
1216
- interactive=True,
1217
- elem_id="pointcloud_frames",
1218
  )
1219
- pointcloud_download = gr.DownloadButton(
1220
- label="Download Point Cloud",
 
 
 
 
1221
  visible=False,
1222
- elem_id="pointcloud_download",
1223
  )
1224
 
1225
- model_output = gr.Model3D(
1226
- label="Point Cloud Viewer", interactive=True, elem_id="model_output"
1227
- )
 
 
 
1228
 
1229
- with gr.Tab("About Results"):
1230
- gr.Markdown(
1231
- """
1232
- ### Understanding the Outputs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233
 
1234
- - **RGB Video**: Shows the predicted or reconstructed RGB frames
1235
- - **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
1236
- - **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1237
 
1238
- <p class="warning">Note: 3D point clouds take a long time to visualize, and we show the keyframes only.
1239
- You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.</p>
1240
- """
1241
- )
 
 
 
 
1242
 
1243
  # Event handlers
1244
  task.change(
@@ -1256,6 +1673,7 @@ with gr.Blocks(
1256
  raymap_option,
1257
  post_reconstruction,
1258
  guidance_scale,
 
1259
  ],
1260
  )
1261
 
@@ -1486,15 +1904,6 @@ with gr.Blocks(
1486
  outputs=[pointcloud_download],
1487
  )
1488
 
1489
- # Example Accordion
1490
- with gr.Accordion("Examples"):
1491
- gr.Markdown(
1492
- """
1493
- ### Examples will be added soon
1494
- Check back for example inputs for each task type.
1495
- """
1496
- )
1497
-
1498
  # Load the model at startup
1499
  demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
1500
 
 
416
  for frame_idx in frames_to_save:
417
  if frame_idx >= pointmap.shape[0]:
418
  continue
419
+
420
+ # fix the problem of point cloud being upside down and left-right reversed: flip Y axis and X axis
421
+ flipped_pointmap = pointmap[frame_idx:frame_idx+1].copy()
422
+ flipped_pointmap[..., 1] = -flipped_pointmap[..., 1] # flip Y axis (up and down)
423
+ flipped_pointmap[..., 0] = -flipped_pointmap[..., 0] # flip X axis (left and right)
424
+
425
+ # flip camera poses
426
+ flipped_poses = poses[frame_idx:frame_idx+1].copy()
427
+ # flip Y axis and X axis of camera orientation
428
+ flipped_poses[..., 1, :3] = -flipped_poses[..., 1, :3] # flip Y axis of camera orientation
429
+ flipped_poses[..., 0, :3] = -flipped_poses[..., 0, :3] # flip X axis of camera orientation
430
+ flipped_poses[..., :3, 1] = -flipped_poses[..., :3, 1] # flip Y axis of camera orientation
431
+ flipped_poses[..., :3, 0] = -flipped_poses[..., :3, 0] # flip X axis of camera orientation
432
+ # flip Y axis and X axis of camera position
433
+ flipped_poses[..., 1, 3] = -flipped_poses[..., 1, 3] # flip Y axis position
434
+ flipped_poses[..., 0, 3] = -flipped_poses[..., 0, 3] # flip X axis position
435
+
436
+ # use flipped point cloud and camera poses
437
  predictions = {
438
+ "world_points": flipped_pointmap,
439
  "images": rgb[frame_idx : frame_idx + 1],
440
  "depths": 1 / np.clip(disparity[frame_idx : frame_idx + 1], 1e-8, 1e8),
441
+ "camera_poses": flipped_poses,
442
  }
443
 
444
  glb_path = os.path.join(
 
588
  return None, None, []
589
 
590
 
591
+ @spaces.GPU(duration=300)
592
  def process_prediction(
593
  image_file,
594
  height,
 
617
 
618
  # Set random seed
619
  seed_all(seed)
620
+
621
+ # Check if CUDA is available
622
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
623
+ if not torch.cuda.is_available():
624
+ raise ValueError("CUDA is not available. Check your environment.")
625
 
626
  # Build the pipeline
627
  pipeline = build_pipeline(device)
 
720
  return None, None, []
721
 
722
 
723
+ @spaces.GPU(duration=300)
724
  def process_planning(
725
  image_file,
726
  goal_file,
 
873
  gr.update(visible=False), # raymap_option
874
  gr.update(visible=False), # post_reconstruction
875
  gr.update(value=1.0), # guidance_scale
876
+ gr.update(visible=False), # gpu_time_warning
877
  )
878
  elif task == "prediction":
879
  return (
 
888
  gr.update(visible=True), # raymap_option
889
  gr.update(visible=True), # post_reconstruction
890
  gr.update(value=3.0), # guidance_scale
891
+ gr.update(visible=True), # gpu_time_warning
892
  )
893
  elif task == "planning":
894
  return (
 
903
  gr.update(visible=False), # raymap_option
904
  gr.update(visible=True), # post_reconstruction
905
  gr.update(value=3.0), # guidance_scale
906
+ gr.update(visible=True), # gpu_time_warning
907
  )
908
 
909
 
 
950
  min-height: 400px;
951
  }
952
  .warning {
953
+ color: #856404 !important;
954
+ font-weight: bold !important;
955
+ padding: 10px !important;
956
+ background-color: #fff3cd !important;
957
+ border-left: 4px solid #ffc107 !important;
958
+ border-radius: 4px !important;
959
+ margin: 10px 0 !important;
960
+ }
961
+ .dark .warning {
962
+ background-color: rgba(255, 193, 7, 0.1) !important;
963
+ color: #fbd38d !important;
964
  }
965
  .highlight {
966
  background-color: rgba(0, 123, 255, 0.1);
 
970
  margin: 10px 0;
971
  }
972
  .task-header {
973
+ margin-top: 15px;
974
+ margin-bottom: 20px;
975
+ font-size: 1.4em;
976
  font-weight: bold;
977
  color: #007bff;
978
  }
 
989
  }
990
  .input-section, .params-section, .advanced-section {
991
  border: 1px solid #ddd;
992
+ padding: 20px;
993
  border-radius: 8px;
994
+ margin-bottom: 20px;
995
  }
996
  .logo-container {
997
  display: flex;
 
1002
  max-width: 300px;
1003
  height: auto;
1004
  }
1005
+
1006
+ /* Optimize layout and spacing */
1007
+ .container {
1008
+ margin: 0 auto;
1009
+ padding: 0 15px;
1010
+ max-width: 1800px;
1011
+ }
1012
+
1013
+ .header {
1014
+ text-align: center;
1015
+ margin-bottom: 20px;
1016
+ padding: 15px;
1017
+ background: linear-gradient(to right, #f8f9fa, #e9ecef);
1018
+ border-radius: 10px;
1019
+ }
1020
+
1021
+ .dark .header {
1022
+ background: linear-gradient(to right, #2d3748, #1a202c);
1023
+ }
1024
+
1025
+ .main-title {
1026
+ font-size: 2.2em;
1027
+ font-weight: bold;
1028
+ margin: 0 auto;
1029
+ color: #2c3e50;
1030
+ max-width: 800px;
1031
+ }
1032
+
1033
+ .dark .main-title {
1034
+ color: #e2e8f0;
1035
+ }
1036
+
1037
+ .links-bar {
1038
+ display: flex;
1039
+ justify-content: center;
1040
+ gap: 15px;
1041
+ margin: 12px 0;
1042
+ }
1043
+
1044
+ .link-button {
1045
+ display: inline-flex;
1046
+ align-items: center;
1047
+ padding: 6px 12px;
1048
+ background-color: #007bff;
1049
+ color: white !important;
1050
+ text-decoration: none;
1051
+ border-radius: 5px;
1052
+ transition: background-color 0.3s;
1053
+ font-size: 0.95em;
1054
+ }
1055
+
1056
+ .link-button:hover {
1057
+ background-color: #0056b3;
1058
+ text-decoration: none;
1059
+ }
1060
+
1061
+ .features-limitations-container {
1062
+ display: flex;
1063
+ gap: 15px;
1064
+ margin: 20px 0;
1065
+ }
1066
+
1067
+ .capabilities-box, .limitations-box {
1068
+ flex: 1;
1069
+ padding: 18px;
1070
+ border-radius: 8px;
1071
+ margin-bottom: 15px;
1072
+ }
1073
+
1074
+ .capabilities-box {
1075
+ background: #f0f9ff;
1076
+ border-left: 5px solid #3498db;
1077
+ }
1078
+
1079
+ .dark .capabilities-box {
1080
+ background: #172a3a;
1081
+ border-left: 5px solid #3498db;
1082
+ }
1083
+
1084
+ .limitations-box {
1085
+ background: #f8f9fa;
1086
+ border-left: 5px solid #ffc107;
1087
+ }
1088
+
1089
+ .dark .limitations-box {
1090
+ background: #2d2a20;
1091
+ border-left: 5px solid #ffc107;
1092
+ }
1093
+
1094
+ .capabilities-text, .limitations-text {
1095
+ color: #495057;
1096
+ line-height: 1.6;
1097
+ }
1098
+
1099
+ .dark .capabilities-text, .dark .limitations-text {
1100
+ color: #cbd5e0;
1101
+ }
1102
+
1103
+ .capabilities-text h3 {
1104
+ color: #2980b9;
1105
+ margin-top: 0;
1106
+ margin-bottom: 15px;
1107
+ }
1108
+
1109
+ .dark .capabilities-text h3 {
1110
+ color: #63b3ed;
1111
+ }
1112
+
1113
+ .limitations-text h3 {
1114
+ color: #d39e00;
1115
+ margin-top: 0;
1116
+ margin-bottom: 15px;
1117
+ }
1118
+
1119
+ .dark .limitations-text h3 {
1120
+ color: #fbd38d;
1121
+ }
1122
+
1123
+ .capabilities-text blockquote, .limitations-text blockquote {
1124
+ margin: 20px 0 0 0;
1125
+ padding: 10px 20px;
1126
+ font-style: italic;
1127
+ }
1128
+
1129
+ .capabilities-text blockquote {
1130
+ border-left: 3px solid #3498db;
1131
+ background: rgba(52, 152, 219, 0.1);
1132
+ }
1133
+
1134
+ .dark .capabilities-text blockquote {
1135
+ background: rgba(52, 152, 219, 0.2);
1136
+ }
1137
+
1138
+ .limitations-text blockquote {
1139
+ border-left: 3px solid #ffc107;
1140
+ background: rgba(255, 193, 7, 0.1);
1141
+ }
1142
+
1143
+ .dark .limitations-text blockquote {
1144
+ background: rgba(255, 193, 7, 0.2);
1145
+ }
1146
+
1147
+ /* Optimize layout and spacing */
1148
+ .main-interface {
1149
+ display: flex;
1150
+ gap: 30px;
1151
+ margin-top: 20px;
1152
+ }
1153
+
1154
+ .input-column, .output-column {
1155
+ flex: 1;
1156
+ min-width: 0;
1157
+ display: flex;
1158
+ flex-direction: column;
1159
+ }
1160
+
1161
+ .output-panel {
1162
+ border: 1px solid #ddd;
1163
+ border-radius: 8px;
1164
+ padding: 20px;
1165
+ height: 100%;
1166
+ display: flex;
1167
+ flex-direction: column;
1168
+ overflow-y: auto;
1169
+ }
1170
+
1171
+ .dark .output-panel {
1172
+ border-color: #4a5568;
1173
+ }
1174
+
1175
+ .run-button-container {
1176
+ display: flex;
1177
+ justify-content: center;
1178
+ margin: 15px 0;
1179
+ }
1180
+
1181
+ .run-button {
1182
+ padding: 10px 30px;
1183
+ font-size: 1.1em;
1184
+ font-weight: bold;
1185
+ background: linear-gradient(to right, #3498db, #2980b9);
1186
+ border: none;
1187
+ border-radius: 5px;
1188
+ color: white;
1189
+ cursor: pointer;
1190
+ transition: all 0.3s;
1191
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
1192
+ }
1193
+
1194
+ .run-button:hover {
1195
+ background: linear-gradient(to right, #2980b9, #1a5276);
1196
+ box-shadow: 0 6px 8px rgba(0, 0, 0, 0.15);
1197
+ transform: translateY(-2px);
1198
+ }
1199
+
1200
+ .task-selector {
1201
+ background-color: #f8f9fa;
1202
+ padding: 12px;
1203
+ border-radius: 8px;
1204
+ margin-bottom: 15px;
1205
+ border: 1px solid #e9ecef;
1206
+ }
1207
+
1208
+ .dark .task-selector {
1209
+ background-color: #2d3748;
1210
+ border-color: #4a5568;
1211
+ }
1212
+
1213
+ /* Compact parameter settings */
1214
+ .compact-params .row {
1215
+ margin-bottom: 8px;
1216
+ }
1217
+
1218
+ .compact-params label {
1219
+ margin-bottom: 4px;
1220
+ }
1221
+
1222
+ /* More obvious advanced options */
1223
+ .advanced-options-header {
1224
+ background-color: #e9ecef;
1225
+ padding: 10px 15px;
1226
+ border-radius: 6px;
1227
+ margin-top: 10px;
1228
+ font-weight: bold;
1229
+ color: #495057;
1230
+ border-left: 4px solid #6c757d;
1231
+ cursor: pointer;
1232
+ transition: all 0.2s;
1233
+ }
1234
+
1235
+ .advanced-options-header:hover {
1236
+ background-color: #dee2e6;
1237
+ }
1238
+
1239
+ .dark .advanced-options-header {
1240
+ background-color: #2d3748;
1241
+ color: #e2e8f0;
1242
+ border-left: 4px solid #a0aec0;
1243
+ }
1244
+
1245
+ .dark .advanced-options-header:hover {
1246
+ background-color: #4a5568;
1247
+ }
1248
+
1249
+ /* Vertical arrangement of output section */
1250
+ .output-section {
1251
+ margin-bottom: 30px;
1252
+ border: 1px solid #e9ecef;
1253
+ border-radius: 8px;
1254
+ padding: 20px;
1255
+ }
1256
+
1257
+ .output-section-title {
1258
+ font-weight: bold;
1259
+ color: #495057;
1260
+ margin-bottom: 15px;
1261
+ font-size: 1.2em;
1262
+ }
1263
+
1264
+ .dark .output-section-title {
1265
+ color: #e2e8f0;
1266
+ }
1267
+
1268
+ .pointcloud-controls {
1269
+ display: flex;
1270
+ gap: 10px;
1271
+ margin-bottom: 10px;
1272
+ align-items: center;
1273
+ }
1274
+
1275
+ .note-box {
1276
+ background-color: #fff8e1 !important;
1277
+ border-left: 4px solid #ffc107 !important;
1278
+ padding: 12px !important;
1279
+ margin: 15px 0 !important;
1280
+ border-radius: 4px !important;
1281
+ color: #333 !important;
1282
+ }
1283
+
1284
+ .dark .note-box {
1285
+ background-color: rgba(255, 193, 7, 0.1) !important;
1286
+ color: #e0e0e0 !important;
1287
+ }
1288
+
1289
+ .note-box p, .note-box strong {
1290
+ color: inherit !important;
1291
+ }
1292
+
1293
+ /* Ensure warning class styles are correctly applied */
1294
+ .warning {
1295
+ color: #856404 !important;
1296
+ font-weight: bold !important;
1297
+ padding: 10px !important;
1298
+ background-color: #fff3cd !important;
1299
+ border-left: 4px solid #ffc107 !important;
1300
+ border-radius: 4px !important;
1301
+ margin: 10px 0 !important;
1302
+ }
1303
+
1304
+ .dark .warning {
1305
+ background-color: rgba(255, 193, 7, 0.1) !important;
1306
+ color: #fbd38d !important;
1307
+ }
1308
+
1309
+ .warning-box {
1310
+ background-color: #fff3cd;
1311
+ border-left: 4px solid #ffc107;
1312
+ padding: 12px;
1313
+ margin: 15px 0;
1314
+ border-radius: 4px;
1315
+ color: #856404;
1316
+ }
1317
+
1318
+ .dark .warning-box {
1319
+ background-color: rgba(255, 193, 7, 0.1);
1320
+ color: #fbd38d;
1321
+ }
1322
  """,
1323
  ) as demo:
1324
+ with gr.Column(elem_classes=["container"]):
1325
+ with gr.Row(elem_classes=["header"]):
1326
+ with gr.Column():
1327
+ gr.Markdown(
1328
+ """
1329
+ # Aether: Geometric-Aware Unified World Modeling
1330
+ """,
1331
+ elem_classes=["main-title"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1332
  )
1333
+
1334
+ gr.Markdown(
1335
+ """
1336
+ <div class="links-bar">
1337
+ 🌐<a href="https://aether-world.github.io/" class="link-button" target="_blank"> Project Page</a>
1338
+ 📄<a href="https://arxiv.org/abs/2503.18945" class="link-button" target="_blank"> Paper</a>
1339
+ 💻<a href="https://github.com/OpenRobotLab/Aether" class="link-button" target="_blank"> Code</a>
1340
+ 🤗<a href="https://huggingface.co/AetherWorldModel/AetherV1" class="link-button" target="_blank"> Model</a>
1341
+ </div>
1342
+ """,
1343
  )
1344
 
1345
+ with gr.Row(elem_classes=["features-limitations-container"]):
1346
+ with gr.Column(elem_classes=["capabilities-box"]):
1347
+ gr.Markdown(
1348
+ """
1349
+ ### 🚀 Key Capabilities
1350
+
1351
+ Aether addresses a fundamental challenge in AI: integrating geometric reconstruction with generative modeling for human-like spatial reasoning. Our framework unifies three core capabilities:
1352
+
1353
+ - 🌏 **4D Dynamic Reconstruction**: Reconstruct dynamic point clouds from videos by estimating depths and camera poses.
1354
+
1355
+ - 🎬 **Action-Conditioned Prediction**: Predict future frames based on initial observations, with optional camera trajectory actions.
1356
+
1357
+ - 🎯 **Goal-Conditioned Planning**: Generate planning paths from pairs of observation and goal images.
1358
+
1359
+ > *Trained entirely on synthetic data, Aether achieves strong zero-shot generalization to real-world scenarios.*
1360
+ """,
1361
+ elem_classes=["capabilities-text"]
1362
+ )
1363
+
1364
+ with gr.Column(elem_classes=["limitations-box"]):
1365
+ gr.Markdown(
1366
+ """
1367
+ ### 📝 Current Limitations
1368
+
1369
+ Aether represents an initial step in our journey, trained entirely on synthetic data. While it demonstrates promising capabilities, it is important to be aware of its current limitations:
1370
+
1371
+ - 🔄 **Dynamic Scenarios**: Struggles with highly dynamic scenarios involving significant motion or dense crowds.
1372
+
1373
+ - 📸 **Camera Stability**: Camera pose estimation can be less stable in certain conditions.
1374
+
1375
+ - 📐 **Planning Range**: For visual planning tasks, we recommend keeping the observations and goals relatively close to ensure optimal performance.
1376
+
1377
+ > *We are actively working on the next generation of Aether and are committed to addressing these limitations in future releases.*
1378
+ """,
1379
+ elem_classes=["limitations-text"]
1380
  )
1381
 
1382
+ with gr.Row(elem_classes=["main-interface"]):
1383
+ with gr.Column(elem_classes=["input-column"]):
1384
+ with gr.Group(elem_classes=["task-selector"]):
1385
+ task = gr.Radio(
1386
+ ["reconstruction", "prediction", "planning"],
1387
+ label="Select Task",
1388
+ value="reconstruction",
1389
+ info="Choose the task you want to perform",
1390
  )
1391
+ gpu_time_warning = gr.Markdown(
1392
+ """
1393
+ <div class="warning-box">
1394
+ <strong>⚠️ Note:</strong> Due to HuggingFace Spaces ZERO GPU quota limitations (5 minutes max),
1395
+ prediction and planning tasks may not complete in time. We strongly recommend deploying
1396
+ our model locally for the full Aether experience.
1397
+ </div>
1398
+ """,
1399
+ visible=False
1400
  )
1401
 
1402
+ with gr.Group(elem_classes=["input-section"]):
1403
+ # Input section - changes based on task
1404
+ gr.Markdown("## 📥 Input", elem_classes=["task-header"])
1405
 
1406
+ # Task-specific inputs
1407
+ video_input = gr.Video(
1408
+ label="Upload Input Video",
1409
+ sources=["upload"],
1410
+ visible=True,
1411
+ interactive=True,
1412
+ elem_id="video_input",
1413
+ )
1414
 
1415
+ image_input = gr.File(
1416
+ label="Upload Start Image",
1417
+ file_count="single",
1418
+ file_types=["image"],
1419
+ visible=False,
1420
+ interactive=True,
1421
+ elem_id="image_input",
1422
+ )
1423
 
1424
+ goal_input = gr.File(
1425
+ label="Upload Goal Image",
1426
+ file_count="single",
1427
+ file_types=["image"],
1428
+ visible=False,
1429
+ interactive=True,
1430
+ elem_id="goal_input",
1431
+ )
1432
 
1433
+ with gr.Row(visible=False) as preview_row:
1434
+ image_preview = gr.Image(
1435
+ label="Start Image Preview",
1436
+ elem_id="image_preview",
1437
+ visible=False,
 
1438
  )
1439
+ goal_preview = gr.Image(
1440
+ label="Goal Image Preview",
1441
+ elem_id="goal_preview",
1442
+ visible=False,
 
 
 
 
 
 
1443
  )
1444
 
1445
+ with gr.Group(elem_classes=["params-section", "compact-params"]):
1446
+ gr.Markdown("## ⚙️ Parameters", elem_classes=["task-header"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1447
 
 
 
 
 
1448
  with gr.Row():
1449
  with gr.Column(scale=1):
1450
+ height = gr.Dropdown(
1451
+ choices=[480],
1452
+ value=480,
1453
+ label="Height",
1454
+ info="Height of the output video",
 
 
1455
  )
1456
 
 
1457
  with gr.Column(scale=1):
1458
+ width = gr.Dropdown(
1459
+ choices=[720],
1460
+ value=720,
1461
+ label="Width",
1462
+ info="Width of the output video",
 
 
1463
  )
1464
 
1465
  with gr.Row():
1466
  with gr.Column(scale=1):
1467
+ num_frames = gr.Dropdown(
1468
+ choices=[17, 25, 33, 41],
1469
+ value=41,
1470
+ label="Number of Frames",
1471
+ info="Number of frames to predict",
1472
  )
1473
 
1474
  with gr.Column(scale=1):
1475
+ fps = gr.Dropdown(
1476
+ choices=[8, 10, 12, 15, 24],
1477
+ value=12,
1478
+ label="FPS",
1479
+ info="Frames per second",
1480
  )
1481
 
1482
  with gr.Row():
1483
+ num_inference_steps = gr.Slider(
1484
+ minimum=1,
1485
+ maximum=60,
1486
+ value=4,
1487
+ step=1,
1488
+ label="Inference Steps",
1489
+ info="Number of inference step",
1490
+ )
 
 
 
 
 
 
 
 
 
 
 
1491
 
1492
+ sliding_window_stride = gr.Slider(
1493
  minimum=1,
1494
+ maximum=40,
1495
+ value=24,
1496
  step=1,
1497
+ label="Sliding Window Stride",
1498
+ info="Sliding window stride (window size equals to num_frames). Only used for 'reconstruction' task",
1499
+ visible=True,
1500
  )
1501
 
1502
+ use_dynamic_cfg = gr.Checkbox(
1503
+ label="Use Dynamic CFG",
1504
+ value=True,
1505
+ info="Use dynamic CFG",
1506
+ visible=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1507
  )
1508
+
1509
+ raymap_option = gr.Radio(
1510
+ choices=["backward", "forward_right", "left_forward", "right"],
1511
+ label="Camera Movement Direction",
1512
+ value="forward_right",
1513
+ info="Direction of camera action. We offer 4 pre-defined actions for you to choose from.",
1514
  visible=False,
 
1515
  )
1516
 
1517
+ post_reconstruction = gr.Checkbox(
1518
+ label="Post-Reconstruction",
1519
+ value=True,
1520
+ info="Run reconstruction after prediction for better quality",
1521
+ visible=False,
1522
+ )
1523
 
1524
+ with gr.Accordion(
1525
+ "Advanced Options", open=False, visible=True, elem_classes=["advanced-options-header"]
1526
+ ) as advanced_options:
1527
+ with gr.Group(elem_classes=["advanced-section"]):
1528
+ with gr.Row():
1529
+ guidance_scale = gr.Slider(
1530
+ minimum=1.0,
1531
+ maximum=10.0,
1532
+ value=1.0,
1533
+ step=0.1,
1534
+ label="Guidance Scale",
1535
+ info="Guidance scale (only for prediction / planning)",
1536
+ )
1537
+
1538
+ with gr.Row():
1539
+ seed = gr.Number(
1540
+ value=42,
1541
+ label="Random Seed",
1542
+ info="Set a seed for reproducible results",
1543
+ precision=0,
1544
+ minimum=0,
1545
+ maximum=2147483647,
1546
+ )
1547
+
1548
+ with gr.Row():
1549
+ with gr.Column(scale=1):
1550
+ smooth_camera = gr.Checkbox(
1551
+ label="Smooth Camera",
1552
+ value=True,
1553
+ info="Apply smoothing to camera trajectory",
1554
+ )
1555
+
1556
+ with gr.Column(scale=1):
1557
+ align_pointmaps = gr.Checkbox(
1558
+ label="Align Point Maps",
1559
+ value=False,
1560
+ info="Align point maps across frames",
1561
+ )
1562
+
1563
+ with gr.Row():
1564
+ with gr.Column(scale=1):
1565
+ max_depth = gr.Slider(
1566
+ minimum=10,
1567
+ maximum=200,
1568
+ value=60,
1569
+ step=10,
1570
+ label="Max Depth",
1571
+ info="Maximum depth for point cloud (higher = more distant points)",
1572
+ )
1573
+
1574
+ with gr.Column(scale=1):
1575
+ rtol = gr.Slider(
1576
+ minimum=0.01,
1577
+ maximum=2.0,
1578
+ value=0.2,
1579
+ step=0.01,
1580
+ label="Relative Tolerance",
1581
+ info="Used for depth edge detection. Lower = remove more edges",
1582
+ )
1583
+
1584
+ pointcloud_save_frame_interval = gr.Slider(
1585
+ minimum=1,
1586
+ maximum=20,
1587
+ value=10,
1588
+ step=1,
1589
+ label="Point Cloud Frame Interval",
1590
+ info="Save point cloud every N frames (higher = fewer files but less complete representation)",
1591
+ )
1592
 
1593
+ with gr.Group(elem_classes=["run-button-container"]):
1594
+ run_button = gr.Button("Run Aether", variant="primary", elem_classes=["run-button"])
1595
+
1596
+ with gr.Column(elem_classes=["output-column"]):
1597
+ with gr.Group(elem_classes=["output-panel"]):
1598
+ gr.Markdown("## 📤 Output", elem_classes=["task-header"])
1599
+
1600
+ with gr.Group(elem_classes=["output-section"]):
1601
+ gr.Markdown("### RGB Video", elem_classes=["output-section-title"])
1602
+ rgb_output = gr.Video(
1603
+ label="RGB Output", interactive=False, elem_id="rgb_output"
1604
+ )
1605
+
1606
+ with gr.Group(elem_classes=["output-section"]):
1607
+ gr.Markdown("### Depth Video", elem_classes=["output-section-title"])
1608
+ depth_output = gr.Video(
1609
+ label="Depth Output", interactive=False, elem_id="depth_output"
1610
+ )
1611
+
1612
+ with gr.Group(elem_classes=["output-section"]):
1613
+ gr.Markdown("### Point Clouds", elem_classes=["output-section-title"])
1614
+ with gr.Row(elem_classes=["pointcloud-controls"]):
1615
+ pointcloud_frames = gr.Dropdown(
1616
+ label="Select Frame",
1617
+ choices=[],
1618
+ value=None,
1619
+ interactive=True,
1620
+ elem_id="pointcloud_frames",
1621
+ )
1622
+ pointcloud_download = gr.DownloadButton(
1623
+ label="Download Point Cloud",
1624
+ visible=False,
1625
+ elem_id="pointcloud_download",
1626
+ )
1627
+
1628
+ model_output = gr.Model3D(
1629
+ label="Point Cloud Viewer", interactive=True, elem_id="model_output"
1630
+ )
1631
+
1632
+ gr.Markdown(
1633
+ """
1634
+ > **Note:** 3D point clouds take a long time to visualize, and we show the keyframes only.
1635
+ > You can control the keyframe interval by modifying the `pointcloud_save_frame_interval`.
1636
+ """
1637
+ )
1638
+
1639
+ with gr.Group(elem_classes=["output-section"]):
1640
+ gr.Markdown("### About Results", elem_classes=["output-section-title"])
1641
+ gr.Markdown(
1642
+ """
1643
+ #### Understanding the Outputs
1644
+
1645
+ - **RGB Video**: Shows the predicted or reconstructed RGB frames
1646
+ - **Depth Video**: Visualizes the disparity maps in color (closer = red, further = blue)
1647
+ - **Point Clouds**: Interactive 3D point cloud with camera positions shown as colored pyramids
1648
+ """
1649
+ )
1650
 
1651
+ # Example Accordion
1652
+ with gr.Accordion("Examples", open=False):
1653
+ gr.Markdown(
1654
+ """
1655
+ ### Examples will be added soon
1656
+ Check back for example inputs for each task type.
1657
+ """
1658
+ )
1659
 
1660
  # Event handlers
1661
  task.change(
 
1673
  raymap_option,
1674
  post_reconstruction,
1675
  guidance_scale,
1676
+ gpu_time_warning,
1677
  ],
1678
  )
1679
 
 
1904
  outputs=[pointcloud_download],
1905
  )
1906
 
 
 
 
 
 
 
 
 
 
1907
  # Load the model at startup
1908
  demo.load(lambda: build_pipeline(torch.device("cpu")), inputs=None, outputs=None)
1909