Spaces:

Mayuri
/

S2O_DPM

Sleeping

App Files Files Community

Mayuri commited on Jan 24

Commit

2a5630b

verified ·

1 Parent(s): c566753

Upload 10 files

Browse files

Files changed (10) hide show

lcm.py +117 -0
main_v3.py +140 -0
models.py +402 -0
models/model.safetensors +3 -0
models/model_org.safetensors +3 -0
sar_1.png +0 -0
sar_2.png +0 -0
sar_3.png +0 -0
sar_4.png +0 -0
utils.py +347 -0

lcm.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# 首先，确保安装了必要的库
+# 你可以使用以下命令安装：
+# pip install gradio diffusers transformers torch
+import gradio as gr
+from diffusers import StableDiffusionPipeline
+import torch
+from PIL import Image
+import requests
+from io import BytesIO
+# 定义可用的扩散模型列表
+AVAILABLE_MODELS = {
+    "Stable Diffusion v1.4": "CompVis/stable-diffusion-v1-4",
+    "Stable Diffusion v1.5": "runwayml/stable-diffusion-v1-5",
+    "Stable Diffusion 2.1": "stabilityai/stable-diffusion-2-1",
+    # 你可以根据需要添加更多模型
+}
+# 示例图片的URL列表
+SAMPLE_IMAGES = {
+    "风景": "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/samples/landscape.jpg",
+    "人像": "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/samples/portrait.jpg",
+    "动物": "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/samples/animal.jpg",
+}
+# 使用缓存来存储已加载的模型，以避免重复加载
+model_cache = {}
+def load_model(model_name):
+    if model_name in model_cache:
+        return model_cache[model_name]
+    else:
+        model_id = AVAILABLE_MODELS[model_name]
+        pipe = StableDiffusionPipeline.from_pretrained(
+            model_id,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        )
+        pipe = pipe.to("cuda") if torch.cuda.is_available() else pipe.to("cpu")
+        model_cache[model_name] = pipe
+        return pipe
+def process_image(model_name, input_image, sample_choice):
+    # 如果用户选择使用示例图片，则下载示例图片
+    if sample_choice != "上传图片":
+        url = SAMPLE_IMAGES.get(sample_choice, SAMPLE_IMAGES["风景"])
+        response = requests.get(url)
+        input_image = Image.open(BytesIO(response.content)).convert("RGB")
+    # 加载所选模型
+    pipe = load_model(model_name)
+    # 生成图像（这里以文本提示为例，可以根据实际模型功能调整）
+    prompt = "A transformed version of the input image."
+    with torch.autocast("cuda" if torch.cuda.is_available() else "cpu"):
+        generated_image = pipe(prompt=prompt, init_image=input_image, strength=0.8).images[0]
+    return input_image, generated_image
+# 定义 Gradio 接口
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Diffusers 扩散模型展示页面")
+        gr.Markdown("选择一个模型，上传一张图片或选择一个示例图片，然后点击转换按钮查看结果。")
+        with gr.Row():
+            model_dropdown = gr.Dropdown(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value=list(AVAILABLE_MODELS.keys())[0],
+                label="选择模型"
+            )
+        with gr.Row():
+            sample_radio = gr.Radio(
+                choices=["上传图片"] + list(SAMPLE_IMAGES.keys()),
+                value="上传图片",
+                label="选择图片来源"
+            )
+        with gr.Row():
+            input_image = gr.Image(
+                type="pil",
+                label="上传图片",
+                visible=False
+            )
+            sample_image = gr.Image(
+                type="pil",
+                label="示例图片",
+                visible=False
+            )
+        # 根据用户选择显示上传或示例图片
+        def toggle_image(choice):
+            return {
+                "input_image": gr.update(visible=(choice == "上传图片")),
+                "sample_image": gr.update(visible=(choice != "上传图片"))
+            }
+        sample_radio.change(toggle_image, inputs=sample_radio, outputs=[input_image, sample_image])
+        convert_button = gr.Button("转换")
+        with gr.Row():
+            original_output = gr.Image(label="原图")
+            generated_output = gr.Image(label="生成图")
+        convert_button.click(
+            process_image,
+            inputs=[model_dropdown, input_image, sample_radio],
+            outputs=[original_output, generated_output]
+        )
+    demo.launch(server_port=16006)
+if __name__ == "__main__":
+    main()

main_v3.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gradio as gr
+import argparse
+import os
+import pandas as pd
+from PIL import Image
+import numpy as np
+import torch as th
+from torchvision import transforms
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler, LCMScheduler
+import gc
+from safetensors import safe_open
+from models import SAR2OptUNetv3
+from utils import update_args_from_yaml, safe_load
+transform_sar = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((256, 256)),
+    transforms.Normalize((0.5), (0.5)),
+])
+AVAILABLE_MODELS = {
+    "Sen12:LCM-Model": "models/model.safetensors",
+    "Sen12:Org-Model": "models/model_org.safetensors",
+}
+device = th.device('cuda:0' if th.cuda.is_available() else 'cpu')
+def safe_load(model_path):
+    assert "safetensors" in model_path
+    state_dict = {}
+    with safe_open(model_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+    return state_dict
+unet_model = SAR2OptUNetv3(
+            sample_size=256,
+            in_channels=4,
+            out_channels=3,
+            layers_per_block=2,
+            block_out_channels=(128, 128, 256, 256, 512, 512),
+            down_block_types=(
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "AttnDownBlock2D",
+                "DownBlock2D",
+            ),
+            up_block_types=(
+                "UpBlock2D",
+                "AttnUpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+            ),
+)
+print('load unet safetensos done!')
+lcm_scheduler = LCMScheduler(num_train_timesteps=1000)
+unet_model.to(device)
+unet_model.eval()
+model_kwargs = {}
+def predict(condition, nums_step, model_name):
+    unet_checkpoint = AVAILABLE_MODELS[model_name]
+    unet_model.load_state_dict(safe_load(unet_checkpoint), strict=True)
+    unet_model.eval().to(device)
+    with th.no_grad():
+        lcm_scheduler.set_timesteps(nums_step, device=device)
+        timesteps = lcm_scheduler.timesteps
+        pred_latent = th.randn(size=[1, 3, 256, 256], device=device)
+        condition = condition.convert("L")
+        condition = transform_sar(condition)
+        condition = th.unsqueeze(condition, 0)
+        condition = condition.to(device)
+        for timestep in timesteps:
+            latent_to_pred = th.cat((pred_latent, condition), dim=1)
+            model_pred = unet_model(latent_to_pred, timestep)
+            pred_latent, denoised = lcm_scheduler.step(
+                                                    model_output=model_pred,
+                                                    timestep=timestep,
+                                                    sample=pred_latent,
+                                                    return_dict=False)
+        sample = denoised.cpu()
+    sample = ((sample + 1) * 127.5).clamp(0, 255).to(th.uint8)
+    sample = sample.permute(0, 2, 3, 1)
+    sample = sample.contiguous()
+    sample = sample.cpu().numpy()
+    sample = sample.squeeze(0)
+    sample = Image.fromarray(sample)
+    return sample
+demo = gr.Interface(
+    fn=predict,
+    inputs=[gr.Image(type="pil"),
+            gr.Slider(1, 1000),
+            gr.Dropdown(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value=list(AVAILABLE_MODELS.keys())[0],
+                label="Choose the Model"),],
+            # gr.Radio(["Sent", "GF3"], label="Model", info="Which model to you want to use?"), ],
+    outputs=gr.Image(type="pil"),
+    examples=[
+        [os.path.join(os.path.dirname(__file__), "sar_1.png"), 8, "Sen12:LCM-Model"],
+        [os.path.join(os.path.dirname(__file__), "sar_2.png"), 16, "Sen12:LCM-Model"],
+        [os.path.join(os.path.dirname(__file__), "sar_3.png"), 500, "Sen12:Org-Model"],
+        [os.path.join(os.path.dirname(__file__), "sar_4.png"), 1000, "Sen12:Org-Model"],
+    ],
+    title="SAR to Optical Image🚀",
+    description="""
+        # 🎯 Instruction
+        This is a project that converts SAR images into optical images, based on conditional diffusion.
+        Input a SAR image, and its corresponding optical image will be obtained.
+        ## 📢 Inputs
+        - `condition`: the SAR image that you want to transfer.
+        - `timestep_respacing`: the number of iteration steps when inference.
+        ## 🎉 Outputs
+        - The corresponding optical image.
+        **Paper** : [Guided Diffusion for Image Generation](https://arxiv.org/abs/2105.05233)
+        **Github** : https://github.com/Coordi777/Conditional_SAR2OPT
+    """
+)
+if __name__ == "__main__":
+    demo.launch(server_port=16006)

models.py ADDED Viewed

	@@ -0,0 +1,402 @@

+from diffusers import StableDiffusionPipeline
+from diffusers import AutoencoderKL, UNet2DConditionModel, UNet2DModel
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+import json
+class SAR2OptUNet(UNet2DConditionModel):
+    def forward(self, sample, timestep, encoder_hidden_states, timestep_cond, cross_attention_kwargs,
+                added_cond_kwargs):
+        default_overall_up_factor = 2 ** self.num_upsamplers
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            forward_upsample_size = True
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if added_cond_kwargs is not None:
+            if 'sar' in added_cond_kwargs:
+                image_embs = added_cond_kwargs.get("image_embeds")
+                aug_emb = self.add_embedding(image_embs)
+            else:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=None,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=None,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=None,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=None,
+            )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=None,
+                    encoder_attention_mask=None,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class SAREncoder(nn.Module):
+    def __init__(self,in_channels,ngf=50):
+        super(SAREncoder, self).__init__()
+        self.ngf = ngf
+        self.encoder = nn.Sequential(
+            # Encoder 1
+            nn.Conv2d(in_channels=in_channels, out_channels=self.ngf, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(self.ngf),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 2
+            nn.Conv2d(in_channels=self.ngf, out_channels=self.ngf * 2, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 3
+            nn.Conv2d(in_channels=self.ngf * 2, out_channels=self.ngf * 4, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 4
+            nn.Conv2d(in_channels=self.ngf * 4, out_channels=self.ngf * 5, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 5),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+    def forward(self, x):
+        bz = x.shape[0]
+        out = self.encoder(x).reshape(bz, -1, 1280)
+        return out
+class SAR2OptUNetv2(UNet2DConditionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args,**kwargs)
+        in_channels = 1
+        self.ngf = 2
+        self.sar_encoder = nn.Sequential(
+            # Encoder 1
+            nn.Conv2d(in_channels=in_channels, out_channels=self.ngf, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(self.ngf),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 2
+            nn.Conv2d(in_channels=self.ngf, out_channels=self.ngf * 2, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 3
+            nn.Conv2d(in_channels=self.ngf * 2, out_channels=self.ngf * 4, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            # Encoder 4
+            nn.Conv2d(in_channels=self.ngf * 4, out_channels=self.ngf * 5, kernel_size=3, stride=2, padding=1),# half
+            nn.BatchNorm2d(self.ngf * 5),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+    def forward(self, sample, timestep, sar_image=None,
+                encoder_hidden_states=None,
+                timestep_cond=None, cross_attention_kwargs=None,
+                added_cond_kwargs=None):
+        if encoder_hidden_states is None:
+            assert sar_image is not None
+            bz = sample.shape[0]
+            encoder_hidden_states = self.sar_encoder(sar_image).reshape(bz, -1, 1280)
+        default_overall_up_factor = 2 ** self.num_upsamplers
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            forward_upsample_size = True
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+        if added_cond_kwargs is not None:
+            if 'sar' in added_cond_kwargs:
+                image_embs = added_cond_kwargs.get("image_embeds")
+                aug_emb = self.add_embedding(image_embs)
+            else:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+        emb = emb + aug_emb if aug_emb is not None else emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=None,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=None,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=None,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=None,
+            )
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets):]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=None,
+                    encoder_attention_mask=None,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        return sample
+class SAR2OptUNetv3(UNet2DModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args,**kwargs)
+    def forward(self, sample, timestep):
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+        t_emb = self.time_proj(timesteps)
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        # 2. pre-process
+        skip_sample = sample
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "skip_conv"):
+                sample, res_samples, skip_sample = downsample_block(
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(sample, emb)
+        # 5. up
+        skip_sample = None
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "skip_conv"):
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+            else:
+                sample = upsample_block(sample, res_samples, emb)
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if skip_sample is not None:
+            sample += skip_sample
+        if self.config.time_embedding_type == "fourier":
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+            sample = sample / timesteps
+        return sample
+# 3*64*64
+if __name__ == '__main__':
+    model = SAR2OptUNetv2(
+            sample_size=256,
+            in_channels=3,
+            out_channels=3,
+            layers_per_block=2,
+            block_out_channels=(128, 128, 256, 256, 512, 512),
+            down_block_types=(
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "DownBlock2D",
+                "AttnDownBlock2D",
+                "DownBlock2D",
+            ),
+            up_block_types=(
+                "UpBlock2D",
+                "AttnUpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+                "UpBlock2D",
+            ),
+    )
+    model.to("cuda")
+    opt_image = torch.randn(8, 3, 256, 256).to("cuda")
+    sar_image = torch.randn(8, 1, 256, 256).to("cuda")
+    timestep = torch.tensor(1.0)
+    re = model(opt_image, timestep, sar_image , None, None, None)
+    print(re.shape)

models/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34833bcdbebf7767daa0015ca6bc0a0c444c68d84fad6f7aa96a10f1653cf1d7
+size 454745716

models/model_org.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:788ed3e1601923a5245e430b89ff3522c3ab8c46b928d8a1275778a27cf2f8cf
+size 454745716

sar_1.png ADDED Viewed

sar_2.png ADDED Viewed

sar_3.png ADDED Viewed

sar_4.png ADDED Viewed

utils.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import ast
+from safetensors import safe_open
+import torch
+from dataclasses import dataclass
+from typing import Optional, Union, List
+def update_args_from_yaml(group, args, parser):
+    for key, value in group.items():
+        if isinstance(value, dict):
+            update_args_from_yaml(value, args, parser)
+        else:
+            if value == 'None' or value == 'null':
+                value = None
+            else:
+                arg_type = next((action.type for action in parser._actions if action.dest == key), str)
+                if arg_type is ast.literal_eval:
+                    pass
+                elif arg_type is not None and not isinstance(value, arg_type):
+                    try:
+                        value = arg_type(value)
+                    except ValueError as e:
+                        raise ValueError(f"Cannot convert {key} to {arg_type}: {e}")
+            setattr(args, key, value)
+def safe_load(model_path):
+    assert "safetensors" in model_path
+    state_dict = {}
+    with safe_open(model_path, framework="pt", device="cpu") as f:
+        for k in f.keys():
+            state_dict[k] = f.get_tensor(k)
+    return state_dict
+@dataclass
+class DDIMSchedulerStepOutput:
+    prev_sample: torch.Tensor  # x_{t-1}
+    pred_original_sample: Optional[torch.Tensor] = None  # x0
+@dataclass
+class DDIMSchedulerConversionOutput:
+    pred_epsilon: torch.Tensor
+    pred_original_sample: torch.Tensor
+    pred_velocity: torch.Tensor
+class DDIMScheduler:
+    prediction_types = ["epsilon", "sample", "v_prediction"]
+    def __init__(
+        self,
+        num_train_timesteps: int,
+        num_inference_timesteps: int,
+        betas: torch.Tensor,
+        set_alpha_to_one: bool = True,
+        set_inference_timesteps_from_pure_noise: bool = True,
+        inference_timesteps: Union[str, List[int]] = "trailing",
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: torch.dtype = torch.float32,
+        skip_step:bool = False,
+        original_inference_step: int=20,
+        steps_offset: int=0,
+    ):
+        assert num_train_timesteps > 0
+        assert num_train_timesteps >= num_inference_timesteps
+        assert num_train_timesteps == betas.size(0)
+        assert betas.ndim == 1
+        # self.user_name = user_name
+        # self.run_time = Recorder.format_time()
+        # self.task_name = 'AutoAIGC_%s' % str(self.run_time)
+        self.module_name = 'AutoAIGC'
+        self.config_list = {"num_train_timesteps": num_train_timesteps,
+                            "num_inference_timesteps": num_inference_timesteps,
+                            "betas": betas,
+                            "set_alpha_to_one": set_alpha_to_one,
+                            "set_inference_timesteps_from_pure_noise": set_inference_timesteps_from_pure_noise,
+                            "inference_timesteps": inference_timesteps}
+        self.module_info = str(self.config_list)
+        # self.upload_logger(user_name=user_name)
+        device = device or betas.device
+        self.num_train_timesteps = num_train_timesteps
+        self.num_inference_steps = num_inference_timesteps
+        self.steps_offset = steps_offset
+        self.betas = betas # .to(device=device, dtype=dtype)
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.final_alpha_cumprod = torch.tensor(1.0, device=device, dtype=dtype) if set_alpha_to_one else self.alphas_cumprod[0]
+        if isinstance(inference_timesteps, torch.Tensor):
+            assert len(inference_timesteps) == num_inference_timesteps
+            self.timesteps = inference_timesteps.cpu().numpy().tolist()
+        elif set_inference_timesteps_from_pure_noise:
+            if inference_timesteps == "trailing":
+                # [999, 949, 899, 849, 799, 749, 699, 649, 599, 549, 499, 449, 399, 349, 299, 249, 199, 149,  99,  49]
+                if skip_step:  #  ?
+                    original_timesteps = torch.arange(num_train_timesteps - 1, -1, -num_train_timesteps / original_inference_step, device=device).round().int().tolist()
+                    skipping_step = len(original_timesteps) // num_inference_timesteps
+                    self.timesteps = original_timesteps[::skipping_step][:num_inference_timesteps]
+                else:  # [999, 899, 799, 699, 599, 499, 399, 299, 199, 99]
+                    self.timesteps = torch.arange(num_train_timesteps - 1, -1, -num_train_timesteps / num_inference_timesteps, device=device).round().int().tolist()
+            elif inference_timesteps == "linspace":
+                # Fixed DDIM timestep. Make sure the timestep starts from 999.
+                # Example 20 steps:
+                # [999, 946, 894, 841, 789, 736, 684, 631, 578, 526, 473, 421, 368, 315, 263, 210, 158, 105,  53,   0]
+                # [999,      888,      777,      666,      555,      444,      333,      222,      111,       0]
+                self.timesteps = torch.linspace(0, num_train_timesteps - 1, num_inference_timesteps, device=device).round().int().flip(0).tolist()
+            elif inference_timesteps == "leading":
+                step_ratio = num_train_timesteps // num_inference_timesteps
+                # # creates integer timesteps by multiplying by ratio
+                # # casting to int to avoid issues when num_inference_step is power of 3
+                self.timesteps = torch.arange(0, num_inference_timesteps).mul(step_ratio).round().flip(dims=[0]) #.clone().long()
+                # self.timesteps += self.steps_offset
+                # Original SD and DDIM paper may have a bug: <https://github.com/huggingface/diffusers/issues/2585>
+                # The inference timestep does not start from 999.
+                # Example 20 steps:
+                # [950, 900, 850, 800, 750, 700, 650, 600, 550, 500, 450, 400, 350, 300, 250, 200, 150, 100,  50,   0]
+                # [     900,      800,      700,      600,      500,      400,      300,      200,      100,        0]
+                # self.timesteps = torch.arange(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps, device=self.device, dtype=torch.int).flip(0)
+                # self.timesteps = list(reversed(range(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps)))
+            else:
+                raise NotImplementedError
+        elif inference_timesteps == "leading":
+            # Original SD and DDIM paper may have a bug: <https://github.com/huggingface/diffusers/issues/2585>
+            # The inference timestep does not start from 999.
+            # Example 20 steps:
+            # [950, 900, 850, 800, 750, 700, 650, 600, 550, 500, 450, 400, 350, 300, 250, 200, 150, 100,  50,   0]
+            # [     900,      800,      700,      600,      500,      400,      300,      200,      100,        0]
+            # self.timesteps = torch.arange(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps, device=self.device, dtype=torch.int).flip(0)
+            self.timesteps = list(reversed(range(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps)))
+        else:
+            self.timesteps = list(reversed(range(0, num_train_timesteps, num_train_timesteps // num_inference_timesteps)))
+            # raise NotImplementedError
+        self.to(device=device)
+    def to(self, device):
+        self.betas = self.betas.to(device)
+        self.alphas_cumprod = self.alphas_cumprod.to(device)
+        self.final_alpha_cumprod = self.final_alpha_cumprod.to(device)
+        # self.timesteps = self.timesteps.to(device)
+        return self
+    def step(
+        self,
+        model_output: torch.Tensor,
+        model_output_type: str,
+        timestep: Union[torch.Tensor, int],
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        clip_sample: bool = False,
+        dynamic_threshold: Optional[float] = None,
+        variance_noise: Optional[torch.Tensor] = None,
+    ) -> DDIMSchedulerStepOutput:
+        # 1. get previous step value (t-1)
+        if isinstance(timestep, int):
+            # 1. get previous step value (t-1)
+            idx = self.timesteps.index(timestep)
+            prev_timestep = self.timesteps[idx + 1] if idx < self.num_inference_steps - 1 else None
+            # 2. compute alphas, betas
+            alpha_prod_t = self.alphas_cumprod[timestep]
+            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep is not None else self.final_alpha_cumprod
+            beta_prod_t = 1 - alpha_prod_t
+            beta_prod_t_prev = 1 - alpha_prod_t_prev
+        else:
+            timesteps = torch.tensor(self.timesteps).to(timestep.device)
+            idx = timestep.reshape(-1, 1).eq(timesteps.reshape(1, -1)).nonzero()[:, 1] # 找到 timestep 在 timesteps 中的索引 idx
+            # 根据idx找到idx+1对应的timesteps元素，也就是下一个时间步。如果idx+1超出了timesteps的长度，它会被限制在self.num_inference_steps - 1
+            prev_timestep = timesteps[idx.add(1).clamp_max(self.num_inference_steps - 1)]
+            assert (prev_timestep is not None)
+            # 2. compute alphas, betas
+            alpha_prod_t = self.alphas_cumprod[timestep]
+            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
+            alpha_prod_t_prev = torch.where(prev_timestep < 0, self.final_alpha_cumprod, alpha_prod_t_prev)
+            beta_prod_t = 1 - alpha_prod_t
+            beta_prod_t_prev = 1 - alpha_prod_t_prev
+            bs = timestep.size(0)
+            alpha_prod_t = alpha_prod_t.view(bs, 1, 1, 1)
+            alpha_prod_t_prev = alpha_prod_t_prev.view(bs, 1, 1, 1)
+            beta_prod_t = beta_prod_t.view(bs, 1, 1, 1)
+            beta_prod_t_prev = beta_prod_t_prev.view(bs, 1, 1, 1)
+        # # 2. compute alphas, betas
+        # alpha_prod_t = self.alphas_cumprod[timestep]
+        # alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep is not None else self.final_alpha_cumprod
+        # beta_prod_t = 1 - alpha_prod_t
+        # beta_prod_t_prev = 1 - alpha_prod_t_prev
+        # rcfg
+        self.stock_alpha_prod_t_prev = alpha_prod_t_prev
+        self.stock_beta_prod_t_prev = beta_prod_t_prev
+        # rcfg
+        self.stock_alpha_prod_t_prev = alpha_prod_t_prev
+        self.stock_beta_prod_t_prev = beta_prod_t_prev
+        # 3. compute predicted original sample from predicted noise also called
+        model_output_conversion = self.convert_output(model_output, model_output_type, sample, timestep)
+        pred_original_sample = model_output_conversion.pred_original_sample
+        pred_epsilon = model_output_conversion.pred_epsilon
+        # 4. Clip or threshold "predicted x_0"
+        if clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+            pred_epsilon = self.convert_output(pred_original_sample, "sample", sample, timestep).pred_epsilon
+        if dynamic_threshold is not None:
+            # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+            dynamic_max_val = pred_original_sample \
+                .flatten(1) \
+                .abs() \
+                .float() \
+                .quantile(dynamic_threshold, dim=1) \
+                .type_as(pred_original_sample) \
+                .clamp_min(1) \
+                .view(-1, *([1] * (pred_original_sample.ndim - 1)))
+            pred_original_sample = pred_original_sample.clamp(-dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            pred_epsilon = self.convert_output(pred_original_sample, "sample", sample, timestep).pred_epsilon
+        # 5. compute variance: "sigma_t(η)" -> see formula (16) from https://arxiv.org/pdf/2010.02502.pdf
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+        std_dev_t = eta * variance ** (0.5)
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+        # 8. add "random noise" if needed.
+        if eta > 0:
+            if variance_noise is None:
+                variance_noise = torch.randn_like(model_output)
+            prev_sample = prev_sample + std_dev_t * variance_noise
+        return DDIMSchedulerStepOutput(
+            prev_sample=prev_sample, # x_{t-1}
+            pred_original_sample=pred_original_sample # x0
+            )
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: Union[torch.Tensor, int],
+        replace_noise=True
+    ) -> torch.Tensor:
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (original_samples.ndim - 1)))
+        if replace_noise:
+            indices = (timesteps == 999).nonzero()
+            if indices.numel() > 0:
+                alpha_prod_t[indices] = 0
+        return alpha_prod_t ** (0.5) * original_samples + (1 - alpha_prod_t) ** (0.5) * noise
+    def add_noise_lcm(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timestep: Union[torch.Tensor, int],
+    ) -> torch.Tensor:
+        if isinstance(timestep, int):
+            # 1. get previous step value (t-1)
+            idx = self.timesteps.index(timestep)
+            prev_timestep = self.timesteps[idx + 1] if idx < self.num_inference_steps - 1 else None
+            # 2. compute alphas, betas
+            alpha_prod_t = self.alphas_cumprod[timestep]
+            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep is not None else self.final_alpha_cumprod
+            beta_prod_t = 1 - alpha_prod_t
+            beta_prod_t_prev = 1 - alpha_prod_t_prev
+        else:
+            timesteps = torch.tensor(self.timesteps).to(timestep.device)
+            idx = timestep.reshape(-1, 1).eq(timesteps.reshape(1, -1)).nonzero()[:, 1] # 找到 timestep 在 timesteps 中的索引 idx
+            prev_timestep = timesteps[idx.add(1).clamp_max(self.num_inference_steps - 1)]
+            assert (prev_timestep is not None)
+            # 2. compute alphas, betas
+            alpha_prod_t = self.alphas_cumprod[timestep]
+            alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
+            alpha_prod_t_prev = torch.where(prev_timestep < 0, self.final_alpha_cumprod, alpha_prod_t_prev)
+            beta_prod_t = 1 - alpha_prod_t
+            beta_prod_t_prev = 1 - alpha_prod_t_prev
+            bs = timestep.size(0)
+            alpha_prod_t = alpha_prod_t.view(bs, 1, 1, 1)
+            alpha_prod_t_prev = alpha_prod_t_prev.view(bs, 1, 1, 1)
+            beta_prod_t = beta_prod_t.view(bs, 1, 1, 1)
+            beta_prod_t_prev = beta_prod_t_prev.view(bs, 1, 1, 1)
+        alpha_prod_t_prev = alpha_prod_t_prev.reshape(-1, *([1] * (original_samples.ndim - 1)))
+        return alpha_prod_t_prev ** (0.5) * original_samples + (1 - alpha_prod_t_prev) ** (0.5) * noise
+    def convert_output(
+        self,
+        model_output: torch.Tensor,
+        model_output_type: str,
+        sample: torch.Tensor,
+        timesteps: Union[torch.Tensor, int]
+    ) -> DDIMSchedulerConversionOutput:
+        assert model_output_type in self.prediction_types
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (sample.ndim - 1)))
+        beta_prod_t = 1 - alpha_prod_t
+        if model_output_type == "epsilon":
+            pred_epsilon = model_output
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * pred_epsilon) / alpha_prod_t ** (0.5)
+            pred_velocity = alpha_prod_t ** (0.5) * pred_epsilon - (1 - alpha_prod_t) ** (0.5) * pred_original_sample
+        elif model_output_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+            pred_velocity = alpha_prod_t ** (0.5) * pred_epsilon - (1 - alpha_prod_t) ** (0.5) * pred_original_sample
+        elif model_output_type == "v_prediction":
+            pred_velocity = model_output
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError("Unknown prediction type")
+        return DDIMSchedulerConversionOutput(
+            pred_epsilon=pred_epsilon,
+            pred_original_sample=pred_original_sample,
+            pred_velocity=pred_velocity)
+    def get_velocity(
+        self,
+        sample: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.FloatTensor:
+        alpha_prod_t = self.alphas_cumprod[timesteps].reshape(-1, *([1] * (sample.ndim - 1)))
+        return alpha_prod_t ** (0.5) * noise - (1 - alpha_prod_t) ** (0.5) * sample