1 files changed, 71 insertions, 9 deletions
diff --git a/modules/processing.py b/modules/processing.py
index 9351e3fb..f0656882 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -16,7 +16,7 @@ from skimage import exposure
 from typing import Any
 
 import modules.sd_hijack
-from modules import devices, prompt_parser, masking, sd_samplers, lowvram, generation_parameters_copypaste, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng
+from modules import devices, prompt_parser, masking, sd_samplers, lowvram, infotext, extra_networks, sd_vae_approx, scripts, sd_samplers_common, sd_unet, errors, rng
 from modules.rng import slerp # noqa: F401
 from modules.sd_hijack import model_hijack
 from modules.sd_samplers_common import images_tensor_to_samples, decode_first_stage, approximation_indexes
@@ -113,6 +113,21 @@ def txt2img_image_conditioning(sd_model, x, width, height):
         return x.new_zeros(x.shape[0], 2*sd_model.noise_augmentor.time_embed.dim, dtype=x.dtype, device=x.device)
 
     else:
+        sd = sd_model.model.state_dict()
+        diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
+        if diffusion_model_input is not None:
+            if diffusion_model_input.shape[1] == 9:
+                # The "masked-image" in this case will just be all 0.5 since the entire image is masked.
+                image_conditioning = torch.ones(x.shape[0], 3, height, width, device=x.device) * 0.5
+                image_conditioning = images_tensor_to_samples(image_conditioning,
+                                                              approximation_indexes.get(opts.sd_vae_encode_method))
+
+                # Add the fake full 1s mask to the first dimension.
+                image_conditioning = torch.nn.functional.pad(image_conditioning, (0, 0, 0, 0, 1, 0), value=1.0)
+                image_conditioning = image_conditioning.to(x.dtype)
+
+                return image_conditioning
+
         # Dummy zero conditioning if we're not using inpainting or unclip models.
         # Still takes up a bit of memory, but no encoder call.
         # Pretty sure we can just make this a 1x1 image since its not going to be used besides its batch size.
@@ -371,6 +386,12 @@ class StableDiffusionProcessing:
         if self.sampler.conditioning_key == "crossattn-adm":
             return self.unclip_image_conditioning(source_image)
 
+        sd = self.sampler.model_wrap.inner_model.model.state_dict()
+        diffusion_model_input = sd.get('diffusion_model.input_blocks.0.0.weight', None)
+        if diffusion_model_input is not None:
+            if diffusion_model_input.shape[1] == 9:
+                return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
+
         # Dummy zero conditioning if we're not using inpainting or depth model.
         return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
 
@@ -607,20 +628,33 @@ def decode_latent_batch(model, batch, target_device=None, check_for_nans=False):
         sample = decode_first_stage(model, batch[i:i + 1])[0]
 
         if check_for_nans:
+
             try:
                 devices.test_for_nans(sample, "vae")
             except devices.NansException as e:
-                if devices.dtype_vae == torch.float32 or not shared.opts.auto_vae_precision:
+                if shared.opts.auto_vae_precision_bfloat16:
+                    autofix_dtype = torch.bfloat16
+                    autofix_dtype_text = "bfloat16"
+                    autofix_dtype_setting = "Automatically convert VAE to bfloat16"
+                    autofix_dtype_comment = ""
+                elif shared.opts.auto_vae_precision:
+                    autofix_dtype = torch.float32
+                    autofix_dtype_text = "32-bit float"
+                    autofix_dtype_setting = "Automatically revert VAE to 32-bit floats"
+                    autofix_dtype_comment = "\nTo always start with 32-bit VAE, use --no-half-vae commandline flag."
+                else:
+                    raise e
+
+                if devices.dtype_vae == autofix_dtype:
                     raise e
 
                 errors.print_error_explanation(
                     "A tensor with all NaNs was produced in VAE.\n"
-                    "Web UI will now convert VAE into 32-bit float and retry.\n"
-                    "To disable this behavior, disable the 'Automatically revert VAE to 32-bit floats' setting.\n"
-                    "To always start with 32-bit VAE, use --no-half-vae commandline flag."
+                    f"Web UI will now convert VAE into {autofix_dtype_text} and retry.\n"
+                    f"To disable this behavior, disable the '{autofix_dtype_setting}' setting.{autofix_dtype_comment}"
                 )
 
-                devices.dtype_vae = torch.float32
+                devices.dtype_vae = autofix_dtype
                 model.first_stage_model.to(devices.dtype_vae)
                 batch = batch.to(devices.dtype_vae)
 
@@ -712,7 +746,7 @@ def create_infotext(p, all_prompts, all_seeds, all_subseeds, comments=None, iter
         "User": p.user if opts.add_user_name_to_info else None,
     }
 
-    generation_params_text = ", ".join([k if k == v else f'{k}: {generation_parameters_copypaste.quote(v)}' for k, v in generation_params.items() if v is not None])
+    generation_params_text = ", ".join([k if k == v else f'{k}: {infotext.quote(v)}' for k, v in generation_params.items() if v is not None])
 
     prompt_text = p.main_prompt if use_main_prompt else all_prompts[index]
     negative_prompt_text = f"\nNegative prompt: {p.main_negative_prompt if use_main_prompt else all_negative_prompts[index]}" if all_negative_prompts[index] else ""
@@ -877,6 +911,34 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
             if p.n_iter > 1:
                 shared.state.job = f"Batch {n+1} out of {p.n_iter}"
 
+            def rescale_zero_terminal_snr_abar(alphas_cumprod):
+                alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+                # Store old values.
+                alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+                alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+                # Shift so the last timestep is zero.
+                alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+
+                # Scale so the first timestep is back to the old value.
+                alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+                # Convert alphas_bar_sqrt to betas
+                alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+                alphas_bar[-1] = 4.8973451890853435e-08
+                return alphas_bar
+
+            if hasattr(p.sd_model, 'alphas_cumprod') and hasattr(p.sd_model, 'alphas_cumprod_original'):
+                p.sd_model.alphas_cumprod = p.sd_model.alphas_cumprod_original.to(shared.device)
+
+                if opts.use_downcasted_alpha_bar:
+                    p.extra_generation_params['Downcast alphas_cumprod'] = opts.use_downcasted_alpha_bar
+                    p.sd_model.alphas_cumprod = p.sd_model.alphas_cumprod.half().to(shared.device)
+                if opts.sd_noise_schedule == "Zero Terminal SNR":
+                    p.extra_generation_params['Noise Schedule'] = opts.sd_noise_schedule
+                    p.sd_model.alphas_cumprod = rescale_zero_terminal_snr_abar(p.sd_model.alphas_cumprod).to(shared.device)
+
             with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
                 samples_ddim = p.sample(conditioning=p.c, unconditional_conditioning=p.uc, seeds=p.seeds, subseeds=p.subseeds, subseed_strength=p.subseed_strength, prompts=p.prompts)
 
@@ -1135,7 +1197,7 @@ class StableDiffusionProcessingTxt2Img(StableDiffusionProcessing):
 
     def init(self, all_prompts, all_seeds, all_subseeds):
         if self.enable_hr:
-            if self.hr_checkpoint_name:
+            if self.hr_checkpoint_name and self.hr_checkpoint_name != 'Use same checkpoint':
                 self.hr_checkpoint_info = sd_models.get_closet_checkpoint_match(self.hr_checkpoint_name)
 
                 if self.hr_checkpoint_info is None:
@@ -1482,7 +1544,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):
             # Save init image
             if opts.save_init_img:
                 self.init_img_hash = hashlib.md5(img.tobytes()).hexdigest()
-                images.save_image(img, path=opts.outdir_init_images, basename=None, forced_filename=self.init_img_hash, save_to_dirs=False)
+                images.save_image(img, path=opts.outdir_init_images, basename=None, forced_filename=self.init_img_hash, save_to_dirs=False, existing_info=img.info)
 
             image = images.flatten(img, opts.img2img_background_color)