aboutsummaryrefslogtreecommitdiff
path: root/configs
diff options
context:
space:
mode:
authorAUTOMATIC <16777216c@gmail.com>2023-01-27 11:28:12 +0300
committerAUTOMATIC <16777216c@gmail.com>2023-01-27 11:28:12 +0300
commitd2ac95fa7b2a8d0bcc5361ee16dba9cbb81ff8b2 (patch)
tree056355bca8b5ff3071f4aec4a0c4d725f026413a /configs
parent7a14c8ab45da8a681792a6331d48a88dd684a0a9 (diff)
remove the need to place configs near models
Diffstat (limited to 'configs')
-rw-r--r--configs/instruct-pix2pix.yaml99
-rw-r--r--configs/v1-inpainting-inference.yaml70
2 files changed, 169 insertions, 0 deletions
diff --git a/configs/instruct-pix2pix.yaml b/configs/instruct-pix2pix.yaml
new file mode 100644
index 00000000..437ddcef
--- /dev/null
+++ b/configs/instruct-pix2pix.yaml
@@ -0,0 +1,99 @@
+# File modified by authors of InstructPix2Pix from original (https://github.com/CompVis/stable-diffusion).
+# See more details in LICENSE.
+
+model:
+ base_learning_rate: 1.0e-04
+ target: modules.models.diffusion.ddpm_edit.LatentDiffusion
+ params:
+ linear_start: 0.00085
+ linear_end: 0.0120
+ num_timesteps_cond: 1
+ log_every_t: 200
+ timesteps: 1000
+ first_stage_key: edited
+ cond_stage_key: edit
+ # image_size: 64
+ # image_size: 32
+ image_size: 16
+ channels: 4
+ cond_stage_trainable: false # Note: different from the one we trained before
+ conditioning_key: hybrid
+ monitor: val/loss_simple_ema
+ scale_factor: 0.18215
+ use_ema: true
+ load_ema: true
+
+ scheduler_config: # 10000 warmup steps
+ target: ldm.lr_scheduler.LambdaLinearScheduler
+ params:
+ warm_up_steps: [ 0 ]
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+ f_start: [ 1.e-6 ]
+ f_max: [ 1. ]
+ f_min: [ 1. ]
+
+ unet_config:
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+ params:
+ image_size: 32 # unused
+ in_channels: 8
+ out_channels: 4
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_heads: 8
+ use_spatial_transformer: True
+ transformer_depth: 1
+ context_dim: 768
+ use_checkpoint: True
+ legacy: False
+
+ first_stage_config:
+ target: ldm.models.autoencoder.AutoencoderKL
+ params:
+ embed_dim: 4
+ monitor: val/rec_loss
+ ddconfig:
+ double_z: true
+ z_channels: 4
+ resolution: 256
+ in_channels: 3
+ out_ch: 3
+ ch: 128
+ ch_mult:
+ - 1
+ - 2
+ - 4
+ - 4
+ num_res_blocks: 2
+ attn_resolutions: []
+ dropout: 0.0
+ lossconfig:
+ target: torch.nn.Identity
+
+ cond_stage_config:
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
+
+data:
+ target: main.DataModuleFromConfig
+ params:
+ batch_size: 128
+ num_workers: 1
+ wrap: false
+ validation:
+ target: edit_dataset.EditDataset
+ params:
+ path: data/clip-filtered-dataset
+ cache_dir: data/
+ cache_name: data_10k
+ split: val
+ min_text_sim: 0.2
+ min_image_sim: 0.75
+ min_direction_sim: 0.2
+ max_samples_per_prompt: 1
+ min_resize_res: 512
+ max_resize_res: 512
+ crop_res: 512
+ output_as_edit: False
+ real_input: True
diff --git a/configs/v1-inpainting-inference.yaml b/configs/v1-inpainting-inference.yaml
new file mode 100644
index 00000000..f9eec37d
--- /dev/null
+++ b/configs/v1-inpainting-inference.yaml
@@ -0,0 +1,70 @@
+model:
+ base_learning_rate: 7.5e-05
+ target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
+ params:
+ linear_start: 0.00085
+ linear_end: 0.0120
+ num_timesteps_cond: 1
+ log_every_t: 200
+ timesteps: 1000
+ first_stage_key: "jpg"
+ cond_stage_key: "txt"
+ image_size: 64
+ channels: 4
+ cond_stage_trainable: false # Note: different from the one we trained before
+ conditioning_key: hybrid # important
+ monitor: val/loss_simple_ema
+ scale_factor: 0.18215
+ finetune_keys: null
+
+ scheduler_config: # 10000 warmup steps
+ target: ldm.lr_scheduler.LambdaLinearScheduler
+ params:
+ warm_up_steps: [ 2500 ] # NOTE for resuming. use 10000 if starting from scratch
+ cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+ f_start: [ 1.e-6 ]
+ f_max: [ 1. ]
+ f_min: [ 1. ]
+
+ unet_config:
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+ params:
+ image_size: 32 # unused
+ in_channels: 9 # 4 data + 4 downscaled image + 1 mask
+ out_channels: 4
+ model_channels: 320
+ attention_resolutions: [ 4, 2, 1 ]
+ num_res_blocks: 2
+ channel_mult: [ 1, 2, 4, 4 ]
+ num_heads: 8
+ use_spatial_transformer: True
+ transformer_depth: 1
+ context_dim: 768
+ use_checkpoint: True
+ legacy: False
+
+ first_stage_config:
+ target: ldm.models.autoencoder.AutoencoderKL
+ params:
+ embed_dim: 4
+ monitor: val/rec_loss
+ ddconfig:
+ double_z: true
+ z_channels: 4
+ resolution: 256
+ in_channels: 3
+ out_ch: 3
+ ch: 128
+ ch_mult:
+ - 1
+ - 2
+ - 4
+ - 4
+ num_res_blocks: 2
+ attn_resolutions: []
+ dropout: 0.0
+ lossconfig:
+ target: torch.nn.Identity
+
+ cond_stage_config:
+ target: ldm.modules.encoders.modules.FrozenCLIPEmbedder