1 files changed, 11 insertions, 5 deletions
diff --git a/modules/sd_hijack_optimizations.py b/modules/sd_hijack_optimizations.py
index 79405525..98123fbf 100644
--- a/modules/sd_hijack_optimizations.py
+++ b/modules/sd_hijack_optimizations.py
@@ -181,7 +181,7 @@ def einsum_op_cuda(q, k, v):
     mem_free_torch = mem_reserved - mem_active
     mem_free_total = mem_free_cuda + mem_free_torch
     # Divide factor of safety as there's copying and fragmentation
-    return self.einsum_op_tensor_mem(q, k, v, mem_free_total / 3.3 / (1 << 20))
+    return einsum_op_tensor_mem(q, k, v, mem_free_total / 3.3 / (1 << 20))
 
 def einsum_op(q, k, v):
     if q.device.type == 'cuda':
@@ -296,10 +296,16 @@ def xformers_attnblock_forward(self, x):
     try:
         h_ = x
         h_ = self.norm(h_)
-        q1 = self.q(h_).contiguous()
-        k1 = self.k(h_).contiguous()
-        v = self.v(h_).contiguous()
-        out = xformers.ops.memory_efficient_attention(q1, k1, v)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q, k, v = map(lambda t: rearrange(t, 'b c h w -> b (h w) c'), (q, k, v))
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        out = xformers.ops.memory_efficient_attention(q, k, v)
+        out = rearrange(out, 'b (h w) c -> b c h w', h=h)
         out = self.proj_out(out)
         return x + out
     except NotImplementedError: