|
2 | 2 | import torch
|
3 | 3 | import comfy.model_management
|
4 | 4 | import comfy.utils
|
| 5 | +import comfy.latent_formats |
5 | 6 |
|
6 | 7 |
|
7 | 8 | class EmptyCosmosLatentVideo:
|
@@ -75,8 +76,53 @@ def encode(self, vae, width, height, length, batch_size, start_image=None, end_i
|
75 | 76 | out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1))
|
76 | 77 | return (out_latent,)
|
77 | 78 |
|
| 79 | +class CosmosPredict2ImageToVideoLatent: |
| 80 | + @classmethod |
| 81 | + def INPUT_TYPES(s): |
| 82 | + return {"required": {"vae": ("VAE", ), |
| 83 | + "width": ("INT", {"default": 848, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| 84 | + "height": ("INT", {"default": 480, "min": 16, "max": nodes.MAX_RESOLUTION, "step": 16}), |
| 85 | + "length": ("INT", {"default": 93, "min": 1, "max": nodes.MAX_RESOLUTION, "step": 4}), |
| 86 | + "batch_size": ("INT", {"default": 1, "min": 1, "max": 4096}), |
| 87 | + }, |
| 88 | + "optional": {"start_image": ("IMAGE", ), |
| 89 | + "end_image": ("IMAGE", ), |
| 90 | + }} |
| 91 | + |
| 92 | + |
| 93 | + RETURN_TYPES = ("LATENT",) |
| 94 | + FUNCTION = "encode" |
| 95 | + |
| 96 | + CATEGORY = "conditioning/inpaint" |
| 97 | + |
| 98 | + def encode(self, vae, width, height, length, batch_size, start_image=None, end_image=None): |
| 99 | + latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device()) |
| 100 | + if start_image is None and end_image is None: |
| 101 | + out_latent = {} |
| 102 | + out_latent["samples"] = latent |
| 103 | + return (out_latent,) |
| 104 | + |
| 105 | + mask = torch.ones([latent.shape[0], 1, ((length - 1) // 4) + 1, latent.shape[-2], latent.shape[-1]], device=comfy.model_management.intermediate_device()) |
| 106 | + |
| 107 | + if start_image is not None: |
| 108 | + latent_temp = vae_encode_with_padding(vae, start_image, width, height, length, padding=1) |
| 109 | + latent[:, :, :latent_temp.shape[-3]] = latent_temp |
| 110 | + mask[:, :, :latent_temp.shape[-3]] *= 0.0 |
| 111 | + |
| 112 | + if end_image is not None: |
| 113 | + latent_temp = vae_encode_with_padding(vae, end_image, width, height, length, padding=0) |
| 114 | + latent[:, :, -latent_temp.shape[-3]:] = latent_temp |
| 115 | + mask[:, :, -latent_temp.shape[-3]:] *= 0.0 |
| 116 | + |
| 117 | + out_latent = {} |
| 118 | + latent_format = comfy.latent_formats.Wan21() |
| 119 | + latent = latent_format.process_out(latent) * mask + latent * (1.0 - mask) |
| 120 | + out_latent["samples"] = latent.repeat((batch_size, ) + (1,) * (latent.ndim - 1)) |
| 121 | + out_latent["noise_mask"] = mask.repeat((batch_size, ) + (1,) * (mask.ndim - 1)) |
| 122 | + return (out_latent,) |
78 | 123 |
|
79 | 124 | NODE_CLASS_MAPPINGS = {
|
80 | 125 | "EmptyCosmosLatentVideo": EmptyCosmosLatentVideo,
|
81 | 126 | "CosmosImageToVideoLatent": CosmosImageToVideoLatent,
|
| 127 | + "CosmosPredict2ImageToVideoLatent": CosmosPredict2ImageToVideoLatent, |
82 | 128 | }
|
0 commit comments