From 52cc83d36b7663a77b79fd2258d2ca871af73e55 Mon Sep 17 00:00:00 2001 From: zhaohu xing <920232796@qq.com> Date: Wed, 30 Nov 2022 14:56:12 +0800 Subject: fix bugs Signed-off-by: zhaohu xing <920232796@qq.com> --- ldm/modules/encoders/__init__.py | 0 ldm/modules/encoders/modules.py | 234 --------------------------------------- ldm/modules/encoders/xlmr.py | 137 ----------------------- 3 files changed, 371 deletions(-) delete mode 100644 ldm/modules/encoders/__init__.py delete mode 100644 ldm/modules/encoders/modules.py delete mode 100644 ldm/modules/encoders/xlmr.py (limited to 'ldm/modules/encoders') diff --git a/ldm/modules/encoders/__init__.py b/ldm/modules/encoders/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/ldm/modules/encoders/modules.py b/ldm/modules/encoders/modules.py deleted file mode 100644 index ededbe43..00000000 --- a/ldm/modules/encoders/modules.py +++ /dev/null @@ -1,234 +0,0 @@ -import torch -import torch.nn as nn -from functools import partial -import clip -from einops import rearrange, repeat -from transformers import CLIPTokenizer, CLIPTextModel -import kornia - -from ldm.modules.x_transformer import Encoder, TransformerWrapper # TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test - - -class AbstractEncoder(nn.Module): - def __init__(self): - super().__init__() - - def encode(self, *args, **kwargs): - raise NotImplementedError - - - -class ClassEmbedder(nn.Module): - def __init__(self, embed_dim, n_classes=1000, key='class'): - super().__init__() - self.key = key - self.embedding = nn.Embedding(n_classes, embed_dim) - - def forward(self, batch, key=None): - if key is None: - key = self.key - # this is for use in crossattn - c = batch[key][:, None] - c = self.embedding(c) - return c - - -class TransformerEmbedder(AbstractEncoder): - """Some transformer encoder layers""" - def __init__(self, n_embed, n_layer, vocab_size, max_seq_len=77, device="cuda"): - super().__init__() - self.device = device - self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, - attn_layers=Encoder(dim=n_embed, depth=n_layer)) - - def forward(self, tokens): - tokens = tokens.to(self.device) # meh - z = self.transformer(tokens, return_embeddings=True) - return z - - def encode(self, x): - return self(x) - - -class BERTTokenizer(AbstractEncoder): - """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" - def __init__(self, device="cuda", vq_interface=True, max_length=77): - super().__init__() - from transformers import BertTokenizerFast # TODO: add to reuquirements - self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") - self.device = device - self.vq_interface = vq_interface - self.max_length = max_length - - def forward(self, text): - batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, - return_overflowing_tokens=False, padding="max_length", return_tensors="pt") - tokens = batch_encoding["input_ids"].to(self.device) - return tokens - - @torch.no_grad() - def encode(self, text): - tokens = self(text) - if not self.vq_interface: - return tokens - return None, None, [None, None, tokens] - - def decode(self, text): - return text - - -class BERTEmbedder(AbstractEncoder): - """Uses the BERT tokenizr model and add some transformer encoder layers""" - def __init__(self, n_embed, n_layer, vocab_size=30522, max_seq_len=77, - device="cuda",use_tokenizer=True, embedding_dropout=0.0): - super().__init__() - self.use_tknz_fn = use_tokenizer - if self.use_tknz_fn: - self.tknz_fn = BERTTokenizer(vq_interface=False, max_length=max_seq_len) - self.device = device - self.transformer = TransformerWrapper(num_tokens=vocab_size, max_seq_len=max_seq_len, - attn_layers=Encoder(dim=n_embed, depth=n_layer), - emb_dropout=embedding_dropout) - - def forward(self, text): - if self.use_tknz_fn: - tokens = self.tknz_fn(text)#.to(self.device) - else: - tokens = text - z = self.transformer(tokens, return_embeddings=True) - return z - - def encode(self, text): - # output of length 77 - return self(text) - - -class SpatialRescaler(nn.Module): - def __init__(self, - n_stages=1, - method='bilinear', - multiplier=0.5, - in_channels=3, - out_channels=None, - bias=False): - super().__init__() - self.n_stages = n_stages - assert self.n_stages >= 0 - assert method in ['nearest','linear','bilinear','trilinear','bicubic','area'] - self.multiplier = multiplier - self.interpolator = partial(torch.nn.functional.interpolate, mode=method) - self.remap_output = out_channels is not None - if self.remap_output: - print(f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.') - self.channel_mapper = nn.Conv2d(in_channels,out_channels,1,bias=bias) - - def forward(self,x): - for stage in range(self.n_stages): - x = self.interpolator(x, scale_factor=self.multiplier) - - - if self.remap_output: - x = self.channel_mapper(x) - return x - - def encode(self, x): - return self(x) - -class FrozenCLIPEmbedder(AbstractEncoder): - """Uses the CLIP transformer encoder for text (from Hugging Face)""" - def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77): - super().__init__() - self.tokenizer = CLIPTokenizer.from_pretrained(version) - self.transformer = CLIPTextModel.from_pretrained(version) - self.device = device - self.max_length = max_length - self.freeze() - - def freeze(self): - self.transformer = self.transformer.eval() - for param in self.parameters(): - param.requires_grad = False - - def forward(self, text): - batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True, - return_overflowing_tokens=False, padding="max_length", return_tensors="pt") - tokens = batch_encoding["input_ids"].to(self.device) - outputs = self.transformer(input_ids=tokens) - - z = outputs.last_hidden_state - return z - - def encode(self, text): - return self(text) - - -class FrozenCLIPTextEmbedder(nn.Module): - """ - Uses the CLIP transformer encoder for text. - """ - def __init__(self, version='ViT-L/14', device="cuda", max_length=77, n_repeat=1, normalize=True): - super().__init__() - self.model, _ = clip.load(version, jit=False, device="cpu") - self.device = device - self.max_length = max_length - self.n_repeat = n_repeat - self.normalize = normalize - - def freeze(self): - self.model = self.model.eval() - for param in self.parameters(): - param.requires_grad = False - - def forward(self, text): - tokens = clip.tokenize(text).to(self.device) - z = self.model.encode_text(tokens) - if self.normalize: - z = z / torch.linalg.norm(z, dim=1, keepdim=True) - return z - - def encode(self, text): - z = self(text) - if z.ndim==2: - z = z[:, None, :] - z = repeat(z, 'b 1 d -> b k d', k=self.n_repeat) - return z - - -class FrozenClipImageEmbedder(nn.Module): - """ - Uses the CLIP image encoder. - """ - def __init__( - self, - model, - jit=False, - device='cuda' if torch.cuda.is_available() else 'cpu', - antialias=False, - ): - super().__init__() - self.model, _ = clip.load(name=model, device=device, jit=jit) - - self.antialias = antialias - - self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False) - self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False) - - def preprocess(self, x): - # normalize to [0,1] - x = kornia.geometry.resize(x, (224, 224), - interpolation='bicubic',align_corners=True, - antialias=self.antialias) - x = (x + 1.) / 2. - # renormalize according to clip - x = kornia.enhance.normalize(x, self.mean, self.std) - return x - - def forward(self, x): - # x is assumed to be in range [-1,1] - return self.model.encode_image(self.preprocess(x)) - - -if __name__ == "__main__": - from ldm.util import count_params - model = FrozenCLIPEmbedder() - count_params(model, verbose=True) \ No newline at end of file diff --git a/ldm/modules/encoders/xlmr.py b/ldm/modules/encoders/xlmr.py deleted file mode 100644 index beab3fdf..00000000 --- a/ldm/modules/encoders/xlmr.py +++ /dev/null @@ -1,137 +0,0 @@ -from transformers import BertPreTrainedModel,BertModel,BertConfig -import torch.nn as nn -import torch -from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig -from transformers import XLMRobertaModel,XLMRobertaTokenizer -from typing import Optional - -class BertSeriesConfig(BertConfig): - def __init__(self, vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", use_cache=True, classifier_dropout=None,project_dim=512, pooler_fn="average",learn_encoder=False,model_type='bert',**kwargs): - - super().__init__(vocab_size, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, hidden_act, hidden_dropout_prob, attention_probs_dropout_prob, max_position_embeddings, type_vocab_size, initializer_range, layer_norm_eps, pad_token_id, position_embedding_type, use_cache, classifier_dropout, **kwargs) - self.project_dim = project_dim - self.pooler_fn = pooler_fn - self.learn_encoder = learn_encoder - -class RobertaSeriesConfig(XLMRobertaConfig): - def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2,project_dim=512,pooler_fn='cls',learn_encoder=False, **kwargs): - super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - self.project_dim = project_dim - self.pooler_fn = pooler_fn - self.learn_encoder = learn_encoder - - -class BertSeriesModelWithTransformation(BertPreTrainedModel): - - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - config_class = BertSeriesConfig - - def __init__(self, config=None, **kargs): - # modify initialization for autoloading - if config is None: - config = XLMRobertaConfig() - config.attention_probs_dropout_prob= 0.1 - config.bos_token_id=0 - config.eos_token_id=2 - config.hidden_act='gelu' - config.hidden_dropout_prob=0.1 - config.hidden_size=1024 - config.initializer_range=0.02 - config.intermediate_size=4096 - config.layer_norm_eps=1e-05 - config.max_position_embeddings=514 - - config.num_attention_heads=16 - config.num_hidden_layers=24 - config.output_past=True - config.pad_token_id=1 - config.position_embedding_type= "absolute" - - config.type_vocab_size= 1 - config.use_cache=True - config.vocab_size= 250002 - config.project_dim = 768 - config.learn_encoder = False - super().__init__(config) - self.roberta = XLMRobertaModel(config) - self.transformation = nn.Linear(config.hidden_size,config.project_dim) - self.pre_LN=nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') - self.pooler = lambda x: x[:,0] - self.post_init() - - def encode(self,c): - device = next(self.parameters()).device - text = self.tokenizer(c, - truncation=True, - max_length=77, - return_length=False, - return_overflowing_tokens=False, - padding="max_length", - return_tensors="pt") - text["input_ids"] = torch.tensor(text["input_ids"]).to(device) - text["attention_mask"] = torch.tensor( - text['attention_mask']).to(device) - features = self(**text) - return features['projection_state'] - - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - token_type_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - head_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, - encoder_hidden_states: Optional[torch.Tensor] = None, - encoder_attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - return_dict: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - ) : - r""" - """ - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - - outputs = self.roberta( - input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - output_attentions=output_attentions, - output_hidden_states=True, - return_dict=return_dict, - ) - - # last module outputs - sequence_output = outputs[0] - - - # project every module - sequence_output_ln = self.pre_LN(sequence_output) - - # pooler - pooler_output = self.pooler(sequence_output_ln) - pooler_output = self.transformation(pooler_output) - projection_state = self.transformation(outputs.last_hidden_state) - - return { - 'pooler_output':pooler_output, - 'last_hidden_state':outputs.last_hidden_state, - 'hidden_states':outputs.hidden_states, - 'attentions':outputs.attentions, - 'projection_state':projection_state, - 'sequence_out': sequence_output - } - - -class RobertaSeriesModelWithTransformation(BertSeriesModelWithTransformation): - base_model_prefix = 'roberta' - config_class= RobertaSeriesConfig \ No newline at end of file -- cgit v1.2.1