allenai · dirkgr · Dec 8, 2021 · Dec 7, 2021 · Dec 7, 2021 · Dec 7, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+
+- Added a way to resize the vocabulary in the T5 module
+
 ### Fixed
 
 - Fixed the docstring information for the `FBetaMultiLabelMeasure` metric.

diff --git a/allennlp/modules/transformer/t5.py b/allennlp/modules/transformer/t5.py
@@ -4,8 +4,7 @@
 """  # noqa: E401
 
 import logging
-from typing import Optional, Tuple, List, Union, Dict, TYPE_CHECKING, NamedTuple
-
+from typing import Optional, Tuple, List, Union, Dict, TYPE_CHECKING, NamedTuple, Callable
 
 import torch
 from torch import nn
@@ -428,6 +427,36 @@ def get_head_mask(head_mask: Optional[torch.BoolTensor], num_hidden_layers: int)
             head_mask = [None] * num_hidden_layers
         return head_mask
 
+    def resize_token_embeddings(
+        self, new_size: int, *, init_fn: Callable = torch.nn.init.normal_
+    ) -> None:
+        old_size, embedding_dim = tuple(self.token_embeddings.weight.shape)
+        if old_size == new_size:
+            return
+        if old_size > new_size:
+            logger.warning(
+                "Shrinking vocabulary from size %d to size %d. This is probably not what you want?",
+                old_size,
+                new_size,
+            )
+
+        result = torch.nn.Embedding(
+            new_size,
+            embedding_dim,
+            self.token_embeddings.padding_idx,
+            self.token_embeddings.max_norm,
+            self.token_embeddings.norm_type,
+            self.token_embeddings.scale_grad_by_freq,
+            self.token_embeddings.sparse,
+            device=self.token_embeddings.weight.device,
+            dtype=self.token_embeddings.weight.dtype,
+        )
+        copy_size = min(old_size, new_size)
+        result.weight.data[:copy_size, ...] = self.token_embeddings.weight.data[:copy_size, ...]
+        if new_size > old_size:
+            init_fn(result.weight.data[copy_size:, ...])
+        self.token_embeddings = result
+
     def forward(
         self,
         input_ids: Optional[torch.IntTensor] = None,
@@ -759,8 +788,8 @@ class T5(TransformerModule, Registrable):
     def __init__(
         self,
         token_embeddings: Optional[nn.Embedding] = None,
-        encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack),
-        decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack),
+        encoder: Lazy[T5EncoderStack] = Lazy(T5EncoderStack.basic_encoder),
+        decoder: Lazy[T5DecoderStack] = Lazy(T5DecoderStack.basic_decoder),
         decoder_start_token_id: int = 0,
         pad_token_id: int = 0,  # These are both 0 in t5-(small|base|large). Go figure.
         eos_token_id: int = 1,
@@ -806,6 +835,47 @@ def __init__(
 
         self.beam_search = beam_search.construct(end_index=self.eos_token_id)
 
+    def resize_token_embeddings(
+        self, new_size: int, *, init_fn: Callable = torch.nn.init.normal_
+    ) -> None:
+        """
+        Resizes the token embeddings in the model.
+
+        This takes care of the token embeddings for the encoder, the decoder, and the LM head.
+
+        new_size : `int`
+            The new size of the token embeddings
+        init_fn : `Callable`
+            The function to use to initialize new embeddings. This function will be called with a
+            single argument, the tensor to initialize, and it is expected to initialize the tensor
+            in place. Many of the functions from `torch.nn.init` fit.
+        """
+        self.encoder.resize_token_embeddings(new_size, init_fn=init_fn)
+        # If encoder and decoder share embeddings, this is a no-op the second time.
+        self.decoder.resize_token_embeddings(new_size, init_fn=init_fn)
+
+    
10000
    # resize lm head
+        old_size = self.lm_head.out_features
+        if old_size == new_size:
+            return
+        new_lm_head = torch.nn.Linear(
+            self.lm_head.in_features,
+            new_size,
+            self.lm_head.bias,
+            self.lm_head.weight.device,
+            self.lm_head.weight.dtype,
+        )
+        copy_size = min(old_size, new_size)
+        new_lm_head.weight.data[:copy_size, ...] = self.lm_head.weight.data[:copy_size, ...]
+        if self.lm_head.bias and new_lm_head.bias:
+            new_lm_head.bias.data[:copy_size, ...] = self.lm_head.bias[:copy_size, ...]
+        if new_size > old_size:
+            init_fn(new_lm_head.weight.data[copy_size:, ...])
+            if new_lm_head.bias:
+                init_fn(new_lm_head.bias[copy_size:, ...])
+
+        self.lm_head = new_lm_head
+
     def _post_load_state_dict(
         self, missing_keys: List[str], unexpected_keys: List[str]
     ) -> Tuple[List[str], List[str]]:
@@ -954,7 +1024,7 @@ def forward(
             logits = self._get_lm_logits(decoder_outputs.last_hidden_state)  # type: ignore[union-attr]
 
             # Shape: (1,)
-            loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+            loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.to(torch.long).view(-1))
         elif self.training:
             raise ValueError("'labels' required during training")
 

diff --git a/tests/modules/transformer/t5_test.py b/tests/modules/transformer/t5_test.py
@@ -1,4 +1,5 @@
 import pytest
+import torch
 from transformers.models import t5 as hf_t5
 
 from allennlp.modules.transformer.t5 import T5
@@ -135,3 +136,16 @@ def _test_distributed_load_state_dict(global_rank, world_size, gpu_id):
 @requires_multi_gpu
 def test_distributed_load_state_dict():
     run_distributed_test([0, 1], func=_test_distributed_load_state_dict)
+
+
+@pytest.mark.parametrize("tie_word_embeddings", [True, False])
+def test_t5_resize_token_embeddings(model: T5, tie_word_embeddings: bool):
+    module = T5(tie_word_embeddings=tie_word_embeddings)
+
+    labels = torch.IntTensor([[1, 2, 3]])
+    module(torch.IntTensor([[129, 130, 131]]), labels=labels)
+    module.resize_token_embeddings(128)
+    with pytest.raises(IndexError):
+        module(torch.IntTensor([[129, 130, 131]]), labels=labels)
+    module.resize_token_embeddings(1024)
+    module(torch.IntTensor([[129, 130, 131]]), labels=labels)