From 3e9b5ae1d6d74dd666afc9570d4e6c52dd9287f0 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 2 May 2018 17:06:06 -0700 Subject: [PATCH 01/24] WIP: ESIM model --- allennlp/models/esim.py | 248 ++++++++++++++++++++++++++++++++++++++ training_config/esim.json | 79 ++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 allennlp/models/esim.py create mode 100644 training_config/esim.json diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py new file mode 100644 index 00000000000..92b113d4304 --- /dev/null +++ b/allennlp/models/esim.py @@ -0,0 +1,248 @@ +# TODO: projection dropout with ELMO +# l2 reg with ELMO +# multiple ELMO layers +# doc + +# init: +# for LSTM, use xavier_uniform for kernel, orthogonal for recurrent + +from typing import Dict, Optional + +import torch + +from allennlp.common import Params +from allennlp.common.checks import check_dimensions_match +from allennlp.data import Vocabulary +from allennlp.models.model import Model +from allennlp.modules import FeedForward, MatrixAttention +from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder +from allennlp.nn import InitializerApplicator, RegularizerApplicator +from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values +from allennlp.training.metrics import CategoricalAccuracy + + +@Model.register("esim") +class ESIM(Model): + """ + This ``Model`` implements the ESIM sequence model described in `"Enhanced LSTM for Natural Language Inference" + `_ + by Chen et al., 2017. + + Parameters + ---------- + vocab : ``Vocabulary`` + text_field_embedder : ``TextFieldEmbedder`` + Used to embed the ``premise`` and ``hypothesis`` ``TextFields`` we get as input to the + model. + attend_feedforward : ``FeedForward`` + This feedforward network is applied to the encoded sentence representations before the + similarity matrix is computed between words in the premise and words in the hypothesis. + similarity_function : ``SimilarityFunction`` + This is the similarity function used when computing the similarity matrix between words in + the premise and words in the hypothesis. + compare_feedforward : ``FeedForward`` + This feedforward network is applied to the aligned premise and hypothesis representations, + individually. + aggregate_feedforward : ``FeedForward`` + This final feedforward network is applied to the concatenated, summed result of the + ``compare_feedforward`` network, and its output is used as the entailment class logits. + premise_encoder : ``Seq2SeqEncoder``, optional (default=``None``) + After embedding the premise, we can optionally apply an encoder. If this is ``None``, we + will do nothing. + hypothesis_encoder : ``Seq2SeqEncoder``, optional (default=``None``) + After embedding the hypothesis, we can optionally apply an encoder. If this is ``None``, + we will use the ``premise_encoder`` for the encoding (doing nothing if ``premise_encoder`` + is also ``None``). + initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) + Used to initialize the model parameters. + regularizer : ``RegularizerApplicator``, optional (default=``None``) + If provided, will be used to calculate the regularization penalty during training. + """ + def __init__(self, vocab: Vocabulary, + text_field_embedder: TextFieldEmbedder, + encoder: Seq2SeqEncoder, + similarity_function: SimilarityFunction, + projection_feedforward: FeedForward, + inference_encoder: Seq2SeqEncoder, + output_feedforward: FeedForward, + initializer: InitializerApplicator = InitializerApplicator(), + dropout: float = 0.5, + regularizer: Optional[RegularizerApplicator] = None) -> None: + super().__init__(vocab, regularizer) + + self._text_field_embedder = text_field_embedder + self._encoder = encoder + + self._matrix_attention = MatrixAttention(similarity_function) + self._projection_feedforward = projection_feedforward + + self._inference_encoder = inference_encoder + + if dropout: + self.dropout = torch.nn.Dropout(dropout) + else: + self.dropout = None + + self._output_feedforward = output_feedforward + + self._num_labels = vocab.get_vocab_size(namespace="labels") + + check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), + "text field embedding dim", "encoder input dim") + check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(), + "encoder output dim", "projection feedforward input") + check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(), + "proj feedforward output dim", "inference lstm input dim") + check_dimensions_match(output_feedforward.get_output_dim(), self._num_labels, + "final output dimension", "number of labels") + + self._accuracy = CategoricalAccuracy() + self._loss = torch.nn.CrossEntropyLoss() + + initializer(self) + + def forward(self, # type: ignore + premise: Dict[str, torch.LongTensor], + hypothesis: Dict[str, torch.LongTensor], + label: torch.IntTensor = None) -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + """ + Parameters + ---------- + premise : Dict[str, torch.LongTensor] + From a ``TextField`` + hypothesis : Dict[str, torch.LongTensor] + From a ``TextField`` + label : torch.IntTensor, optional (default = None) + From a ``LabelField`` + + Returns + ------- + An output dictionary consisting of: + + label_logits : torch.FloatTensor + A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log + probabilities of the entailment label. + label_probs : torch.FloatTensor + A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the + entailment label. + loss : torch.FloatTensor, optional + A scalar loss to be optimised. + """ + embedded_premise = self._text_field_embedder(premise) + embedded_hypothesis = self._text_field_embedder(hypothesis) + premise_mask = get_text_field_mask(premise).float() + hypothesis_mask = get_text_field_mask(hypothesis).float() + + # apply dropout + if self.dropout: + embedded_premise = self.dropout(embedded_premise) + embedded_hypothesis = self.dropout(embedded_hypothesis) + + # encode premise and hypothesis + encoded_premise = self._encoder(embedded_premise, premise_mask) + encoded_hypothesis = self._encoder(embedded_hypothesis, hypothesis_mask) + + # Shape: (batch_size, premise_length, hypothesis_length) + similarity_matrix = self._matrix_attention(encoded_premise, encoded_hypothesis) + + # Shape: (batch_size, premise_length, hypothesis_length) + p2h_attention = last_dim_softmax(similarity_matrix, hypothesis_mask) + # Shape: (batch_size, premise_length, embedding_dim) + attended_hypothesis = weighted_sum(embedded_hypothesis, p2h_attention) + + # Shape: (batch_size, hypothesis_length, premise_length) + h2p_attention = last_dim_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask) + # Shape: (batch_size, hypothesis_length, embedding_dim) + attended_premise = weighted_sum(embedded_premise, h2p_attention) + + # the "enhancement" layer + premise_enhanced = torch.cat( + [encoded_premise, attended_hypothesis, + encoded_premise - attended_hypothesis, + encoded_premise * attended_hypothesis], + dim=-1 + ) + hypothesis_enhanced = torch.cat( + [encoded_hypothesis, attended_premise, + encoded_hypothesis - attended_premise, + encoded_hypothesis * attended_premise], + dim=-1 + ) + + projected_enhanced_premise = self._projection_feedforward(premise_enhanced) + + # the projection layer down to the model dimension + projected_enhanced_premise = self._projection_feedforward(premise_enhanced) + projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced) + + # Run the inference layer + if self.dropout: + projected_enhanced_premise = self.dropout(projected_enhanced_premise) + projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis) + v_ai = self._inference_encoder(projected_enhanced_premise) + v_bi = self._inference_encoder(projected_enhanced_hypothesis) + + # The pooling layer -- max and avg pooling. + # (batch_size, model_dim) + v_a_max = replace_masked_values( + v_ai, premise_mask.unsqueeze(-1), -1e7 + ).max(dim=1) + v_b_max = replace_masked_values( + v_bi, hypothesis_mask.unsqueeze(-1), -1e7 + ).max(dim=1) + + v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(premise_mask, 1, keepdim=True) + v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(hypothesis_mask, 1, keepdim=True) + + # Now concat + # (batch_size, model_dim * 2 * 4) + v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + + # the final MLP -- apply dropout to input, and MLP applies to output & hidden + if self.dropout: + v = self.dropout(v) + + label_logits = self._output_feedforward(v) + label_probs = torch.nn.functional.softmax(label_logits, dim=-1) + + output_dict = {"label_logits": label_logits, "label_probs": label_probs} + + if label is not None: + loss = self._loss(label_logits, label.long().view(-1)) + self._accuracy(label_logits, label.squeeze(-1)) + output_dict["loss"] = loss + + return output_dict + + def get_metrics(self, reset: bool = False) -> Dict[str, float]: + return { + 'accuracy': self._accuracy.get_metric(reset), + } + + @classmethod + def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention': + embedder_params = params.pop("text_field_embedder") + text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) + + encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) + similarity_function = SimilarityFunction.from_params(params.pop("similarity_function")) + projection_feedforward = FeedForward.from_params(params.pop('projection_feedforward')) + inference_encoder = Seq2SeqEncoder.from_params(params.pop("inference_encoder")) + output_feedforward = FeedForward.from_params(params.pop('output_feedforward')) + initializer = InitializerApplicator.from_params(params.pop('initializer', [])) + regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) + + dropout = params.pop("dropout", 0) + + params.assert_empty(cls.__name__) + return cls(vocab=vocab, + text_field_embedder=text_field_embedder, + encoder=encoder, + similarity_function=similarity_function, + projection_feedforward=projection_feedforward, + inference_encoder=inference_encoder, + output_feedforward=output_feedforward, + initializer=initializer, + dropout=dropout, + regularizer=regularizer) diff --git a/training_config/esim.json b/training_config/esim.json new file mode 100644 index 00000000000..9ce1b03f13c --- /dev/null +++ b/training_config/esim.json @@ -0,0 +1,79 @@ +{ + "dataset_reader": { + "type": "snli", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": false, + } + }, + }, + "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl", + "validation_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl", + "model": { + "type": "esim", + "dropout": 0.5, + "text_field_embedder": { + "tokens": { + "type": "embedding", + "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.840B.300d.txt.gz", + "embedding_dim": 300, + "trainable": false + } + }, + "encoder": { + "type": "lstm", + "input_size": 300, + "hidden_size": 300, + "num_layers": 1, + "bidirectional": true + }, + "similarity_function": {"type": "dot_product"}, + "projection_feedforward": { + "input_dim": 2400, + "hiddem_dims": 300, + "num_layers": 1, + "activations": "relu", + }, + "inference_encoder": { + "type": "lstm", + "input_size": 300, + "hidden_size": 300, + "num_layers": 1, + "bidirectional": true + }, + "output_feedforward": { + "input_dim": 2400, + "num_layers": 2, + "hidden_dims": [300, 3], + "activations": "relu", + "dropout": 0.5 + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_uniform"}] + ] + }, + "iterator": { + "type": "bucket", + "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]], + "batch_size": 32 + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.0004, + }, + "validation_metric": "+accuracy", + "num_serialized_models_to_keep": 2, + "num_epochs": 75, + "grad_norm": 10.0, + "patience": 3, + "cuda_device": 0, + "learning_rate_scheduler": { + "type": "reduce_on_plateau", + "factor": 0.5, + "mode": "max", + "patience": 0 + } + } +} From 8127486ed0c1a2b54723134a2d7e2a7fe095c2e4 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 2 May 2018 17:39:35 -0700 Subject: [PATCH 02/24] WIP: ESIM model for SNLI --- allennlp/models/__init__.py | 1 + allennlp/models/esim.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/allennlp/models/__init__.py b/allennlp/models/__init__.py index 5c7f7bc4eed..59a23409c85 100644 --- a/allennlp/models/__init__.py +++ b/allennlp/models/__init__.py @@ -17,3 +17,4 @@ from allennlp.models.semantic_parsing.wikitables.wikitables_semantic_parser import WikiTablesSemanticParser from allennlp.models.semantic_role_labeler import SemanticRoleLabeler from allennlp.models.simple_tagger import SimpleTagger +from allennlp.models.esim import ESIM diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 92b113d4304..fec04b6171b 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -149,12 +149,12 @@ def forward(self, # type: ignore # Shape: (batch_size, premise_length, hypothesis_length) p2h_attention = last_dim_softmax(similarity_matrix, hypothesis_mask) # Shape: (batch_size, premise_length, embedding_dim) - attended_hypothesis = weighted_sum(embedded_hypothesis, p2h_attention) + attended_hypothesis = weighted_sum(encoded_hypothesis, p2h_attention) # Shape: (batch_size, hypothesis_length, premise_length) h2p_attention = last_dim_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask) # Shape: (batch_size, hypothesis_length, embedding_dim) - attended_premise = weighted_sum(embedded_premise, h2p_attention) + attended_premise = weighted_sum(encoded_premise, h2p_attention) # the "enhancement" layer premise_enhanced = torch.cat( @@ -180,8 +180,8 @@ def forward(self, # type: ignore if self.dropout: projected_enhanced_premise = self.dropout(projected_enhanced_premise) projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis) - v_ai = self._inference_encoder(projected_enhanced_premise) - v_bi = self._inference_encoder(projected_enhanced_hypothesis) + v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask) + v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask) # The pooling layer -- max and avg pooling. # (batch_size, model_dim) From 5d775f72ccba4ad4a8a20f1b591b5977d7c986cc Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 2 May 2018 17:44:57 -0700 Subject: [PATCH 03/24] WIP: ESIM --- allennlp/models/esim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index fec04b6171b..8c9f47b8af2 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -185,10 +185,10 @@ def forward(self, # type: ignore # The pooling layer -- max and avg pooling. # (batch_size, model_dim) - v_a_max = replace_masked_values( + v_a_max, _ = replace_masked_values( v_ai, premise_mask.unsqueeze(-1), -1e7 ).max(dim=1) - v_b_max = replace_masked_values( + v_b_max, _ = replace_masked_values( v_bi, hypothesis_mask.unsqueeze(-1), -1e7 ).max(dim=1) From 98291602ade292b021ac1ca82ff98f5efe3ae929 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 2 May 2018 19:51:29 -0700 Subject: [PATCH 04/24] WIP: ESIM --- allennlp/models/esim.py | 5 ----- allennlp/nn/initializers.py | 17 ++++++++++++++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 8c9f47b8af2..d1732637f86 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -3,9 +3,6 @@ # multiple ELMO layers # doc -# init: -# for LSTM, use xavier_uniform for kernel, orthogonal for recurrent - from typing import Dict, Optional import torch @@ -170,8 +167,6 @@ def forward(self, # type: ignore dim=-1 ) - projected_enhanced_premise = self._projection_feedforward(premise_enhanced) - # the projection layer down to the model dimension projected_enhanced_premise = self._projection_feedforward(premise_enhanced) projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced) diff --git a/allennlp/nn/initializers.py b/allennlp/nn/initializers.py index 5c7ccba355c..045f7067f2d 100644 --- a/allennlp/nn/initializers.py +++ b/allennlp/nn/initializers.py @@ -153,6 +153,19 @@ def block_orthogonal(tensor: torch.Tensor, tensor[block_slice] = torch.nn.init.orthogonal(tensor[block_slice].contiguous(), gain=gain) +def zero(tensor: torch.Tensor) -> None: + return tensor.data.zero_() + +def lstm_hidden_bias(tensor: torch.Tensor) -> None: + """ + Initialize the biases of the forget gate to 1, and all other gates to 0, + following Jozefowicz et al., An Empirical Exploration of Recurrent Network Architectures + """ + # gates are (b_hi|b_hf|b_hg|b_ho) of shape (4*hidden_size) + tensor.data.zero_() + hidden_size = tensor.shape[0] // 4 + tensor.data[hidden_size:(2 * hidden_size)] = 1.0 + def _initializer_wrapper(init_function: Callable[..., None]) -> Type[Initializer]: class Init(Initializer): def __init__(self, **kwargs): @@ -182,7 +195,9 @@ def from_params(cls, params: Params): "sparse": _initializer_wrapper(torch.nn.init.sparse), "eye": _initializer_wrapper(torch.nn.init.eye), "block_orthogonal": _initializer_wrapper(block_orthogonal), - "uniform_unit_scaling": _initializer_wrapper(uniform_unit_scaling) + "uniform_unit_scaling": _initializer_wrapper(uniform_unit_scaling), + "zero": _initializer_wrapper(zero), + "lstm_hidden_bias": _initializer_wrapper(lstm_hidden_bias), } From 8c511114abc48ec3ac3e403de2262114378b6b7b Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 2 May 2018 21:51:05 -0700 Subject: [PATCH 05/24] WIP: ESIM --- allennlp/models/esim.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index d1732637f86..0d501d19ab2 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -62,6 +62,7 @@ def __init__(self, vocab: Vocabulary, projection_feedforward: FeedForward, inference_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, + output_logit: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), dropout: float = 0.5, regularizer: Optional[RegularizerApplicator] = None) -> None: @@ -81,6 +82,7 @@ def __init__(self, vocab: Vocabulary, self.dropout = None self._output_feedforward = output_feedforward + self._output_logit = output_logit self._num_labels = vocab.get_vocab_size(namespace="labels") @@ -90,8 +92,6 @@ def __init__(self, vocab: Vocabulary, "encoder output dim", "projection feedforward input") check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(), "proj feedforward output dim", "inference lstm input dim") - check_dimensions_match(output_feedforward.get_output_dim(), self._num_labels, - "final output dimension", "number of labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() @@ -198,7 +198,8 @@ def forward(self, # type: ignore if self.dropout: v = self.dropout(v) - label_logits = self._output_feedforward(v) + output_hidden = self._output_feedforward(v) + label_logits = self._output_logit(output_hidden) label_probs = torch.nn.functional.softmax(label_logits, dim=-1) output_dict = {"label_logits": label_logits, "label_probs": label_probs} @@ -225,6 +226,7 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio projection_feedforward = FeedForward.from_params(params.pop('projection_feedforward')) inference_encoder = Seq2SeqEncoder.from_params(params.pop("inference_encoder")) output_feedforward = FeedForward.from_params(params.pop('output_feedforward')) + output_logit = FeedForward.from_params(params.pop('output_logit')) initializer = InitializerApplicator.from_params(params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params(params.pop('regularizer', [])) @@ -238,6 +240,7 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio projection_feedforward=projection_feedforward, inference_encoder=inference_encoder, output_feedforward=output_feedforward, + output_logit=output_logit, initializer=initializer, dropout=dropout, regularizer=regularizer) From 3e1faac57c96ad28aa57358c7b6c06513781bbba Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Fri, 4 May 2018 09:33:52 -0700 Subject: [PATCH 06/24] WIP: ESIM --- allennlp/models/esim.py | 37 ++++++++++++++++++++++++++++++------- training_config/esim.json | 29 ++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 0d501d19ab2..1e63b16ff44 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -6,6 +6,7 @@ from typing import Dict, Optional import torch +from torch.autograd import Variable from allennlp.common import Params from allennlp.common.checks import check_dimensions_match @@ -17,6 +18,21 @@ from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values from allennlp.training.metrics import CategoricalAccuracy +class VariationalDropout(torch.nn.Dropout): + def forward(self, input): + """ + input is shape (batch_size, timesteps, embedding_dim) + Samples one mask of size (batch_size, embedding_dim) and applies it to every time step. + """ + #ones = Variable(torch.ones(input.shape[0], input.shape[-1])) + ones = Variable(input.data.new(input.shape[0], input.shape[-1]).fill_(1)) + dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) + if self.inplace: + input *= dropout_mask.unsqueeze(1) + return None + else: + return dropout_mask.unsqueeze(1) * input + @Model.register("esim") class ESIM(Model): @@ -78,8 +94,10 @@ def __init__(self, vocab: Vocabulary, if dropout: self.dropout = torch.nn.Dropout(dropout) + self.rnn_input_dropout = VariationalDropout(dropout) else: self.dropout = None + self.rnn_input_dropout = None self._output_feedforward = output_feedforward self._output_logit = output_logit @@ -131,10 +149,10 @@ def forward(self, # type: ignore premise_mask = get_text_field_mask(premise).float() hypothesis_mask = get_text_field_mask(hypothesis).float() - # apply dropout - if self.dropout: - embedded_premise = self.dropout(embedded_premise) - embedded_hypothesis = self.dropout(embedded_hypothesis) + # apply dropout for LSTM + if self.rnn_input_dropout: + embedded_premise = self.rnn_input_dropout(embedded_premise) + embedded_hypothesis = self.rnn_input_dropout(embedded_hypothesis) # encode premise and hypothesis encoded_premise = self._encoder(embedded_premise, premise_mask) @@ -167,14 +185,19 @@ def forward(self, # type: ignore dim=-1 ) + # embedding -> lstm w/ do -> enhanced attention -> dropout_proj, only if ELMO -> ff proj -> lstm w/ do -> dropout -> ff 300 -> dropout -> output + + # add dropout here with ELMO + # the projection layer down to the model dimension + # no dropout in projection projected_enhanced_premise = self._projection_feedforward(premise_enhanced) projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced) # Run the inference layer - if self.dropout: - projected_enhanced_premise = self.dropout(projected_enhanced_premise) - projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis) + if self.rnn_input_dropout: + projected_enhanced_premise = self.rnn_input_dropout(projected_enhanced_premise) + projected_enhanced_hypothesis = self.rnn_input_dropout(projected_enhanced_hypothesis) v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask) v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask) diff --git a/training_config/esim.json b/training_config/esim.json index 9ce1b03f13c..269fc0d8644 100644 --- a/training_config/esim.json +++ b/training_config/esim.json @@ -4,9 +4,9 @@ "token_indexers": { "tokens": { "type": "single_id", - "lowercase_tokens": false, + "lowercase_tokens": false } - }, + } }, "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl", "validation_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl", @@ -31,9 +31,9 @@ "similarity_function": {"type": "dot_product"}, "projection_feedforward": { "input_dim": 2400, - "hiddem_dims": 300, + "hidden_dims": 300, "num_layers": 1, - "activations": "relu", + "activations": "relu" }, "inference_encoder": { "type": "lstm", @@ -44,13 +44,24 @@ }, "output_feedforward": { "input_dim": 2400, - "num_layers": 2, - "hidden_dims": [300, 3], + "num_layers": 1, + "hidden_dims": 300, "activations": "relu", "dropout": 0.5 + }, + "output_logit": { + "input_dim": 300, + "num_layers": 1, + "hidden_dims": 3, + "activations": "linear" }, "initializer": [ - [".*linear_layers.*weight", {"type": "xavier_uniform"}] + [".*linear_layers.*weight", {"type": "xavier_uniform"}], + [".*linear_layers.*bias", {"type": "zero"}], + [".*weight_ih.*", {"type": "xavier_uniform"}], + [".*weight_hh.*", {"type": "orthogonal"}], + [".*bias_ih.*", {"type": "zero"}], + [".*bias_hh.*", {"type": "lstm_hidden_bias"}] ] }, "iterator": { @@ -61,13 +72,13 @@ "trainer": { "optimizer": { "type": "adam", - "lr": 0.0004, + "lr": 0.0004 }, "validation_metric": "+accuracy", "num_serialized_models_to_keep": 2, "num_epochs": 75, "grad_norm": 10.0, - "patience": 3, + "patience": 5, "cuda_device": 0, "learning_rate_scheduler": { "type": "reduce_on_plateau", From 9db872d2bc9eebd546ac6178ed84686c0abe4422 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 7 May 2018 11:28:06 -0700 Subject: [PATCH 07/24] ESLM model with ELMo --- training_config/esim.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/training_config/esim.json b/training_config/esim.json index 269fc0d8644..348b3bd299e 100644 --- a/training_config/esim.json +++ b/training_config/esim.json @@ -2,10 +2,9 @@ "dataset_reader": { "type": "snli", "token_indexers": { - "tokens": { - "type": "single_id", - "lowercase_tokens": false - } + "elmo": { + "type": "elmo_characters" + } } }, "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl", @@ -14,16 +13,17 @@ "type": "esim", "dropout": 0.5, "text_field_embedder": { - "tokens": { - "type": "embedding", - "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.840B.300d.txt.gz", - "embedding_dim": 300, - "trainable": false + "elmo":{ + "type": "elmo_token_embedder", + "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0.0 } }, "encoder": { "type": "lstm", - "input_size": 300, + "input_size": 1024, "hidden_size": 300, "num_layers": 1, "bidirectional": true From c79bd998feea53af66286fbd863fae39d8b47839 Mon Sep 17 00:00:00 2001 From: Matt Peters Date: Mon, 14 May 2018 13:26:05 -0700 Subject: [PATCH 08/24] Add a ESIM predictor that works with SNLI formatted files --- allennlp/service/predictors/__init__.py | 1 + allennlp/service/predictors/esim.py | 41 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 allennlp/service/predictors/esim.py diff --git a/allennlp/service/predictors/__init__.py b/allennlp/service/predictors/__init__.py index c11c497ed7b..6e746a6a6eb 100644 --- a/allennlp/service/predictors/__init__.py +++ b/allennlp/service/predictors/__init__.py @@ -16,3 +16,4 @@ from .simple_seq2seq import SimpleSeq2SeqPredictor from .wikitables_parser import WikiTablesParserPredictor from .nlvr_parser import NlvrParserPredictor +from .esim import ESIMPredictor diff --git a/allennlp/service/predictors/esim.py b/allennlp/service/predictors/esim.py new file mode 100644 index 00000000000..5ea63c7fda8 --- /dev/null +++ b/allennlp/service/predictors/esim.py @@ -0,0 +1,41 @@ +from typing import Tuple +from overrides import overrides + +from allennlp.common.util import JsonDict +from allennlp.data import Instance +from allennlp.service.predictors.predictor import Predictor + + +@Predictor.register('esim') +class ESIMPredictor(Predictor): + """ + Predictor for the :class:`~allennlp.models.esim.ESIM` model. + """ + + def predict(self, sentence1: str, sentence2: str) -> JsonDict: + """ + Predicts whether the sentence2 is entailed by the sentence1 text. + + Parameters + ---------- + sentence1 : ``str`` + A passage representing what is assumed to be true. + + sentence2 : ``str`` + A sentence that may be entailed by the sentence1. + + Returns + ------- + A dictionary where the key "label_probs" determines the probabilities of each of + [entailment, contradiction, neutral]. + """ + return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2}) + + @overrides + def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: + """ + Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``. + """ + sentence1_text = json_dict["sentence1"] + sentence2_text = json_dict["sentence2"] + return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {} From 3f173beda59a33f8403cc75eada202f0391b4dc3 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Wed, 30 May 2018 14:23:47 -0700 Subject: [PATCH 09/24] Move ESIM predictor --- allennlp/service/predictors/esim.py | 41 ----------------------------- 1 file changed, 41 deletions(-) delete mode 100644 allennlp/service/predictors/esim.py diff --git a/allennlp/service/predictors/esim.py b/allennlp/service/predictors/esim.py deleted file mode 100644 index 5ea63c7fda8..00000000000 --- a/allennlp/service/predictors/esim.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import Tuple -from overrides import overrides - -from allennlp.common.util import JsonDict -from allennlp.data import Instance -from allennlp.service.predictors.predictor import Predictor - - -@Predictor.register('esim') -class ESIMPredictor(Predictor): - """ - Predictor for the :class:`~allennlp.models.esim.ESIM` model. - """ - - def predict(self, sentence1: str, sentence2: str) -> JsonDict: - """ - Predicts whether the sentence2 is entailed by the sentence1 text. - - Parameters - ---------- - sentence1 : ``str`` - A passage representing what is assumed to be true. - - sentence2 : ``str`` - A sentence that may be entailed by the sentence1. - - Returns - ------- - A dictionary where the key "label_probs" determines the probabilities of each of - [entailment, contradiction, neutral]. - """ - return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2}) - - @overrides - def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: - """ - Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``. - """ - sentence1_text = json_dict["sentence1"] - sentence2_text = json_dict["sentence2"] - return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {} From 2b722cebe3ab91895c31d74e04058d632bad6cfb Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 13:29:49 -0700 Subject: [PATCH 10/24] Clean up --- allennlp/models/esim.py | 84 +++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 1e63b16ff44..3fc8d11a1e5 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -1,8 +1,3 @@ -# TODO: projection dropout with ELMO -# l2 reg with ELMO -# multiple ELMO layers -# doc - from typing import Dict, Optional import torch @@ -18,14 +13,31 @@ from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values from allennlp.training.metrics import CategoricalAccuracy -class VariationalDropout(torch.nn.Dropout): +class InputVariationalDropout(torch.nn.Dropout): + """ + Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation: + Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a + 3D tensor. + + This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` + and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies + it to every time step. + """ def forward(self, input): """ - input is shape (batch_size, timesteps, embedding_dim) - Samples one mask of size (batch_size, embedding_dim) and applies it to every time step. + Apply dropout to input tensor. + + Parameters + ---------- + input: torch.FloatTensor + A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` + + Returns + ------- + output: torch.FloatTensor + A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied. """ - #ones = Variable(torch.ones(input.shape[0], input.shape[-1])) - ones = Variable(input.data.new(input.shape[0], input.shape[-1]).fill_(1)) + ones = Variable(input.data.new_ones(input.shape[0], input.shape[-1])) dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) if self.inplace: input *= dropout_mask.unsqueeze(1) @@ -47,25 +59,21 @@ class ESIM(Model): text_field_embedder : ``TextFieldEmbedder`` Used to embed the ``premise`` and ``hypothesis`` ``TextFields`` we get as input to the model. - attend_feedforward : ``FeedForward`` - This feedforward network is applied to the encoded sentence representations before the - similarity matrix is computed between words in the premise and words in the hypothesis. + encoder : ``Seq2SeqEncoder`` + Used to encode the premise and hypothesis. similarity_function : ``SimilarityFunction`` - This is the similarity function used when computing the similarity matrix between words in - the premise and words in the hypothesis. - compare_feedforward : ``FeedForward`` - This feedforward network is applied to the aligned premise and hypothesis representations, - individually. - aggregate_feedforward : ``FeedForward`` - This final feedforward network is applied to the concatenated, summed result of the - ``compare_feedforward`` network, and its output is used as the entailment class logits. - premise_encoder : ``Seq2SeqEncoder``, optional (default=``None``) - After embedding the premise, we can optionally apply an encoder. If this is ``None``, we - will do nothing. - hypothesis_encoder : ``Seq2SeqEncoder``, optional (default=``None``) - After embedding the hypothesis, we can optionally apply an encoder. If this is ``None``, - we will use the ``premise_encoder`` for the encoding (doing nothing if ``premise_encoder`` - is also ``None``). + This is the similarity function used when computing the similarity matrix between encoded + words in the premise and words in the hypothesis. + projection_feedforward : ``FeedForward`` + The feedforward network used to project down the encoded and enhanced premise and hypothesis. + inference_encoder : ``Seq2SeqEncoder`` + Used to encode the projected premise and hypothesis for prediction. + output_feedforward : ``FeedForward`` + Used to prepare the concatenated premise and hypothesis for prediction. + output_logit : ``FeedForward`` + This feedforward network computes the output logits. + dropout : ``float``, optional (default=0.5) + Dropout percentage to use. initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) Used to initialize the model parameters. regularizer : ``RegularizerApplicator``, optional (default=``None``) @@ -79,8 +87,8 @@ def __init__(self, vocab: Vocabulary, inference_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, output_logit: FeedForward, - initializer: InitializerApplicator = InitializerApplicator(), dropout: float = 0.5, + initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) @@ -94,7 +102,7 @@ def __init__(self, vocab: Vocabulary, if dropout: self.dropout = torch.nn.Dropout(dropout) - self.rnn_input_dropout = VariationalDropout(dropout) + self.rnn_input_dropout = InputVariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None @@ -185,12 +193,8 @@ def forward(self, # type: ignore dim=-1 ) - # embedding -> lstm w/ do -> enhanced attention -> dropout_proj, only if ELMO -> ff proj -> lstm w/ do -> dropout -> ff 300 -> dropout -> output - - # add dropout here with ELMO - - # the projection layer down to the model dimension - # no dropout in projection + # The projection layer down to the model dimension. Dropout is not applied before + # projection. projected_enhanced_premise = self._projection_feedforward(premise_enhanced) projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced) @@ -235,12 +239,10 @@ def forward(self, # type: ignore return output_dict def get_metrics(self, reset: bool = False) -> Dict[str, float]: - return { - 'accuracy': self._accuracy.get_metric(reset), - } + return {'accuracy': self._accuracy.get_metric(reset)} @classmethod - def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention': + def from_params(cls, vocab: Vocabulary, params: Params) -> 'ESIM': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params) @@ -264,6 +266,6 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio inference_encoder=inference_encoder, output_feedforward=output_feedforward, output_logit=output_logit, - initializer=initializer, dropout=dropout, + initializer=initializer, regularizer=regularizer) From fa5e670bef66d1a7707ea82793897988268fdbfe Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:18:51 -0700 Subject: [PATCH 11/24] Add test for ESIM --- allennlp/models/esim.py | 15 ++-- allennlp/tests/fixtures/esim/experiment.json | 92 ++++++++++++++++++++ allennlp/tests/models/esim_test.py | 27 ++++++ 3 files changed, 129 insertions(+), 5 deletions(-) create mode 100644 allennlp/tests/fixtures/esim/experiment.json create mode 100644 allennlp/tests/models/esim_test.py diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 3fc8d11a1e5..446165d57e4 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -1,4 +1,4 @@ -from typing import Dict, Optional +from typing import Dict, Optional, List, Any import torch from torch.autograd import Variable @@ -7,7 +7,8 @@ from allennlp.common.checks import check_dimensions_match from allennlp.data import Vocabulary from allennlp.models.model import Model -from allennlp.modules import FeedForward, MatrixAttention +from allennlp.modules import FeedForward +from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder from allennlp.nn import InitializerApplicator, RegularizerApplicator from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values @@ -95,7 +96,7 @@ def __init__(self, vocab: Vocabulary, self._text_field_embedder = text_field_embedder self._encoder = encoder - self._matrix_attention = MatrixAttention(similarity_function) + self._matrix_attention = LegacyMatrixAttention(similarity_function) self._projection_feedforward = projection_feedforward self._inference_encoder = inference_encoder @@ -127,7 +128,8 @@ def __init__(self, vocab: Vocabulary, def forward(self, # type: ignore premise: Dict[str, torch.LongTensor], hypothesis: Dict[str, torch.LongTensor], - label: torch.IntTensor = None) -> Dict[str, torch.Tensor]: + label: torch.IntTensor = None, + metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters @@ -138,6 +140,9 @@ def forward(self, # type: ignore From a ``TextField`` label : torch.IntTensor, optional (default = None) From a ``LabelField`` + metadata : ``List[Dict[str, Any]]``, optional, (default = None) + Metadata containing the original tokenization of the premise and + hypothesis with 'premise_tokens' and 'hypothesis_tokens' keys respectively. Returns ------- @@ -233,7 +238,7 @@ def forward(self, # type: ignore if label is not None: loss = self._loss(label_logits, label.long().view(-1)) - self._accuracy(label_logits, label.squeeze(-1)) + self._accuracy(label_logits, label) output_dict["loss"] = loss return output_dict diff --git a/allennlp/tests/fixtures/esim/experiment.json b/allennlp/tests/fixtures/esim/experiment.json new file mode 100644 index 00000000000..772b261b789 --- /dev/null +++ b/allennlp/tests/fixtures/esim/experiment.json @@ -0,0 +1,92 @@ +{ + "dataset_reader": { + "type": "snli", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + } + } + }, + "train_data_path": "allennlp/tests/fixtures/data/snli.jsonl", + "validation_data_path": "allennlp/tests/fixtures/data/snli.jsonl", + "model": { + "type": "esim", + "dropout": 0.5, + "text_field_embedder": { + "tokens": { + "type": "embedding", + "pretrained_file": "allennlp/tests/fixtures/embeddings/glove.6B.300d.sample.txt.gz", + "embedding_dim": 300, + "trainable": false, + "projection_dim": 10, + } + }, + "encoder": { + "type": "lstm", + "input_size": 10, + "hidden_size": 25, + "num_layers": 1, + "bidirectional": true + }, + "similarity_function": {"type": "dot_product"}, + "projection_feedforward": { + "input_dim": 200, + "hidden_dims": 25, + "num_layers": 1, + "activations": "relu" + }, + "inference_encoder": { + "type": "lstm", + "input_size": 25, + "hidden_size": 25, + "num_layers": 1, + "bidirectional": true + }, + "output_feedforward": { + "input_dim": 200, + "num_layers": 1, + "hidden_dims": 5, + "activations": "relu", + "dropout": 0.5 + }, + "output_logit": { + "input_dim": 5, + "num_layers": 1, + "hidden_dims": 3, + "activations": "linear" + }, + "initializer": [ + [".*linear_layers.*weight", {"type": "xavier_uniform"}], + [".*linear_layers.*bias", {"type": "zero"}], + [".*weight_ih.*", {"type": "xavier_uniform"}], + [".*weight_hh.*", {"type": "orthogonal"}], + [".*bias_ih.*", {"type": "zero"}], + [".*bias_hh.*", {"type": "lstm_hidden_bias"}] + ] + }, + "iterator": { + "type": "bucket", + "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]], + "batch_size": 32, + "padding_noise": 0.0, + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.0004 + }, + "validation_metric": "+accuracy", + "num_serialized_models_to_keep": 2, + "num_epochs": 5, + "grad_norm": 10.0, + "patience": 2, + "cuda_device": -1, + "learning_rate_scheduler": { + "type": "reduce_on_plateau", + "factor": 0.5, + "mode": "max", + "patience": 0 + } + } +} diff --git a/allennlp/tests/models/esim_test.py b/allennlp/tests/models/esim_test.py new file mode 100644 index 00000000000..458e73f4527 --- /dev/null +++ b/allennlp/tests/models/esim_test.py @@ -0,0 +1,27 @@ +# pylint: disable=no-self-use,invalid-name +from flaky import flaky +import pytest +import numpy +from numpy.testing import assert_almost_equal + +from allennlp.common import Params +from allennlp.common.checks import ConfigurationError +from allennlp.common.testing import ModelTestCase + + +class TestESIM(ModelTestCase): + def setUp(self): + super(TestESIM, self).setUp() + self.set_up_model(self.FIXTURES_ROOT / 'esim' / 'experiment.json', + self.FIXTURES_ROOT / 'data' / 'snli.jsonl') + + def test_forward_pass_runs_correctly(self): + training_tensors = self.dataset.as_tensor_dict() + output_dict = self.model(**training_tensors) + assert_almost_equal(numpy.sum(output_dict["label_probs"][0].data.numpy(), -1), 1, decimal=6) + + def test_model_can_train_save_and_load(self): + self.ensure_model_can_train_save_and_load(self.param_file) + + def test_batch_predictions_are_consistent(self): + self.ensure_batch_predictions_are_consistent() From 3e336a5f07b59b40103fafbd4e6a8256b0044828 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:20:33 -0700 Subject: [PATCH 12/24] Add predictor for ESIM --- allennlp/predictors/esim.py | 41 +++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 allennlp/predictors/esim.py diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py new file mode 100644 index 00000000000..5ea63c7fda8 --- /dev/null +++ b/allennlp/predictors/esim.py @@ -0,0 +1,41 @@ +from typing import Tuple +from overrides import overrides + +from allennlp.common.util import JsonDict +from allennlp.data import Instance +from allennlp.service.predictors.predictor import Predictor + + +@Predictor.register('esim') +class ESIMPredictor(Predictor): + """ + Predictor for the :class:`~allennlp.models.esim.ESIM` model. + """ + + def predict(self, sentence1: str, sentence2: str) -> JsonDict: + """ + Predicts whether the sentence2 is entailed by the sentence1 text. + + Parameters + ---------- + sentence1 : ``str`` + A passage representing what is assumed to be true. + + sentence2 : ``str`` + A sentence that may be entailed by the sentence1. + + Returns + ------- + A dictionary where the key "label_probs" determines the probabilities of each of + [entailment, contradiction, neutral]. + """ + return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2}) + + @overrides + def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: + """ + Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``. + """ + sentence1_text = json_dict["sentence1"] + sentence2_text = json_dict["sentence2"] + return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {} From 23bfeca757f9f844b3ac46f8bdae05017af0b4db Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:37:59 -0700 Subject: [PATCH 13/24] pylint --- allennlp/models/esim.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 446165d57e4..63a4fedcc70 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -9,7 +9,7 @@ from allennlp.models.model import Model from allennlp.modules import FeedForward from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention -from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder +from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TextFieldEmbedder from allennlp.nn import InitializerApplicator, RegularizerApplicator from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values from allennlp.training.metrics import CategoricalAccuracy @@ -24,27 +24,28 @@ class InputVariationalDropout(torch.nn.Dropout): and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies it to every time step. """ - def forward(self, input): + def forward(self, input_tensor): + # pylint: disable=arguments-differ """ Apply dropout to input tensor. Parameters ---------- - input: torch.FloatTensor + input_tensor: torch.FloatTensor A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` - + Returns ------- output: torch.FloatTensor A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied. """ - ones = Variable(input.data.new_ones(input.shape[0], input.shape[-1])) + ones = Variable(input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1])) dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) if self.inplace: - input *= dropout_mask.unsqueeze(1) + input_tensor *= dropout_mask.unsqueeze(1) return None else: - return dropout_mask.unsqueeze(1) * input + return dropout_mask.unsqueeze(1) * input_tensor @Model.register("esim") @@ -129,7 +130,8 @@ def forward(self, # type: ignore premise: Dict[str, torch.LongTensor], hypothesis: Dict[str, torch.LongTensor], label: torch.IntTensor = None, - metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: + metadata: List[Dict[str, Any]] = None # pylint:disable=unused-argument + ) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters @@ -192,7 +194,7 @@ def forward(self, # type: ignore dim=-1 ) hypothesis_enhanced = torch.cat( - [encoded_hypothesis, attended_premise, + [encoded_hypothesis, attended_premise, encoded_hypothesis - attended_premise, encoded_hypothesis * attended_premise], dim=-1 @@ -219,18 +221,22 @@ def forward(self, # type: ignore v_bi, hypothesis_mask.unsqueeze(-1), -1e7 ).max(dim=1) - v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(premise_mask, 1, keepdim=True) - v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(hypothesis_mask, 1, keepdim=True) + v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum( + premise_mask, 1, keepdim=True + ) + v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum( + hypothesis_mask, 1, keepdim=True + ) # Now concat # (batch_size, model_dim * 2 * 4) - v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) + v_all = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1) # the final MLP -- apply dropout to input, and MLP applies to output & hidden if self.dropout: - v = self.dropout(v) + v_all = self.dropout(v_all) - output_hidden = self._output_feedforward(v) + output_hidden = self._output_feedforward(v_all) label_logits = self._output_logit(output_hidden) label_probs = torch.nn.functional.softmax(label_logits, dim=-1) From 1d0c90586aa86786056e95aff0c1805e5f4ed9a5 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:39:07 -0700 Subject: [PATCH 14/24] pylint --- allennlp/tests/models/esim_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/allennlp/tests/models/esim_test.py b/allennlp/tests/models/esim_test.py index 458e73f4527..7e939755e09 100644 --- a/allennlp/tests/models/esim_test.py +++ b/allennlp/tests/models/esim_test.py @@ -1,11 +1,7 @@ # pylint: disable=no-self-use,invalid-name -from flaky import flaky -import pytest import numpy from numpy.testing import assert_almost_equal -from allennlp.common import Params -from allennlp.common.checks import ConfigurationError from allennlp.common.testing import ModelTestCase From 12325be3e83a89a683be0331faab3163b9026b40 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:42:18 -0700 Subject: [PATCH 15/24] mypy --- allennlp/predictors/esim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py index 5ea63c7fda8..77bac349807 100644 --- a/allennlp/predictors/esim.py +++ b/allennlp/predictors/esim.py @@ -32,10 +32,10 @@ def predict(self, sentence1: str, sentence2: str) -> JsonDict: return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2}) @overrides - def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]: + def _json_to_instance(self, json_dict: JsonDict) -> Instance: """ Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``. """ sentence1_text = json_dict["sentence1"] sentence2_text = json_dict["sentence2"] - return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {} + return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text) From d9730f47672b5608a7b0c19c94088b8a6a234219 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:45:40 -0700 Subject: [PATCH 16/24] fix the docs --- doc/api/allennlp.models.esim.rst | 7 +++++++ doc/api/allennlp.predictors.rst | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 doc/api/allennlp.models.esim.rst diff --git a/doc/api/allennlp.models.esim.rst b/doc/api/allennlp.models.esim.rst new file mode 100644 index 00000000000..deaebd20782 --- /dev/null +++ b/doc/api/allennlp.models.esim.rst @@ -0,0 +1,7 @@ +allennlp.models.esim +==================== + +.. automodule:: allennlp.models.esim + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/allennlp.predictors.rst b/doc/api/allennlp.predictors.rst index 8712dfa6be1..f8cfd837274 100644 --- a/doc/api/allennlp.predictors.rst +++ b/doc/api/allennlp.predictors.rst @@ -9,6 +9,7 @@ allennlp.predictors * :ref:`Predictor` * :ref:`BidafPredictor` * :ref:`DecomposableAttentionPredictor` +* :ref:`ESIMPredictor` * :ref:`SemanticRoleLabelerPredictor` * :ref:`SentenceTaggerPredictor` * :ref:`CorefPredictor` @@ -35,6 +36,12 @@ allennlp.predictors :undoc-members: :show-inheritance: +.. _esim: +.. automodule:: allennlp.predictors.esim + :members: + :undoc-members: + :show-inheritance: + .. _semantic-role-labeler: .. automodule:: allennlp.predictors.semantic_role_labeler :members: From 4f6d37fc729a12d7a94351329ac6821605b072ef Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 14:52:37 -0700 Subject: [PATCH 17/24] ESIM predictor --- allennlp/predictors/__init__.py | 1 + allennlp/predictors/esim.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py index 0be6fc4ee6e..ca291ee1658 100644 --- a/allennlp/predictors/__init__.py +++ b/allennlp/predictors/__init__.py @@ -16,3 +16,4 @@ from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor from allennlp.predictors.nlvr_parser import NlvrParserPredictor +from allennlp.predictors.esim import ESIMPredictor diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py index 77bac349807..59542024212 100644 --- a/allennlp/predictors/esim.py +++ b/allennlp/predictors/esim.py @@ -1,9 +1,8 @@ -from typing import Tuple from overrides import overrides from allennlp.common.util import JsonDict from allennlp.data import Instance -from allennlp.service.predictors.predictor import Predictor +from allennlp.predictors.predictor import Predictor @Predictor.register('esim') From 7b57e4229ffaac6abc9c593001fff71fbc72c50d Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 15:04:24 -0700 Subject: [PATCH 18/24] Add comment to esim training config --- training_config/esim.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/training_config/esim.json b/training_config/esim.json index 348b3bd299e..65ef2467fc2 100644 --- a/training_config/esim.json +++ b/training_config/esim.json @@ -1,4 +1,14 @@ { + // Configuration for the ESIM model with ELMo, modified slightly from + // the version included in "Deep Contextualized Word Representations", + // (https://arxiv.org/abs/1802.05365). Compared to the version in this paper, + // this configuration only includes one layer of ELMo representations + // and removes GloVe embeddings. + // + // There is a trained model available at https://s3-us-west-2.amazonaws.com/allennlp/models/esim-elmo-2018.05.17.tar.gz + // with test set accuracy of 88.5%, compared to the single model reported + // result of 88.7 +/- 0.17. + "dataset_reader": { "type": "snli", "token_indexers": { From 7ea3e47d3f588f705735ffe044881552bcd43fb1 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 15:50:30 -0700 Subject: [PATCH 19/24] Move InputVariationalDropout --- allennlp/models/esim.py | 35 +------------------ allennlp/modules/__init__.py | 1 + allennlp/modules/input_variational_dropout.py | 34 ++++++++++++++++++ 3 files changed, 36 insertions(+), 34 deletions(-) create mode 100644 allennlp/modules/input_variational_dropout.py diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 63a4fedcc70..1ee46d6edc9 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -7,46 +7,13 @@ from allennlp.common.checks import check_dimensions_match from allennlp.data import Vocabulary from allennlp.models.model import Model -from allennlp.modules import FeedForward +from allennlp.modules import FeedForward, InputVariationalDropout from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TextFieldEmbedder from allennlp.nn import InitializerApplicator, RegularizerApplicator from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values from allennlp.training.metrics import CategoricalAccuracy -class InputVariationalDropout(torch.nn.Dropout): - """ - Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation: - Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a - 3D tensor. - - This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` - and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies - it to every time step. - """ - def forward(self, input_tensor): - # pylint: disable=arguments-differ - """ - Apply dropout to input tensor. - - Parameters - ---------- - input_tensor: torch.FloatTensor - A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` - - Returns - ------- - output: torch.FloatTensor - A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied. - """ - ones = Variable(input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1])) - dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) - if self.inplace: - input_tensor *= dropout_mask.unsqueeze(1) - return None - else: - return dropout_mask.unsqueeze(1) * input_tensor - @Model.register("esim") class ESIM(Model): diff --git a/allennlp/modules/__init__.py b/allennlp/modules/__init__.py index c4bc0f30a2c..4ecdf3c09dd 100644 --- a/allennlp/modules/__init__.py +++ b/allennlp/modules/__init__.py @@ -21,3 +21,4 @@ from allennlp.modules.token_embedders import TokenEmbedder, Embedding from allennlp.modules.matrix_attention import MatrixAttention from allennlp.modules.attention import Attention +from allennlp.modules.input_variational_dropout import InputVariationalDropout diff --git a/allennlp/modules/input_variational_dropout.py b/allennlp/modules/input_variational_dropout.py new file mode 100644 index 00000000000..ea441af3287 --- /dev/null +++ b/allennlp/modules/input_variational_dropout.py @@ -0,0 +1,34 @@ +import torch + +class InputVariationalDropout(torch.nn.Dropout): + """ + Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation: + Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a + 3D tensor. + + This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` + and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies + it to every time step. + """ + def forward(self, input_tensor): + # pylint: disable=arguments-differ + """ + Apply dropout to input tensor. + + Parameters + ---------- + input_tensor: ``torch.FloatTensor`` + A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` + + Returns + ------- + output: ``torch.FloatTensor`` + A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied. + """ + ones = input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1]) + dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) + if self.inplace: + input_tensor *= dropout_mask.unsqueeze(1) + return None + else: + return dropout_mask.unsqueeze(1) * input_tensor From 54db6047bfd81b826f017a1dec22542c05f4e216 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 15:51:23 -0700 Subject: [PATCH 20/24] pylint --- allennlp/models/esim.py | 1 - 1 file changed, 1 deletion(-) diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py index 1ee46d6edc9..eca68f889b6 100644 --- a/allennlp/models/esim.py +++ b/allennlp/models/esim.py @@ -1,7 +1,6 @@ from typing import Dict, Optional, List, Any import torch -from torch.autograd import Variable from allennlp.common import Params from allennlp.common.checks import check_dimensions_match From 9ae74aac40d9fae9018554722dfee208b63570aa Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 15:56:26 -0700 Subject: [PATCH 21/24] Fix the docs --- doc/api/allennlp.models.rst | 1 + doc/api/allennlp.modules.input_variational_dropout.rst | 7 +++++++ doc/api/allennlp.modules.rst | 1 + 3 files changed, 9 insertions(+) create mode 100644 doc/api/allennlp.modules.input_variational_dropout.rst diff --git a/doc/api/allennlp.models.rst b/doc/api/allennlp.models.rst index a0cce505a82..46a4cd42db8 100644 --- a/doc/api/allennlp.models.rst +++ b/doc/api/allennlp.models.rst @@ -21,3 +21,4 @@ allennlp.models allennlp.models.semantic_parsing allennlp.models.semantic_role_labeler allennlp.models.simple_tagger + allennlp.models.esim diff --git a/doc/api/allennlp.modules.input_variational_dropout.rst b/doc/api/allennlp.modules.input_variational_dropout.rst new file mode 100644 index 00000000000..c02b9ce6373 --- /dev/null +++ b/doc/api/allennlp.modules.input_variational_dropout.rst @@ -0,0 +1,7 @@ +allennlp.modules.input_variational_dropout +========================================= + +.. automodule:: allennlp.modules.input_variational_dropout + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api/allennlp.modules.rst b/doc/api/allennlp.modules.rst index 9353200f9e6..81d04dc9f23 100644 --- a/doc/api/allennlp.modules.rst +++ b/doc/api/allennlp.modules.rst @@ -31,3 +31,4 @@ allennlp.modules allennlp.modules.layer_norm allennlp.modules.span_pruner allennlp.modules.maxout + allennlp.modules.input_variational_dropout From a3cf48daf222c80e4963d1906ac8b01123568272 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 15:57:46 -0700 Subject: [PATCH 22/24] fix the docs --- doc/api/allennlp.modules.input_variational_dropout.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/api/allennlp.modules.input_variational_dropout.rst b/doc/api/allennlp.modules.input_variational_dropout.rst index c02b9ce6373..ccb4a210341 100644 --- a/doc/api/allennlp.modules.input_variational_dropout.rst +++ b/doc/api/allennlp.modules.input_variational_dropout.rst @@ -1,5 +1,5 @@ allennlp.modules.input_variational_dropout -========================================= +========================================== .. automodule:: allennlp.modules.input_variational_dropout :members: From 101a71c360f21333527edab32adc031d5e0d4ebb Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 16:20:36 -0700 Subject: [PATCH 23/24] Remove ESIM predictor --- allennlp/predictors/__init__.py | 1 - allennlp/predictors/esim.py | 40 --------------------------------- doc/api/allennlp.predictors.rst | 7 ------ 3 files changed, 48 deletions(-) delete mode 100644 allennlp/predictors/esim.py diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py index ca291ee1658..0be6fc4ee6e 100644 --- a/allennlp/predictors/__init__.py +++ b/allennlp/predictors/__init__.py @@ -16,4 +16,3 @@ from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor from allennlp.predictors.nlvr_parser import NlvrParserPredictor -from allennlp.predictors.esim import ESIMPredictor diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py deleted file mode 100644 index 59542024212..00000000000 --- a/allennlp/predictors/esim.py +++ /dev/null @@ -1,40 +0,0 @@ -from overrides import overrides - -from allennlp.common.util import JsonDict -from allennlp.data import Instance -from allennlp.predictors.predictor import Predictor - - -@Predictor.register('esim') -class ESIMPredictor(Predictor): - """ - Predictor for the :class:`~allennlp.models.esim.ESIM` model. - """ - - def predict(self, sentence1: str, sentence2: str) -> JsonDict: - """ - Predicts whether the sentence2 is entailed by the sentence1 text. - - Parameters - ---------- - sentence1 : ``str`` - A passage representing what is assumed to be true. - - sentence2 : ``str`` - A sentence that may be entailed by the sentence1. - - Returns - ------- - A dictionary where the key "label_probs" determines the probabilities of each of - [entailment, contradiction, neutral]. - """ - return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2}) - - @overrides - def _json_to_instance(self, json_dict: JsonDict) -> Instance: - """ - Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``. - """ - sentence1_text = json_dict["sentence1"] - sentence2_text = json_dict["sentence2"] - return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text) diff --git a/doc/api/allennlp.predictors.rst b/doc/api/allennlp.predictors.rst index f8cfd837274..8712dfa6be1 100644 --- a/doc/api/allennlp.predictors.rst +++ b/doc/api/allennlp.predictors.rst @@ -9,7 +9,6 @@ allennlp.predictors * :ref:`Predictor` * :ref:`BidafPredictor` * :ref:`DecomposableAttentionPredictor` -* :ref:`ESIMPredictor` * :ref:`SemanticRoleLabelerPredictor` * :ref:`SentenceTaggerPredictor` * :ref:`CorefPredictor` @@ -36,12 +35,6 @@ allennlp.predictors :undoc-members: :show-inheritance: -.. _esim: -.. automodule:: allennlp.predictors.esim - :members: - :undoc-members: - :show-inheritance: - .. _semantic-role-labeler: .. automodule:: allennlp.predictors.semantic_role_labeler :members: From 9b901f93fbbccad93dff606238c0114c0130fc98 Mon Sep 17 00:00:00 2001 From: Matthew Peters Date: Mon, 9 Jul 2018 16:52:44 -0700 Subject: [PATCH 24/24] Scrub all of ESIMPredictor --- allennlp/service/predictors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/allennlp/service/predictors/__init__.py b/allennlp/service/predictors/__init__.py index b06b614cd8c..1e95ef2c393 100644 --- a/allennlp/service/predictors/__init__.py +++ b/allennlp/service/predictors/__init__.py @@ -18,7 +18,6 @@ from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor from allennlp.predictors.nlvr_parser import NlvrParserPredictor -from allennlp.predictors.esim import ESIMPredictor warnings.warn("allennlp.service.predictors.* has been depreciated. " "Please use allennlp.predictors.*", FutureWarning)