From 3e9b5ae1d6d74dd666afc9570d4e6c52dd9287f0 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 2 May 2018 17:06:06 -0700
Subject: [PATCH 01/24] WIP: ESIM model

---
 allennlp/models/esim.py   | 248 ++++++++++++++++++++++++++++++++++++++
 training_config/esim.json |  79 ++++++++++++
 2 files changed, 327 insertions(+)
 create mode 100644 allennlp/models/esim.py
 create mode 100644 training_config/esim.json

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
new file mode 100644
index 00000000000..92b113d4304
--- /dev/null
+++ b/allennlp/models/esim.py
@@ -0,0 +1,248 @@
+# TODO: projection dropout with ELMO
+#   l2 reg with ELMO
+#   multiple ELMO layers
+#   doc
+
+# init:
+# for LSTM, use xavier_uniform for kernel, orthogonal for recurrent
+
+from typing import Dict, Optional
+
+import torch
+
+from allennlp.common import Params
+from allennlp.common.checks import check_dimensions_match
+from allennlp.data import Vocabulary
+from allennlp.models.model import Model
+from allennlp.modules import FeedForward, MatrixAttention
+from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
+from allennlp.training.metrics import CategoricalAccuracy
+
+
+@Model.register("esim")
+class ESIM(Model):
+    """
+    This ``Model`` implements the ESIM sequence model described in `"Enhanced LSTM for Natural Language Inference"
+    <https://www.semanticscholar.org/paper/Enhanced-LSTM-for-Natural-Language-Inference-Chen-Zhu/83e7654d545fbbaaf2328df365a781fb67b841b4>`_
+    by Chen et al., 2017.
+
+    Parameters
+    ----------
+    vocab : ``Vocabulary``
+    text_field_embedder : ``TextFieldEmbedder``
+        Used to embed the ``premise`` and ``hypothesis`` ``TextFields`` we get as input to the
+        model.
+    attend_feedforward : ``FeedForward``
+        This feedforward network is applied to the encoded sentence representations before the
+        similarity matrix is computed between words in the premise and words in the hypothesis.
+    similarity_function : ``SimilarityFunction``
+        This is the similarity function used when computing the similarity matrix between words in
+        the premise and words in the hypothesis.
+    compare_feedforward : ``FeedForward``
+        This feedforward network is applied to the aligned premise and hypothesis representations,
+        individually.
+    aggregate_feedforward : ``FeedForward``
+        This final feedforward network is applied to the concatenated, summed result of the
+        ``compare_feedforward`` network, and its output is used as the entailment class logits.
+    premise_encoder : ``Seq2SeqEncoder``, optional (default=``None``)
+        After embedding the premise, we can optionally apply an encoder.  If this is ``None``, we
+        will do nothing.
+    hypothesis_encoder : ``Seq2SeqEncoder``, optional (default=``None``)
+        After embedding the hypothesis, we can optionally apply an encoder.  If this is ``None``,
+        we will use the ``premise_encoder`` for the encoding (doing nothing if ``premise_encoder``
+        is also ``None``).
+    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
+        Used to initialize the model parameters.
+    regularizer : ``RegularizerApplicator``, optional (default=``None``)
+        If provided, will be used to calculate the regularization penalty during training.
+    """
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 encoder: Seq2SeqEncoder,
+                 similarity_function: SimilarityFunction,
+                 projection_feedforward: FeedForward,
+                 inference_encoder: Seq2SeqEncoder,
+                 output_feedforward: FeedForward,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 dropout: float = 0.5,
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super().__init__(vocab, regularizer)
+
+        self._text_field_embedder = text_field_embedder
+        self._encoder = encoder
+
+        self._matrix_attention = MatrixAttention(similarity_function)
+        self._projection_feedforward = projection_feedforward
+
+        self._inference_encoder = inference_encoder
+
+        if dropout:
+            self.dropout = torch.nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        self._output_feedforward = output_feedforward
+
+        self._num_labels = vocab.get_vocab_size(namespace="labels")
+
+        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
+                               "text field embedding dim", "encoder input dim")
+        check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(),
+                               "encoder output dim", "projection feedforward input")
+        check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(),
+                               "proj feedforward output dim", "inference lstm input dim")
+        check_dimensions_match(output_feedforward.get_output_dim(), self._num_labels,
+                               "final output dimension", "number of labels")
+
+        self._accuracy = CategoricalAccuracy()
+        self._loss = torch.nn.CrossEntropyLoss()
+
+        initializer(self)
+
+    def forward(self,  # type: ignore
+                premise: Dict[str, torch.LongTensor],
+                hypothesis: Dict[str, torch.LongTensor],
+                label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        premise : Dict[str, torch.LongTensor]
+            From a ``TextField``
+        hypothesis : Dict[str, torch.LongTensor]
+            From a ``TextField``
+        label : torch.IntTensor, optional (default = None)
+            From a ``LabelField``
+
+        Returns
+        -------
+        An output dictionary consisting of:
+
+        label_logits : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
+            probabilities of the entailment label.
+        label_probs : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the
+            entailment label.
+        loss : torch.FloatTensor, optional
+            A scalar loss to be optimised.
+        """
+        embedded_premise = self._text_field_embedder(premise)
+        embedded_hypothesis = self._text_field_embedder(hypothesis)
+        premise_mask = get_text_field_mask(premise).float()
+        hypothesis_mask = get_text_field_mask(hypothesis).float()
+
+        # apply dropout
+        if self.dropout:
+            embedded_premise = self.dropout(embedded_premise)
+            embedded_hypothesis = self.dropout(embedded_hypothesis)
+
+        # encode premise and hypothesis
+        encoded_premise = self._encoder(embedded_premise, premise_mask)
+        encoded_hypothesis = self._encoder(embedded_hypothesis, hypothesis_mask)
+
+        # Shape: (batch_size, premise_length, hypothesis_length)
+        similarity_matrix = self._matrix_attention(encoded_premise, encoded_hypothesis)
+
+        # Shape: (batch_size, premise_length, hypothesis_length)
+        p2h_attention = last_dim_softmax(similarity_matrix, hypothesis_mask)
+        # Shape: (batch_size, premise_length, embedding_dim)
+        attended_hypothesis = weighted_sum(embedded_hypothesis, p2h_attention)
+
+        # Shape: (batch_size, hypothesis_length, premise_length)
+        h2p_attention = last_dim_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
+        # Shape: (batch_size, hypothesis_length, embedding_dim)
+        attended_premise = weighted_sum(embedded_premise, h2p_attention)
+
+        # the "enhancement" layer
+        premise_enhanced = torch.cat(
+                [encoded_premise, attended_hypothesis,
+                 encoded_premise - attended_hypothesis,
+                 encoded_premise * attended_hypothesis],
+                dim=-1
+        )
+        hypothesis_enhanced = torch.cat(
+                [encoded_hypothesis, attended_premise, 
+                 encoded_hypothesis - attended_premise,
+                 encoded_hypothesis * attended_premise],
+                dim=-1
+        )
+
+        projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
+
+        # the projection layer down to the model dimension
+        projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
+        projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)
+
+        # Run the inference layer
+        if self.dropout:
+            projected_enhanced_premise = self.dropout(projected_enhanced_premise)
+            projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis)
+        v_ai = self._inference_encoder(projected_enhanced_premise)
+        v_bi = self._inference_encoder(projected_enhanced_hypothesis)
+
+        # The pooling layer -- max and avg pooling.
+        # (batch_size, model_dim)
+        v_a_max = replace_masked_values(
+                v_ai, premise_mask.unsqueeze(-1), -1e7
+        ).max(dim=1)
+        v_b_max = replace_masked_values(
+                v_bi, hypothesis_mask.unsqueeze(-1), -1e7
+        ).max(dim=1)
+
+        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(premise_mask, 1, keepdim=True)
+        v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(hypothesis_mask, 1, keepdim=True)
+
+        # Now concat
+        # (batch_size, model_dim * 2 * 4)
+        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+
+        # the final MLP -- apply dropout to input, and MLP applies to output & hidden
+        if self.dropout:
+            v = self.dropout(v)
+
+        label_logits = self._output_feedforward(v)
+        label_probs = torch.nn.functional.softmax(label_logits, dim=-1)
+
+        output_dict = {"label_logits": label_logits, "label_probs": label_probs}
+
+        if label is not None:
+            loss = self._loss(label_logits, label.long().view(-1))
+            self._accuracy(label_logits, label.squeeze(-1))
+            output_dict["loss"] = loss
+
+        return output_dict
+
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        return {
+                'accuracy': self._accuracy.get_metric(reset),
+                }
+
+    @classmethod
+    def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention':
+        embedder_params = params.pop("text_field_embedder")
+        text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
+
+        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))
+        similarity_function = SimilarityFunction.from_params(params.pop("similarity_function"))
+        projection_feedforward = FeedForward.from_params(params.pop('projection_feedforward'))
+        inference_encoder = Seq2SeqEncoder.from_params(params.pop("inference_encoder"))
+        output_feedforward = FeedForward.from_params(params.pop('output_feedforward'))
+        initializer = InitializerApplicator.from_params(params.pop('initializer', []))
+        regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
+
+        dropout = params.pop("dropout", 0)
+
+        params.assert_empty(cls.__name__)
+        return cls(vocab=vocab,
+                   text_field_embedder=text_field_embedder,
+                   encoder=encoder,
+                   similarity_function=similarity_function,
+                   projection_feedforward=projection_feedforward,
+                   inference_encoder=inference_encoder,
+                   output_feedforward=output_feedforward,
+                   initializer=initializer,
+                   dropout=dropout,
+                   regularizer=regularizer)
diff --git a/training_config/esim.json b/training_config/esim.json
new file mode 100644
index 00000000000..9ce1b03f13c
--- /dev/null
+++ b/training_config/esim.json
@@ -0,0 +1,79 @@
+{
+  "dataset_reader": {
+    "type": "snli",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": false,
+      }
+    },
+  },
+  "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl",
+  "validation_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl",
+  "model": {
+    "type": "esim",
+    "dropout": 0.5,
+    "text_field_embedder": {
+      "tokens": {
+        "type": "embedding",
+        "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.840B.300d.txt.gz",
+        "embedding_dim": 300,
+        "trainable": false
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 300,
+      "hidden_size": 300,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "similarity_function": {"type": "dot_product"},
+    "projection_feedforward": {
+      "input_dim": 2400,
+      "hiddem_dims": 300,
+      "num_layers": 1,
+      "activations": "relu",
+    },
+    "inference_encoder": {
+      "type": "lstm",
+      "input_size": 300,
+      "hidden_size": 300,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "output_feedforward": {
+      "input_dim": 2400,
+      "num_layers": 2,
+      "hidden_dims": [300, 3],
+      "activations": "relu",
+      "dropout": 0.5
+    },
+     "initializer": [
+      [".*linear_layers.*weight", {"type": "xavier_uniform"}]
+     ]
+   },
+  "iterator": {
+    "type": "bucket",
+    "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]],
+    "batch_size": 32
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.0004,
+    },
+    "validation_metric": "+accuracy",
+    "num_serialized_models_to_keep": 2,
+    "num_epochs": 75,
+    "grad_norm": 10.0,
+    "patience": 3,
+    "cuda_device": 0,
+    "learning_rate_scheduler": {
+      "type": "reduce_on_plateau",
+      "factor": 0.5,
+      "mode": "max",
+      "patience": 0
+    }
+  }
+}

From 8127486ed0c1a2b54723134a2d7e2a7fe095c2e4 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 2 May 2018 17:39:35 -0700
Subject: [PATCH 02/24] WIP: ESIM model for SNLI

---
 allennlp/models/__init__.py | 1 +
 allennlp/models/esim.py     | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/allennlp/models/__init__.py b/allennlp/models/__init__.py
index 5c7f7bc4eed..59a23409c85 100644
--- a/allennlp/models/__init__.py
+++ b/allennlp/models/__init__.py
@@ -17,3 +17,4 @@
 from allennlp.models.semantic_parsing.wikitables.wikitables_semantic_parser import WikiTablesSemanticParser
 from allennlp.models.semantic_role_labeler import SemanticRoleLabeler
 from allennlp.models.simple_tagger import SimpleTagger
+from allennlp.models.esim import ESIM
diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 92b113d4304..fec04b6171b 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -149,12 +149,12 @@ def forward(self,  # type: ignore
         # Shape: (batch_size, premise_length, hypothesis_length)
         p2h_attention = last_dim_softmax(similarity_matrix, hypothesis_mask)
         # Shape: (batch_size, premise_length, embedding_dim)
-        attended_hypothesis = weighted_sum(embedded_hypothesis, p2h_attention)
+        attended_hypothesis = weighted_sum(encoded_hypothesis, p2h_attention)
 
         # Shape: (batch_size, hypothesis_length, premise_length)
         h2p_attention = last_dim_softmax(similarity_matrix.transpose(1, 2).contiguous(), premise_mask)
         # Shape: (batch_size, hypothesis_length, embedding_dim)
-        attended_premise = weighted_sum(embedded_premise, h2p_attention)
+        attended_premise = weighted_sum(encoded_premise, h2p_attention)
 
         # the "enhancement" layer
         premise_enhanced = torch.cat(
@@ -180,8 +180,8 @@ def forward(self,  # type: ignore
         if self.dropout:
             projected_enhanced_premise = self.dropout(projected_enhanced_premise)
             projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis)
-        v_ai = self._inference_encoder(projected_enhanced_premise)
-        v_bi = self._inference_encoder(projected_enhanced_hypothesis)
+        v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask)
+        v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask)
 
         # The pooling layer -- max and avg pooling.
         # (batch_size, model_dim)

From 5d775f72ccba4ad4a8a20f1b591b5977d7c986cc Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 2 May 2018 17:44:57 -0700
Subject: [PATCH 03/24] WIP: ESIM

---
 allennlp/models/esim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index fec04b6171b..8c9f47b8af2 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -185,10 +185,10 @@ def forward(self,  # type: ignore
 
         # The pooling layer -- max and avg pooling.
         # (batch_size, model_dim)
-        v_a_max = replace_masked_values(
+        v_a_max, _ = replace_masked_values(
                 v_ai, premise_mask.unsqueeze(-1), -1e7
         ).max(dim=1)
-        v_b_max = replace_masked_values(
+        v_b_max, _ = replace_masked_values(
                 v_bi, hypothesis_mask.unsqueeze(-1), -1e7
         ).max(dim=1)
 

From 98291602ade292b021ac1ca82ff98f5efe3ae929 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 2 May 2018 19:51:29 -0700
Subject: [PATCH 04/24] WIP: ESIM

---
 allennlp/models/esim.py     |  5 -----
 allennlp/nn/initializers.py | 17 ++++++++++++++++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 8c9f47b8af2..d1732637f86 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -3,9 +3,6 @@
 #   multiple ELMO layers
 #   doc
 
-# init:
-# for LSTM, use xavier_uniform for kernel, orthogonal for recurrent
-
 from typing import Dict, Optional
 
 import torch
@@ -170,8 +167,6 @@ def forward(self,  # type: ignore
                 dim=-1
         )
 
-        projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
-
         # the projection layer down to the model dimension
         projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
         projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)
diff --git a/allennlp/nn/initializers.py b/allennlp/nn/initializers.py
index 5c7ccba355c..045f7067f2d 100644
--- a/allennlp/nn/initializers.py
+++ b/allennlp/nn/initializers.py
@@ -153,6 +153,19 @@ def block_orthogonal(tensor: torch.Tensor,
             tensor[block_slice] = torch.nn.init.orthogonal(tensor[block_slice].contiguous(), gain=gain)
 
 
+def zero(tensor: torch.Tensor) -> None:
+    return tensor.data.zero_()
+
+def lstm_hidden_bias(tensor: torch.Tensor) -> None:
+    """
+    Initialize the biases of the forget gate to 1, and all other gates to 0,
+    following Jozefowicz et al., An Empirical Exploration of Recurrent Network Architectures
+    """
+    # gates are (b_hi|b_hf|b_hg|b_ho) of shape (4*hidden_size)
+    tensor.data.zero_()
+    hidden_size = tensor.shape[0] // 4
+    tensor.data[hidden_size:(2 * hidden_size)] = 1.0
+
 def _initializer_wrapper(init_function: Callable[..., None]) -> Type[Initializer]:
     class Init(Initializer):
         def __init__(self, **kwargs):
@@ -182,7 +195,9 @@ def from_params(cls, params: Params):
         "sparse": _initializer_wrapper(torch.nn.init.sparse),
         "eye": _initializer_wrapper(torch.nn.init.eye),
         "block_orthogonal": _initializer_wrapper(block_orthogonal),
-        "uniform_unit_scaling": _initializer_wrapper(uniform_unit_scaling)
+        "uniform_unit_scaling": _initializer_wrapper(uniform_unit_scaling),
+        "zero": _initializer_wrapper(zero),
+        "lstm_hidden_bias": _initializer_wrapper(lstm_hidden_bias),
 }
 
 

From 8c511114abc48ec3ac3e403de2262114378b6b7b Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 2 May 2018 21:51:05 -0700
Subject: [PATCH 05/24] WIP: ESIM

---
 allennlp/models/esim.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index d1732637f86..0d501d19ab2 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -62,6 +62,7 @@ def __init__(self, vocab: Vocabulary,
                  projection_feedforward: FeedForward,
                  inference_encoder: Seq2SeqEncoder,
                  output_feedforward: FeedForward,
+                 output_logit: FeedForward,
                  initializer: InitializerApplicator = InitializerApplicator(),
                  dropout: float = 0.5,
                  regularizer: Optional[RegularizerApplicator] = None) -> None:
@@ -81,6 +82,7 @@ def __init__(self, vocab: Vocabulary,
             self.dropout = None
 
         self._output_feedforward = output_feedforward
+        self._output_logit = output_logit
 
         self._num_labels = vocab.get_vocab_size(namespace="labels")
 
@@ -90,8 +92,6 @@ def __init__(self, vocab: Vocabulary,
                                "encoder output dim", "projection feedforward input")
         check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(),
                                "proj feedforward output dim", "inference lstm input dim")
-        check_dimensions_match(output_feedforward.get_output_dim(), self._num_labels,
-                               "final output dimension", "number of labels")
 
         self._accuracy = CategoricalAccuracy()
         self._loss = torch.nn.CrossEntropyLoss()
@@ -198,7 +198,8 @@ def forward(self,  # type: ignore
         if self.dropout:
             v = self.dropout(v)
 
-        label_logits = self._output_feedforward(v)
+        output_hidden = self._output_feedforward(v)
+        label_logits = self._output_logit(output_hidden)
         label_probs = torch.nn.functional.softmax(label_logits, dim=-1)
 
         output_dict = {"label_logits": label_logits, "label_probs": label_probs}
@@ -225,6 +226,7 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio
         projection_feedforward = FeedForward.from_params(params.pop('projection_feedforward'))
         inference_encoder = Seq2SeqEncoder.from_params(params.pop("inference_encoder"))
         output_feedforward = FeedForward.from_params(params.pop('output_feedforward'))
+        output_logit = FeedForward.from_params(params.pop('output_logit'))
         initializer = InitializerApplicator.from_params(params.pop('initializer', []))
         regularizer = RegularizerApplicator.from_params(params.pop('regularizer', []))
 
@@ -238,6 +240,7 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio
                    projection_feedforward=projection_feedforward,
                    inference_encoder=inference_encoder,
                    output_feedforward=output_feedforward,
+                   output_logit=output_logit,
                    initializer=initializer,
                    dropout=dropout,
                    regularizer=regularizer)

From 3e1faac57c96ad28aa57358c7b6c06513781bbba Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Fri, 4 May 2018 09:33:52 -0700
Subject: [PATCH 06/24] WIP: ESIM

---
 allennlp/models/esim.py   | 37 ++++++++++++++++++++++++++++++-------
 training_config/esim.json | 29 ++++++++++++++++++++---------
 2 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 0d501d19ab2..1e63b16ff44 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -6,6 +6,7 @@
 from typing import Dict, Optional
 
 import torch
+from torch.autograd import Variable
 
 from allennlp.common import Params
 from allennlp.common.checks import check_dimensions_match
@@ -17,6 +18,21 @@
 from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
 from allennlp.training.metrics import CategoricalAccuracy
 
+class VariationalDropout(torch.nn.Dropout):
+    def forward(self, input):
+        """
+        input is shape (batch_size, timesteps, embedding_dim)
+        Samples one mask of size (batch_size, embedding_dim) and applies it to every time step.
+        """
+        #ones = Variable(torch.ones(input.shape[0], input.shape[-1]))
+        ones = Variable(input.data.new(input.shape[0], input.shape[-1]).fill_(1))
+        dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
+        if self.inplace:
+            input *= dropout_mask.unsqueeze(1)
+            return None
+        else:
+            return dropout_mask.unsqueeze(1) * input
+
 
 @Model.register("esim")
 class ESIM(Model):
@@ -78,8 +94,10 @@ def __init__(self, vocab: Vocabulary,
 
         if dropout:
             self.dropout = torch.nn.Dropout(dropout)
+            self.rnn_input_dropout = VariationalDropout(dropout)
         else:
             self.dropout = None
+            self.rnn_input_dropout = None
 
         self._output_feedforward = output_feedforward
         self._output_logit = output_logit
@@ -131,10 +149,10 @@ def forward(self,  # type: ignore
         premise_mask = get_text_field_mask(premise).float()
         hypothesis_mask = get_text_field_mask(hypothesis).float()
 
-        # apply dropout
-        if self.dropout:
-            embedded_premise = self.dropout(embedded_premise)
-            embedded_hypothesis = self.dropout(embedded_hypothesis)
+        # apply dropout for LSTM
+        if self.rnn_input_dropout:
+            embedded_premise = self.rnn_input_dropout(embedded_premise)
+            embedded_hypothesis = self.rnn_input_dropout(embedded_hypothesis)
 
         # encode premise and hypothesis
         encoded_premise = self._encoder(embedded_premise, premise_mask)
@@ -167,14 +185,19 @@ def forward(self,  # type: ignore
                 dim=-1
         )
 
+        # embedding -> lstm w/ do -> enhanced attention -> dropout_proj, only if ELMO -> ff proj -> lstm w/ do -> dropout -> ff 300 -> dropout -> output
+
+        # add dropout here with ELMO
+
         # the projection layer down to the model dimension
+        # no dropout in projection
         projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
         projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)
 
         # Run the inference layer
-        if self.dropout:
-            projected_enhanced_premise = self.dropout(projected_enhanced_premise)
-            projected_enhanced_hypothesis = self.dropout(projected_enhanced_hypothesis)
+        if self.rnn_input_dropout:
+            projected_enhanced_premise = self.rnn_input_dropout(projected_enhanced_premise)
+            projected_enhanced_hypothesis = self.rnn_input_dropout(projected_enhanced_hypothesis)
         v_ai = self._inference_encoder(projected_enhanced_premise, premise_mask)
         v_bi = self._inference_encoder(projected_enhanced_hypothesis, hypothesis_mask)
 
diff --git a/training_config/esim.json b/training_config/esim.json
index 9ce1b03f13c..269fc0d8644 100644
--- a/training_config/esim.json
+++ b/training_config/esim.json
@@ -4,9 +4,9 @@
     "token_indexers": {
       "tokens": {
         "type": "single_id",
-        "lowercase_tokens": false,
+        "lowercase_tokens": false
       }
-    },
+    }
   },
   "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl",
   "validation_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl",
@@ -31,9 +31,9 @@
     "similarity_function": {"type": "dot_product"},
     "projection_feedforward": {
       "input_dim": 2400,
-      "hiddem_dims": 300,
+      "hidden_dims": 300,
       "num_layers": 1,
-      "activations": "relu",
+      "activations": "relu"
     },
     "inference_encoder": {
       "type": "lstm",
@@ -44,13 +44,24 @@
     },
     "output_feedforward": {
       "input_dim": 2400,
-      "num_layers": 2,
-      "hidden_dims": [300, 3],
+      "num_layers": 1,
+      "hidden_dims": 300,
       "activations": "relu",
       "dropout": 0.5
+    },
+    "output_logit": {
+      "input_dim": 300,
+      "num_layers": 1,
+      "hidden_dims": 3,
+      "activations": "linear"
     },
      "initializer": [
-      [".*linear_layers.*weight", {"type": "xavier_uniform"}]
+      [".*linear_layers.*weight", {"type": "xavier_uniform"}],
+      [".*linear_layers.*bias", {"type": "zero"}],
+      [".*weight_ih.*", {"type": "xavier_uniform"}],
+      [".*weight_hh.*", {"type": "orthogonal"}],
+      [".*bias_ih.*", {"type": "zero"}],
+      [".*bias_hh.*", {"type": "lstm_hidden_bias"}]
      ]
    },
   "iterator": {
@@ -61,13 +72,13 @@
   "trainer": {
     "optimizer": {
         "type": "adam",
-        "lr": 0.0004,
+        "lr": 0.0004
     },
     "validation_metric": "+accuracy",
     "num_serialized_models_to_keep": 2,
     "num_epochs": 75,
     "grad_norm": 10.0,
-    "patience": 3,
+    "patience": 5,
     "cuda_device": 0,
     "learning_rate_scheduler": {
       "type": "reduce_on_plateau",

From 9db872d2bc9eebd546ac6178ed84686c0abe4422 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 7 May 2018 11:28:06 -0700
Subject: [PATCH 07/24] ESLM model with ELMo

---
 training_config/esim.json | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/training_config/esim.json b/training_config/esim.json
index 269fc0d8644..348b3bd299e 100644
--- a/training_config/esim.json
+++ b/training_config/esim.json
@@ -2,10 +2,9 @@
   "dataset_reader": {
     "type": "snli",
     "token_indexers": {
-      "tokens": {
-        "type": "single_id",
-        "lowercase_tokens": false
-      }
+      "elmo": {
+        "type": "elmo_characters"
+     }
     }
   },
   "train_data_path": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_train.jsonl",
@@ -14,16 +13,17 @@
     "type": "esim",
     "dropout": 0.5,
     "text_field_embedder": {
-      "tokens": {
-        "type": "embedding",
-        "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.840B.300d.txt.gz",
-        "embedding_dim": 300,
-        "trainable": false
+      "elmo":{
+        "type": "elmo_token_embedder",
+       "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
+       "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
+        "do_layer_norm": false,
+        "dropout": 0.0
       }
     },
     "encoder": {
       "type": "lstm",
-      "input_size": 300,
+      "input_size": 1024,
       "hidden_size": 300,
       "num_layers": 1,
       "bidirectional": true

From c79bd998feea53af66286fbd863fae39d8b47839 Mon Sep 17 00:00:00 2001
From: Matt Peters <matthewp@allenai.org>
Date: Mon, 14 May 2018 13:26:05 -0700
Subject: [PATCH 08/24] Add a ESIM predictor that works with SNLI formatted
 files

---
 allennlp/service/predictors/__init__.py |  1 +
 allennlp/service/predictors/esim.py     | 41 +++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 allennlp/service/predictors/esim.py

diff --git a/allennlp/service/predictors/__init__.py b/allennlp/service/predictors/__init__.py
index c11c497ed7b..6e746a6a6eb 100644
--- a/allennlp/service/predictors/__init__.py
+++ b/allennlp/service/predictors/__init__.py
@@ -16,3 +16,4 @@
 from .simple_seq2seq import SimpleSeq2SeqPredictor
 from .wikitables_parser import WikiTablesParserPredictor
 from .nlvr_parser import NlvrParserPredictor
+from .esim import ESIMPredictor
diff --git a/allennlp/service/predictors/esim.py b/allennlp/service/predictors/esim.py
new file mode 100644
index 00000000000..5ea63c7fda8
--- /dev/null
+++ b/allennlp/service/predictors/esim.py
@@ -0,0 +1,41 @@
+from typing import Tuple
+from overrides import overrides
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance
+from allennlp.service.predictors.predictor import Predictor
+
+
+@Predictor.register('esim')
+class ESIMPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.esim.ESIM` model.
+    """
+
+    def predict(self, sentence1: str, sentence2: str) -> JsonDict:
+        """
+        Predicts whether the sentence2 is entailed by the sentence1 text.
+
+        Parameters
+        ----------
+        sentence1 : ``str``
+            A passage representing what is assumed to be true.
+
+        sentence2 : ``str``
+            A sentence that may be entailed by the sentence1.
+
+        Returns
+        -------
+        A dictionary where the key "label_probs" determines the probabilities of each of
+        [entailment, contradiction, neutral].
+        """
+        return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``.
+        """
+        sentence1_text = json_dict["sentence1"]
+        sentence2_text = json_dict["sentence2"]
+        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {}

From 3f173beda59a33f8403cc75eada202f0391b4dc3 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Wed, 30 May 2018 14:23:47 -0700
Subject: [PATCH 09/24] Move ESIM predictor

---
 allennlp/service/predictors/esim.py | 41 -----------------------------
 1 file changed, 41 deletions(-)
 delete mode 100644 allennlp/service/predictors/esim.py

diff --git a/allennlp/service/predictors/esim.py b/allennlp/service/predictors/esim.py
deleted file mode 100644
index 5ea63c7fda8..00000000000
--- a/allennlp/service/predictors/esim.py
+++ /dev/null
@@ -1,41 +0,0 @@
-from typing import Tuple
-from overrides import overrides
-
-from allennlp.common.util import JsonDict
-from allennlp.data import Instance
-from allennlp.service.predictors.predictor import Predictor
-
-
-@Predictor.register('esim')
-class ESIMPredictor(Predictor):
-    """
-    Predictor for the :class:`~allennlp.models.esim.ESIM` model.
-    """
-
-    def predict(self, sentence1: str, sentence2: str) -> JsonDict:
-        """
-        Predicts whether the sentence2 is entailed by the sentence1 text.
-
-        Parameters
-        ----------
-        sentence1 : ``str``
-            A passage representing what is assumed to be true.
-
-        sentence2 : ``str``
-            A sentence that may be entailed by the sentence1.
-
-        Returns
-        -------
-        A dictionary where the key "label_probs" determines the probabilities of each of
-        [entailment, contradiction, neutral].
-        """
-        return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2})
-
-    @overrides
-    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
-        """
-        Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``.
-        """
-        sentence1_text = json_dict["sentence1"]
-        sentence2_text = json_dict["sentence2"]
-        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {}

From 2b722cebe3ab91895c31d74e04058d632bad6cfb Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 13:29:49 -0700
Subject: [PATCH 10/24] Clean up

---
 allennlp/models/esim.py | 84 +++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 1e63b16ff44..3fc8d11a1e5 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -1,8 +1,3 @@
-# TODO: projection dropout with ELMO
-#   l2 reg with ELMO
-#   multiple ELMO layers
-#   doc
-
 from typing import Dict, Optional
 
 import torch
@@ -18,14 +13,31 @@
 from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
 from allennlp.training.metrics import CategoricalAccuracy
 
-class VariationalDropout(torch.nn.Dropout):
+class InputVariationalDropout(torch.nn.Dropout):
+    """
+    Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation:
+    Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a
+    3D tensor.
+
+    This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
+    and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies
+    it to every time step.
+    """
     def forward(self, input):
         """
-        input is shape (batch_size, timesteps, embedding_dim)
-        Samples one mask of size (batch_size, embedding_dim) and applies it to every time step.
+        Apply dropout to input tensor.
+
+        Parameters
+        ----------
+        input: torch.FloatTensor
+            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
+        
+        Returns
+        -------
+        output: torch.FloatTensor
+            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied.
         """
-        #ones = Variable(torch.ones(input.shape[0], input.shape[-1]))
-        ones = Variable(input.data.new(input.shape[0], input.shape[-1]).fill_(1))
+        ones = Variable(input.data.new_ones(input.shape[0], input.shape[-1]))
         dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
         if self.inplace:
             input *= dropout_mask.unsqueeze(1)
@@ -47,25 +59,21 @@ class ESIM(Model):
     text_field_embedder : ``TextFieldEmbedder``
         Used to embed the ``premise`` and ``hypothesis`` ``TextFields`` we get as input to the
         model.
-    attend_feedforward : ``FeedForward``
-        This feedforward network is applied to the encoded sentence representations before the
-        similarity matrix is computed between words in the premise and words in the hypothesis.
+    encoder : ``Seq2SeqEncoder``
+        Used to encode the premise and hypothesis.
     similarity_function : ``SimilarityFunction``
-        This is the similarity function used when computing the similarity matrix between words in
-        the premise and words in the hypothesis.
-    compare_feedforward : ``FeedForward``
-        This feedforward network is applied to the aligned premise and hypothesis representations,
-        individually.
-    aggregate_feedforward : ``FeedForward``
-        This final feedforward network is applied to the concatenated, summed result of the
-        ``compare_feedforward`` network, and its output is used as the entailment class logits.
-    premise_encoder : ``Seq2SeqEncoder``, optional (default=``None``)
-        After embedding the premise, we can optionally apply an encoder.  If this is ``None``, we
-        will do nothing.
-    hypothesis_encoder : ``Seq2SeqEncoder``, optional (default=``None``)
-        After embedding the hypothesis, we can optionally apply an encoder.  If this is ``None``,
-        we will use the ``premise_encoder`` for the encoding (doing nothing if ``premise_encoder``
-        is also ``None``).
+        This is the similarity function used when computing the similarity matrix between encoded
+        words in the premise and words in the hypothesis.
+    projection_feedforward : ``FeedForward``
+        The feedforward network used to project down the encoded and enhanced premise and hypothesis.
+    inference_encoder : ``Seq2SeqEncoder``
+        Used to encode the projected premise and hypothesis for prediction.
+    output_feedforward : ``FeedForward``
+        Used to prepare the concatenated premise and hypothesis for prediction.
+    output_logit : ``FeedForward``
+        This feedforward network computes the output logits.
+    dropout : ``float``, optional (default=0.5)
+        Dropout percentage to use.
     initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
         Used to initialize the model parameters.
     regularizer : ``RegularizerApplicator``, optional (default=``None``)
@@ -79,8 +87,8 @@ def __init__(self, vocab: Vocabulary,
                  inference_encoder: Seq2SeqEncoder,
                  output_feedforward: FeedForward,
                  output_logit: FeedForward,
-                 initializer: InitializerApplicator = InitializerApplicator(),
                  dropout: float = 0.5,
+                 initializer: InitializerApplicator = InitializerApplicator(),
                  regularizer: Optional[RegularizerApplicator] = None) -> None:
         super().__init__(vocab, regularizer)
 
@@ -94,7 +102,7 @@ def __init__(self, vocab: Vocabulary,
 
         if dropout:
             self.dropout = torch.nn.Dropout(dropout)
-            self.rnn_input_dropout = VariationalDropout(dropout)
+            self.rnn_input_dropout = InputVariationalDropout(dropout)
         else:
             self.dropout = None
             self.rnn_input_dropout = None
@@ -185,12 +193,8 @@ def forward(self,  # type: ignore
                 dim=-1
         )
 
-        # embedding -> lstm w/ do -> enhanced attention -> dropout_proj, only if ELMO -> ff proj -> lstm w/ do -> dropout -> ff 300 -> dropout -> output
-
-        # add dropout here with ELMO
-
-        # the projection layer down to the model dimension
-        # no dropout in projection
+        # The projection layer down to the model dimension.  Dropout is not applied before
+        # projection.
         projected_enhanced_premise = self._projection_feedforward(premise_enhanced)
         projected_enhanced_hypothesis = self._projection_feedforward(hypothesis_enhanced)
 
@@ -235,12 +239,10 @@ def forward(self,  # type: ignore
         return output_dict
 
     def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        return {
-                'accuracy': self._accuracy.get_metric(reset),
-                }
+        return {'accuracy': self._accuracy.get_metric(reset)}
 
     @classmethod
-    def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttention':
+    def from_params(cls, vocab: Vocabulary, params: Params) -> 'ESIM':
         embedder_params = params.pop("text_field_embedder")
         text_field_embedder = TextFieldEmbedder.from_params(vocab, embedder_params)
 
@@ -264,6 +266,6 @@ def from_params(cls, vocab: Vocabulary, params: Params) -> 'DecomposableAttentio
                    inference_encoder=inference_encoder,
                    output_feedforward=output_feedforward,
                    output_logit=output_logit,
-                   initializer=initializer,
                    dropout=dropout,
+                   initializer=initializer,
                    regularizer=regularizer)

From fa5e670bef66d1a7707ea82793897988268fdbfe Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:18:51 -0700
Subject: [PATCH 11/24] Add test for ESIM

---
 allennlp/models/esim.py                      | 15 ++--
 allennlp/tests/fixtures/esim/experiment.json | 92 ++++++++++++++++++++
 allennlp/tests/models/esim_test.py           | 27 ++++++
 3 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 allennlp/tests/fixtures/esim/experiment.json
 create mode 100644 allennlp/tests/models/esim_test.py

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 3fc8d11a1e5..446165d57e4 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional
+from typing import Dict, Optional, List, Any
 
 import torch
 from torch.autograd import Variable
@@ -7,7 +7,8 @@
 from allennlp.common.checks import check_dimensions_match
 from allennlp.data import Vocabulary
 from allennlp.models.model import Model
-from allennlp.modules import FeedForward, MatrixAttention
+from allennlp.modules import FeedForward
+from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention
 from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder
 from allennlp.nn import InitializerApplicator, RegularizerApplicator
 from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
@@ -95,7 +96,7 @@ def __init__(self, vocab: Vocabulary,
         self._text_field_embedder = text_field_embedder
         self._encoder = encoder
 
-        self._matrix_attention = MatrixAttention(similarity_function)
+        self._matrix_attention = LegacyMatrixAttention(similarity_function)
         self._projection_feedforward = projection_feedforward
 
         self._inference_encoder = inference_encoder
@@ -127,7 +128,8 @@ def __init__(self, vocab: Vocabulary,
     def forward(self,  # type: ignore
                 premise: Dict[str, torch.LongTensor],
                 hypothesis: Dict[str, torch.LongTensor],
-                label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
+                label: torch.IntTensor = None,
+                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
         # pylint: disable=arguments-differ
         """
         Parameters
@@ -138,6 +140,9 @@ def forward(self,  # type: ignore
             From a ``TextField``
         label : torch.IntTensor, optional (default = None)
             From a ``LabelField``
+        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
+            Metadata containing the original tokenization of the premise and
+            hypothesis with 'premise_tokens' and 'hypothesis_tokens' keys respectively.
 
         Returns
         -------
@@ -233,7 +238,7 @@ def forward(self,  # type: ignore
 
         if label is not None:
             loss = self._loss(label_logits, label.long().view(-1))
-            self._accuracy(label_logits, label.squeeze(-1))
+            self._accuracy(label_logits, label)
             output_dict["loss"] = loss
 
         return output_dict
diff --git a/allennlp/tests/fixtures/esim/experiment.json b/allennlp/tests/fixtures/esim/experiment.json
new file mode 100644
index 00000000000..772b261b789
--- /dev/null
+++ b/allennlp/tests/fixtures/esim/experiment.json
@@ -0,0 +1,92 @@
+{
+  "dataset_reader": {
+    "type": "snli",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      }
+    }
+  },
+  "train_data_path": "allennlp/tests/fixtures/data/snli.jsonl",
+  "validation_data_path": "allennlp/tests/fixtures/data/snli.jsonl",
+  "model": {
+    "type": "esim",
+    "dropout": 0.5,
+    "text_field_embedder": {
+      "tokens": {
+        "type": "embedding",
+        "pretrained_file": "allennlp/tests/fixtures/embeddings/glove.6B.300d.sample.txt.gz",
+        "embedding_dim": 300,
+        "trainable": false,
+        "projection_dim": 10,
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 10,
+      "hidden_size": 25,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "similarity_function": {"type": "dot_product"},
+    "projection_feedforward": {
+      "input_dim": 200,
+      "hidden_dims": 25,
+      "num_layers": 1,
+      "activations": "relu"
+    },
+    "inference_encoder": {
+      "type": "lstm",
+      "input_size": 25,
+      "hidden_size": 25,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "output_feedforward": {
+      "input_dim": 200,
+      "num_layers": 1,
+      "hidden_dims": 5,
+      "activations": "relu",
+      "dropout": 0.5
+    },
+    "output_logit": {
+      "input_dim": 5,
+      "num_layers": 1,
+      "hidden_dims": 3,
+      "activations": "linear"
+    },
+     "initializer": [
+      [".*linear_layers.*weight", {"type": "xavier_uniform"}],
+      [".*linear_layers.*bias", {"type": "zero"}],
+      [".*weight_ih.*", {"type": "xavier_uniform"}],
+      [".*weight_hh.*", {"type": "orthogonal"}],
+      [".*bias_ih.*", {"type": "zero"}],
+      [".*bias_hh.*", {"type": "lstm_hidden_bias"}]
+     ]
+   },
+  "iterator": {
+    "type": "bucket",
+    "sorting_keys": [["premise", "num_tokens"], ["hypothesis", "num_tokens"]],
+    "batch_size": 32,
+    "padding_noise": 0.0,
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.0004
+    },
+    "validation_metric": "+accuracy",
+    "num_serialized_models_to_keep": 2,
+    "num_epochs": 5,
+    "grad_norm": 10.0,
+    "patience": 2,
+    "cuda_device": -1,
+    "learning_rate_scheduler": {
+      "type": "reduce_on_plateau",
+      "factor": 0.5,
+      "mode": "max",
+      "patience": 0
+    }
+  }
+}
diff --git a/allennlp/tests/models/esim_test.py b/allennlp/tests/models/esim_test.py
new file mode 100644
index 00000000000..458e73f4527
--- /dev/null
+++ b/allennlp/tests/models/esim_test.py
@@ -0,0 +1,27 @@
+# pylint: disable=no-self-use,invalid-name
+from flaky import flaky
+import pytest
+import numpy
+from numpy.testing import assert_almost_equal
+
+from allennlp.common import Params
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.testing import ModelTestCase
+
+
+class TestESIM(ModelTestCase):
+    def setUp(self):
+        super(TestESIM, self).setUp()
+        self.set_up_model(self.FIXTURES_ROOT / 'esim' / 'experiment.json',
+                          self.FIXTURES_ROOT / 'data' / 'snli.jsonl')
+
+    def test_forward_pass_runs_correctly(self):
+        training_tensors = self.dataset.as_tensor_dict()
+        output_dict = self.model(**training_tensors)
+        assert_almost_equal(numpy.sum(output_dict["label_probs"][0].data.numpy(), -1), 1, decimal=6)
+
+    def test_model_can_train_save_and_load(self):
+        self.ensure_model_can_train_save_and_load(self.param_file)
+
+    def test_batch_predictions_are_consistent(self):
+        self.ensure_batch_predictions_are_consistent()

From 3e336a5f07b59b40103fafbd4e6a8256b0044828 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:20:33 -0700
Subject: [PATCH 12/24] Add predictor for ESIM

---
 allennlp/predictors/esim.py | 41 +++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 allennlp/predictors/esim.py

diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py
new file mode 100644
index 00000000000..5ea63c7fda8
--- /dev/null
+++ b/allennlp/predictors/esim.py
@@ -0,0 +1,41 @@
+from typing import Tuple
+from overrides import overrides
+
+from allennlp.common.util import JsonDict
+from allennlp.data import Instance
+from allennlp.service.predictors.predictor import Predictor
+
+
+@Predictor.register('esim')
+class ESIMPredictor(Predictor):
+    """
+    Predictor for the :class:`~allennlp.models.esim.ESIM` model.
+    """
+
+    def predict(self, sentence1: str, sentence2: str) -> JsonDict:
+        """
+        Predicts whether the sentence2 is entailed by the sentence1 text.
+
+        Parameters
+        ----------
+        sentence1 : ``str``
+            A passage representing what is assumed to be true.
+
+        sentence2 : ``str``
+            A sentence that may be entailed by the sentence1.
+
+        Returns
+        -------
+        A dictionary where the key "label_probs" determines the probabilities of each of
+        [entailment, contradiction, neutral].
+        """
+        return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2})
+
+    @overrides
+    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+        """
+        Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``.
+        """
+        sentence1_text = json_dict["sentence1"]
+        sentence2_text = json_dict["sentence2"]
+        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {}

From 23bfeca757f9f844b3ac46f8bdae05017af0b4db Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:37:59 -0700
Subject: [PATCH 13/24] pylint

---
 allennlp/models/esim.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 446165d57e4..63a4fedcc70 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -9,7 +9,7 @@
 from allennlp.models.model import Model
 from allennlp.modules import FeedForward
 from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention
-from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TimeDistributed, TextFieldEmbedder
+from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TextFieldEmbedder
 from allennlp.nn import InitializerApplicator, RegularizerApplicator
 from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
 from allennlp.training.metrics import CategoricalAccuracy
@@ -24,27 +24,28 @@ class InputVariationalDropout(torch.nn.Dropout):
     and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies
     it to every time step.
     """
-    def forward(self, input):
+    def forward(self, input_tensor):
+        # pylint: disable=arguments-differ
         """
         Apply dropout to input tensor.
 
         Parameters
         ----------
-        input: torch.FloatTensor
+        input_tensor: torch.FloatTensor
             A tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
-        
+
         Returns
         -------
         output: torch.FloatTensor
             A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied.
         """
-        ones = Variable(input.data.new_ones(input.shape[0], input.shape[-1]))
+        ones = Variable(input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1]))
         dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
         if self.inplace:
-            input *= dropout_mask.unsqueeze(1)
+            input_tensor *= dropout_mask.unsqueeze(1)
             return None
         else:
-            return dropout_mask.unsqueeze(1) * input
+            return dropout_mask.unsqueeze(1) * input_tensor
 
 
 @Model.register("esim")
@@ -129,7 +130,8 @@ def forward(self,  # type: ignore
                 premise: Dict[str, torch.LongTensor],
                 hypothesis: Dict[str, torch.LongTensor],
                 label: torch.IntTensor = None,
-                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
+                metadata: List[Dict[str, Any]] = None  # pylint:disable=unused-argument
+               ) -> Dict[str, torch.Tensor]:
         # pylint: disable=arguments-differ
         """
         Parameters
@@ -192,7 +194,7 @@ def forward(self,  # type: ignore
                 dim=-1
         )
         hypothesis_enhanced = torch.cat(
-                [encoded_hypothesis, attended_premise, 
+                [encoded_hypothesis, attended_premise,
                  encoded_hypothesis - attended_premise,
                  encoded_hypothesis * attended_premise],
                 dim=-1
@@ -219,18 +221,22 @@ def forward(self,  # type: ignore
                 v_bi, hypothesis_mask.unsqueeze(-1), -1e7
         ).max(dim=1)
 
-        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(premise_mask, 1, keepdim=True)
-        v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(hypothesis_mask, 1, keepdim=True)
+        v_a_avg = torch.sum(v_ai * premise_mask.unsqueeze(-1), dim=1) / torch.sum(
+                premise_mask, 1, keepdim=True
+        )
+        v_b_avg = torch.sum(v_bi * hypothesis_mask.unsqueeze(-1), dim=1) / torch.sum(
+                hypothesis_mask, 1, keepdim=True
+        )
 
         # Now concat
         # (batch_size, model_dim * 2 * 4)
-        v = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
+        v_all = torch.cat([v_a_avg, v_a_max, v_b_avg, v_b_max], dim=1)
 
         # the final MLP -- apply dropout to input, and MLP applies to output & hidden
         if self.dropout:
-            v = self.dropout(v)
+            v_all = self.dropout(v_all)
 
-        output_hidden = self._output_feedforward(v)
+        output_hidden = self._output_feedforward(v_all)
         label_logits = self._output_logit(output_hidden)
         label_probs = torch.nn.functional.softmax(label_logits, dim=-1)
 

From 1d0c90586aa86786056e95aff0c1805e5f4ed9a5 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:39:07 -0700
Subject: [PATCH 14/24] pylint

---
 allennlp/tests/models/esim_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/allennlp/tests/models/esim_test.py b/allennlp/tests/models/esim_test.py
index 458e73f4527..7e939755e09 100644
--- a/allennlp/tests/models/esim_test.py
+++ b/allennlp/tests/models/esim_test.py
@@ -1,11 +1,7 @@
 # pylint: disable=no-self-use,invalid-name
-from flaky import flaky
-import pytest
 import numpy
 from numpy.testing import assert_almost_equal
 
-from allennlp.common import Params
-from allennlp.common.checks import ConfigurationError
 from allennlp.common.testing import ModelTestCase
 
 

From 12325be3e83a89a683be0331faab3163b9026b40 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:42:18 -0700
Subject: [PATCH 15/24] mypy

---
 allennlp/predictors/esim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py
index 5ea63c7fda8..77bac349807 100644
--- a/allennlp/predictors/esim.py
+++ b/allennlp/predictors/esim.py
@@ -32,10 +32,10 @@ def predict(self, sentence1: str, sentence2: str) -> JsonDict:
         return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2})
 
     @overrides
-    def _json_to_instance(self, json_dict: JsonDict) -> Tuple[Instance, JsonDict]:
+    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
         """
         Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``.
         """
         sentence1_text = json_dict["sentence1"]
         sentence2_text = json_dict["sentence2"]
-        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text), {}
+        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text)

From d9730f47672b5608a7b0c19c94088b8a6a234219 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:45:40 -0700
Subject: [PATCH 16/24] fix the docs

---
 doc/api/allennlp.models.esim.rst | 7 +++++++
 doc/api/allennlp.predictors.rst  | 7 +++++++
 2 files changed, 14 insertions(+)
 create mode 100644 doc/api/allennlp.models.esim.rst

diff --git a/doc/api/allennlp.models.esim.rst b/doc/api/allennlp.models.esim.rst
new file mode 100644
index 00000000000..deaebd20782
--- /dev/null
+++ b/doc/api/allennlp.models.esim.rst
@@ -0,0 +1,7 @@
+allennlp.models.esim
+====================
+
+.. automodule:: allennlp.models.esim
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/doc/api/allennlp.predictors.rst b/doc/api/allennlp.predictors.rst
index 8712dfa6be1..f8cfd837274 100644
--- a/doc/api/allennlp.predictors.rst
+++ b/doc/api/allennlp.predictors.rst
@@ -9,6 +9,7 @@ allennlp.predictors
 * :ref:`Predictor<predictor>`
 * :ref:`BidafPredictor<bidaf>`
 * :ref:`DecomposableAttentionPredictor<decomposable-attention>`
+* :ref:`ESIMPredictor<esim>`
 * :ref:`SemanticRoleLabelerPredictor<semantic-role-labeler>`
 * :ref:`SentenceTaggerPredictor<sentence-tagger>`
 * :ref:`CorefPredictor<coreference-resolution>`
@@ -35,6 +36,12 @@ allennlp.predictors
    :undoc-members:
    :show-inheritance:
 
+.. _esim:
+.. automodule:: allennlp.predictors.esim
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 .. _semantic-role-labeler:
 .. automodule:: allennlp.predictors.semantic_role_labeler
    :members:

From 4f6d37fc729a12d7a94351329ac6821605b072ef Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 14:52:37 -0700
Subject: [PATCH 17/24] ESIM predictor

---
 allennlp/predictors/__init__.py | 1 +
 allennlp/predictors/esim.py     | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
index 0be6fc4ee6e..ca291ee1658 100644
--- a/allennlp/predictors/__init__.py
+++ b/allennlp/predictors/__init__.py
@@ -16,3 +16,4 @@
 from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
 from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
 from allennlp.predictors.nlvr_parser import NlvrParserPredictor
+from allennlp.predictors.esim import ESIMPredictor
diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py
index 77bac349807..59542024212 100644
--- a/allennlp/predictors/esim.py
+++ b/allennlp/predictors/esim.py
@@ -1,9 +1,8 @@
-from typing import Tuple
 from overrides import overrides
 
 from allennlp.common.util import JsonDict
 from allennlp.data import Instance
-from allennlp.service.predictors.predictor import Predictor
+from allennlp.predictors.predictor import Predictor
 
 
 @Predictor.register('esim')

From 7b57e4229ffaac6abc9c593001fff71fbc72c50d Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 15:04:24 -0700
Subject: [PATCH 18/24] Add comment to esim training config

---
 training_config/esim.json | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/training_config/esim.json b/training_config/esim.json
index 348b3bd299e..65ef2467fc2 100644
--- a/training_config/esim.json
+++ b/training_config/esim.json
@@ -1,4 +1,14 @@
 {
+  // Configuration for the ESIM model with ELMo, modified slightly from
+  // the version included in "Deep Contextualized Word Representations",
+  // (https://arxiv.org/abs/1802.05365).  Compared to the version in this paper,
+  // this configuration only includes one layer of ELMo representations
+  // and removes GloVe embeddings.
+  //
+  // There is a trained model available at https://s3-us-west-2.amazonaws.com/allennlp/models/esim-elmo-2018.05.17.tar.gz
+  // with test set accuracy of 88.5%, compared to the single model reported
+  // result of 88.7 +/- 0.17.
+
   "dataset_reader": {
     "type": "snli",
     "token_indexers": {

From 7ea3e47d3f588f705735ffe044881552bcd43fb1 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 15:50:30 -0700
Subject: [PATCH 19/24] Move InputVariationalDropout

---
 allennlp/models/esim.py                       | 35 +------------------
 allennlp/modules/__init__.py                  |  1 +
 allennlp/modules/input_variational_dropout.py | 34 ++++++++++++++++++
 3 files changed, 36 insertions(+), 34 deletions(-)
 create mode 100644 allennlp/modules/input_variational_dropout.py

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 63a4fedcc70..1ee46d6edc9 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -7,46 +7,13 @@
 from allennlp.common.checks import check_dimensions_match
 from allennlp.data import Vocabulary
 from allennlp.models.model import Model
-from allennlp.modules import FeedForward
+from allennlp.modules import FeedForward, InputVariationalDropout
 from allennlp.modules.matrix_attention.legacy_matrix_attention import LegacyMatrixAttention
 from allennlp.modules import Seq2SeqEncoder, SimilarityFunction, TextFieldEmbedder
 from allennlp.nn import InitializerApplicator, RegularizerApplicator
 from allennlp.nn.util import get_text_field_mask, last_dim_softmax, weighted_sum, replace_masked_values
 from allennlp.training.metrics import CategoricalAccuracy
 
-class InputVariationalDropout(torch.nn.Dropout):
-    """
-    Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation:
-    Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a
-    3D tensor.
-
-    This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
-    and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies
-    it to every time step.
-    """
-    def forward(self, input_tensor):
-        # pylint: disable=arguments-differ
-        """
-        Apply dropout to input tensor.
-
-        Parameters
-        ----------
-        input_tensor: torch.FloatTensor
-            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
-
-        Returns
-        -------
-        output: torch.FloatTensor
-            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied.
-        """
-        ones = Variable(input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1]))
-        dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
-        if self.inplace:
-            input_tensor *= dropout_mask.unsqueeze(1)
-            return None
-        else:
-            return dropout_mask.unsqueeze(1) * input_tensor
-
 
 @Model.register("esim")
 class ESIM(Model):
diff --git a/allennlp/modules/__init__.py b/allennlp/modules/__init__.py
index c4bc0f30a2c..4ecdf3c09dd 100644
--- a/allennlp/modules/__init__.py
+++ b/allennlp/modules/__init__.py
@@ -21,3 +21,4 @@
 from allennlp.modules.token_embedders import TokenEmbedder, Embedding
 from allennlp.modules.matrix_attention import MatrixAttention
 from allennlp.modules.attention import Attention
+from allennlp.modules.input_variational_dropout import InputVariationalDropout
diff --git a/allennlp/modules/input_variational_dropout.py b/allennlp/modules/input_variational_dropout.py
new file mode 100644
index 00000000000..ea441af3287
--- /dev/null
+++ b/allennlp/modules/input_variational_dropout.py
@@ -0,0 +1,34 @@
+import torch
+
+class InputVariationalDropout(torch.nn.Dropout):
+    """
+    Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation:
+    Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a
+    3D tensor.
+
+    This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
+    and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies
+    it to every time step.
+    """
+    def forward(self, input_tensor):
+        # pylint: disable=arguments-differ
+        """
+        Apply dropout to input tensor.
+
+        Parameters
+        ----------
+        input_tensor: ``torch.FloatTensor``
+            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
+
+        Returns
+        -------
+        output: ``torch.FloatTensor``
+            A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied.
+        """
+        ones = input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1])
+        dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
+        if self.inplace:
+            input_tensor *= dropout_mask.unsqueeze(1)
+            return None
+        else:
+            return dropout_mask.unsqueeze(1) * input_tensor

From 54db6047bfd81b826f017a1dec22542c05f4e216 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 15:51:23 -0700
Subject: [PATCH 20/24] pylint

---
 allennlp/models/esim.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/allennlp/models/esim.py b/allennlp/models/esim.py
index 1ee46d6edc9..eca68f889b6 100644
--- a/allennlp/models/esim.py
+++ b/allennlp/models/esim.py
@@ -1,7 +1,6 @@
 from typing import Dict, Optional, List, Any
 
 import torch
-from torch.autograd import Variable
 
 from allennlp.common import Params
 from allennlp.common.checks import check_dimensions_match

From 9ae74aac40d9fae9018554722dfee208b63570aa Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 15:56:26 -0700
Subject: [PATCH 21/24] Fix the docs

---
 doc/api/allennlp.models.rst                            | 1 +
 doc/api/allennlp.modules.input_variational_dropout.rst | 7 +++++++
 doc/api/allennlp.modules.rst                           | 1 +
 3 files changed, 9 insertions(+)
 create mode 100644 doc/api/allennlp.modules.input_variational_dropout.rst

diff --git a/doc/api/allennlp.models.rst b/doc/api/allennlp.models.rst
index a0cce505a82..46a4cd42db8 100644
--- a/doc/api/allennlp.models.rst
+++ b/doc/api/allennlp.models.rst
@@ -21,3 +21,4 @@ allennlp.models
   allennlp.models.semantic_parsing
   allennlp.models.semantic_role_labeler
   allennlp.models.simple_tagger
+  allennlp.models.esim
diff --git a/doc/api/allennlp.modules.input_variational_dropout.rst b/doc/api/allennlp.modules.input_variational_dropout.rst
new file mode 100644
index 00000000000..c02b9ce6373
--- /dev/null
+++ b/doc/api/allennlp.modules.input_variational_dropout.rst
@@ -0,0 +1,7 @@
+allennlp.modules.input_variational_dropout
+=========================================
+
+.. automodule:: allennlp.modules.input_variational_dropout
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/doc/api/allennlp.modules.rst b/doc/api/allennlp.modules.rst
index 9353200f9e6..81d04dc9f23 100644
--- a/doc/api/allennlp.modules.rst
+++ b/doc/api/allennlp.modules.rst
@@ -31,3 +31,4 @@ allennlp.modules
    allennlp.modules.layer_norm
    allennlp.modules.span_pruner
    allennlp.modules.maxout
+   allennlp.modules.input_variational_dropout

From a3cf48daf222c80e4963d1906ac8b01123568272 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 15:57:46 -0700
Subject: [PATCH 22/24] fix the docs

---
 doc/api/allennlp.modules.input_variational_dropout.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/api/allennlp.modules.input_variational_dropout.rst b/doc/api/allennlp.modules.input_variational_dropout.rst
index c02b9ce6373..ccb4a210341 100644
--- a/doc/api/allennlp.modules.input_variational_dropout.rst
+++ b/doc/api/allennlp.modules.input_variational_dropout.rst
@@ -1,5 +1,5 @@
 allennlp.modules.input_variational_dropout
-=========================================
+==========================================
 
 .. automodule:: allennlp.modules.input_variational_dropout
    :members:

From 101a71c360f21333527edab32adc031d5e0d4ebb Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 16:20:36 -0700
Subject: [PATCH 23/24] Remove ESIM predictor

---
 allennlp/predictors/__init__.py |  1 -
 allennlp/predictors/esim.py     | 40 ---------------------------------
 doc/api/allennlp.predictors.rst |  7 ------
 3 files changed, 48 deletions(-)
 delete mode 100644 allennlp/predictors/esim.py

diff --git a/allennlp/predictors/__init__.py b/allennlp/predictors/__init__.py
index ca291ee1658..0be6fc4ee6e 100644
--- a/allennlp/predictors/__init__.py
+++ b/allennlp/predictors/__init__.py
@@ -16,4 +16,3 @@
 from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
 from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
 from allennlp.predictors.nlvr_parser import NlvrParserPredictor
-from allennlp.predictors.esim import ESIMPredictor
diff --git a/allennlp/predictors/esim.py b/allennlp/predictors/esim.py
deleted file mode 100644
index 59542024212..00000000000
--- a/allennlp/predictors/esim.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from overrides import overrides
-
-from allennlp.common.util import JsonDict
-from allennlp.data import Instance
-from allennlp.predictors.predictor import Predictor
-
-
-@Predictor.register('esim')
-class ESIMPredictor(Predictor):
-    """
-    Predictor for the :class:`~allennlp.models.esim.ESIM` model.
-    """
-
-    def predict(self, sentence1: str, sentence2: str) -> JsonDict:
-        """
-        Predicts whether the sentence2 is entailed by the sentence1 text.
-
-        Parameters
-        ----------
-        sentence1 : ``str``
-            A passage representing what is assumed to be true.
-
-        sentence2 : ``str``
-            A sentence that may be entailed by the sentence1.
-
-        Returns
-        -------
-        A dictionary where the key "label_probs" determines the probabilities of each of
-        [entailment, contradiction, neutral].
-        """
-        return self.predict_json({"sentence1" : sentence1, "sentence2": sentence2})
-
-    @overrides
-    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
-        """
-        Expects JSON that looks like ``{"sentence1": "...", "sentence2": "..."}``.
-        """
-        sentence1_text = json_dict["sentence1"]
-        sentence2_text = json_dict["sentence2"]
-        return self._dataset_reader.text_to_instance(sentence1_text, sentence2_text)
diff --git a/doc/api/allennlp.predictors.rst b/doc/api/allennlp.predictors.rst
index f8cfd837274..8712dfa6be1 100644
--- a/doc/api/allennlp.predictors.rst
+++ b/doc/api/allennlp.predictors.rst
@@ -9,7 +9,6 @@ allennlp.predictors
 * :ref:`Predictor<predictor>`
 * :ref:`BidafPredictor<bidaf>`
 * :ref:`DecomposableAttentionPredictor<decomposable-attention>`
-* :ref:`ESIMPredictor<esim>`
 * :ref:`SemanticRoleLabelerPredictor<semantic-role-labeler>`
 * :ref:`SentenceTaggerPredictor<sentence-tagger>`
 * :ref:`CorefPredictor<coreference-resolution>`
@@ -36,12 +35,6 @@ allennlp.predictors
    :undoc-members:
    :show-inheritance:
 
-.. _esim:
-.. automodule:: allennlp.predictors.esim
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 .. _semantic-role-labeler:
 .. automodule:: allennlp.predictors.semantic_role_labeler
    :members:

From 9b901f93fbbccad93dff606238c0114c0130fc98 Mon Sep 17 00:00:00 2001
From: Matthew Peters <matthewp@allenai.org>
Date: Mon, 9 Jul 2018 16:52:44 -0700
Subject: [PATCH 24/24] Scrub all of ESIMPredictor

---
 allennlp/service/predictors/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/allennlp/service/predictors/__init__.py b/allennlp/service/predictors/__init__.py
index b06b614cd8c..1e95ef2c393 100644
--- a/allennlp/service/predictors/__init__.py
+++ b/allennlp/service/predictors/__init__.py
@@ -18,7 +18,6 @@
 from allennlp.predictors.simple_seq2seq import SimpleSeq2SeqPredictor
 from allennlp.predictors.wikitables_parser import WikiTablesParserPredictor
 from allennlp.predictors.nlvr_parser import NlvrParserPredictor
-from allennlp.predictors.esim import ESIMPredictor
 
 warnings.warn("allennlp.service.predictors.* has been depreciated. "
               "Please use allennlp.predictors.*", FutureWarning)