patterns-ai-core · sergiobayona · Jun 11, 2025 · Jun 12, 2025
diff --git a/examples/store_and_query_with_elasticsearch_using_metadata.rb b/examples/store_and_query_with_elasticsearch_using_metadata.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+require "langchain"
+require "dotenv/load"
+require "ruby/openai"
+
+# This example assumes you are running Elasticsearch in Docker:
+#
+#   docker run --name es8 -d \
+#     -p 9200:9200 -p 9300:9300 \
+#     -e "discovery.type=single-node" \
+#     -e "xpack.security.enabled=false" \
+#     docker.elastic.co/elasticsearch/elasticsearch:8.12.2
+#
+# The container exposes the REST API on http://localhost:9200 which
+# the script connects to below. If you use a different host/port, set
+# the ELASTICSEARCH_URL environment variable accordingly before running
+# the script:
+#   ELASTICSEARCH_URL=http://localhost:9201 ruby examples/...
+
+# Instantiate the Elasticsearch vector store
+es = Langchain::Vectorsearch::Elasticsearch.new(
+  url: ENV.fetch("ELASTICSEARCH_URL", "http://localhost:9200"),
+  index_name: "documents",
+  llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
+)
+
+# Create the index & mapping (safe to call if it already exists)
+# You may need to delete an old index first if it was created without the metadata field.
+begin
+  es.create_default_schema
+rescue => e
+  warn "Index might already exist: #{e.message}"
+end
+
+# Prepare documents with metadata
+corpus = [
+  {
+    text: "Vector search lets you retrieve semantically similar documents.",
+    metadata: {lang: "en", author: "alice", topic: "vector-search"}
+  },
+  {
+    text: "Las bases de datos vectoriales permiten búsquedas semánticas.",
+    metadata: {lang: "es", author: "bob", topic: "vector-search"}
+  },
+  {
+    text: "Ruby makes metaprogramming accessible and fun.",
+    metadata: {lang: "en", author: "carol", topic: "ruby"}
+  }
+]
+
+puts "\nAdding documents with metadata …"
+
+es.add_texts(
+  texts: corpus.map { |d| d[:text] },
+  metadatas: corpus.map { |d| d[:metadata] }
+)
+
+sleep 1 # give ES a moment to index
+
+puts "\nSimilarity search for 'vector' restricted to English docs:"
+filter = {term: {"metadata.lang" => "en"}}
+results = es.similarity_search(text: "vector", k: 2, filter: filter)
+pp results
+
+puts "\nSimilarity search by embedding, Spanish docs only:"
+embedding = es.llm.embed(text: "vector query").embedding
+filter = {term: {"metadata.lang" => "es"}}
+pp es.similarity_search_by_vector(embedding: embedding, k: 1, filter: filter)
+
+# Cleanup (optional)
+# es.delete_default_schema
diff --git a/lib/langchain/vectorsearch/elasticsearch.rb b/lib/langchain/vectorsearch/elasticsearch.rb
@@ -48,12 +48,25 @@ def initialize(url:, index_name:, llm:, api_key: nil, es_options: {})
 
     # Add a list of texts to the index
     # @param texts [Array<String>] The list of texts to add
+    # @param metadatas [Array<Hash>] Optional list of metadata hashes to store alongside each text. Must be the same length as texts when provided.
     # @return [Elasticsearch::Response] from the Elasticsearch server
-    def add_texts(texts: [])
-      body = texts.map do |text|
+    def add_texts(texts: [], metadatas: [])
+      metadatas = Array(metadatas)
+
+      if !metadatas.empty? && (metadatas.length != texts.length)
+        raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
+      end
+
+      body = texts.map.with_index do |text, i|
+        document_body = {
+          input: text,
+          input_vector: llm.embed(text: text).embedding
+        }
+        document_body[:metadata] = metadatas[i] if metadatas[i]
+
         [
           {index: {_index: index_name}},
-          {input: text, input_vector: llm.embed(text: text).embedding}
+          document_body
         ]
       end.flatten
 
@@ -63,12 +76,25 @@ def add_texts(texts: [])
     # Add a list of texts to the index
     # @param texts [Array<String>] The list of texts to update
     # @param texts [Array<Integer>] The list of texts to update
+    # @param metadatas [Array<Hash>] Optional list of metadata hashes to update alongside each text. Must be the same length as texts when provided.
     # @return [Elasticsearch::Response] from the Elasticsearch server
-    def update_texts(texts: [], ids: [])
+    def update_texts(texts: [], ids: [], metadatas: [])
+      metadatas = Array(metadatas)
+
+      if !metadatas.empty? && (metadatas.length != texts.length)
+        raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
+      end
+
       body = texts.map.with_index do |text, i|
+        document_body = {
+          input: text,
+          input_vector: llm.embed(text: text).embedding
+        }
+        document_body[:metadata] = metadatas[i] if metadatas[i]
+
         [
           {index: {_index: index_name, _id: ids[i]}},
-          {input: text, input_vector: llm.embed(text: text).embedding}
+          document_body
         ]
       end.flatten
 
@@ -118,7 +144,11 @@ def default_schema
             input: {
               type: "text"
             },
-            input_vector: vector_settings
+            input_vector: vector_settings,
+            metadata: {
+              type: "object",
+              dynamic: true
+            }
           }
         }
       }
@@ -163,34 +193,45 @@ def ask(question:, k: 4, &block)
     # @param text [String] The text to search for
     # @param k [Integer] The number of results to return
     # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
+    # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
     # @return [Elasticsearch::Response] The response from the server
-    def similarity_search(text: "", k: 10, query: {})
+    def similarity_search(text: "", k: 10, query: {}, filter: {})
       if text.empty? && query.empty?
         raise "Either text or query should pass as an argument"
       end
 
+      # Build base similarity query (script_score by default)
       if query.empty?
         query_vector = llm.embed(text: text).embedding
-
         query = default_query(query_vector)
       end
 
-      es_client.search(body: {query: query, size: k}).body
+      # Apply filter if provided
+      final_query = if filter.empty?
+        query
+      else
+        {bool: {must: query, filter: filter}}
+      end
+
+      es_client.search(body: {query: final_query, size: k}).body
     end
 
     # Search for similar texts by embedding
     # @param embedding [Array<Float>] The embedding to search for
     # @param k [Integer] The number of results to return
     # @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
+    # @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
     # @return [Elasticsearch::Response] The response from the server
-    def similarity_search_by_vector(embedding: [], k: 10, query: {})
+    def similarity_search_by_vector(embedding: [], k: 10, query: {}, filter: {})
       if embedding.empty? && query.empty?
         raise "Either embedding or query should pass as an argument"
       end
 
       query = default_query(embedding) if query.empty?
 
-      es_client.search(body: {query: query, size: k}).body
+      final_query = filter.empty? ? query : {bool: {must: query, filter: filter}}
+
+      es_client.search(body: {query: final_query, size: k}).body
     end
   end
 end
diff --git a/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb b/spec/lib/langchain/vectorsearch/elasticsearch_spec.rb
@@ -17,16 +17,23 @@
   end
 
   describe "#add_texts" do
-    it "indexes data into elasticsearch" do
+    it "indexes data into elasticsearch with metadata" do
+      metadata = {lang: "en"}
       es_body = [
         {index: {_index: "langchain"}},
-        {input: "simple text", input_vector: [0.1, 0.2, 0.3]}
+        {input: "simple text", input_vector: [0.1, 0.2, 0.3], metadata: metadata}
       ]
 
       allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
       expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
 
-      subject.add_texts(texts: ["simple text"])
+      subject.add_texts(texts: ["simple text"], metadatas: [metadata])
+    end
+
+    it "raises error when metadatas length mismatch" do
+      expect {
+        subject.add_texts(texts: ["t1", "t2"], metadatas: [{foo: 1}])
+      }.to raise_error(ArgumentError)
     end
   end
 
@@ -38,16 +45,17 @@
         .and_return([0.1, 0.2, 0.3, 0.4])
     end
 
-    it "updates respective document" do
+    it "updates respective document with metadata" do
+      metadata = {version: 2}
       es_body = [
         {index: {_index: "langchain", _id: 1}},
-        {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4]}
+        {input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4], metadata: metadata}
       ]
 
       allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
       expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once
 
-      subject.update_texts(texts: ["updated text"], ids: [1])
+      subject.update_texts(texts: ["updated text"], ids: [1], metadatas: [metadata])
     end
   end
 
@@ -100,7 +108,8 @@
             input: {
               type: "text"
             },
-            input_vector: {type: "dense_vector", dims: 384}
+            input_vector: {type: "dense_vector", dims: 384},
+            metadata: {type: "object", dynamic: true}
           }
         }
       }
@@ -117,7 +126,8 @@
             input: {
               type: "text"
             },
-            input_vector: {type: "dense_vector", dims: 500}
+            input_vector: {type: "dense_vector", dims: 500},
+            metadata: {type: "object", dynamic: true}
           }
         }
       }
@@ -145,7 +155,8 @@
   end
 
   describe "#similarity_search" do
-    it "should return similar documents" do
+    it "should return similar documents with metadata filter" do
+      filter = {term: {"metadata.lang": "en"}}
       response = [
         {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
         {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -154,13 +165,13 @@
 
       allow(es_response).to receive(:body).and_return(response)
       allow_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}).and_return(es_response)
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}).and_return(es_response)
 
       expect_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5})
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5})
       expect(es_response).to receive(:body)
 
-      expect(subject.similarity_search(text: "simple", k: 5)).to eq(response)
+      expect(subject.similarity_search(text: "simple", k: 5, filter: filter)).to eq(response)
     end
 
     it "able to search with custom query" do
@@ -197,7 +208,8 @@
   end
 
   describe "#similarity_search_by_vector" do
-    it "should return similar documents" do
+    it "should return similar documents with metadata filter" do
+      filter = {term: {"metadata.lang": "en"}}
       response = [
         {_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
         {_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
@@ -206,13 +218,13 @@
 
       allow(es_response).to receive(:body).and_return(response)
       allow_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}).and_return(es_response)
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}).and_return(es_response)
 
       expect_any_instance_of(::Elasticsearch::Client)
-        .to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5})
+        .to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5})
       expect(es_response).to receive(:body)
 
-      expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5)).to eq(response)
+      expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5, filter: filter)).to eq(response)
     end
 
     it "able to search with custom query" do