8000 feat(vectorsearch): add metadata support to Elasticsearch adapter by sergiobayona · Pull Request #1004 · patterns-ai-core/langchainrb · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

feat(vectorsearch): add metadata support to Elasticsearch adapter #1004

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions examples/store_and_query_with_elasticsearch_using_metadata.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# frozen_string_literal: true

require "langchain"
require "dotenv/load"
require "ruby/openai"

# This example assumes you are running Elasticsearch in Docker:
#
# docker run --name es8 -d \
# -p 9200:9200 -p 9300:9300 \
# -e "discovery.type=single-node" \
# -e "xpack.security.enabled=false" \
# docker.elastic.co/elasticsearch/elasticsearch:8.12.2
#
# The container exposes the REST API on http://localhost:9200 which
# the script connects to below. If you use a different host/port, set
# the ELASTICSEARCH_URL environment variable accordingly before running
# the script:
# ELASTICSEARCH_URL=http://localhost:9201 ruby examples/...

# Instantiate the Elasticsearch vector store
es = Langchain::Vectorsearch::Elasticsearch.new(
url: ENV.fetch("ELASTICSEARCH_URL", "http://localhost:9200"),
index_name: "documents",
llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
)

# Create the index & mapping (safe to call if it already exists)
# You may need to delete an old index first if it was created without the metadata field.
begin
es.create_default_schema
rescue => e
warn "Index might already exist: #{e.message}"
end

# Prepare documents with metadata
corpus = [
{
text: "Vector search lets you retrieve semantically similar documents.",
metadata: {lang: "en", author: "alice", topic: "vector-search"}
},
{
text: "Las bases de datos vectoriales permiten búsquedas semánticas.",
metadata: {lang: "es", author: "bob", topic: "vector-search"}
},
{
text: "Ruby makes metaprogramming accessible and fun.",
metadata: {lang: "en", author: "carol", topic: "ruby"}
}
]

puts "\nAdding documents with metadata …"

es.add_texts(
texts: corpus.map { |d| d[:text] },
metadatas: corpus.map { |d| d[:metadata] }
)

sleep 1 # give ES a moment to index

puts "\nSimilarity search for 'vector' restricted to English docs:"
filter = {term: {"metadata.lang" => "en"}}
results = es.similarity_search(text: "vector", k: 2, filter: filter)
pp results

puts "\nSimilarity search by embedding, Spanish docs only:"
embedding = es.llm.embed(text: "vector query").embedding
filter = {term: {"metadata.lang" => "es"}}
pp es.similarity_search_by_vector(embedding: embedding, k: 1, filter: filter)

# Cleanup (optional)
# es.delete_default_schema
63 changes: 52 additions & 11 deletions lib/langchain/vectorsearch/elasticsearch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,25 @@ def initialize(url:, index_name:, llm:, api_key: nil, es_options: {})

# Add a list of texts to the index
# @param texts [Array<String>] The list of texts to add
# @param metadatas [Array<Hash>] Optional list of metadata hashes to store alongside each text. Must be the same length as texts when provided.
# @return [Elasticsearch::Response] from the Elasticsearch server
def add_texts(texts: [])
body = texts.map do |text|
def add_texts(texts: [], metadatas: [])
metadatas = Array(metadatas)

if !metadatas.empty? && (metadatas.length != texts.length)
raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
end

body = texts.map.with_index do |text, i|
document_body = {
input: text,
input_vector: llm.embed(text: text).embedding
}
document_body[:metadata] = metadatas[i] if metadatas[i]

[
{index: {_index: index_name}},
{input: text, input_vector: llm.embed(text: text).embedding}
document_body
]
end.flatten

Expand All @@ -63,12 +76,25 @@ def add_texts(texts: [])
# Add a list of texts to the index
# @param texts [Array<String>] The list of texts to update
# @param texts [Array<Integer>] The list of texts to update
# @param metadatas [Array<Hash>] Optional list of metadata hashes to update alongside each text. Must be the same length as texts when provided.
# @return [Elasticsearch::Response] from the Elasticsearch server
def update_texts(texts: [], ids: [])
def update_texts(texts: [], ids: [], metadatas: [])
metadatas = Array(metadatas)

if !metadatas.empty? && (metadatas.length != texts.length)
raise ArgumentError, "`metadatas` must be the same length as `texts` when provided"
end

body = texts.map.with_index do |text, i|
document_body = {
input: text,
input_vector: llm.embed(text: text).embedding
}
document_body[:metadata] = metadatas[i] if metadatas[i]

[
{index: {_index: index_name, _id: ids[i]}},
{input: text, input_vector: llm.embed(text: text).embedding}
document_body
]
end.flatten

Expand Down Expand Up @@ -118,7 +144,11 @@ def default_schema
input: {
type: "text"
},
input_vector: vector_settings
input_vector: vector_settings,
metadata: {
type: "object",
dynamic: true
}
}
}
}
Expand Down Expand Up @@ -163,34 +193,45 @@ def ask(question:, k: 4, &block)
# @param text [String] The text to search for
# @param k [Integer] The number of results to return
# @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
# @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
# @return [Elasticsearch::Response] The response from the server
def similarity_search(text: "", k: 10, query: {})
def similarity_search(text: "", k: 10, query: {}, filter: {})
if text.empty? && query.empty?
raise "Either text or query should pass as an argument"
end

# Build base similarity query (script_score by default)
if query.empty?
query_vector = llm.embed(text: text).embedding

query = default_query(query_vector)
end

es_client.search(body: {query: query, size: k}).body
# Apply filter if provided
final_query = if filter.empty?
query
else
{bool: {must: query, filter: filter}}
end

es_client.search(body: {query: final_query, size: k}).body
end

# Search for similar texts by embedding
# @param embedding [Array<Float>] The embedding to search for
# @param k [Integer] The number of results to return
# @param query [Hash] Elasticsearch query that needs to be used while searching (Optional)
# @param filter [Hash] Elasticsearch filter that needs to be used while searching (Optional)
# @return [Elasticsearch::Response] The response from the server
def similarity_search_by_vector(embedding: [], k: 10, query: {})
def similarity_search_by_vector(embedding: [], k: 10, query: {}, filter: {})
if embedding.empty? && query.empty?
raise "Either embedding or query should pass as an argument"
end

query = default_query(embedding) if query.empty?

es_client.search(body: {query: query, size: k}).body
final_query = filter.empty? ? query : {bool: {must: query, filter: filter}}

es_client.search(body: {query: final_query, size: k}).body
end
end
end
44 changes: 28 additions & 16 deletions spec/lib/langchain/vectorsearch/elasticsearch_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,23 @@
end

describe "#add_texts" do
it "indexes data into elasticsearch" do
it "indexes data into elasticsearch with metadata" do
metadata = {lang: "en"}
es_body = [
{index: {_index: "langchain"}},
{input: "simple text", input_vector: [0.1, 0.2, 0.3]}
{input: "simple text", input_vector: [0.1, 0.2, 0.3], metadata: metadata}
]

allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once

subject.add_texts(texts: ["simple text"])
subject.add_texts(texts: ["simple text"], metadatas: [metadata])
end

it "raises error when metadatas length mismatch" do
expect {
subject.add_texts(texts: ["t1", "t2"], metadatas: [{foo: 1}])
}.to raise_error(ArgumentError)
end
end

Expand All @@ -38,16 +45,17 @@
.and_return([0.1, 0.2, 0.3, 0.4])
end

it "updates respective document" do
it "updates respective document with metadata" do
metadata = {version: 2}
es_body = [
{index: {_index: "langchain", _id: 1}},
{input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4]}
{input: "updated text", input_vector: [0.1, 0.2, 0.3, 0.4], metadata: metadata}
]

allow_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body)
expect_any_instance_of(::Elasticsearch::Client).to receive(:bulk).with(body: es_body).once

subject.update_texts(texts: ["updated text"], ids: [1])
subject.update_texts(texts: ["updated text"], ids: [1], metadatas: [metadata])
end
end

Expand Down Expand Up @@ -100,7 +108,8 @@
input: {
type: "text"
},
input_vector: {type: "dense_vector", dims: 384}
input_vector: {type: "dense_vector", dims: 384},
metadata: {type: "object", dynamic: true}
}
}
}
Expand All @@ -117,7 +126,8 @@
input: {
type: "text"
},
input_vector: {type: "dense_vector", dims: 500}
input_vector: {type: "dense_vector", dims: 500},
metadata: {type: "object", dynamic: true}
}
}
}
Expand Down Expand Up @@ -145,7 +155,8 @@
end

describe "#similarity_search" do
it "should return similar documents" do
it "should return similar documents with metadata filter" do
filter = {term: {"metadata.lang": "en"}}
response = [
{_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
{_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
Expand All @@ -154,13 +165,13 @@

allow(es_response).to receive(:body).and_return(response)
allow_any_instance_of(::Elasticsearch::Client)
.to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5}).and_return(es_response)
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5}).and_return(es_response)

expect_any_instance_of(::Elasticsearch::Client)
.to receive(:search).with(body: {query: subject.default_query([0.1, 0.2, 0.3]), size: 5})
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.1, 0.2, 0.3]), filter: filter}}, size: 5})
expect(es_response).to receive(:body)

expect(subject.similarity_search(text: "simple", k: 5)).to eq(response)
expect(subject.similarity_search(text: "simple", k: 5, filter: filter)).to eq(response)
end

it "able to search with custom query" do
Expand Down Expand Up @@ -197,7 +208,8 @@
end

describe "#similarity_search_by_vector" do
it "should return similar documents" do
it "should return similar documents with metadata filter" do
filter = {term: {"metadata.lang": "en"}}
response = [
{_id: 1, input: "simple text", input_vector: [0.1, 0.5, 0.6]},
{_id: 2, input: "update text", input_vector: [0.5, 0.3, 0.1]}
Expand All @@ -206,13 +218,13 @@

allow(es_response).to receive(:body).and_return(response)
allow_any_instance_of(::Elasticsearch::Client)
.to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5}).and_return(es_response)
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5}).and_return(es_response)

expect_any_instance_of(::Elasticsearch::Client)
.to receive(:search).with(body: {query: subject.default_query([0.5, 0.6, 0.7]), size: 5})
.to receive(:search).with(body: {query: {bool: {must: subject.default_query([0.5, 0.6, 0.7]), filter: filter}}, size: 5})
expect(es_response).to receive(:body)

expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5)).to eq(response)
expect(subject.similarity_search_by_vector(embedding: [0.5, 0.6, 0.7], k: 5, filter: filter)).to eq(response)
end

it "able to search with custom query" do
Expand Down
0