patterns-ai-core · andreibondarev · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023 · Oct 21, 2023
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -35,6 +35,11 @@ GEM
       erubi (~> 1.4)
       rails-dom-testing (~> 2.0)
       rails-html-sanitizer (~> 1.1, >= 1.2.0)
+    activemodel (7.0.8)
+      activesupport (= 7.0.8)
+    activerecord (7.0.8)
+      activemodel (= 7.0.8)
+      activesupport (= 7.0.8)
     activesupport (7.0.8)
       concurrent-ruby (~> 1.0, >= 1.0.2)
       i18n (>= 1.6, < 2)
@@ -164,6 +169,8 @@ GEM
     multi_json (1.15.0)
     multi_xml (0.6.0)
     multipart-post (2.3.0)
+    neighbor (0.3.1)
+      activerecord (>= 6.1)
     nokogiri (1.14.3)
       mini_portile2 (~> 2.8.0)
       racc (~> 1.4)
@@ -342,6 +349,7 @@ DEPENDENCIES
   langchainrb!
   llama_cpp (~> 0.3.7)
   milvus (~> 0.9.2)
+  neighbor (~> 0.3.0)
   nokogiri (~> 1.13)
   open-weather-ruby-client (~> 0.4.0)
   pdf-reader (~> 1.4)

diff --git a/langchain.gemspec b/langchain.gemspec
@@ -53,6 +53,7 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "hnswlib", "~> 0.8.1"
   spec.add_development_dependency "hugging-face", "~> 0.3.4"
   spec.add_development_dependency "milvus", "~> 0.9.2"
+  spec.add_development_dependency "neighbor", "~> 0.3.0"
   spec.add_development_dependency "llama_cpp", "~> 0.3.7"
   spec.add_development_dependency "nokogiri", "~> 1.13"
   spec.add_development_dependency "open-weather-ruby-client", "~> 0.4.0"

diff --git a/lib/generators/langchain/pgvector_generator.rb b/lib/generators/langchain/pgvector_generator.rb
@@ -0,0 +1,75 @@
+require "rails/generators/active_record"
+
+module Langchain
+  module Generators
+    #
+    # Usage:
+    #     rails g langchain:pgvector -model=Product -llm=openai
+    #
+    class PgvectorGenerator < Rails::Generators::Base
+      desc "This generator adds Pgvector vectorsearch integration to your ActiveRecord model"
+
+      include ::ActiveRecord::Generators::Migration
+      source_root File.join(__dir__, "templates")
+
+      class_option :model, type: :string, required: true, desc: "ActiveRecord Model to add vectorsearch to", aliases: "-m"
+      class_option :llm, type: :string, required: true, desc: "LLM provider that will be used to generate embeddings and completions"
+
+      LLMS = {
+        "cohere" => "Langchain::LLM::Cohere",
+        "google_palm" => "Langchain::LLM::GooglePalm",
+        "hugging_face" => "Langchain::LLM::HuggingFace",
+        "llama_cpp" => "Langchain::LLM::LlamaCpp",
+        "ollama" => "Langchain::LLM::Ollama",
+        "openai" => "Langchain::LLM::OpenAI",
+        "replicate" => "Langchain::LLM::Replicate"
+      }
+
+      def copy_migration
+        migration_template "enable_vector_extension_template.rb", "db/migrate/enable_vector_extension.rb", migration_version: migration_version
+        migration_template "add_vector_column_template.rb", "db/migrate/add_vector_column_to_#{table_name}.rb", migration_version: migration_version
+      end
+
+      def create_initializer_file
+        template "initializer.rb", "config/initializers/langchain.rb"
+      end
+
+      def migration_version
+        "[#{::ActiveRecord::VERSION::MAJOR}.#{::ActiveRecord::VERSION::MINOR}]"
+      end
+
+      def add_to_model
+        inject_into_class "app/models/#{model_name.downcase}.rb", model_name do
+          "  vectorsearch\n\n  after_save :upsert_to_vectorsearch\n\n"
+        end
+      end
+
+      private
+
+      # @return [String] Name of the model
+      def model_name
+        options["model"]
+      end
+
+      # @return [String] Table name of the model
+      def table_name
+        model_name.downcase.pluralize
+      end
+
+      # @return [String] LLM provider to use
+      def llm
+        options["llm"]
+      end
+
+      # @return [Langchain::LLM::*] LLM class
+      def llm_class
+        Langchain::LLM.const_get(LLMS[llm])
+      end
+
+      # @return [Integer] Dimension of the vector to be used
+      def vector_dimension
+        llm_class.default_dimension
+      end
+    end
+  end
+end
diff --git a/lib/generators/langchain/templates/add_vector_column_template.rb.tt b/lib/generators/langchain/templates/add_vector_column_template.rb.tt
@@ -0,0 +1,10 @@
+class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %>
+  def change
+    add_column :<%= table_name %>, :embedding, :vector, 
+      limit: Langchain
+        .config
+        .vectorsearch
+        .llm
+        .default_dimension
+  end
+end
diff --git a/lib/generators/langchain/templates/enable_vector_extension_template.rb.tt b/lib/generators/langchain/templates/enable_vector_extension_template.rb.tt
@@ -0,0 +1,5 @@
+class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %>
+  def change
+    enable_extension "vector"
+  end
+end
diff --git a/lib/generators/langchain/templates/initializer.rb.tt b/lib/generators/langchain/templates/initializer.rb.tt
@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+
+Langchain.configure do |config|
+  config.vectorsearch = Langchain::Vectorsearch::Pgvector.new(
+    llm: <%= llm_class %>.new(api_key: ENV["OPENAI_API_KEY"])
+  )
+end
diff --git a/lib/langchain.rb b/lib/langchain.rb
@@ -6,6 +6,7 @@
 require "zeitwerk"
 loader = Zeitwerk::Loader.for_gem
 loader.ignore("#{__dir__}/langchainrb.rb")
+loader.ignore("#{__dir__}/generators")
 loader.inflector.inflect(
   "ai21" => "AI21",
   "ai21_response" => "AI21Response",
@@ -71,14 +72,29 @@ class << self
     # @return [ContextualLogger]
     attr_reader :logger
 
+    # @return [Pathname]
+    attr_reader :root
+
     # @param logger [Logger]
     # @return [ContextualLogger]
     def logger=(logger)
       @logger = ContextualLogger.new(logger)
     end
 
-    # @return [Pathname]
-    attr_reader :root
+    # Configures global settings for Langchain
+    #     Langchain.configure do |config|
+    #       config.vectorsearch = Langchain::Vectorsearch::Pgvector.new(
+    #         llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
+    #       )
+    #     end
+    def configure
+      yield(config)
+    end
+
+    # @return [Config] The global configuration object
+    def config
+      @_config ||= Config.new
+    end
   end
 
   self.logger ||= ::Logger.new($stdout, level: :warn)

diff --git a/lib/langchain/active_record/hooks.rb b/lib/langchain/active_record/hooks.rb
@@ -66,15 +66,21 @@ def upsert_to_vectorsearch
       #
       # @return [String] the text representation of the model
       def as_vector
-        to_json
+        to_json(except: :embedding)
       end
 
       module ClassMethods
         # Set the vector search provider
         #
         # @param provider [Object] The `Langchain::Vectorsearch::*` instance
-        def vectorsearch(provider:)
-          class_variable_set(:@@provider, provider)
+        def vectorsearch
+          # Pgvector-specific configuration
+          if Langchain.config.vectorsearch.is_a?(Langchain::Vectorsearch::Pgvector)
+            has_neighbors(:embedding)
+          end
+
+          Langchain.config.vectorsearch.model = self
+          class_variable_set(:@@provider, Langchain.config.vectorsearch)
         end
 
         # Search for similar texts
@@ -88,9 +94,13 @@ def similarity_search(query, k: 1)
             k: k
           )
 
-          # We use "__id" when Weaviate is the provider
-          ids = records.map { |record| record.dig("id") || record.dig("__id") }
-          where(id: ids)
+          if records.is_a?(::ActiveRecord::Relation)
+            records
+          else
+            # We use "__id" when Weaviate is the provider
+            ids = records.map { |record| record.dig("id") || record.dig("__id") }
+            where(id: ids)
+          end
         end
 
         # Ask a question and return the answer

diff --git a/lib/langchain/config.rb b/lib/langchain/config.rb
@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+
+module Langchain
+  class Config
+    # This class is used to configure the Langchain.rb gem inside Rails apps, in the `config/initializers/langchain.rb` file.
+    #
+    # Langchain is configured in the following way:
+    #     Langchain.configure do |config|
+    #       config.vectorsearch = Langchain::Vectorsearch::Pgvector.new(
+    #         llm: Langchain::LLM::OpenAI.new(api_key: ENV["OPENAI_API_KEY"])
+    #       )
+    #     end
+    attr_accessor :vectorsearch
+
+    def initialize
+      # Define the defaults for future configuration here
+      @vectorsearch = {}
+    end
+  end
+end
diff --git a/lib/langchain/vectorsearch/pgvector.rb b/lib/langchain/vectorsearch/pgvector.rb
@@ -6,105 +6,63 @@ class Pgvector < Base
     # The PostgreSQL vector search adapter
     #
     # Gem requirements:
-    #     gem "sequel", "~> 5.68.0"
     #     gem "pgvector", "
10000
~> 0.2"
     #
     # Usage:
-    #     pgvector = Langchain::Vectorsearch::Pgvector.new(url:, index_name:, llm:, namespace: nil)
+    #     pgvector = Langchain::Vectorsearch::Pgvector.new(llm:, model_name:)
     #
 
     # The operators supported by the PostgreSQL vector search adapter
-    OPERATORS = {
-      "cosine_distance" => "cosine",
-      "euclidean_distance" => "euclidean"
-    }
-    DEFAULT_OPERATOR = "cosine_distance"
+    OPERATORS = [
+      "cosine",
+      "euclidean",
+      "inner_product"
+    ]
+    DEFAULT_OPERATOR = "cosine"
 
-    attr_reader :db, :operator, :table_name, :namespace_column, :namespace, :documents_table
+    attr_reader :db, :operator, :llm
+    attr_accessor :model
 
     # @param url [String] The URL of the PostgreSQL database
     # @param index_name [String] The name of the table to use for the index
     # @param llm [Object] The LLM client to use
     # @param namespace [String] The namespace to use for the index when inserting/querying
-    def initialize(url:, index_name:, llm:, namespace: nil)
-      depends_on "sequel"
+    def initialize(llm:)
       depends_on "pgvector"
+      depends_on "neighbor"
 
-      @db = Sequel.connect(url)
-
-      @table_name = index_name
-
-      @namespace_column = "namespace"
-      @namespace = namespace
-      @operator = OPERATORS[DEFAULT_OPERATOR]
+      @operator = DEFAULT_OPERATOR
 
       super(llm: llm)
     end
 
-    def documents_model
-      Class.new(Sequel::Model(table_name.to_sym)) do
-        plugin :pgvector, :vectors
-      end
-    end
-
-    # Upsert a list of texts to the index
-    # @param texts [Array<String>] The texts to add to the index
-    # @param ids [Array<Integer>] The ids of the objects to add to the index, in the same order as the texts
-    # @return [PG::Result] The response from the database including the ids of
-    # the added or updated texts.
-    def upsert_texts(texts:, ids:)
-      data = texts.zip(ids).flat_map do |(text, id)|
-        {id: id, content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
-      end
-      # @db[table_name.to_sym].multi_insert(data, return: :primary_key)
-      @db[table_name.to_sym]
-        .insert_conflict(
-          target: :id,
-          update: {content: Sequel[:excluded][:content], vectors: Sequel[:excluded][:vectors]}
-        )
-        .multi_insert(data, return: :primary_key)
-    end
-
     # Add a list of texts to the index
     # @param texts [Array<String>] The texts to add to the index
     # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
     # @return [Array<Integer>] The the ids of the added texts.
-    def add_texts(texts:, ids: nil)
-      if ids.nil? || ids.empty?
-        data = texts.map do |text|
-          {content: text, vectors: llm.embed(text: text).embedding.to_s, namespace: namespace}
-        end
-
-        @db[table_name.to_sym].multi_insert(data, return: :primary_key)
-      else
-        upsert_texts(texts: texts, ids: ids)
+    def add_texts(texts:, ids:)
+      embeddings = texts.map do |text|
+        llm.embed(text: text).embedding
+      end
+
+      model.find_each.with_index do |record, i|
+        record.update_column(:embedding, embeddings[i])
       end
     end
 
-    # Update a list of ids and corresponding texts to the index
-    # @param texts [Array<String>] The texts to add to the index
-    # @param ids [Array<String>] The ids to add to the index, in the same order as the texts
-    # @return [Array<Integer>] The ids of the updated texts.
     def update_texts(texts:, ids:)
-      upsert_texts(texts: texts, ids: ids)
+      add_texts(texts: texts, ids: ids)
     end
 
-    # Create default schema
+    # Invoke a rake task that will create an initializer (`config/initializers/langchain.rb`) file
+    # and db/migrations/* files
     def create_default_schema
-      db.run "CREATE EXTENSION IF NOT EXISTS vector"
-      namespace_column = @namespace_column
-      vector_dimension = llm.default_dimension
-      db.create_table? table_name.to_sym do
-        primary_key :id
-        text :content
-        column :vectors, "vector(#{vector_dimension})"
-        text namespace_column.to_sym, default: nil
-      end
+      Rake::Task["pgvector"].invoke
     end
 
     # Destroy default schema
     def destroy_default_schema
-      db.drop_table? table_name.to_sym
+      # Tell the user to rollback the migration
     end
 
     # Search for similar texts in the index
@@ -126,11 +84,9 @@ def similarity_search(query:, k: 4)
     # @param k [Integer] The number of top results to return
     # @return [Array<Hash>] The results of the search
     def similarity_search_by_vector(embedding:, k: 4)
-      db.transaction do # BEGIN
-        documents_model
-          .nearest_neighbors(:vectors, embedding, distance: operator).limit(k)
-          .where(namespace_column.to_sym => namespace)
-      end
+      model
+        .nearest_neighbors(:embedding, embedding, distance: operator)
+        .limit(k)
     end
 
     # Ask a question and return the answer
@@ -142,7 +98,7 @@ def ask(question:, k: 4, &block)
       search_results = similarity_search(query: question, k: k)
 
       context = search_results.map do |result|
-        result.content.to_s
+        result.as_vector
       end
       context = context.join("\n---\n")