8000 Qdrant vs implementation by AmoghTantradi · Pull Request #111 · lotus-data/lotus · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Qdrant vs implementation #111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 65 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
e3abd90
initial scaffolding for adding vector store / vector database integra…
AmoghTantradi Jan 12, 2025
bd1e8fd
fixed linting, ruff checks pass
AmoghTantradi Jan 12, 2025
880c31f
added changes to requirements.txt file and added additional abstract …
AmoghTantradi Jan 12, 2025
7b5dfd3
refactored
AmoghTantradi Jan 12, 2025
08dfaba
added tests for clustering and filtering
AmoghTantradi Jan 13, 2025
f3a82c1
made edits to test_filter
AmoghTantradi Jan 13, 2025
fc62846
added implementations for weaviate and pinecone vs
AmoghTantradi Jan 14, 2025
3e89b5f
fixed merge conflicts
AmoghTantradi Jan 14, 2025
f2937ad
added extra refactoring and added implementations for qdrant and chro…
AmoghTantradi Jan 14, 2025
a4c7418
fixed some type errors
AmoghTantradi Jan 14, 2025
1357fb3
made further corrections
AmoghTantradi Jan 15, 2025
c76b658
edit uuid type
AmoghTantradi Jan 15, 2025
9f257f7
changed uuid type
AmoghTantradi Jan 15, 2025
99cb535
made type changes to weaviate file
AmoghTantradi Jan 15, 2025
3c8a742
made another change
AmoghTantradi Jan 15, 2025
ccd9e48
typecheck passes for weaviate?
AmoghTantradi Jan 15, 2025
89bf974
type changes for weaviate and qdrant files
AmoghTantradi Jan 16, 2025
a76adb7
made changes to weaviate file
AmoghTantradi Jan 16, 2025
c3e0f0c
made changes to weaviate file
AmoghTantradi Jan 16, 2025
1782281
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
0621b9b
fixed pinecone type errors
AmoghTantradi Jan 16, 2025
b568d1e
type checks all pass locally
AmoghTantradi Jan 16, 2025
9b33a1f
fixed linting errors
AmoghTantradi Jan 16, 2025
820f3be
made refactors to allow for testing
AmoghTantradi Jan 17, 2025
a0a70d2
made changes to tests
AmoghTantradi Jan 22, 2025
6dbd1db
fixed
AmoghTantradi Jan 22, 2025
bea1d19
changed setattr to getattr
AmoghTantradi Jan 22, 2025
f93f7ed
fixed a test
AmoghTantradi Jan 22, 2025
38ff87d
over
AmoghTantradi Jan 25, 2025
c885dbc
another change
AmoghTantradi Jan 25, 2025
8eefac0
fixed type check errors
AmoghTantradi Jan 25, 2025
23bafa5
second refactor (removed index_dir)
AmoghTantradi Jan 27, 2025
75d11ea
fixed type checks
AmoghTantradi Jan 27, 2025
0b0bf38
fixed retriever module errors
AmoghTantradi Jan 27, 2025
6bf7926
fixed key error
AmoghTantradi Jan 27, 2025
f7071a2
added fixes to failing rm tests
AmoghTantradi Jan 28, 2025
6ebe407
fixed chroma
AmoghTantradi Jan 28, 2025
e588bee
removed dynamic indexing for weaviatevs
AmoghTantradi Jan 28, 2025
d6a86e1
fixed type errors
AmoghTantradi Jan 28, 2025
ddfd549
changed weaviate index config
AmoghTantradi Jan 28, 2025
20206e1
changed rm tests index name to avoid pinecone failures
AmoghTantradi Jan 28, 2025
e7ea24f
fixed naming convention for index_dir and fixed serverless spec for p…
AmoghTantradi Jan 28, 2025
f152b54
changed serverless spec for pc index due to free plan
AmoghTantradi Jan 29, 2025
2e21a97
added debug statement
AmoghTantradi Jan 29, 2025
524b501
made changes to errors
AmoghTantradi Jan 29, 2025
87f57e1
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Jan 29, 2025
e995996
added some fixes to collection upload error handling
AmoghTantradi Jan 29, 2025
1a75486
made some other change
AmoghTantradi Jan 29, 2025
c5f50f6
fixed type errors for 8000 qdrant vs
AmoghTantradi Feb 9, 2025
85daf51
changed endpoint
AmoghTantradi Feb 9, 2025
6b80fd3
added changes
AmoghTantradi Feb 9, 2025
4bafdb7
Merge branch 'main' of github.com:guestrin-lab/lotus into at/vs_imple…
AmoghTantradi Feb 9, 2025
f90ff0f
added fixes
AmoghTantradi Feb 9, 2025
cccfa39
added some changes
AmoghTantradi Feb 9, 2025
6cf4f0a
added some change
AmoghTantradi Feb 9, 2025
0438b18
another set of changes
AmoghTantradi Feb 9, 2025
43e9bc3
added other logs
AmoghTantradi Feb 9, 2025
90d07d0
added logging
AmoghTantradi Feb 9, 2025
9c371a0
pr for qdrant
AmoghTantradi Feb 14, 2025
154c2be
removed redundant imports
AmoghTantradi Feb 14, 2025
bdd2463
removed pinecone reference
AmoghTantradi Feb 14, 2025
0998e24
resolved merge conflicts with qdrant
AmoghTantradi Feb 16, 2025
9b1928c
added type check and integration for qdrant vs
AmoghTantradi Feb 16, 2025
3cc4537
added another test to qdrant
AmoghTantradi Feb 16, 2025
08a309f
fixed imports
AmoghTantradi Feb 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions .github/tests/rm_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import lotus
from lotus.models import CrossEncoderReranker, LiteLLMRM, SentenceTransformersRM
from lotus.vector_store import FaissVS
from lotus.vector_store import FaissVS, QdrantVS

################################################################################
# Setup
Expand Down Expand Up @@ -33,6 +33,7 @@

VECTOR_STORE_TO_CLS = {
'local': FaissVS,
'qdrant': QdrantVS
}


Expand Down Expand Up @@ -242,10 +243,6 @@ def test_vs_sim_join(setup_models, setup_vs, vs, model):
expected_pairs = {("History of the Atlantic World", "History"), ("Riemannian Geometry", "Math")}
assert joined_pairs == expected_pairs, joined_pairs






# TODO: threshold is hardcoded for intfloat/e5-small-v2
@pytest.mark.skipif(
Expand Down Expand Up @@ -320,8 +317,9 @@ def test_search(setup_models):
df = df.sem_search("Course Name", "Optimization", K=2, n_rerank=1)
assert df["Course Name"].tolist() == ["Optimization Methods in Engineering"]

@pytest.mark.parametrize("vs", VECTOR_STORE_TO_CLS.keys())
@pytest.mark.parametrize("model", get_enabled("intfloat/e5-small-v2", "text-embedding-3-small"))
def test_filtered_vector_search(setup_models, model):
def test_filtered_vector_search(setup_models, setup_vs, vs, model):
"""
Test filtered vector search.

Expand All @@ -336,7 +334,7 @@ def test_filtered_vector_search(setup_models, model):
expected to pick out the culinary course "Gourmet Cooking Advanced".
"""
rm = setup_models[model]
vs = FaissVS()
vs = setup_vs[vs]
lotus.settings.configure(rm=rm, vs=vs)

data = {
Expand Down
2 changes: 1 addition & 1 deletion lotus/models/rm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ def convert_query_to_query_vector(self, queries: Union[pd.Series, str, Image.Ima
queries = queries.tolist()
# Create embeddings for text queries
query_vectors = self._embed(queries)
return query_vectors
return query_vectors
1 change: 0 additions & 1 deletion lotus/sem_ops/sem_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ def __call__(

df_idxs = self._obj.index
cur_min = len(df_idxs)

K = min(K, cur_min)

search_K = K
Expand Down
3 changes: 2 additions & 1 deletion lotus/vector_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from lotus.vector_store.vs import VS
from lotus.vector_store.faiss_vs import FaissVS
from lotus.vector_store.qdrant_vs import QdrantVS

__all__ = ["VS", "FaissVS"]
__all__ = ["VS", "FaissVS", "QdrantVS"]
174 changes: 174 additions & 0 deletions lotus/vector_store/qdrant_vs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from typing import Any, Optional

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from tqdm import tqdm

from lotus.types import RMOutput
from lotus.vector_store.vs import VS

try:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, FieldCondition, Filter, MatchValue, PointStruct, VectorParams
except ImportError as err:
raise ImportError("Please install the qdrant client") from err

class QdrantVS(VS):
def __init__(self, max_batch_size: int = 64):

API_KEY = '_Mic3dVln2gAkS6NLyia6p-CCyMScK42ayuq8Rapm5-xsV5j5_UlIA'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove hardcode and remove api key


URL = "https://6f8b9aec-a788-4aac-9aeb-417d307493e8.europe-west3-0.gcp.cloud.qdrant.io:6333"

client: QdrantClient = QdrantClient(
url=URL,
api_key=API_KEY
)

"""Initialize with Qdrant client and embedding model"""
super() # Fixed the super() call syntax
self.client: QdrantClient = client
self.max_batch_size = max_batch_size

def __del__(self):
self.client.close()

def index(self, docs:pd.Series, embeddings, index_dir: str, **kwargs: dict[str, Any]):
"""Create a collection and add documents with their embeddings"""
self.index_dir = index_dir

# Get sample embedding to determine vector dimension
dimension = np.reshape(embeddings, (len(embeddings), -1)).shape[1]

# Create collection if it doesn't exist
if not self.client.collection_exists(index_dir):
self.client.create_collection(
collection_name=index_dir,
vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)
)
collection_info = self.client.get_collection(index_dir)
if (collection_info is not None and collection_info.config is not None and collection_info.config.params and collection_info.config.params.vectors):

vectors = collection_info.config.params.vectors
if isinstance(vectors, dict):
# If it's a dict, decide how to handle it.
# Here we extract the first vector, but you may need a different logic.
vector = next(iter(vectors.values()))
size = vector.size
elif isinstance(vectors, VectorParams):
size = vectors.size
else:
size = None

if size != dimension:
# If there's a discrepancy, create a new version of that collection
self.client.delete_collection(index_dir)
self.client.create_collection(
collection_name=index_dir,
vectors_config=VectorParams(size=dimension, distance=Distance.COSINE)
)

# Convert docs to list if it's a pandas Series
docs_list = docs.tolist() if isinstance(docs, pd.Series) else docs

# Prepare points for upload
points = []
for idx, (doc, embedding) in enumerate(zip(docs_list, embeddings)):
points.append(
PointStruct(
id=idx,
vector=embedding.tolist(),
payload={
"content": doc,
"doc_id": idx
}
)
)

# Upload in batches
batch_size = 100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove hard code

for i in tqdm(range(0, len(points), batch_size), desc="Uploading to Qdrant"):
batch = points[i:i + batch_size]
self.client.upsert(
collection_name=index_dir,
points=batch
)

def load_index(self, index_dir: str):
"""Set the collection name to use"""
if not self.client.collection_exists(index_dir):
raise ValueError(f"Collection {index_dir} not found")
self.index_dir = index_dir

def __call__(
self,
query_vectors,
K: int,
ids: Optional[list[int]] = None,
**kwargs: dict[str, Any]
) -> RMOutput:
"""Perform vector search using Qdrant"""
if self.index_dir is None:
raise ValueError("No collection loaded. Call load_index first.")

# Perform searches
all_distances = []
all_indices = []

for query_vector in query_vectors:
results = self.client.search(
collection_name=self.index_dir,
query_vector=query_vector.tolist(),
limit=K,
with_payload=True,
query_filter=Filter(
should=[
FieldCondition(key="doc_id", match=MatchValue(value=id)) for id in ids
]
) if ids is not None else None
)

# Extract distances and indices
distances = []
indices = []

for result in results:
indices.append(result.id)
distances.append(result.score) # Qdrant returns cosine similarity directly

# Pad results if fewer than K matches
while len(indices) < K:
indices.append(-1)
distances.append(0.0)

all_distances.append(d 109CB istances)
all_indices.append(indices)

return RMOutput(
distances=np.array(all_distances, dtype=np.float32).tolist(),
indices=np.array(all_indices, dtype=np.int64).tolist()
)

def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""Retrieve vectors for specific document IDs"""
if self.index_dir != index_dir:
self.load_index(index_dir)

# Fetch points from Qdrant
points = self.client.retrieve(
collection_name=index_dir,
ids=ids,
with_vectors=True,
with_payload=False
)

# Extract and return vectors
vectors = []
for point in points:
if point.vector is not None:
vectors.append(point.vector)
else:
raise ValueError(f"Vector not found for id {point.id}")

return np.array(vectors, dtype=np.float64)
6 changes: 3 additions & 3 deletions lotus/vector_store/vs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class VS(ABC):

def __init__(self) -> None:
self.index_dir: str | None = None
self.max_batch_size: int = 64
self.max_batch_size:int = 64

@abstractmethod
def index(self, docs, embeddings: Any, index_dir: str, **kwargs: dict[str, Any]):
Expand All @@ -33,7 +33,7 @@ def __call__(
self,
query_vectors: Any,
K: int,
ids: Optional[list[Any]] = None,
ids: Optional[list[int]] = None,
**kwargs: dict[str, Any],
) -> RMOutput:
"""
Expand All @@ -52,7 +52,7 @@ def __call__(
pass

@abstractmethod
def get_vectors_from_index(self, index_dir: str, ids: list[Any]) -> NDArray[np.float64]:
def get_vectors_from_index(self, index_dir: str, ids: list[int]) -> NDArray[np.float64]:
"""
Retrieve vectors from a stored index given specific ids.
"""
Expand Down
0