8000 Add knn test cases for tensor and sparse vector by Ami11111 · Pull Request #1536 · infiniflow/infinity · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Add knn test cases for tensor and sparse vector #1536

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 60 additions & 1 deletion python/test/cases/test_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,4 +343,63 @@ def test_tensor_scan_with_invalid_method_type(self, check_data, method_type):
@pytest.mark.parametrize("check_data", [{"file_name": "tensor_maxsim.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_tensor_scan_with_invalid_extra_option(self, check_data, extra_option):
self.test_infinity_obj._test_tensor_scan_with_invalid_extra_option(check_data, extra_option)
self.test_infinity_obj._test_tensor_scan_with_invalid_extra_option(check_data, extra_option)

@pytest.mark.skip(reason = "UnrecoverableException The tensor column basic embedding dimension should be greater than 0")
def test_zero_dimension_tensor_scan(self):
self.test_infinity_obj._test_zero_dimension_tensor_scan()

@pytest.mark.parametrize("dim", [1, 10, 100]) #1^3, 10^3, 100^3
def test_big_dimension_tensor_scan(self, dim):
self.test_infinity_obj._test_big_dimension_tensor_scan(dim)

@pytest.mark.parametrize("table_params", [
"vector,100,float,int8",
"sparse,0,float,int8",
"sparse,100,int,int8",
"sparse,100,float,float",
"int8,float,100,sparse", #disorder
])
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_with_invalid_table_params(self, check_data, table_params):
self.test_infinity_obj._test_sparse_with_invalid_table_params(check_data, table_params)

@pytest.mark.parametrize("index_type", [index.IndexType.IVFFlat,
index.IndexType.Hnsw,
index.IndexType.EMVB,
index.IndexType.FullText,
index.IndexType.Secondary,])
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_knn_with_invalid_index_type(self, check_data, index_type):
self.test_infinity_obj._test_sparse_knn_with_invalid_index_type(check_data, index_type)

@pytest.mark.parametrize("index_params", [["0", "compress"],
["257", "compress"],
["16", "invalid compress type"]])
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_knn_with_invalid_index_params(self, check_data, index_params):
self.test_infinity_obj._test_sparse_knn_with_invalid_index_params(check_data, index_params)


@pytest.mark.skip(reason = "invalid alpha and beta do not raise exception")
@pytest.mark.parametrize("alpha", ["-1.0", "2.0"])
@pytest.mark.parametrize("beta", ["-1.0", "2.0"])
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_knn_with_invalid_alpha_beta(self, check_data, alpha, beta):
self.test_infinity_obj._test_sparse_knn_with_invalid_alpha_beta(check_data, alpha, beta)

@pytest.mark.skip(reason = "UnrecoverableException Sparse data size mismatch")
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_knn_with_indices_values_mismatch(self, check_data):
self.test_infinity_obj._test_sparse_knn_with_indices_values_mismatch(check_data)

@pytest.mark.parametrize("distance_type", ["l2", "cosine", "hamming"])
@pytest.mark.parametrize("check_data", [{"file_name": "sparse_knn.csv",
"data_dir": common_values.TEST_TMP_DIR}], indirect=True)
def test_sparse_knn_with_invalid_distance_type(self, check_data, distance_type):
self.test_infinity_obj._test_sparse_knn_with_invalid_distance_type(check_data, distance_type)
237 changes: 237 additions & 0 deletions python/test/internal/test_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,4 +1092,241 @@ def _test_tensor_scan_with_invalid_extra_option(self, check_data, extra_option):
.to_pl())

res = db_obj.drop_table("test_tensor_scan", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_zero_dimension_tensor_scan(self):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_tensor_scan", ConflictType.Ignore)
table_obj = db_obj.create_table("test_tensor_scan",
{"t": {"type": "tensor, 0, float"}})
with pytest.raises(IndexError):
table_obj.insert([{"t": [[], []]}])

with pytest.raises(Exception):
res = (table_obj
.output(["*", "_row_id", "_score"])
.match_tensor('t', [[], []], 'float', 'maxsim', 'topn=2')
.to_pl())

res = db_obj.drop_table("test_tensor_scan", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_big_dimension_tensor_scan(self, dim):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_tensor_scan", ConflictType.Ignore)
table_obj = db_obj.create_table("test_tensor_scan",
{"t": {"type": f"tensorarray, {dim}, float"}})

table_obj.insert([{"t": [[[1.0]*dim]*dim]*dim},
{"t": [[[2.0]*dim]*dim]*dim},
{"t": [[[3.0]*dim]*dim]*dim},
{"t": [[[4.0]*dim]*dim]*dim},
{"t": [[[5.0]*dim]*dim]*dim},])

res = (table_obj
.output(["*", "_row_id", "_score"])
.match_tensor('t', [[[0.0]*dim]*dim]*dim, 'float', 'maxsim', 'topn=5')
.to_pl())
print(res)

res = db_obj.drop_table("test_tensor_scan", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_sparse_with_invalid_table_params(self, check_data, table_params):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_scan", ConflictType.Ignore)
params = table_params.split(",")
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv"

if params[0] == "int8":
with pytest.raises(InfinityException) as e:
table_obj = db_obj.create_table("test_sparse_scan", {"c1": {"type": "int"}, "c2": {"type": table_params}},
ConflictType.Error)
assert e.value.args[0] == ErrorCode.INVALID_DATA_TYPE
elif params[0] == "vector":
table_obj = db_obj.create_table("test_sparse_scan", {"c1": {"type": "int"}, "c2": {"type": table_params}},
ConflictType.Error)
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})

with pytest.raises(InfinityException) as e:
res = (table_obj.output(["*", "_row_id", "_similarity"])
.match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.to_pl())
assert e.value.args[0] == ErrorCode.SYNTAX_ERROR

res = db_obj.drop_table("test_sparse_scan", ConflictType.Error)
assert res.error_code == ErrorCode.OK
elif params[1] == "0":
table_obj = db_obj.create_table("test_sparse_scan", {"c1": {"type": "int"}, "c2": {"type": table_params}},
ConflictType.Error)
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})

with pytest.raises(InfinityException) as e:
res = (table_obj.output(["*", "_row_id", "_similarity"])
.match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3)
.to_pl())
assert e.value.args[0] == ErrorCode.DATA_TYPE_MISMATCH

res = db_obj.drop_table("test_sparse_scan", ConflictType.Error)
assert res.error_code == ErrorCode.OK
elif params[2] == "int":
table_obj = db_obj.create_table("test_sparse_scan", {"c1": {"type": "int"}, "c2": {"type": table_params}},
ConflictType.Error)
with pytest.raises(InfinityException) as e:
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})
assert e.value.args[0] == ErrorCode.PARSER_ERROR
elif params[3] == "float":
with pytest.raises(InfinityException) as e:
table_obj = db_obj.create_table("test_sparse_scan", {"c1": {"type": "int"}, "c2": {"type": table_params}},
ConflictType.Error)
assert e.value.args[0] == ErrorCode.INVALID_EMBEDDING_DATA_TYPE

def _test_sparse_knn_with_invalid_index_type(self, check_data, index_type):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Ignore)
table_obj = db_obj.create_table("test_sparse_knn_with_index", {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int8"}}, ConflictType.Error)
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv"
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})

with pytest.raises(InfinityException) as e:
if index_type == index.IndexType.IVFFlat:
res = table_obj.create_index("my_index",
[index.IndexInfo("c2",
index.IndexType.IVFFlat,
[index.InitParameter("centroids_count", "128"),
index.InitParameter("metric", "L2")])], ConflictType.Error)
elif index_type == index.IndexType.Hnsw:
res = table_obj.create_index("my_index",
[index.IndexInfo("c2",
index.IndexType.Hnsw,
[
index.InitParameter(
"M", "16"),
index.InitParameter(
"ef_construction", "50"),
index.InitParameter(
"ef", "50"),
index.InitParameter(
"metric", "L2")
])], ConflictType.Error)
elif index_type == index.IndexType.EMVB:
res = table_obj.create_index("my_index",
[index.IndexInfo("c2",
index.IndexType.EMVB,
[index.InitParameter("pq_subspace_num", "32"),
index.InitParameter("pq_subspace_bits", "8")]),
], ConflictType.Error)
elif index_type == index.IndexType.FullText:
res = table_obj.create_index("my_index",
[index.IndexInfo("c2",
index.IndexType.FullText,
[index.InitParameter('ANALYZER', 'STANDARD')]),
], ConflictType.Error)
elif index_type == index.IndexType.Secondary:
res = table_obj.create_index("my_index",
[index.IndexInfo("c2",
index.IndexType.Secondary,
[]),
], ConflictType.Error)
assert e.value.args[0] == ErrorCode.INVALID_INDEX_DEFINITION

res = db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_sparse_knn_with_invalid_index_params(self, check_data, index_params):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Ignore)
table_obj = db_obj.create_table("test_sparse_knn_with_index", {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int8"}}, ConflictType.Error)
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + " 8000 ;sparse_knn.csv"
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})
with pytest.raises(InfinityException) as e:
table_obj.create_index("idx1",
[index.IndexInfo("c2",
index.IndexType.BMP,
[
index.InitParameter(
"block_size", index_params[0]),
index.InitParameter(
"compress_type", index_params[1])
])], ConflictType.Error)
assert e.value.args[0] == ErrorCode.INVALID_INDEX_PARAM

res = db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_sparse_knn_with_invalid_alpha_beta(self, check_data, alpha, beta):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Ignore)
table_obj = db_obj.create_table("test_sparse_knn_with_index", {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int8"}}, ConflictType.Error)
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv"
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})
table_obj.create_index("idx1",
[index.IndexInfo("c2",
index.IndexType.BMP,
[
index.InitParameter(
"block_size", "8"),
index.InitParameter(
"compress_type", "compress")
])], ConflictType.Error)

table_obj.optimize("idx1", {"topk": "3"})

with pytest.raises(InfinityException) as e:
res = (table_obj
.output(["*", "_row_id", "_similarity"])
.match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, "ip", 3,
{"alpha": alpha, "beta": beta})
.to_pl())

res = table_obj.drop_index("idx1", ConflictType.Error)
assert res.error_code == ErrorCode.OK

res = db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_sparse_knn_with_indices_values_mismatch(self, check_data):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Ignore)
table_obj = db_obj.create_table("test_sparse_knn_with_index", {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int8"}}, ConflictType.Error)
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv"
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})

res = (table_obj
.output(["*", "_row_id", "_similarity"])
.match_sparse("c2", {"indices": [0, 20], "values": [1.0, 2.0, 3.0]}, "ip", 3,
{"alpha": "1.0", "beta": "1.0"})
.to_pl())
print(res)

res = db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Error)
assert res.error_code == ErrorCode.OK

def _test_sparse_knn_with_invalid_distance_type(self, check_data, distance_type):
db_obj = self.infinity_obj.get_database("default_db")
db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Ignore)
table_obj = db_obj.create_table("test_sparse_knn_with_index", {"c1": {"type": "int"}, "c2": {"type": "sparse,100,float,int8"}}, ConflictType.Error)
if not check_data:
copy_data("sparse_knn.csv")
test_csv_dir = common_values.TEST_TMP_DIR + "sparse_knn.csv"
table_obj.import_data(test_csv_dir, import_options={"delimiter": ","})

with pytest.raises(Exception):
res = (table_obj
.output(["*", "_row_id", "_similarity"])
.match_sparse("c2", {"indices": [0, 20, 80], "values": [1.0, 2.0, 3.0]}, distance_type, 3,
{"alpha": "1.0", "beta": "1.0"})
.to_pl())

res = db_obj.drop_table("test_sparse_knn_with_index", ConflictType.Error)
assert res.error_code == ErrorCode.OK
Loading
0