8000 Replace json operation by simdjson by 19-hanhan · Pull Request #2755 · infiniflow/infinity · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Replace json operation by simdjson #2755

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions benchmark/local_infinity/fulltext/fulltext_benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,14 @@ void ReadJsonl(std::ifstream &input_file, SizeT lines_to_read, Vector<Tuple<char
else if (line.length() == 0)
continue;
else {
std::string_view json_sv(line);
nlohmann::json json = nlohmann::json::parse(json_sv);
simdjson::padded_string json_str(line);
simdjson::parser parser;
simdjson::document doc = parser.iterate(json_str);
char *elems[3];
for (SizeT i = 0; i < 3; i++) {
assert(json.contains(columns[i]));
String val_str = json[columns[i]];
String val_str;
[[maybe_unused]] auto error = doc[columns[i]].get<String>(val_str);
assert(error == simdjson::SUCCESS);
char *val_buf = (char *)malloc(val_str.length() + 1);
memcpy(val_buf, val_str.data(), val_str.length());
val_buf[val_str.length()] = '\0';
Expand Down
16 changes: 9 additions & 7 deletions src/common/analyzer/rank_features_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,21 @@ import third_party;
namespace infinity {

int RankFeaturesAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
nlohmann::json line_json = nlohmann::json::parse(input.text_);
simdjson::padded_string json(input.text_);
simdjson::parser parser;
simdjson::document doc = parser.iterate(json);
u32 offset = 0;
for (const auto &element : line_json) {
if (element.is_object()) {
for (auto it = element.begin(); it != element.end(); ++it) {
std::string key = it.key();
float value = it.value();
for (auto element : doc.get_array()) {
auto item = element.value();
if (item.type() == simdjson::json_type::object) {
for (auto field : item.get_object()) {
std::string_view key = field.unescaped_key();
float value = field.value().get<float>();
u16 weight = SmallFloat::Float122ToUInt16(value);
func(data, key.data(), key.size(), offset++, 0, false, weight);
}
}
}

return 0;
}

Expand Down
4 changes: 4 additions & 0 deletions src/common/third_party.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,11 @@ export using simdjson::error_code;
export using ondemand::parser;
export using ondemand::document;
export using ondemand::object;
export using ondemand::array;
export using ondemand::value;
export using ondemand::number;
export using ondemand::json_type;
export using ondemand::number_type;
}

namespace magic_enum {
Expand Down
341 changes: 204 additions & 137 deletions src/executor/operator/physical_import.cpp

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/executor/operator/physical_import.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ private:

static void NewCSVRowHandler(void *);

void JSONLRowHandler(const nlohmann::json &line_json, Vector<SharedPtr<ColumnVector>> &column_vectors);
void JSONLRowHandler(std::string_view line_json, Vector<SharedPtr<ColumnVector>> &column_vectors);

void ParquetValueHandler(const SharedPtr<arrow::Array> &array, ColumnVector &column_vector, u64 value_idx);

Expand All @@ -109,7 +109,7 @@ private:
char delimiter_{','};
};

export SharedPtr<ConstantExpr> BuildConstantExprFromJson(const nlohmann::json &json_object);
export SharedPtr<ConstantExpr> BuildConstantSparseExprFromJson(const nlohmann::json &json_object, const SparseInfo *sparse_info);
export SharedPtr<ConstantExpr> BuildConstantExprFromJson(std::string_view json_object);
export SharedPtr<ConstantExpr> BuildConstantSparseExprFromJson(std::string_view json_object, const SparseInfo *sparse_info);

} // namespace infinity
2 changes: 1 addition & 1 deletion src/network/http/http_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1158,7 +1158,7 @@ UniquePtr<MatchTensorExpr> HTTPSearch::ParseMatchTensor(const nlohmann::json &js
return nullptr;
}
try {
tensor_expr = BuildConstantExprFromJson(field_json_obj.value());
tensor_expr = BuildConstantExprFromJson(field_json_obj.value().dump());
} catch (std::exception &e) {
response["error_code"] = ErrorCode::kInvalidExpression;
response["error_message"] = fmt::format("Invalid query_tensor, error info: {}", e.what());
Expand Down
6 changes: 3 additions & 3 deletions src/network/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,12 @@ infinity::Status ParseColumnDefs(const nlohmann::json &fields, Vector<ColumnDef
if (field_element.contains("default")) {
switch (column_type->type()) {
case LogicalType::kSparse: {
default_expr = BuildConstantSparseExprFromJson(field_element["default"],
default_expr = BuildConstantSparseExprFromJson(field_element["default"].dump(),
dynamic_cast<const SparseInfo *>(column_type->type_info().get()));
break;
}
default: {
default_expr = BuildConstantExprFromJson(field_element["default"]);
default_expr = BuildConstantExprFromJson(field_element["default"].dump());
break;
}
}
Expand Down Expand Up @@ -1188,7 +1188,7 @@ class InsertHandler final : public HttpRequestHandler {
if (value.size() == 1 && value.begin().key() == "array") {
SharedPtr<ConstantExpr> array_expr;
try {
auto array_result = BuildConstantExprFromJson(value);
auto array_result = BuildConstantExprFromJson(value.dump());
if (!array_result) {
throw std::runtime_error("Empty return value!");
}
Expand Down
42 changes: 6 additions & 36 deletions src/parser/definition/column_def.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,61 +239,31 @@ nlohmann::json ColumnDef::ToJson() const {
return column_def_json;
}

std::shared_ptr<ColumnDef> ColumnDef::FromJson(const nlohmann::json &json) {
auto column_type = DataType::Deserialize(json.at("column_type"));
auto column_id = json.at("column_id").get<int64_t>();
auto column_name = json.at("column_name").get<std::string>();

std::set<ConstraintType> constraints;
if (json.contains("constraints")) {
for (const auto &constraint : json.at("constraints")) {
constraints.insert(constraint.get<ConstraintType>());
}
}

std::string column_comment;
if (json.contains("column_comment")) {
column_comment = json.at("column_comment").get<std::string>();
}

std::shared_ptr<ParsedExpr> default_expr = nullptr;
if (json.contains("default")) {
auto default_expr_json = json.at("default");
default_expr = ConstantExpr::Deserialize(default_expr_json);
}

return std::make_shared<ColumnDef>(column_id, column_type, column_name, constraints, column_comment, default_expr);
}

std::shared_ptr<ColumnDef> ColumnDef::FromJson(const std::string &col_def_str) {
std::shared_ptr<ColumnDef> ColumnDef::FromJson(std::string_view col_def_str) {
simdjson::ondemand::parser parser;
simdjson::padded_string col_def_json(col_def_str);
simdjson::ondemand::document doc = parser.iterate(col_def_json);

auto column_type_json = doc["column_type"];
auto column_type = DataType::Deserialize(column_type_json);
auto column_type = DataType::Deserialize(doc["column_type"].raw_json());
int64_t column_id = doc["column_id"].get<int64_t>();
std::string column_name = doc["column_name"].get<std::string>();

std::set<ConstraintType> constraints;
simdjson::ondemand::array constraints_json;
if (!doc["constraints"].get(constraints_json)) {
if (simdjson::ondemand::array constraints_json; doc["constraints"].get(constraints_json) == simdjson::SUCCESS) {
for (auto item : constraints_json) {
ConstraintType constraint = static_cast<ConstraintType>(static_cast<char>(item.get<char>()));
constraints.insert(constraint);
}
}

std::string column_comment;
std::string column_comment_json;
if (!doc["column_comment"].get<std::string>(column_comment_json)) {
if (std::string column_comment_json; doc["column_comment"].get<std::string>(column_comment_json) == simdjson::SUCCESS) {
column_comment = column_comment_json;
}

std::shared_ptr<ParsedExpr> default_expr = nullptr;
auto default_expr_json = doc["default"];
if (default_expr_json.error() == simdjson::SUCCESS) {
default_expr = ConstantExpr::Deserialize(default_expr_json);
if (auto default_expr_json = doc["default"]; default_expr_json.error() == simdjson::SUCCESS) {
default_expr = ConstantExpr::Deserialize(default_expr_json.raw_json());
}

return std::make_shared<ColumnDef>(column_id, column_type, column_name, constraints, column_comment, default_expr);
Expand Down
3 changes: 1 addition & 2 deletions src/parser/definition/column_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ class ColumnDef : public TableElement {
const std::shared_ptr<ConstantExpr> default_value() const { return std::dynamic_pointer_cast<ConstantExpr>(default_expr_); }

nlohmann::json ToJson() const;
static std::shared_ptr<ColumnDef> FromJson(const nlohmann::json &json);
static std::shared_ptr<ColumnDef> FromJson(const std::string &json);
static std::shared_ptr<ColumnDef> FromJson(std::string_view col_def_str);

public:
int64_t id_{-1};
Expand Down
107 changes: 20 additions & 87 deletions src/parser/expr/constant_expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,24 +497,27 @@ nlohmann::json ConstantExpr::Serialize() const {
return j;
}

std::shared_ptr<ParsedExpr> ConstantExpr::Deserialize(const nlohmann::json &constant_expr) {
LiteralType literal_type = static_cast<LiteralType>(constant_expr["type"].get<int32_t>());
std::shared_ptr<ParsedExpr> ConstantExpr::Deserialize(std::string_view constant_expr_str) {
simdjson::ondemand::parser parser;
simdjson::padded_string constant_expr_json(constant_expr_str);
simdjson::ondemand::document doc = parser.iterate(constant_expr_json);
LiteralType literal_type = (LiteralType)(int32_t)doc["type"].get<int32_t>();
auto const_expr = new ConstantExpr(literal_type);
switch (literal_type) {
case LiteralType::kBoolean: {
const_expr->bool_value_ = constant_expr["value"].get<bool>();
const_expr->bool_value_ = doc["value"].get<bool>();
break;
}
case LiteralType::kDouble: {
const_expr->double_value_ = constant_expr["value"].get<double>();
const_expr->double_value_ = doc["value"].get<double>();
break;
}
case LiteralType::kString: {
const_expr->str_value_ = strdup(constant_expr["value"].get<std::string>().c_str());
const_expr->str_value_ = strdup(static_cast<std::string>(doc["value"].get<std::string>()).c_str());
break;
}
case LiteralType::kInteger: {
const_expr->integer_value_ = constant_expr["value"].get<int64_t>();
const_expr->integer_value_ = doc["value"].get<int64_t>();
break;
}
case LiteralType::kEmptyArray:
Expand All @@ -525,20 +528,20 @@ std::shared_ptr<ParsedExpr> ConstantExpr::Deserialize(const nlohmann::json &cons
case LiteralType::kTime:
case LiteralType::kDateTime:
case LiteralType::kTimestamp: {
const_expr->date_value_ = strdup(constant_expr["value"].get<std::string>().c_str());
const_expr->date_value_ = strdup(static_cast<std::string>(doc["value"].get<std::string>()).c_str());
break;
}
case LiteralType::kIntegerArray: {
const_expr->long_array_ = constant_expr["value"].get<std::vector<int64_t>>();
const_expr->long_array_ = doc["value"].get<std::vector<int64_t>>();
break;
}
case LiteralType::kDoubleArray: {
const_expr->double_array_ = constant_expr["value"].get<std::vector<double>>();
const_expr->double_array_ = doc["value"].get<std::vector<double>>();
break;
}
case LiteralType::kSubArrayArray: {
for (const nlohmann::json &array = constant_expr["value"]; const auto &val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(Deserialize(val));
for (simdjson::ondemand::array array = doc["value"]; simdjson::simdjson_result<simdjson::ondemand::value> val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(ConstantExpr::Deserialize(val.raw_json()));
const_expr->sub_array_array_.push_back(std::move(sub_arr));
}
break;
Expand All @@ -547,88 +550,18 @@ std::shared_ptr<ParsedExpr> ConstantExpr::Deserialize(const nlohmann::json &cons
ParserError("Interval type is not supported in JSON serialization");
}
case LiteralType::kLongSparseArray: {
const_expr->long_sparse_array_.first = constant_expr["value"]["indices"].get<std::vector<int64_t>>();
const_expr->long_sparse_array_.second = constant_expr["value"]["data"].get<std::vector<int64_t>>();
const_expr->long_sparse_array_.first = doc["value"]["indices"].get<std::vector<int64_t>>();
const_expr->long_sparse_array_.second = doc["value"]["data"].get<std::vector<int64_t>>();
break;
}
case LiteralType::kDoubleSparseArray: {
const_expr->double_sparse_array_.first = constant_expr["value"]["indices"].get<std::vector<int64_t>>();
const_expr->double_sparse_array_.second = constant_expr["value"]["data"].get<std::vector<double>>();
const_expr->double_sparse_array_.first = doc["value"]["indices"].get<std::vector<int64_t>>();
const_expr->double_sparse_array_.second = doc["value"]["data"].get<std::vector<double>>();
break;
}
case LiteralType::kCurlyBracketsArray: {
for (const nlohmann::json &array = constant_expr["value"]; const auto &val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(Deserialize(val));
const_expr->curly_brackets_array_.push_back(std::move(sub_arr));
}
break;
}
}
return std::shared_ptr<ParsedExpr>(const_expr);
}

std::shared_ptr<ParsedExpr> ConstantExpr::Deserialize(simdjson::simdjson_result<simdjson::ondemand::value> &constant_expr) {
LiteralType literal_type = (LiteralType)(int32_t)constant_expr["type"].get<int32_t>();
auto const_expr = new ConstantExpr(literal_type);
switch (literal_type) {
case LiteralType::kBoolean: {
const_expr->bool_value_ = constant_expr["value"].get<bool>();
break;
}
case LiteralType::kDouble: {
const_expr->double_value_ = constant_expr["value"].get<double>();
break;
}
case LiteralType::kString: {
const_expr->str_value_ = strdup(static_cast<std::string>(constant_expr["value"].get<std::string>()).c_str());
break;
}
case LiteralType::kInteger: {
const_expr->integer_value_ = constant_expr["value"].get<int64_t>();
break;
}
case LiteralType::kEmptyArray:
case LiteralType::kNull: {
break;
}
case LiteralType::kDate:
case LiteralType::kTime:
case LiteralType::kDateTime:
case LiteralType::kTimestamp: {
const_expr->date_value_ = strdup(static_cast<std::string>(constant_expr["value"].get<std::string>()).c_str());
break;
}
case LiteralType::kIntegerArray: {
const_expr->long_array_ = constant_expr["value"].get<std::vector<int64_t>>();
break;
}
case LiteralType::kDoubleArray: {
const_expr->double_array_ = constant_expr["value"].get<std::vector<double>>();
break;
}
case LiteralType::kSubArrayArray: {
for (simdjson::ondemand::array array = constant_expr["value"]; simdjson::simdjson_result<simdjson::ondemand::value> val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(ConstantExpr::Deserialize(val));
const_expr->sub_array_array_.push_back(std::move(sub_arr));
}
break;
}
case LiteralType::kInterval: {
ParserError("Interval type is not supported in JSON serialization");
}
case LiteralType::kLongSparseArray: {
const_expr->long_sparse_array_.first = constant_expr["value"]["indices"].get<std::vector<int64_t>>();
const_expr->long_sparse_array_.second = constant_expr["value"]["data"].get<std::vector<int64_t>>();
break;
}
case LiteralType::kDoubleSparseArray: {
const_expr->double_sparse_array_.first = constant_expr["value"]["indices"].get<std::vector<int64_t>>();
const_expr->double_sparse_array_.second = constant_expr["value"]["data"].get<std::vector<double>>();
break;
}
case LiteralType::kCurlyBracketsArray: {
for (simdjson::ondemand::array array = constant_expr["value"]; simdjson::simdjson_result<simdjson::ondemand::value> val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(Deserialize(val));
for (simdjson::ondemand::array array = doc["value"]; simdjson::simdjson_result<simdjson::ondemand::value> val : array) {
auto sub_arr = std::static_pointer_cast<ConstantExpr>(Deserialize(val.raw_json()));
const_expr->curly_brackets_array_.push_back(std::move(sub_arr));
}
break;
Expand Down
4 changes: 1 addition & 3 deletions src/parser/expr/constant_expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@ class ConstantExpr : public ParsedExpr {

nlohmann::json Serialize() const;

static std::shared_ptr<ParsedExpr> Deserialize(const nlohmann::json &constant_expr);

static std::shared_ptr<ParsedExpr> Deserialize(simdjson::simdjson_result<simdjson::ondemand::value> &constant_expr);
static std::shared_ptr<ParsedExpr> Deserialize(std::string_view constant_expr_str);

void TrySortSparseVec(const ColumnDef *col_def);

Expand Down
Loading
Loading
0