From cb60059a0eedc120e1f346fc18ba0aafebdc448e Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Sat, 8 Apr 2023 02:44:43 +0300 Subject: [PATCH 01/27] fix issue --- cmake/external-libraries.cmake | 2 ++ cmake/init-compilation-flags.cmake | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 44a6734f28..4ec35b9cb3 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -54,6 +54,8 @@ else() add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) endif() +add_compile_options(-Wno-redundant-move) + if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( diff --git a/cmake/init-compilation-flags.cmake b/cmake/init-compilation-flags.cmake index 5d62ceabe6..a3273c84fd 100644 --- a/cmake/init-compilation-flags.cmake +++ b/cmake/init-compilation-flags.cmake @@ -118,7 +118,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") endif() add_compile_options(-Werror -Wall -Wextra -Wunused-function -Wfloat-conversion -Wno-sign-compare - -Wuninitialized -Wno-redundant-move -Wno-missing-field-initializers) + -Wuninitialized -Wno-missing-field-initializers) if(NOT APPLE) check_cxx_compiler_flag(-gz=zlib DEBUG_COMPRESSION_IS_FOUND) From 4c156df3341e827e416ac216b59138d51bce6df0 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 10 Apr 2023 23:33:59 +0300 Subject: [PATCH 02/27] mbstring functions --- runtime/mbstring/mbstring.cpp | 33 +++++++++++++++++++++++++++++++++ runtime/mbstring/mbstring.h | 19 +++++++++++++++++++ server/server-stats.cpp | 8 -------- 3 files changed, 52 insertions(+), 8 deletions(-) create mode 100644 runtime/mbstring/mbstring.cpp create mode 100644 runtime/mbstring/mbstring.h diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp new file mode 100644 index 0000000000..a73c42e83c --- /dev/null +++ b/runtime/mbstring/mbstring.cpp @@ -0,0 +1,33 @@ +#include "mbstring.h" + +string f$mb_convert_encoding(const string &str, const string &to, const string &from) { + + /* preparing */ + const char *c_str = str.c_str(); + const char *c_from = from.c_str(); + const char *c_to = to.c_str(); + enum mbfl_no_encoding from_encoding, to_encoding; + mbfl_buffer_converter *convd = NULL; + mbfl_string tmp, result, *ret; + + /* from internal to mbfl */ + from_encoding = mbfl_name2no_encoding(c_from); + to_encoding = mbfl_name2no_encoding(c_to); + + /* init buffer mbfl strings */ + long int len = strlen(c_str); + mbfl_string_init(&tmp); + mbfl_string_init(&result); + tmp.no_encoding = from_encoding; + tmp.len = len; + tmp.val = (unsigned char*)c_str; + + /* converting */ + convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0); + ret = mbfl_buffer_converter_feed_result(convd, &tmp, &result); + mbfl_buffer_converter_delete(convd); + + /* returning kphp's string */ + string res((const char*)ret->val, strlen((const char*)ret->val)); + return res; +} \ No newline at end of file diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h new file mode 100644 index 0000000000..87f2d1c53a --- /dev/null +++ b/runtime/mbstring/mbstring.h @@ -0,0 +1,19 @@ +#pragma once + +extern "C" { + #include +} + +#include "runtime/kphp_core.h" + +/** + * Convert a string from one character encoding to another + * @param str The string to be converted + * @param from The desired encoding of the result + * @param to The current encoding used to interpret string + * @return The encoded string + * TODO!: mb_check_encoding(str, from) inside + * TODO!: own constants for encodings + * TODO: issue for timelib + */ +string f$mb_convert_encoding(const string &str, const string &to, const string &from); \ No newline at end of file diff --git a/server/server-stats.cpp b/server/server-stats.cpp index 6406e09ed7..a45fcf1f4f 100644 --- a/server/server-stats.cpp +++ b/server/server-stats.cpp @@ -143,8 +143,6 @@ struct EnumTable : std::array(E::Key::types_count)> { template struct Percentiles { T p50{}; - T p75{}; - T p90{}; T p95{}; T p99{}; T max{}; @@ -154,8 +152,6 @@ struct Percentiles { void update_percentiles(I first, I last, const Mapper &mapper = {}) noexcept { const auto size = last - first; set_percentile<50>(p50, first, size, mapper); - set_percentile<75>(p75, first, size, mapper); - set_percentile<90>(p90, first, size, mapper); set_percentile<95>(p95, first, size, mapper); set_percentile<99>(p99, first, size, mapper); set_percentile<100>(max, first, size, mapper); @@ -702,8 +698,6 @@ template void write_to(stats_t *stats, const char *prefix, const char *suffix, const AggregatedSamples &samples, const Mapper &mapper = {}) { if (stats->need_aggregated_stats()) { stats->add_gauge_stat(mapper(samples.percentiles.p50), prefix, suffix, ".p50"); - stats->add_gauge_stat(mapper(samples.percentiles.p75), prefix, suffix, ".p75"); - stats->add_gauge_stat(mapper(samples.percentiles.p90), prefix, suffix, ".p90"); stats->add_gauge_stat(mapper(samples.percentiles.p95), prefix, suffix, ".p95"); stats->add_gauge_stat(mapper(samples.percentiles.p99), prefix, suffix, ".p99"); stats->add_gauge_stat(mapper(samples.percentiles.max), prefix, suffix, ".max"); @@ -714,8 +708,6 @@ template void write_to(stats_t *stats, const char *prefix, const char *suffix, const WorkerSamples &samples, const Mapper &mapper = {}) { if (stats->need_aggregated_stats()) { stats->add_gauge_stat(mapper(samples.percentiles.p50), prefix, suffix, ".p50"); - stats->add_gauge_stat(mapper(samples.percentiles.p75), prefix, suffix, ".p75"); - stats->add_gauge_stat(mapper(samples.percentiles.p90), prefix, suffix, ".p90"); stats->add_gauge_stat(mapper(samples.percentiles.p95), prefix, suffix, ".p95"); stats->add_gauge_stat(mapper(samples.percentiles.p99), prefix, suffix, ".p99"); stats->add_gauge_stat(mapper(samples.percentiles.max), prefix, suffix, ".max"); From 8121ae947f1174013a0294e61cd65e71cb73c944 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 14:57:23 +0300 Subject: [PATCH 03/27] build libmbfl from source --- builtin-functions/_functions.txt | 2 + cmake/external-libraries.cmake | 15 ++ compiler/compiler-settings.cpp | 11 +- runtime/interface.cpp | 1 - runtime/mbstring.cpp | 341 +------------------------------ runtime/mbstring.h | 20 +- runtime/mbstring/mbstring.cpp | 4 +- runtime/mbstring/mbstring.h | 7 +- runtime/runtime.cmake | 13 ++ 9 files changed, 49 insertions(+), 365 deletions(-) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 9e6e139f39..c1c2616ff2 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1620,3 +1620,5 @@ class DateTimeImmutable implements DateTimeInterface { } function getenv(string $varname = '', bool $local_only = false): mixed; + +function mb_convert_encoding(string $str, string $to, string $from): string; diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 44a6734f28..70a03bcd36 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -1,5 +1,7 @@ option(DOWNLOAD_MISSING_LIBRARIES "download and build missing libraries if needed" OFF) +option(MBFL, OFF) cmake_print_variables(DOWNLOAD_MISSING_LIBRARIES) +cmake_print_variables(MBFL) function(handle_missing_library LIB_NAME) message(STATUS "------${LIB_NAME}---------") if(DOWNLOAD_MISSING_LIBRARIES) @@ -54,6 +56,19 @@ else() add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) endif() +if(MBFL) +# add_library(libmbfl STATIC IMPORTED ${MBFL}) +# else() + add_compile_options(-Wno-unused-parameter -Wno-logical-op-parentheses -Wno-unused-variable -Wno-return-type -Wno-unused-function) + handle_missing_library("libmbfl") + FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) + message(STATUS "---------------------") + FetchContent_MakeAvailable(libmbfl) + include_directories(${libmbfl_SOURCE_DIR}/include) + add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") + add_link_options(-L${libmbfl_SOURCE_DIR}/objs) +endif() + if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index 3733401f06..b6cc784042 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -320,7 +320,7 @@ void CompilerSettings::init() { ld_flags.value_ = extra_ld_flags.get(); append_curl(cxx_default_flags, ld_flags.value_); append_apple_options(cxx_default_flags, ld_flags.value_); - std::vector external_static_libs{"pcre", "re2", "yaml-cpp", "h3", "z", "zstd", "nghttp2", "kphp-timelib"}; + std::vector external_static_libs{"pcre", "re2", "yaml-cpp", "h3", "z", "zstd", "nghttp2", "kphp-timelib", "libmbfl"}; #ifdef KPHP_TIMELIB_LIB_DIR ld_flags.value_ += " -L" KPHP_TIMELIB_LIB_DIR; @@ -331,6 +331,15 @@ void CompilerSettings::init() { ld_flags.value_ += " -L /usr/local/lib"; #endif +#ifdef LIBMBFL_LIB_DIR + ld_flags.value_ += " -L" LIBMBFL_LIB_DIR; +#else + // kphp-timelib is usually installed in /usr/local/lib; + // LDD may not find a library in /usr/local/lib if we don't add it here + // TODO: can we avoid this hardcoded library path? + ld_flags.value_ += " -L /usr/local/lib"; +#endif + #if defined(__APPLE__) && defined(__arm64__) // for development under M1, manual installation of libucontext is needed // see the docs: https://vkcom.github.io/kphp/kphp-internals/developing-and-extending-kphp/compiling-kphp-from-sources.html diff --git a/runtime/interface.cpp b/runtime/interface.cpp index 2837919e53..c48e2e9a27 100644 --- a/runtime/interface.cpp +++ b/runtime/interface.cpp @@ -2371,7 +2371,6 @@ static void free_runtime_libs() { free_kphp_backtrace(); free_migration_php8(); - free_detect_incorrect_encoding_names(); vk::singleton::get().reset_buffers(); #ifdef PDO_DRIVER_MYSQL diff --git a/runtime/mbstring.cpp b/runtime/mbstring.cpp index 8fa5a03be6..bbf8231b47 100644 --- a/runtime/mbstring.cpp +++ b/runtime/mbstring.cpp @@ -7,86 +7,6 @@ #include "common/unicode/unicode-utils.h" #include "common/unicode/utf8-utils.h" -static bool is_detect_incorrect_encoding_names_warning{false}; - -void f$set_detect_incorrect_encoding_names_warning(bool show) { - is_detect_incorrect_encoding_names_warning = show; -} - -void free_detect_incorrect_encoding_names() { - is_detect_incorrect_encoding_names_warning = false; -} - -static int mb_detect_encoding_new(const string &encoding) { - const auto encoding_name = f$strtolower(encoding).c_str(); - - if (!strcmp(encoding_name, "cp1251") || !strcmp(encoding_name, "cp-1251") || !strcmp(encoding_name, "windows-1251")) { - return 1251; - } - - if (!strcmp(encoding_name, "utf8") || !strcmp(encoding_name, "utf-8")) { - return 8; - } - - return -1; -} - -static int mb_detect_encoding(const string &encoding) { - const int result_new = mb_detect_encoding_new(encoding); - - if (strstr(encoding.c_str(), "1251")) { - if (is_detect_incorrect_encoding_names_warning && 1251 != result_new) { - php_warning("mb_detect_encoding returns 1251, but new will return %d, encoding %s", result_new, encoding.c_str()); - } - return 1251; - } - if (strstr(encoding.c_str(), "-8")) { - if (is_detect_incorrect_encoding_names_warning && 8 != result_new) { - php_warning("mb_detect_encoding returns 8, but new will return %d, encoding %s", result_new, encoding.c_str()); - } - return 8; - } - - if (is_detect_incorrect_encoding_names_warning && -1 != result_new) { - php_warning("mb_detect_encoding returns -1, but new will return %d, encoding %s", result_new, encoding.c_str()); - } - return -1; -} - -static int64_t mb_UTF8_strlen(const char *s) { - int64_t res = 0; - for (int64_t i = 0; s[i]; i++) { - if ((((unsigned char)s[i]) & 0xc0) != 0x80) { - res++; - } - } - return res; -} - -static int64_t mb_UTF8_advance(const char *s, int64_t cnt) { - php_assert (cnt >= 0); - int64_t i; - for (i = 0; s[i] && cnt >= 0; i++) { - if ((((unsigned char)s[i]) & 0xc0) != 0x80) { - cnt--; - } - } - if (cnt < 0) { - i--; - } - return i; -} - -static int64_t mb_UTF8_get_offset(const char *s, int64_t pos) { - int64_t res = 0; - for (int64_t i = 0; i < pos && s[i]; i++) { - if ((((unsigned char)s[i]) & 0xc0) != 0x80) { - res++; - } - } - return res; -} - bool mb_UTF8_check(const char *s) { do { #define CHECK(condition) if (!(condition)) {return false;} @@ -128,263 +48,4 @@ bool mb_UTF8_check(const char *s) { } while (true); php_assert (0); -} - -bool f$mb_check_encoding(const string &str, const string &encoding) { - int encoding_num = mb_detect_encoding(encoding); - if (encoding_num < 0) { - php_critical_error ("encoding \"%s\" doesn't supported in mb_check_encoding", encoding.c_str()); - return !str.empty(); - } - - if (encoding_num == 1251) { - return true; - } - - return mb_UTF8_check(str.c_str()); -} - - -int64_t f$mb_strlen(const string &str, const string &encoding) { - int encoding_num = mb_detect_encoding(encoding); - if (encoding_num < 0) { - php_critical_error ("encoding \"%s\" doesn't supported in mb_strlen", encoding.c_str()); - return str.size(); - } - - if (encoding_num == 1251) { - return str.size(); - } - - return mb_UTF8_strlen(str.c_str()); -} - - -string f$mb_strtolower(const string &str, const string &encoding) { - int encoding_num = mb_detect_encoding(encoding); - if (encoding_num < 0) { - php_critical_error ("encoding \"%s\" doesn't supported in mb_strtolower", encoding.c_str()); - return str; - } - - int len = str.size(); - if (encoding_num == 1251) { - string res(len, false); - for (int i = 0; i < len; i++) { - switch ((unsigned char)str[i]) { - case 'A' ... 'Z': - res[i] = (char)(str[i] + 'a' - 'A'); - break; - case 0xC0 ... 0xDF: - res[i] = (char)(str[i] + 32); - break; - case 0x81: - res[i] = (char)0x83; - break; - case 0xA3: - res[i] = (char)0xBC; - break; - case 0xA5: - res[i] = (char)0xB4; - break; - case 0xA1: - case 0xB2: - case 0xBD: - res[i] = (char)(str[i] + 1); - break; - case 0x80: - case 0x8A: - case 0x8C ... 0x8F: - case 0xA8: - case 0xAA: - case 0xAF: - res[i] = (char)(str[i] + 16); - break; - default: - res[i] = str[i]; - } - } - - return res; - } else { - string res(len * 3, false); - const char *s = str.c_str(); - int res_len = 0; - int p; - int ch; - while ((p = get_char_utf8(&ch, s)) > 0) { - s += p; - res_len += put_char_utf8(unicode_tolower(ch), &res[res_len]); - } - if (p < 0) { - php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtolower", str.c_str()); - } - res.shrink(res_len); - - return res; - } -} - -string f$mb_strtoupper(const string &str, const string &encoding) { - int encoding_num = mb_detect_encoding(encoding); - if (encoding_num < 0) { - php_critical_error ("encoding \"%s\" doesn't supported in mb_strtoupper", encoding.c_str()); - return str; - } - - int len = str.size(); - if (encoding_num == 1251) { - string res(len, false); - for (int i = 0; i < len; i++) { - switch ((unsigned char)str[i]) { - case 'a' ... 'z': - res[i] = (char)(str[i] + 'A' - 'a'); - break; - case 0xE0 ... 0xFF: - res[i] = (char)(str[i] - 32); - break; - case 0x83: - res[i] = (char)(0x81); - break; - case 0xBC: - res[i] = (char)(0xA3); - break; - case 0xB4: - res[i] = (char)(0xA5); - break; - case 0xA2: - case 0xB3: - case 0xBE: - res[i] = (char)(str[i] - 1); - break; - case 0x98: - case 0xA0: - case 0xAD: - res[i] = ' '; - break; - case 0x90: - case 0x9A: - case 0x9C ... 0x9F: - case 0xB8: - case 0xBA: - case 0xBF: - res[i] = (char)(str[i] - 16); - break; - default: - res[i] = str[i]; - } - } - - return res; - } else { - string res(len * 3, false); - const char *s = str.c_str(); - int res_len = 0; - int p; - int ch; - while ((p = get_char_utf8(&ch, s)) > 0) { - s += p; - res_len += put_char_utf8(unicode_toupper(ch), &res[res_len]); - } - if (p < 0) { - php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtoupper", str.c_str()); - } - res.shrink(res_len); - - return res; - } -} - -namespace { - -int check_strpos_agrs(const char *func_name, const string &needle, int64_t offset, const string &encoding) noexcept { - if (unlikely(offset < 0)) { - php_warning("Wrong offset = %" PRIi64 " in function %s()", offset, func_name); - return 0; - } - if (unlikely(needle.empty())) { - php_warning("Parameter needle is empty in function %s()", func_name); - return 0; - } - - const int encoding_num = mb_detect_encoding(encoding); - if (unlikely(encoding_num < 0)) { - php_critical_error ("encoding \"%s\" doesn't supported in %s()", encoding.c_str(), func_name); - return 0; - } - return encoding_num; -} - -Optional mp_strpos_impl(const string &haystack, const string &needle, int64_t offset, int encoding_num) noexcept { - if (encoding_num == 1251) { - return f$strpos(haystack, needle, offset); - } - - int64_t UTF8_offset = mb_UTF8_advance(haystack.c_str(), offset); - const char *s = static_cast(memmem(haystack.c_str() + UTF8_offset, haystack.size() - UTF8_offset, needle.c_str(), needle.size())); - if (unlikely(s == nullptr)) { - return false; - } - return mb_UTF8_get_offset(haystack.c_str() + UTF8_offset, s - (haystack.c_str() + UTF8_offset)) + offset; -} - -} // namespace - -Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset, const string &encoding) noexcept { - if (const int encoding_num = check_strpos_agrs("mb_strpos", needle, offset, encoding)) { - return mp_strpos_impl(haystack, needle, offset, encoding_num); - } - return false; -} - -Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset, const string &encoding) noexcept { - if (const int encoding_num = check_strpos_agrs("mb_stripos", needle, offset, encoding)) { - return mp_strpos_impl(f$mb_strtolower(haystack, encoding), f$mb_strtolower(needle, encoding), offset, encoding_num); - } - return false; -} - -string f$mb_substr(const string &str, int64_t start, const mixed &length_var, const string &encoding) { - int encoding_num = mb_detect_encoding(encoding); - if (encoding_num < 0) { - php_critical_error ("encoding \"%s\" doesn't supported in mb_substr", encoding.c_str()); - return str; - } - - int64_t length; - if (length_var.is_null()) { - length = std::numeric_limits::max(); - } else { - length = length_var.to_int(); - } - - if (encoding_num == 1251) { - Optional res = f$substr(str, start, length); - if (!res.has_value()) { - return {}; - } - return res.val(); - } - - int64_t len = mb_UTF8_strlen(str.c_str()); - if (start < 0) { - start += len; - } - if (start > len) { - start = len; - } - if (length < 0) { - length = len - start + length; - } - if (length <= 0 || start < 0) { - return {}; - } - if (len - start < length) { - length = len - start; - } - - int64_t UTF8_start = mb_UTF8_advance(str.c_str(), start); - int64_t UTF8_length = mb_UTF8_advance(str.c_str() + UTF8_start, length); - - return {str.c_str() + UTF8_start, static_cast(UTF8_length)}; -} +} \ No newline at end of file diff --git a/runtime/mbstring.h b/runtime/mbstring.h index 9685f4be76..be9aef5b0c 100644 --- a/runtime/mbstring.h +++ b/runtime/mbstring.h @@ -9,22 +9,4 @@ #include "runtime/kphp_core.h" #include "runtime/string_functions.h" -bool mb_UTF8_check(const char *s); - -bool f$mb_check_encoding(const string &str, const string &encoding = CP1251); - -int64_t f$mb_strlen(const string &str, const string &encoding = CP1251); - -string f$mb_strtolower(const string &str, const string &encoding = CP1251); - -string f$mb_strtoupper(const string &str, const string &encoding = CP1251); - -Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; - -Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; - -string f$mb_substr(const string &str, int64_t start, const mixed &length = std::numeric_limits::max(), const string &encoding = CP1251); - -void f$set_detect_incorrect_encoding_names_warning(bool show); - -void free_detect_incorrect_encoding_names(); +bool mb_UTF8_check(const char *s); \ No newline at end of file diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index a73c42e83c..52cff09775 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -6,6 +6,7 @@ string f$mb_convert_encoding(const string &str, const string &to, const string & const char *c_str = str.c_str(); const char *c_from = from.c_str(); const char *c_to = to.c_str(); + enum mbfl_no_encoding from_encoding, to_encoding; mbfl_buffer_converter *convd = NULL; mbfl_string tmp, result, *ret; @@ -28,6 +29,5 @@ string f$mb_convert_encoding(const string &str, const string &to, const string & mbfl_buffer_converter_delete(convd); /* returning kphp's string */ - string res((const char*)ret->val, strlen((const char*)ret->val)); - return res; + return string((const char*)ret->val, ret->len); } \ No newline at end of file diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 87f2d1c53a..3668a6fda0 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -1,11 +1,14 @@ #pragma once +#include "runtime/kphp_core.h" + extern "C" { + // FIXME + // #include + // #include "../../build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" #include } -#include "runtime/kphp_core.h" - /** * Convert a string from one character encoding to another * @param str The string to be converted diff --git a/runtime/runtime.cmake b/runtime/runtime.cmake index dce7d62cb8..2572f6f2c8 100644 --- a/runtime/runtime.cmake +++ b/runtime/runtime.cmake @@ -49,7 +49,13 @@ prepend(KPHP_RUNTIME_PDO_PGSQL_SOURCES pdo/pgsql/ pgsql_pdo_emulated_statement.cpp) endif() +if (MBFL) +prepend(KPHP_RUNTIME_MBSTRING_SOURCES mbstring/ + mbstring.cpp) +endif() + prepend(KPHP_RUNTIME_SOURCES ${BASE_DIR}/runtime/ + ${KPHP_RUNTIME_MBSTRING_SOURCES} ${KPHP_RUNTIME_DATETIME_SOURCES} ${KPHP_RUNTIME_MEMORY_RESOURCE_SOURCES} ${KPHP_RUNTIME_MSGPACK_SOURCES} @@ -139,6 +145,9 @@ vk_add_library(kphp_runtime OBJECT ${KPHP_RUNTIME_ALL_SOURCES}) target_include_directories(kphp_runtime PUBLIC ${BASE_DIR} /opt/curl7600/include) add_dependencies(kphp_runtime kphp-timelib) +if (MBFL) + add_dependencies(kphp_runtime libmbfl) +endif() prepare_cross_platform_libs(RUNTIME_LIBS yaml-cpp re2 zstd h3) # todo: linking between static libs is no-op, is this redundant? do we need to add mysqlclient here? set(RUNTIME_LIBS vk::kphp_runtime vk::kphp_server vk::popular_common vk::unicode vk::common_src vk::binlog_src vk::net_src ${RUNTIME_LIBS} OpenSSL::Crypto m z pthread) @@ -157,6 +166,10 @@ if (PDO_DRIVER_PGSQL) list(APPEND RUNTIME_LINK_TEST_LIBS PostgreSQL::PostgreSQL) endif() +if (MBFL) + list(APPEND RUNTIME_LINK_TEST_LIBS libmbfl) +endif() + file(GLOB_RECURSE KPHP_RUNTIME_ALL_HEADERS RELATIVE ${BASE_DIR} CONFIGURE_DEPENDS From 406b0c9c332a6d33b87b01c617186bf41762d65b Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 15:19:33 +0300 Subject: [PATCH 04/27] fix for ubuntu from mac os --- cmake/external-libraries.cmake | 11 ++++++++--- cmake/init-compilation-flags.cmake | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 70a03bcd36..6d9295f370 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -57,9 +57,12 @@ else() endif() if(MBFL) -# add_library(libmbfl STATIC IMPORTED ${MBFL}) -# else() - add_compile_options(-Wno-unused-parameter -Wno-logical-op-parentheses -Wno-unused-variable -Wno-return-type -Wno-unused-function) + add_library(libmbfl STATIC IMPORTED ${MBFL}) +else() + if (APPLE) + add_compile_options(-Wno-logical-op-parentheses) + endif() + add_compile_options(-Wno-unused-parameter -Wno-unused-variable -Wno-return-type -Wno-unused-function) handle_missing_library("libmbfl") FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) message(STATUS "---------------------") @@ -69,6 +72,8 @@ if(MBFL) add_link_options(-L${libmbfl_SOURCE_DIR}/objs) endif() +add_compile_options(-Wno-redundant-move) + if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( diff --git a/cmake/init-compilation-flags.cmake b/cmake/init-compilation-flags.cmake index 9ba0d676d3..c41f116a61 100644 --- a/cmake/init-compilation-flags.cmake +++ b/cmake/init-compilation-flags.cmake @@ -115,7 +115,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") endif() add_compile_options(-Werror -Wall -Wextra -Wunused-function -Wfloat-conversion -Wno-sign-compare - -Wuninitialized -Wno-redundant-move -Wno-missing-field-initializers) + -Wuninitialized -Wno-missing-field-initializers) if(NOT APPLE) check_cxx_compiler_flag(-gz=zlib DEBUG_COMPRESSION_IS_FOUND) From 35bcc8f7714c0ea09b3e568b0e33236029e03198 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 16:02:40 +0300 Subject: [PATCH 05/27] remove trash --- cmake/external-libraries.cmake | 8 -------- runtime/mbstring/mbstring.h | 3 +-- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 6d9295f370..f32e04b0fd 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -57,15 +57,7 @@ else() endif() if(MBFL) - add_library(libmbfl STATIC IMPORTED ${MBFL}) -else() - if (APPLE) - add_compile_options(-Wno-logical-op-parentheses) - endif() - add_compile_options(-Wno-unused-parameter -Wno-unused-variable -Wno-return-type -Wno-unused-function) - handle_missing_library("libmbfl") FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) - message(STATUS "---------------------") FetchContent_MakeAvailable(libmbfl) include_directories(${libmbfl_SOURCE_DIR}/include) add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 3668a6fda0..7fc83931e7 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -4,9 +4,8 @@ extern "C" { // FIXME + #include "/../../build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" // #include - // #include "../../build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" - #include } /** From e951d867e51850cf21e19b73f2790f46ded469db Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 16:12:46 +0300 Subject: [PATCH 06/27] fix path --- runtime/mbstring/mbstring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 7fc83931e7..f4e8c3d527 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -4,7 +4,7 @@ extern "C" { // FIXME - #include "/../../build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" + #include "build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" // #include } From fac68a63182d5c1e84092bc564d8d99c8422d9fd Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 16:46:19 +0300 Subject: [PATCH 07/27] fix path --- cmake/external-libraries.cmake | 1 + runtime/mbstring/mbstring.h | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index f32e04b0fd..fc77046abe 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -57,6 +57,7 @@ else() endif() if(MBFL) + message(STATUS "MBFL=On, libmbfl will be downloaded and built") FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) FetchContent_MakeAvailable(libmbfl) include_directories(${libmbfl_SOURCE_DIR}/include) diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index f4e8c3d527..5a5a0732bf 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -3,9 +3,7 @@ #include "runtime/kphp_core.h" extern "C" { - // FIXME - #include "build/_deps/libmbfl-src/include/kphp/libmbfl/mbfl/mbfilter.h" - // #include + #include } /** From 3c0c54e20a24b7b5edcd391ba30bb08651e8e919 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 17 Apr 2023 18:15:50 +0300 Subject: [PATCH 08/27] fix including --- runtime/mbstring/mbstring.cpp | 4 ++++ runtime/mbstring/mbstring.h | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 52cff09775..bfb39529dc 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -1,5 +1,9 @@ #include "mbstring.h" +extern "C" { + #include +} + string f$mb_convert_encoding(const string &str, const string &to, const string &from) { /* preparing */ diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 5a5a0732bf..c82873127a 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -2,10 +2,6 @@ #include "runtime/kphp_core.h" -extern "C" { - #include -} - /** * Convert a string from one character encoding to another * @param str The string to be converted From 9332f451516deb9296e45119d4e71809dbc84313 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Thu, 20 Apr 2023 02:18:43 +0300 Subject: [PATCH 09/27] finish basic mbstring functions and building libmbfl --- cmake/external-libraries.cmake | 3 +- runtime/mbstring/mbstring.cpp | 93 ++++++++++++++++++++++----- runtime/mbstring/mbstring.h | 11 +++- tests/cpp/runtime/mbstring-test.cpp | 18 ++++++ tests/cpp/runtime/runtime-tests.cmake | 45 ++++++------- 5 files changed, 127 insertions(+), 43 deletions(-) create mode 100644 tests/cpp/runtime/mbstring-test.cpp diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index fc77046abe..8266b44b7b 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -1,5 +1,5 @@ option(DOWNLOAD_MISSING_LIBRARIES "download and build missing libraries if needed" OFF) -option(MBFL, OFF) +option(MBFL "download and build libmbfl" OFF) cmake_print_variables(DOWNLOAD_MISSING_LIBRARIES) cmake_print_variables(MBFL) function(handle_missing_library LIB_NAME) @@ -58,6 +58,7 @@ endif() if(MBFL) message(STATUS "MBFL=On, libmbfl will be downloaded and built") + add_compile_options(-DMBFL) FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) FetchContent_MakeAvailable(libmbfl) include_directories(${libmbfl_SOURCE_DIR}/include) diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index bfb39529dc..4d1637d59a 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -4,34 +4,93 @@ extern "C" { #include } -string f$mb_convert_encoding(const string &str, const string &to, const string &from) { - - /* preparing */ - const char *c_str = str.c_str(); - const char *c_from = from.c_str(); - const char *c_to = to.c_str(); +mbfl_string *convert_encoding(const char *str, const char *to, const char *from) { + int len = strlen(str); enum mbfl_no_encoding from_encoding, to_encoding; mbfl_buffer_converter *convd = NULL; - mbfl_string tmp, result, *ret; + mbfl_string _string, result, *ret; /* from internal to mbfl */ - from_encoding = mbfl_name2no_encoding(c_from); - to_encoding = mbfl_name2no_encoding(c_to); + from_encoding = mbfl_name2no_encoding(from); + to_encoding = mbfl_name2no_encoding(to); /* init buffer mbfl strings */ - long int len = strlen(c_str); - mbfl_string_init(&tmp); + mbfl_string_init(&_string); mbfl_string_init(&result); - tmp.no_encoding = from_encoding; - tmp.len = len; - tmp.val = (unsigned char*)c_str; + _string.no_encoding = from_encoding; + _string.len = len; + _string.val = (unsigned char*)str; /* converting */ convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0); - ret = mbfl_buffer_converter_feed_result(convd, &tmp, &result); + ret = mbfl_buffer_converter_feed_result(convd, &_string, &result); mbfl_buffer_converter_delete(convd); - /* returning kphp's string */ - return string((const char*)ret->val, ret->len); + /* fix converting with multibyte encodings */ + if (len % 2 != 0 && ret->len % 2 == 0 && len < ret->len) { + ret->len++; + ret->val[ret->len-1] = 63; + } + + return ret; +} + +bool check_encoding(const char *value, const char *encoding) { + + /* init buffer mbfl strins */ + mbfl_string _string; + mbfl_string_init(&_string); + _string.val = (unsigned char*)value; + _string.len = strlen((char*)value); + + /* from internal to mbfl */ + const mbfl_encoding *enc = mbfl_name2encoding(encoding); + + /* get all supported encodings */ + const mbfl_encoding **encs = mbfl_get_supported_encodings(); + int len = sizeof(**encs); + + /* identify encoding of input string */ + /* Warning! String can be represented in different encodings, so check needed */ + const mbfl_encoding *i_enc = mbfl_identify_encoding2(&_string, encs, len, 1); + + /* perform convering */ + const char *i_enc_str = (const char*)convert_encoding(value, i_enc->name, enc->name)->val; + const char *enc_str = (const char*)convert_encoding(i_enc_str, enc->name, i_enc->name)->val; + + /* check equality */ + /* Warning! strcmp not working, because of different encodings */ + bool res = true; + for (int i = 0; i < strlen(enc_str); i++) + if (enc_str[i] != value[i]) { + res = false; + break; + } + + free((void*)i_enc_str); + free((void*)enc_str); + return res; +} + +bool f$mb_check_encoding(const string &value, const string &encoding) { + const char *c_value = value.c_str(); + const char *c_encoding = encoding.c_str(); + return check_encoding(c_value, c_encoding); +} + +string f$mb_convert_encoding(const string &str, const string &to_encoding, const string &from_encoding) { + + const char *c_string = str.c_str(); + const char *c_to_encoding = to_encoding.c_str(); + const char *c_from_encoding = from_encoding.c_str(); + + /* perform convertion */ + mbfl_string *ret = convert_encoding(c_string, c_to_encoding, c_from_encoding); + string res = string((const char*)ret->val, ret->len); + + /* check if string represents in from_encoding, magic number 63 - '?' in ASCII */ + if (!check_encoding(c_string, c_from_encoding)) res = string(strlen(c_string), (char)63); + + return res; } \ No newline at end of file diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index c82873127a..a2154176a5 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -2,14 +2,19 @@ #include "runtime/kphp_core.h" +/** + * Check if strings are valid for the specified encoding + * @param value The byte stream + * @param encoding The expected encoding + * @return Returns true on success or false on failure + */ +bool f$mb_check_encoding(const string &value, const string &encoding); + /** * Convert a string from one character encoding to another * @param str The string to be converted * @param from The desired encoding of the result * @param to The current encoding used to interpret string * @return The encoded string - * TODO!: mb_check_encoding(str, from) inside - * TODO!: own constants for encodings - * TODO: issue for timelib */ string f$mb_convert_encoding(const string &str, const string &to, const string &from); \ No newline at end of file diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp new file mode 100644 index 0000000000..42ab014a48 --- /dev/null +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -0,0 +1,18 @@ +#include +#include "runtime/mbstring/mbstring.h" + +#ifdef MBFL +/* TODO: make fun strings for tests */ +TEST(mbstring_test, test_mb_check_encoding) { + ASSERT_TRUE(f$mb_check_encoding(string("sdf"), string("Windows-1251"))); + ASSERT_TRUE(f$mb_check_encoding(string("ыва"), string("Windows-1251"))); + ASSERT_TRUE(f$mb_check_encoding(string("İnanç Esasları"), string("UTF-8"))); + ASSERT_TRUE(f$mb_check_encoding(string("İnanç Esasları"), string("Windows-1251"))); + ASSERT_FALSE(f$mb_check_encoding(string("İnanç Esasları"), string("ASCII"))); +} +TEST(mbstring_test, test_mb_convert_encoding) { + ASSERT_STREQ(f$mb_convert_encoding(string("Hello"), string("UTF-8"), string("EUC-KR")).c_str(), "Hello"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыавыа"), string("UTF-8"), string("Windows-1251")).c_str(), "ыавыа"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыва"), string("UTF-8"), string("ASCII")).c_str(), "??????"); +} +#endif \ No newline at end of file diff --git a/tests/cpp/runtime/runtime-tests.cmake b/tests/cpp/runtime/runtime-tests.cmake index 88d4255228..aea2f10c3d 100644 --- a/tests/cpp/runtime/runtime-tests.cmake +++ b/tests/cpp/runtime/runtime-tests.cmake @@ -1,26 +1,27 @@ prepend(RUNTIME_TESTS_SOURCES ${BASE_DIR}/tests/cpp/runtime/ - _runtime-tests-env.cpp - allocator-malloc-replacement-test.cpp - array-test.cpp - common-php-functions-test.cpp - confdata-functions-test.cpp - confdata-key-maker-test.cpp - confdata-predefined-wildcards-test.cpp - flex-test.cpp - inter-process-mutex-test.cpp - inter-process-resource-test.cpp - json-writer-test.cpp - number-string-comparison.cpp - kphp-type-traits-test.cpp - msgpack-test.cpp - memory_resource/details/memory_chunk_list-test.cpp - memory_resource/details/memory_chunk_tree-test.cpp - memory_resource/details/memory_ordered_chunk_list-test.cpp - memory_resource/extra-memory-pool-test.cpp - memory_resource/unsynchronized_pool_resource-test.cpp - string-list-test.cpp - string-test.cpp - zstd-test.cpp) + _runtime-tests-env.cpp + allocator-malloc-replacement-test.cpp + array-test.cpp + common-php-functions-test.cpp + confdata-functions-test.cpp + confdata-key-maker-test.cpp + confdata-predefined-wildcards-test.cpp + flex-test.cpp + inter-process-mutex-test.cpp + inter-process-resource-test.cpp + json-writer-test.cpp + number-string-comparison.cpp + kphp-type-traits-test.cpp + msgpack-test.cpp + memory_resource/details/memory_chunk_list-test.cpp + memory_resource/details/memory_chunk_tree-test.cpp + memory_resource/details/memory_ordered_chunk_list-test.cpp + memory_resource/extra-memory-pool-test.cpp + memory_resource/unsynchronized_pool_resource-test.cpp + string-list-test.cpp + string-test.cpp + zstd-test.cpp + mbstring-test.cpp) allow_deprecated_declarations_for_apple(${BASE_DIR}/tests/cpp/runtime/inter-process-mutex-test.cpp) vk_add_unittest(runtime "${RUNTIME_LIBS};${RUNTIME_LINK_TEST_LIBS}" ${RUNTIME_TESTS_SOURCES}) From 8b3d40e04d17e5fb8e4bf54ccc18c0626b7478a4 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Thu, 20 Apr 2023 03:51:32 +0300 Subject: [PATCH 10/27] fix external libs --- builtin-functions/_functions.txt | 9 +-------- compiler/compiler-settings.cpp | 8 ++------ 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index c1c2616ff2..b789a477a7 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -730,14 +730,6 @@ function setlocale ($category ::: int, $locale ::: string) ::: string | false; function iconv ($input_encoding ::: string, $output_encoding ::: string, $input_str ::: string) ::: string | false; -function mb_check_encoding ($str ::: string, $encoding ::: string = "1251") ::: bool; -function mb_strlen ($str ::: string, $encoding ::: string = "1251") ::: int; -function mb_strpos ($haystack ::: string, $needle ::: string, $offset ::: int = 0, $encoding ::: string = "1251") ::: int | false; -function mb_stripos ($haystack ::: string, $needle ::: string, $offset ::: int = 0, $encoding ::: string = "1251") ::: int | false; -function mb_strtolower ($str ::: string, $encoding ::: string = "1251") ::: string; -function mb_strtoupper ($str ::: string, $encoding ::: string = "1251") ::: string; -function mb_substr ($str ::: string, $start ::: int, $length ::: mixed = PHP_INT_MAX, $encoding ::: string = "1251") ::: string; - define('PHP_ROUND_HALF_UP', 123423141); define('PHP_ROUND_HALF_DOWN', 123423144); define('PHP_ROUND_HALF_EVEN', 123423145); @@ -1622,3 +1614,4 @@ class DateTimeImmutable implements DateTimeInterface { function getenv(string $varname = '', bool $local_only = false): mixed; function mb_convert_encoding(string $str, string $to, string $from): string; +function mb_check_encoding(string $str, string $encoding): bool; \ No newline at end of file diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index b6cc784042..0368c3d946 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -320,7 +320,7 @@ void CompilerSettings::init() { ld_flags.value_ = extra_ld_flags.get(); append_curl(cxx_default_flags, ld_flags.value_); append_apple_options(cxx_default_flags, ld_flags.value_); - std::vector external_static_libs{"pcre", "re2", "yaml-cpp", "h3", "z", "zstd", "nghttp2", "kphp-timelib", "libmbfl"}; + std::vector external_static_libs{"pcre", "re2", "yaml-cpp", "h3", "z", "zstd", "nghttp2", "kphp-timelib"}; #ifdef KPHP_TIMELIB_LIB_DIR ld_flags.value_ += " -L" KPHP_TIMELIB_LIB_DIR; @@ -332,12 +332,8 @@ void CompilerSettings::init() { #endif #ifdef LIBMBFL_LIB_DIR + external_static_libs.emplace_back("libmbfl"); ld_flags.value_ += " -L" LIBMBFL_LIB_DIR; -#else - // kphp-timelib is usually installed in /usr/local/lib; - // LDD may not find a library in /usr/local/lib if we don't add it here - // TODO: can we avoid this hardcoded library path? - ld_flags.value_ += " -L /usr/local/lib"; #endif #if defined(__APPLE__) && defined(__arm64__) From 7e137cb9d8115e18b077dee873538f26e6b7e8bc Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Thu, 20 Apr 2023 15:18:44 +0300 Subject: [PATCH 11/27] move kphp-timelib installation to the top and add explanatory comment --- cmake/external-libraries.cmake | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 4ec35b9cb3..715f4e6028 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -9,6 +9,23 @@ function(handle_missing_library LIB_NAME) endif() endfunction() +find_library(KPHP_TIMELIB kphp-timelib) +if(KPHP_TIMELIB) + add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) +else() + handle_missing_library("kphp-timelib") + FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) + message(STATUS "---------------------") + FetchContent_MakeAvailable(kphp-timelib) + include_directories(${kphp-timelib_SOURCE_DIR}/include) + add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") + add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) +endif() + +# '-Wno-redundant-move' flag works for C++/ObjC++ but not for C, +# so build C libraries above +add_compile_options(-Wno-redundant-move) + find_package(fmt QUIET) if(NOT fmt_FOUND) handle_missing_library("fmtlib") @@ -41,21 +58,6 @@ if(KPHP_TESTS) endif() endif() -find_library(KPHP_TIMELIB kphp-timelib) -if(KPHP_TIMELIB) - add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) -else() - handle_missing_library("kphp-timelib") - FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) - message(STATUS "---------------------") - FetchContent_MakeAvailable(kphp-timelib) - include_directories(${kphp-timelib_SOURCE_DIR}/include) - add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") - add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) -endif() - -add_compile_options(-Wno-redundant-move) - if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( From 5aff222bd91751758bde852a0f15b843b38f00f3 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Thu, 20 Apr 2023 15:23:55 +0300 Subject: [PATCH 12/27] move kphp-timelib installation to the top and add explanatory comment --- cmake/external-libraries.cmake | 52 ++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 8266b44b7b..cac16d83af 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -11,6 +11,33 @@ function(handle_missing_library LIB_NAME) endif() endfunction() +find_library(KPHP_TIMELIB kphp-timelib) +if(KPHP_TIMELIB) + add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) +else() + handle_missing_library("kphp-timelib") + FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) + message(STATUS "---------------------") + FetchContent_MakeAvailable(kphp-timelib) + include_directories(${kphp-timelib_SOURCE_DIR}/include) + add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") + add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) +endif() + +if(MBFL) + message(STATUS "MBFL=On, libmbfl will be downloaded and built") + add_compile_options(-DMBFL) + FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) + FetchContent_MakeAvailable(libmbfl) + include_directories(${libmbfl_SOURCE_DIR}/include) + add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") + add_link_options(-L${libmbfl_SOURCE_DIR}/objs) +endif() + +# '-Wno-redundant-move' flag works for C++/ObjC++ but not for C, +# so build C libraries above +add_compile_options(-Wno-redundant-move) + find_package(fmt QUIET) if(NOT fmt_FOUND) handle_missing_library("fmtlib") @@ -43,31 +70,6 @@ if(KPHP_TESTS) endif() endif() -find_library(KPHP_TIMELIB kphp-timelib) -if(KPHP_TIMELIB) - add_library(kphp-timelib STATIC IMPORTED ${KPHP_TIMELIB}) -else() - handle_missing_library("kphp-timelib") - FetchContent_Declare(kphp-timelib GIT_REPOSITORY https://github.com/VKCOM/timelib) - message(STATUS "---------------------") - FetchContent_MakeAvailable(kphp-timelib) - include_directories(${kphp-timelib_SOURCE_DIR}/include) - add_definitions(-DKPHP_TIMELIB_LIB_DIR="${kphp-timelib_SOURCE_DIR}/objs") - add_link_options(-L${kphp-timelib_SOURCE_DIR}/objs) -endif() - -if(MBFL) - message(STATUS "MBFL=On, libmbfl will be downloaded and built") - add_compile_options(-DMBFL) - FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) - FetchContent_MakeAvailable(libmbfl) - include_directories(${libmbfl_SOURCE_DIR}/include) - add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") - add_link_options(-L${libmbfl_SOURCE_DIR}/objs) -endif() - -add_compile_options(-Wno-redundant-move) - if(APPLE) if (DEFINED ENV{EPOLL_SHIM_REPO}) FetchContent_Declare( From 07af7a09361cdc52ea82782243bc979bf50dc9d9 Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Sun, 30 Apr 2023 23:24:48 +0300 Subject: [PATCH 13/27] add MBFL flag to runtime, restore mbstring functions for only UTF-8 and Windows-1251 encodings and add runtime declarations of all mbstring functions --- builtin-functions/_functions.txt | 61 ++- cmake/init-compilation-flags.cmake | 1 + compiler/compiler-settings.cpp | 3 + runtime/interface.cpp | 3 + runtime/mbstring.cpp | 51 -- runtime/mbstring.h | 12 - runtime/mbstring/mbstring.cpp | 473 +++++++++++++++++- runtime/mbstring/mbstring.h | 744 +++++++++++++++++++++++++++- runtime/regexp.h | 2 +- runtime/runtime.cmake | 4 +- tests/cpp/runtime/mbstring-test.cpp | 9 +- 11 files changed, 1266 insertions(+), 97 deletions(-) delete mode 100644 runtime/mbstring.cpp delete mode 100644 runtime/mbstring.h diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index febed462ab..34536b2e98 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1617,5 +1617,62 @@ class DateTimeImmutable implements DateTimeInterface { function getenv(string $varname = '', bool $local_only = false): mixed; -function mb_convert_encoding(string $str, string $to, string $from): string; -function mb_check_encoding(string $str, string $encoding): bool; \ No newline at end of file +function mb_check_encoding(array|string $value, ?string $encoding = null): bool; +function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_strlen(string $string, ?string $encoding = null): int; +function mb_strpos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_stripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strtolower(string $string, ?string $encoding = null): string; +function mb_strtoupper(string $string, ?string $encoding = null): string; +function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; +function mb_chr(int $codepoint, ?string $encoding = null): string|false; +function mb_convert_case(string $string, int $mode, ?string $encoding = null): string; +function mb_convert_kana(string $string, string $mode = "KV", ?string $encoding = null): string; +function mb_convert_variables(string $to_encoding, array|string $from_encoding, mixed &$vars): string|false; // ??? (change variable bytes + kwargs) +function mb_decode_mimeheader(string $string): string; +function mb_decode_numericentity(string $string, array $map, ?string $encoding = null): string; +function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false; +function mb_detect_order(array|string|null $encoding = null): array|bool; +function mb_encode_mimeheader(string $string, ?string $charset = null, ?string $transfer_encoding = null, string $newline = "\r\n", int $indent = 0): string; +function mb_encode_numericentity(string $string, array $map, ?string $encoding = null, bool $hex = false): string; +function mb_encoding_aliases(string $encoding): array; +function mb_ereg_match(string $pattern, string $string, ?string $options = null): bool; +function mb_ereg_replace_callback(string $pattern, callable $callback, string $string, ?string $options = null): string|false|null; +function mb_ereg_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; +function mb_ereg_search_getpos(): int; +function mb_ereg_search_getregs(): array|false; +function mb_ereg_search_init(string $string, ?string $pattern = null, ?string $options = null): bool; +function mb_ereg_search_pos(?string $pattern = null, ?string $options = null): array|false; +function mb_ereg_search_regs(?string $pattern = null, ?string $options = null): array|false; +function mb_ereg_search_setpos(int $offset): bool; +function mb_ereg_search(?string $pattern = null, ?string $options = null): bool; +function mb_ereg(string $pattern, string $string, array &$matches = null): bool; +function mb_eregi_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; +function mb_eregi(string $pattern, string $string, array &$matches = null): bool; +function mb_get_info(string $type = "all"): array|string|int|false; +function mb_http_input(?string $type = null): array|string|false; +function mb_http_output(?string $encoding = null): string|false; +function mb_internal_encoding(?string $encoding = null): string|false; +function mb_language(?string $language = null): string|false; +function mb_list_encodings(): array; +function mb_ord(string $string, ?string $encoding = null): int|false; +function mb_output_handler(string $string, int $status): string; +function mb_parse_str(string $string, array &$result): bool; +function mb_preferred_mime_name(string $encoding): string|false; +function mb_regex_encoding(?string $encoding = null): string|false; +function mb_regex_set_options(?string $options = null): string; +function mb_scrub(string $string, ?string $encoding = null): string; +function mb_send_mail(string $to, string $subject, string $message, array|string $additional_headers = [], ?string $additional_params = null): bool; +function mb_split(string $pattern, string $string, int $limit = -1): array|false; +function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; +function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; +function mb_strimwidth(string $string, int $start, int $width, string $trim_marker = "", ?string $encoding = null): string; +function mb_stristr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strrchr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strrichr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strrpos(string $haystack, string $needle, int $offset = 0, string $encoding = null): int|false; +function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strwidth(string $string, ?string $encoding = null): int; +function mb_substitute_character(string|int|null $substitute_character = null): string|int|false; +function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; \ No newline at end of file diff --git a/cmake/init-compilation-flags.cmake b/cmake/init-compilation-flags.cmake index a3273c84fd..5ed14874bb 100644 --- a/cmake/init-compilation-flags.cmake +++ b/cmake/init-compilation-flags.cmake @@ -76,6 +76,7 @@ if (PDO_DRIVER_PGSQL) add_definitions(-DPDO_DRIVER_PGSQL) add_compile_definitions(PDO_DRIVER_PGSQL_VERSION=${PostgreSQL_VERSION}) endif() + cmake_print_variables(PDO_DRIVER_PGSQL) option(KPHP_TESTS "Build the tests" ON) diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index 0368c3d946..5a9ab2ede7 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -283,6 +283,9 @@ void CompilerSettings::init() { remove_extra_spaces(extra_cxx_flags.value_); std::stringstream ss; + #ifdef MBFL + ss << " -DMBFL "; + #endif ss << extra_cxx_flags.get(); ss << " -iquote" << kphp_src_path.get() << " -iquote " << kphp_src_path.get() << "objs/generated/auto/runtime"; diff --git a/runtime/interface.cpp b/runtime/interface.cpp index fea37c2618..93522425e8 100644 --- a/runtime/interface.cpp +++ b/runtime/interface.cpp @@ -2380,6 +2380,9 @@ static void free_runtime_libs() { free_migration_php8(); + #ifndef MBFL + free_detect_incorrect_encoding_names(); + #endif vk::singleton::get().reset_buffers(); #ifdef PDO_DRIVER_MYSQL database_drivers::free_mysql_lib(); diff --git a/runtime/mbstring.cpp b/runtime/mbstring.cpp deleted file mode 100644 index bbf8231b47..0000000000 --- a/runtime/mbstring.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2020 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt - -#include "runtime/mbstring.h" - -#include "common/unicode/unicode-utils.h" -#include "common/unicode/utf8-utils.h" - -bool mb_UTF8_check(const char *s) { - do { -#define CHECK(condition) if (!(condition)) {return false;} - unsigned int a = (unsigned char)(*s++); - if ((a & 0x80) == 0) { - if (a == 0) { - return true; - } - continue; - } - - CHECK ((a & 0x40) != 0); - - unsigned int b = (unsigned char)(*s++); - CHECK((b & 0xc0) == 0x80); - if ((a & 0x20) == 0) { - CHECK((a & 0x1e) > 0); - continue; - } - - unsigned int c = (unsigned char)(*s++); - CHECK((c & 0xc0) == 0x80); - if ((a & 0x10) == 0) { - int x = (((a & 0x0f) << 6) | (b & 0x20)); - CHECK(x != 0 && x != 0x360);//surrogates - continue; - } - - unsigned int d = (unsigned char)(*s++); - CHECK((d & 0xc0) == 0x80); - if ((a & 0x08) == 0) { - int t = (((a & 0x07) << 6) | (b & 0x30)); - CHECK(0 < t && t < 0x110);//end of unicode - continue; - } - - return false; -#undef CHECK - } while (true); - - php_assert (0); -} \ No newline at end of file diff --git a/runtime/mbstring.h b/runtime/mbstring.h deleted file mode 100644 index be9aef5b0c..0000000000 --- a/runtime/mbstring.h +++ /dev/null @@ -1,12 +0,0 @@ -// Compiler for PHP (aka KPHP) -// Copyright (c) 2020 LLC «V Kontakte» -// Distributed under the GPL v3 License, see LICENSE.notice.txt - -#pragma once - -#include - -#include "runtime/kphp_core.h" -#include "runtime/string_functions.h" - -bool mb_UTF8_check(const char *s); \ No newline at end of file diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 4d1637d59a..82c966f0ad 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -1,5 +1,49 @@ #include "mbstring.h" +bool mb_UTF8_check(const char *s) { + do { +#define CHECK(condition) if (!(condition)) {return false;} + unsigned int a = (unsigned char)(*s++); + if ((a & 0x80) == 0) { + if (a == 0) { + return true; + } + continue; + } + + CHECK ((a & 0x40) != 0); + + unsigned int b = (unsigned char)(*s++); + CHECK((b & 0xc0) == 0x80); + if ((a & 0x20) == 0) { + CHECK((a & 0x1e) > 0); + continue; + } + + unsigned int c = (unsigned char)(*s++); + CHECK((c & 0xc0) == 0x80); + if ((a & 0x10) == 0) { + int x = (((a & 0x0f) << 6) | (b & 0x20)); + CHECK(x != 0 && x != 0x360);//surrogates + continue; + } + + unsigned int d = (unsigned char)(*s++); + CHECK((d & 0xc0) == 0x80); + if ((a & 0x08) == 0) { + int t = (((a & 0x07) << 6) | (b & 0x30)); + CHECK(0 < t && t < 0x110);//end of unicode + continue; + } + + return false; +#undef CHECK + } while (true); + + php_assert (0); +} + +#ifdef MBFL extern "C" { #include } @@ -73,24 +117,423 @@ bool check_encoding(const char *value, const char *encoding) { return res; } -bool f$mb_check_encoding(const string &value, const string &encoding) { - const char *c_value = value.c_str(); - const char *c_encoding = encoding.c_str(); +// TODO: check for array as value +mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding) { + + if (str.is_string() && from_encoding.is_string()) { + const string &s = str.to_string(); + const string &from = from_encoding.to_string(); + + const char *c_string = s.c_str(); + const char *c_to_encoding = to_encoding.c_str(); + const char *c_from_encoding = from.c_str(); + + /* perform convertion */ + mbfl_string *ret = convert_encoding(c_string, c_to_encoding, c_from_encoding); + string res = string((const char*)ret->val, ret->len); + + /* check if string represents in from_encoding, magic number 63 - '?' in ASCII */ + if (!check_encoding(c_string, c_from_encoding)) res = string(strlen(c_string), (char)63); + + return res; + } + return 0; +} + +// TODO: check for optional value +bool f$mb_check_encoding(const mixed &value, const Optional &encoding) { + const string &val = value.to_string(); + const string &enc = encoding.val(); + const char *c_value = val.c_str(); + const char *c_encoding = enc.c_str(); return check_encoding(c_value, c_encoding); } -string f$mb_convert_encoding(const string &str, const string &to_encoding, const string &from_encoding) { +#else - const char *c_string = str.c_str(); - const char *c_to_encoding = to_encoding.c_str(); - const char *c_from_encoding = from_encoding.c_str(); +#include "common/unicode/unicode-utils.h" +#include "common/unicode/utf8-utils.h" - /* perform convertion */ - mbfl_string *ret = convert_encoding(c_string, c_to_encoding, c_from_encoding); - string res = string((const char*)ret->val, ret->len); +static bool is_detect_incorrect_encoding_names_warning{false}; - /* check if string represents in from_encoding, magic number 63 - '?' in ASCII */ - if (!check_encoding(c_string, c_from_encoding)) res = string(strlen(c_string), (char)63); - - return res; -} \ No newline at end of file +void f$set_detect_incorrect_encoding_names_warning(bool show) { + is_detect_incorrect_encoding_names_warning = show; +} + +void free_detect_incorrect_encoding_names() { + is_detect_incorrect_encoding_names_warning = false; +} + +static int mb_detect_encoding_new(const string &encoding) { + const auto encoding_name = f$strtolower(encoding).c_str(); + + if (!strcmp(encoding_name, "cp1251") || !strcmp(encoding_name, "cp-1251") || !strcmp(encoding_name, "windows-1251")) { + return 1251; + } + + if (!strcmp(encoding_name, "utf8") || !strcmp(encoding_name, "utf-8")) { + return 8; + } + + return -1; +} + +static int mb_detect_encoding(const string &encoding) { + const int result_new = mb_detect_encoding_new(encoding); + + if (strstr(encoding.c_str(), "1251")) { + if (is_detect_incorrect_encoding_names_warning && 1251 != result_new) { + php_warning("mb_detect_encoding returns 1251, but new will return %d, encoding %s", result_new, encoding.c_str()); + } + return 1251; + } + if (strstr(encoding.c_str(), "-8")) { + if (is_detect_incorrect_encoding_names_warning && 8 != result_new) { + php_warning("mb_detect_encoding returns 8, but new will return %d, encoding %s", result_new, encoding.c_str()); + } + return 8; + } + + if (is_detect_incorrect_encoding_names_warning && -1 != result_new) { + php_warning("mb_detect_encoding returns -1, but new will return %d, encoding %s", result_new, encoding.c_str()); + } + return -1; +} + +static int64_t mb_UTF8_strlen(const char *s) { + int64_t res = 0; + for (int64_t i = 0; s[i]; i++) { + if ((((unsigned char)s[i]) & 0xc0) != 0x80) { + res++; + } + } + return res; +} + +static int64_t mb_UTF8_advance(const char *s, int64_t cnt) { + php_assert (cnt >= 0); + int64_t i; + for (i = 0; s[i] && cnt >= 0; i++) { + if ((((unsigned char)s[i]) & 0xc0) != 0x80) { + cnt--; + } + } + if (cnt < 0) { + i--; + } + return i; +} + +static int64_t mb_UTF8_get_offset(const char *s, int64_t pos) { + int64_t res = 0; + for (int64_t i = 0; i < pos && s[i]; i++) { + if ((((unsigned char)s[i]) & 0xc0) != 0x80) { + res++; + } + } + return res; +} + +bool mb_UTF8_check(const char *s) { + do { +#define CHECK(condition) if (!(condition)) {return false;} + unsigned int a = (unsigned char)(*s++); + if ((a & 0x80) == 0) { + if (a == 0) { + return true; + } + continue; + } + + CHECK ((a & 0x40) != 0); + + unsigned int b = (unsigned char)(*s++); + CHECK((b & 0xc0) == 0x80); + if ((a & 0x20) == 0) { + CHECK((a & 0x1e) > 0); + continue; + } + + unsigned int c = (unsigned char)(*s++); + CHECK((c & 0xc0) == 0x80); + if ((a & 0x10) == 0) { + int x = (((a & 0x0f) << 6) | (b & 0x20)); + CHECK(x != 0 && x != 0x360);//surrogates + continue; + } + + unsigned int d = (unsigned char)(*s++); + CHECK((d & 0xc0) == 0x80); + if ((a & 0x08) == 0) { + int t = (((a & 0x07) << 6) | (b & 0x30)); + CHECK(0 < t && t < 0x110);//end of unicode + continue; + } + + return false; +#undef CHECK + } while (true); + + php_assert (0); +} + +bool f$mb_check_encoding(const string &str, const string &encoding) { + int encoding_num = mb_detect_encoding(encoding); + if (encoding_num < 0) { + php_critical_error ("encoding \"%s\" doesn't supported in mb_check_encoding", encoding.c_str()); + return !str.empty(); + } + + if (encoding_num == 1251) { + return true; + } + + return mb_UTF8_check(str.c_str()); +} + + +int64_t f$mb_strlen(const string &str, const string &encoding) { + int encoding_num = mb_detect_encoding(encoding); + if (encoding_num < 0) { + php_critical_error ("encoding \"%s\" doesn't supported in mb_strlen", encoding.c_str()); + return str.size(); + } + + if (encoding_num == 1251) { + return str.size(); + } + + return mb_UTF8_strlen(str.c_str()); +} + + +string f$mb_strtolower(const string &str, const string &encoding) { + int encoding_num = mb_detect_encoding(encoding); + if (encoding_num < 0) { + php_critical_error ("encoding \"%s\" doesn't supported in mb_strtolower", encoding.c_str()); + return str; + } + + int len = str.size(); + if (encoding_num == 1251) { + string res(len, false); + for (int i = 0; i < len; i++) { + switch ((unsigned char)str[i]) { + case 'A' ... 'Z': + res[i] = (char)(str[i] + 'a' - 'A'); + break; + case 0xC0 ... 0xDF: + res[i] = (char)(str[i] + 32); + break; + case 0x81: + res[i] = (char)0x83; + break; + case 0xA3: + res[i] = (char)0xBC; + break; + case 0xA5: + res[i] = (char)0xB4; + break; + case 0xA1: + case 0xB2: + case 0xBD: + res[i] = (char)(str[i] + 1); + break; + case 0x80: + case 0x8A: + case 0x8C ... 0x8F: + case 0xA8: + case 0xAA: + case 0xAF: + res[i] = (char)(str[i] + 16); + break; + default: + res[i] = str[i]; + } + } + + return res; + } else { + string res(len * 3, false); + const char *s = str.c_str(); + int res_len = 0; + int p; + int ch; + while ((p = get_char_utf8(&ch, s)) > 0) { + s += p; + res_len += put_char_utf8(unicode_tolower(ch), &res[res_len]); + } + if (p < 0) { + php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtolower", str.c_str()); + } + res.shrink(res_len); + + return res; + } +} + +string f$mb_strtoupper(const string &str, const string &encoding) { + int encoding_num = mb_detect_encoding(encoding); + if (encoding_num < 0) { + php_critical_error ("encoding \"%s\" doesn't supported in mb_strtoupper", encoding.c_str()); + return str; + } + + int len = str.size(); + if (encoding_num == 1251) { + string res(len, false); + for (int i = 0; i < len; i++) { + switch ((unsigned char)str[i]) { + case 'a' ... 'z': + res[i] = (char)(str[i] + 'A' - 'a'); + break; + case 0xE0 ... 0xFF: + res[i] = (char)(str[i] - 32); + break; + case 0x83: + res[i] = (char)(0x81); + break; + case 0xBC: + res[i] = (char)(0xA3); + break; + case 0xB4: + res[i] = (char)(0xA5); + break; + case 0xA2: + case 0xB3: + case 0xBE: + res[i] = (char)(str[i] - 1); + break; + case 0x98: + case 0xA0: + case 0xAD: + res[i] = ' '; + break; + case 0x90: + case 0x9A: + case 0x9C ... 0x9F: + case 0xB8: + case 0xBA: + case 0xBF: + res[i] = (char)(str[i] - 16); + break; + default: + res[i] = str[i]; + } + } + + return res; + } else { + string res(len * 3, false); + const char *s = str.c_str(); + int res_len = 0; + int p; + int ch; + while ((p = get_char_utf8(&ch, s)) > 0) { + s += p; + res_len += put_char_utf8(unicode_toupper(ch), &res[res_len]); + } + if (p < 0) { + php_warning("Incorrect UTF-8 string \"%s\" in function mb_strtoupper", str.c_str()); + } + res.shrink(res_len); + + return res; + } +} + +namespace { + +int check_strpos_agrs(const char *func_name, const string &needle, int64_t offset, const string &encoding) noexcept { + if (unlikely(offset < 0)) { + php_warning("Wrong offset = %" PRIi64 " in function %s()", offset, func_name); + return 0; + } + if (unlikely(needle.empty())) { + php_warning("Parameter needle is empty in function %s()", func_name); + return 0; + } + + const int encoding_num = mb_detect_encoding(encoding); + if (unlikely(encoding_num < 0)) { + php_critical_error ("encoding \"%s\" doesn't supported in %s()", encoding.c_str(), func_name); + return 0; + } + return encoding_num; +} + +Optional mp_strpos_impl(const string &haystack, const string &needle, int64_t offset, int encoding_num) noexcept { + if (encoding_num == 1251) { + return f$strpos(haystack, needle, offset); + } + + int64_t UTF8_offset = mb_UTF8_advance(haystack.c_str(), offset); + const char *s = static_cast(memmem(haystack.c_str() + UTF8_offset, haystack.size() - UTF8_offset, needle.c_str(), needle.size())); + if (unlikely(s == nullptr)) { + return false; + } + return mb_UTF8_get_offset(haystack.c_str() + UTF8_offset, s - (haystack.c_str() + UTF8_offset)) + offset; +} + +} // namespace + +Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset, const string &encoding) noexcept { + if (const int encoding_num = check_strpos_agrs("mb_strpos", needle, offset, encoding)) { + return mp_strpos_impl(haystack, needle, offset, encoding_num); + } + return false; +} + +Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset, const string &encoding) noexcept { + if (const int encoding_num = check_strpos_agrs("mb_stripos", needle, offset, encoding)) { + return mp_strpos_impl(f$mb_strtolower(haystack, encoding), f$mb_strtolower(needle, encoding), offset, encoding_num); + } + return false; +} + +string f$mb_substr(const string &str, int64_t start, const mixed &length_var, const string &encoding) { + int encoding_num = mb_detect_encoding(encoding); + if (encoding_num < 0) { + php_critical_error ("encoding \"%s\" doesn't supported in mb_substr", encoding.c_str()); + return str; + } + + int64_t length; + if (length_var.is_null()) { + length = std::numeric_limits::max(); + } else { + length = length_var.to_int(); + } + + if (encoding_num == 1251) { + Optional res = f$substr(str, start, length); + if (!res.has_value()) { + return {}; + } + return res.val(); + } + + int64_t len = mb_UTF8_strlen(str.c_str()); + if (start < 0) { + start += len; + } + if (start > len) { + start = len; + } + if (length < 0) { + length = len - start + length; + } + if (length <= 0 || start < 0) { + return {}; + } + if (len - start < length) { + length = len - start; + } + + int64_t UTF8_start = mb_UTF8_advance(str.c_str(), start); + int64_t UTF8_length = mb_UTF8_advance(str.c_str() + UTF8_start, length); + + return {str.c_str() + UTF8_start, static_cast(UTF8_length)}; +} + +#endif \ No newline at end of file diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index a2154176a5..52f295e3d4 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -1,20 +1,744 @@ #pragma once #include "runtime/kphp_core.h" +#include "common/type_traits/function_traits.h" +#include "common/vector-product.h" + +#include "runtime/kphp_core.h" +#include "runtime/math_functions.h" +#include "runtime/string_functions.h" + +bool mb_UTF8_check(const char *s); + +#ifdef MBFL /** * Check if strings are valid for the specified encoding - * @param value The byte stream - * @param encoding The expected encoding - * @return Returns true on success or false on failure + * Checks if the specified byte stream is valid for the specified encoding. If value is of type array, all keys and values are validated recursively. + * It is useful to prevent so-called "Invalid Encoding Attack". + * @param array|string value The byte stream + * @param ?string encoding (default = null) The expected encoding + * @return bool Returns true on success or false on failure + */ +bool f$mb_check_encoding(const mixed &value, const Optional &encoding); + +/** + * Returns a string containing the character specified by the Unicode code point value, encoded in the specified encoding + * @param int codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. + */ +Optional f$mb_chr(const int64_t codepoint, const Optional &encoding); + +/** + * Perform case folding on a string + * @param string str The string being converted + * @param int mode The mode of the conversion. It can be one of MB_CASE_UPPER, MB_CASE_LOWER, MB_CASE_TITLE, MB_CASE_FOLD, + * MB_CASE_UPPER_SIMPLE, MB_CASE_LOWER_SIMPLE, MB_CASE_TITLE_SIMPLE, MB_CASE_FOLD_SIMPLE + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return string A case folded version of string converted in the way specified by mode + */ +string f$mb_convert_case(const string &str, const int64_t mode, const Optional &encoding); + +/** + * Convert from one character encoding to another + * @param array|string str The string or array to be converted + * @param string to_encoding The desired encoding of the result + * @param array|string|null from_encoding (default = null) The current encoding used to interpret string. + * Multiple encodings may be specified as an array or comma separated list, + * in which case the correct encoding will be guessed using the same algorithm as mb_detect_encoding(). + * If from_encoding is null or not specified, the mbstring.internal_encoding setting will be used if set, otherwise the default_charset setting. + * @return array|string|false The encoded string + */ +mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); + +/** + * Convert "kana" one from another ("zen-kaku", "han-kaku" and more) + * @param string str The string being converted + * @param string mode The conversion option (default = "KV") + * r - Convert "zen-kaku" alphabets to "han-kaku" + * R - Convert "han-kaku" alphabets to "zen-kaku" + * n - Convert "zen-kaku" numbers to "han-kaku" + * N - Convert "han-kaku" numbers to "zen-kaku" + * a - Convert "zen-kaku" alphabets and numbers to "han-kaku" + * A - Convert "han-kaku" alphabets and numbers to "zen-kaku" + * (Characters included in "a", "A" options are U+0021 - U+007E excluding U+0022, U+0027, U+005C, U+007E) + * s - Convert "zen-kaku" space to "han-kaku" (U+3000 -> U+0020) + * S - Convert "han-kaku" space to "zen-kaku" (U+0020 -> U+3000) + * k - Convert "zen-kaku kata-kana" to "han-kaku kata-kana" + * K - Convert "han-kaku kata-kana" to "zen-kaku kata-kana" + * h - Convert "zen-kaku hira-gana" to "han-kaku kata-kana" + * H - Convert "han-kaku kata-kana" to "zen-kaku hira-gana" + * c - Convert "zen-kaku kata-kana" to "zen-kaku hira-gana" + * C - Convert "zen-kaku hira-gana" to "zen-kaku kata-kana" + * V - Collapse voiced sound notation and convert them into a character. Use with "K","H" + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used. + * @return string The converted string + */ +string f$mb_convert_kana(const string &str, const string &mode, const Optional &encoding); + +/** + * Convert character code in variable(s) + * @param string to_encoding The encoding that the string is being converted to + * @param array|string from_encoding is specified as an array or comma separated string, it tries to detect encoding from from-coding. + * When from_encoding is omitted, detect_order is used. + * @param mixed &vars References to the variable being converted. String, Array are accepted. mb_convert_variables() assumes + * all parameters have the same encoding. + * @return string|false The character encoding before conversion for success, or false for failure + */ +Optional f$mb_convert_variables(const string &to_encoding, const mixed &from_encoding, const mixed &vars); // ??? + +/** + * Decode string in MIME header field + * @param string str The string being decoded + * @return string The decoded string in internal character encoding + */ +string f$mb_decode_mimeheader(const string &string); + +/** + * Decode HTML numeric string reference to character + * @param string str The string being decoded + * @param array map An array that specifies the code area to convert + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used. + * @return string The converted string + */ +string f$mb_decode_numericentity(const string &str, const array &map, const Optional &encoding); + +/** + * Detect character encoding + * Detects the most likely character encoding for string string from an ordered list of candidates. Automatic detection of the intended character encoding + * can never be entirely reliable; without some additional information, it is similar to decoding an encrypted string without the key. It is always preferable + * to use an indication of character encoding stored or transmitted with the data, such as a "Content-Type" HTTP header. This function is most useful with + * multibyte encodings, where not all sequences of bytes form a valid string. If the input string contains such a sequence, that encoding will be rejected, + * and the next encoding checked. + * @param string str The string being inspected + * @param array|string|null encodings (default = null) A list of character encodings to try, in order. The list may be specified as an array of strings, + * or a single string separated by commas. If encodings is omitted or null, the current detect_order (set with the mbstring.detect_order configuration option, + * or mb_detect_order() function) will be used. + * @param bool strict (default = false) Controls the behaviour when string is not valid in any of the listed encodings. + * If strict is set to false, the closest matching encoding will be returned; if strict is set to true, false will be returned. + * @return string|false Controls the behaviour when string is not valid in any of the listed encodings. If strict is set to false, + * the closest matching encoding will be returned; if strict is set to true, false will be returned. The default value for strict can be set + * with the mbstring.strict_detection configuration option. + */ +Optional f$mb_detect_encoding(const string &str, const mixed &encodings, const bool strict = false); + +/** + * Set/Get character encoding detection order + * @param array|string|null encoding (default = null) encoding is an array or comma separated list of character encoding. See supported encodings. + * If encoding is omitted or null, it returns the current character encoding detection order as array. This setting affects + * mb_detect_encoding() and mb_send_mail(). + * @return array|bool When setting the encoding detection order, true is returned on success or false on failure. + * When getting the encoding detection order, an ordered array of the encodings is returned. + */ +mixed f$mb_detect_order(const mixed &encoding); + +/** + * Encode string for MIME header + * @param string str The string being encoded. Its encoding should be same as mb_internal_encoding() + * @param ?string charset (default = null) Specifies the name of the character set in which string is represented in. + * The default value is determined by the current NLS setting (mbstring.language) + * @param ?string transfer_encoding (default = null) Specifies the scheme of MIME encoding. + * It should be either "B" (Base64) or "Q" (Quoted-Printable). Falls back to "B" if not given. + * @param string newline (default = "\r\n") Specifies the EOL (end-of-line) marker with which mb_encode_mimeheader() performs line-folding + * (a » RFC term, the act of breaking a line longer than a certain length into multiple lines. The length is currently hard-coded to 74 characters). + * Falls back to "\r\n" (CRLF) if not given. + * @param int indent (default = 0) Indentation of the first line (number of characters in the header before string) + * @return string A converted version of the string represented in ASCII + */ +string f$mb_encode_mimeheader(const string &str, const Optional &charset, const Optional &transfer_encoding, const string &newline, const int64_t indent); + +/** + * Encode character to HTML numeric string reference + * Converts specified character codes in string string from character code to HTML numeric character reference + * @param string str The string being encoded + * @param array map Aarray specifies code area to convert + * @param ?string encding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @param bool hex (default = false) Whether the returned entity reference should be in hexadecimal notation (otherwise it is in decimal notation) + * @return string The converted string + */ +string f$mb_encode_numericentity(const string &str, const array &map, const Optional &encoding, const bool hex = false); + +/** + * Get aliases of a known encoding type + * @param string encoding The encoding type being checked, for aliases + * @return array Returns a numerically indexed array of encoding aliases + */ +array f$mb_encoding_aliases(const string &encoding); + +/** + * Regular expression match for multibyte string + * @param string pattern The regular expression pattern + * @param string str The string being evaluated + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool Returns true if string matches the regular expression pattern, false if not + */ +bool f$mb_ereg_match(const string &pattern, const string &str, const Optional &options); + +/** + * Perform a regular expression search and replace with multibyte support using a callback + * Scans string for matches to pattern, then replaces the matched text with the output of callback function. + * The behavior of this function is almost identical to mb_ereg_replace(), except for the fact that instead of replacement parameter, + * one should specify a callback. + * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern. + * @param callable callback A callback that will be called and passed an array of matched elements in the subject string. + * The callback should return the replacement string. You'll often need the callback function for a mb_ereg_replace_callback() in just one place. + * In this case you can use an anonymous function to declare the callback within the call to mb_ereg_replace_callback(). + * By doing it this way you have all information for the call in one place and do not clutter the function namespace with a callback + * function's name not used anywhere else. + * @param string str The string being checked + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned */ -bool f$mb_check_encoding(const string &value, const string &encoding); +// Optional f$mb_ereg_replace_callback(const string &pattern, const CallableT &callback, const string &str, const Optional options); // callback /** - * Convert a string from one character encoding to another - * @param str The string to be converted - * @param from The desired encoding of the result - * @param to The current encoding used to interpret string - * @return The encoded string + * Replace regular expression with multibyte support + * Scans string for matches to pattern, then replaces the matched text with replacement + * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern + * @param string replacement The replacement text + * @param string str The string being checked + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned */ -string f$mb_convert_encoding(const string &str, const string &to, const string &from); \ No newline at end of file +Optional f$mb_ereg_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); + +/** + * Returns start point for next regular expression match + * @return int mb_ereg_search_getpos() returns the point to start regular expression match for mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). + * The position is represented by bytes from the head of string. + */ +int64_t f$mb_ereg_search_getpos(void); + +/** + * Retrieve the result from the last multibyte regular expression match + * @return array|false An array including the sub-string of matched part by last mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). + * If there are some matches, the first element will have the matched sub-string, the second element will have the first part grouped with brackets, + * the third element will have the second part grouped with brackets, and so on. It returns false on error. + */ +Optional f$mb_ereg_search_getregs(void); + +/** + * Setup string and regular expression for a multibyte regular expression match + * mb_ereg_search_init() sets string and pattern for a multibyte regular expression. + * These values are used for mb_ereg_search(), mb_ereg_search_pos(),and mb_ereg_search_regs(). + * @param string str The search string + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool Returns true on success or false on failure + */ +bool f$mb_ereg_search_init(const string &str, const Optional &pattern, const Optional &options); + +/** + * Returns position and length of a matched part of the multibyte regular expression for a predefined multibyte string + * The string for match is specified by mb_ereg_search_init(). If it is not specified, the previous one will be used + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return array|false An array containing two elements. The first element is the offset, in bytes, where the match begins relative to the start of + * the search string, and the second element is the length in bytes of the match. If an error occurs, false is returned. + */ +Optional f$mb_ereg_search_pos(const Optional &pattern, const Optional &options); + +/** + * Returns the matched part of a multibyte regular expression + * @param ?string pattern (default = null) The search pattern + * @param ?string options (deafult = null) The search option. See mb_regex_set_options() for explanation + * @return array|false mb_ereg_search_regs() executes the multibyte regular expression match, and if there are some matched part, + * it returns an array including substring of matched part as first element, the first grouped part with brackets as second element, + * the second grouped part as third element, and so on. It returns false on error. + */ +Optional f$mb_ereg_search_regs(const Optional &pattern, const Optional &options); + +/** + * Set start point of next regular expression match + * mb_ereg_search_setpos() sets the starting point of a match for mb_ereg_search(). + * @param int offset The position to set. If it is negative, it counts from the end of the string + * @return bool Returns true on success or false on failure + */ +bool f$mb_ereg_search_setpos(const int64_t offset); + +/** + * Multibyte regular expression match for predefined multibyte string + * @param ?string pattern (default = null) The search pattern + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return bool mb_ereg_search() returns true if the multibyte string matches with the regular expression, or false otherwise. The string for matching + * is set by mb_ereg_search_init(). If pattern is not specified, the previous one is used. + */ +bool f$mb_ereg_search(const Optional &pattern, const Optional &options); + +/** + * Regular expression match with multibyte support + * @param string pattern The search pattern + * @param string str The search string + * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the + * third argument matches, the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. + * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, + * and so on. $matches[0] will contain a copy of the complete string matched. + * @return bool Returns whether pattern matches string + */ +bool f$mb_ereg(const string &pattern, const string &str, const array &matches); + +/** + * Replace regular expression with multibyte support ignoring case + * Scans string for matches to pattern, then replaces the matched text with replacement + * @param string pattern The regular expression pattern. Multibyte characters may be used. The case will be ignored + * @param string replacement The replacement text + * @param string str The searched string + * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation + * @return string|false|null The resultant string or false on error. If string is not valid for the current encoding, null is returned + */ +Optional f$mb_eregi_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); + +/** + * Regular expression match ignoring case with multibyte support + * @param string pattern The regular expression pattern + * @param string str The string being searched + * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the third argument matches, + * the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. + * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, + * and so on. $matches[0] will contain a copy of the complete string matched. + * @return bool Returns whether pattern matches string + */ +bool f$mb_eregi(const string &pattern, const string &str, const array &matches); + +/** + * Get internal settings of mbstring + * @param string type (default = "all") If type is not specified or is specified as "all", "internal_encoding", "http_input", "http_output", + * "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", "mail_body_encoding", "illegal_chars", "encoding_translation", "language", + * "detect_order", "substitute_character" and "strict_detection" will be returned. + * If type is specified as "internal_encoding", "http_input", "http_output", "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", + * "mail_body_encoding", "illegal_chars", "encoding_translation", "language", "detect_order", "substitute_character" or "strict_detection" + * the specified setting parameter will be returned. + * @return array|string|int|false An array of type information if type is not specified, otherwise a specific type, or false on failure + */ +mixed f$mb_get_info(const string &type); + +/** + * Detect HTTP input character encoding + * @param ?string type (default = null) Input string specifies the input type. "G" for GET, "P" for POST, "C" for COOKIE, "S" for string, + * "L" for list, and "I" for the whole list (will return array). If type is omitted, it returns the last input type processed. + * @return array|string|false The character encoding name, as per the type, or an array of character encoding names, if type is "I". + * If mb_http_input() does not process specified HTTP input, it returns false. + */ +mixed f$mb_http_input(const Optional &type); + +/** + * Set/Get the HTTP output character encoding. Output after this function is called will be converted from the set internal encoding to encoding + * @param ?string encoding (default = null) If encoding is set, mb_http_output() sets the HTTP output character encoding to encoding. + * If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. + * @return string|bool If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. Otherwise, + * Returns true on success or false on failure. + */ +mixed f$mb_http_output(const Optional &encoding); + +/** + * Set/Get internal character encoding + * @param ?string encoding (default = null) encoding is the character encoding name used for the HTTP input character encoding conversion, + * HTTP output character encoding conversion, and the default character encoding for string functions defined by the mbstring module. + * You should notice that the internal encoding is totally different from the one for multibyte regex. + * @return string|bool If encoding is set, then Returns true on success or false on failure. + * In this case, the character encoding for multibyte regex is NOT changed. + * If encoding is omitted, then the current character encoding name is returned. + */ +mixed f$mb_internal_encoding(const Optional &encoding); + +/** + * Set/Get the current language + * @param ?string language (default = null) Used for encoding e-mail messages. The valid languages are listed in the following table. + * mb_send_mail() uses this setting to encode e-mail. + * +---------------------------+-------------+------------------+-----------+ + * | Language | Charset | Encoding | Alias | + * +---------------------------+-------------+------------------+-----------+ + * | German/de | ISO-8859-15 | Quoted-Printable | Deutsch | + * | English/en | ISO-8859-1 | Quoted-Printable | | + * | Armenian/hy | ArmSCII-8 | Quoted-Printable | | + * | Japanese/ja | ISO-2022-JP | BASE64 | | + * | Korean/ko | ISO-2022-KR | BASE64 | | + * | neutral | UTF-8 | BASE64 | | + * | Russian/ru | KOI8-R | Quoted-Printable | | + * | Turkish/tr | ISO-8859-9 | Quoted-Printable | | + * | Ukrainian/ua | KOI8-U | Quoted-Printable | | + * | uni | UTF-8 | BASE64 | universal | + * | Simplified Chinese/zh-cn | HZ | BASE64 | | + * | Traditional Chinese/zh-tw | BIG-5 | BASE64 | | + * +---------------------------+-------------+------------------+-----------+ + * @return string|bool If language is set and language is valid, it returns true. Otherwise, it returns false. When language is omitted or null, + * it returns the language name as a string + */ +mixed f$mb_language(const Optional &language); + +/** + * Returns an array of all supported encodings + * @return array Returns a numerically indexed array + */ +array f$mb_list_encodings(void); + +/** + * Returns the Unicode code point value of the given character. This function complements mb_chr(). + * @param string str A string + * @param string? encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used. + * @return int|false The Unicode code point for the first character of string or false on failure. + */ +Optional f$mb_ord(const string &str, const Optional &encoding); + +/** + * mb_output_handler() is ob_start() callback function. mb_output_handler() converts characters in the output buffer from internal + * character encoding to HTTP output character encoding. + * @param string str The contents of the output buffer + * @param int status The status of the output buffer + * @return string The converted string + */ +string f$mb_output_handler(const string &str, const int64_t status); + +/** + * Parses GET/POST/COOKIE data and sets global variables. Since PHP does not provide raw POST/COOKIE data, it can only be used for GET data for now. + * It parses URL encoded data, detects encoding, converts coding to internal encoding and set values to the result array or global variables. + * @param string str The URL encoded data + * @param array result An array containing decoded and character encoded converted values + * @return bool Returns true on success or false on failure + */ +bool f$mb_parse_str(const string &str, const array &result); // result = map + +/** + * Get a MIME charset string for a specific encoding. + * @param string encoding The encoding being checked + * @return string|false The MIME charset string for character encoding encoding, or false if no charset is preferred for the given encoding + */ +Optional f$mb_preferred_mime_name(const string &encoding); + +/** + * Set/Get character encoding for a multibyte regex + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return string|bool If encoding is set, then Returns true on success or false on failure. In this case, the internal character encoding is NOT changed. + * If encoding is omitted, then the current character encoding name for a multibyte regex is returned + */ +mixed f$mb_regex_encoding(const Optional &encoding); + +/** + * Sets the default options described by options for multibyte regex functions + * @param ?string options (default = null) The options to set. This is a string where each character is an option. + * To set a mode, the mode character must be the last one set, however there can only be set one mode but multiple options + * + * Regex options: + * +--------+----------------------------------+ + * | Option | Meaning | + * +--------+----------------------------------+ + * | i | Ambiguity match on | + * | x | Enables extended pattern form | + * | m | '.' matches with newlines | + * | s | '^' -> '\A', '$' -> '\Z' | + * | p | Same as both the m and s options | + * | l | Finds longest matches | + * | n | Ignores empty matches | + * | e | eval() resulting code | + * +--------+----------------------------------+ + * + * Regex syntax modes: + * +------+----------------------------+ + * | Mode | Meaning | + * +------+----------------------------+ + * | j | Java (Sun java.util.regex) | + * | u | GNU regex | + * | g | grep | + * | c | Emacs | + * | r | Ruby | + * | z | Perl | + * | b | POSIX Basic regex | + * | d | POSIX Extended regex | + * +------+----------------------------+ + * + * @return string The previous options. If options is omitted or null, it returns the string that describes the current options + */ +string f$mb_regex_set_options(const Optional &options); + +/** + * This function is currently not documented; only its argument list is available. + * @param string str + * @param ?string encoding (default = null) + * @return string + */ +string f$mb_scrub(const string &str, const Optional &encoding); + +/** + * Sends email. Headers and messages are converted and encoded according to the mb_language() setting. + * It's a wrapper function for mail(), so see also mail() for detail + * @param string to The mail addresses being sent to. Multiple recipients may be specified by putting a comma between each address in to. + * This parameter is not automatically encoded + * @param string subject The subject of the mail + * @param string message The message of the mail + * @param array|string additional_headers (default = []) String or array to be inserted at the end of the email header. + * This is typically used to add extra headers (From, Cc, and Bcc). Multiple extra headers should be separated with a CRLF (\r\n). + * Validate parameter not to be injected unwanted headers by attackers. If an array is passed, its keys are the header names and its + * values are the respective header values + * Note: + * If messages are not received, try using a LF (\n) only. Some Unix mail transfer agents (most notably » qmail) replace LF by CRLF automatically + * (which leads to doubling CR if CRLF is used). This should be a last resort, as it does not comply with » RFC 2822. + * @param ?string additional_params (default = null) additional_params is a MTA command line parameter. It is useful when setting the correct Return-Path header + * when using sendmail. This parameter is escaped by escapeshellcmd() internally to prevent command execution. escapeshellcmd() prevents command execution, + * but allows to add additional parameters. For security reason, this parameter should be validated. Since escapeshellcmd() is applied automatically, + * some characters that are allowed as email addresses by internet RFCs cannot be used. Programs that are required to use these characters mail() cannot be used. + * The user that the webserver runs as should be added as a trusted user to the sendmail configuration to prevent a 'X-Warning' header from being added to + * the message when the envelope sender (-f) is set using this method. For sendmail users, this file is /etc/mail/trusted-users + * @return bool Returns true on success or false on failure + */ +bool f$mb_send_mail(const string &to, const string &subject, const string &message, const mixed &additional_headers, const Optional &additional_params); + +/** + * Split a multibyte string using regular expression pattern and returns the result as an array + * @param string pattern The regular expression pattern + * @param string str The string being split + * @param int limit (default = -1) If optional parameter limit is specified, it will be split in limit elements as maximum + * @return array|false The result as an array, or false on failure + */ +Optional f$mb_split(const string &pattern, const string &str, const int64_t limit = -1); + +/** + * This function will return an array of strings, it is a version of str_split() with support for encodings of variable character size as well + * as fixed-size encodings of 1,2 or 4 byte characters. If the length parameter is specified, the string is broken down into chunks of the specified + * length in characters (not bytes). The encoding parameter can be optionally specified and it is good practice to do so + * @param string str The string to split into characters or chunks + * @param int length (default = 1) If specified, each element of the returned array will be composed of multiple characters instead of a single character + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value + * will be used. A string specifying one of the supported encodings + * @return array mb_str_split() returns an array of strings + */ +array f$mb_str_split(const string &str, const int64_t length, const Optional &encoding); + +/** + * mb_strcut() extracts a substring from a string similarly to mb_substr(), but operates on bytes instead of characters. + * If the cut position happens to be between two bytes of a multi-byte character, the cut is performed starting from the first byte of that character. + * This is also the difference to the substr() function, which would simply cut the string between the bytes and thus result in a malformed byte sequence + * @param string str The string being cut + * @param int start If start is non-negative, the returned string will start at the start'th byte position in string, counting from zero. + * For instance, in the string 'abcdef', the byte at position 0 is 'a', the byte at position 2 is 'c', and so forth. + * If start is negative, the returned string will start at the start'th byte counting back from the end of string. + * However, if the magnitude of a negative start is greater than the length of the string, the returned portion will start from the beginning of string + * @param ?int length (default = null) Length in bytes. If omitted or NULL is passed, extract all bytes to the end of the string. + * If length is negative, the returned string will end at the length'th byte counting back from the end of string. + * However, if the magnitude of a negative length is greater than the number of characters after the start position, an empty string will be returned + * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @return string mb_strcut() returns the portion of string specified by the start and length parameters + */ +string f$mb_strcut(const string &str, const int64_t start, const Optional &length, const Optional &encoding); + +/** + * Truncates string string to specified width, where halfwidth characters count as 1, and fullwidth characters count as 2. + * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths + * @param string str The string being decoded + * @param int start The start position offset. Number of characters from the beginning of string (first character is 0), + * or if start is negative, number of characters from the end of the string + * @param int width The width of the desired trim. Negative widths count from the end of the string + * @param string trim_marker (default = "") A string that is added to the end of string when string is truncated + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return string The truncated string. If trim_marker is set, trim_marker replaces the last chars to match the width + */ +string f$mb_strimwidth(const string &str, const int64_t start, const int64_t width, const string &trim_marker, const Optional &encoding); + +/** + * mb_stripos() returns the numeric position of the first occurrence of needle in the haystack string. Unlike mb_strpos(), + * mb_stripos() is case-insensitive. If needle is not found, it returns false + * @param string haystack The string from which to get the position of the first occurrence of needl + * @param string needle The string to find in haystack + * @param int offset (default = 0) The position in haystack to start searching. A negative offset counts from the end of the string + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return int|false Return the numeric position of the first occurrence of needle in the haystack string, or false if needle is not found + */ +Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_stristr() finds the first occurrence of needle in haystack and returns the portion of haystack. + * Unlike mb_strstr(), mb_stristr() is case-insensitive. If needle is not found, it returns false + * @param string haystack The string from which to get the first occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle (default = false) Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). + * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack, or false if needle is not found + */ +Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * Gets the length of a string + * @param string str The string being checked for length + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int Returns the number of characters in string string having character encoding encoding. A multi-byte character is counted as 1 + */ +int64_t f$mb_strlen(const string &str, const Optional &encoding); + +/** + * Finds position of the first occurrence of a string in a string. Performs a multi-byte safe strpos() operation based on number of characters. + * The first character's position is 0, the second character position is 1, and so on + * @param string haystack The string being checked + * @param string needle The string to find in haystack. In contrast with strpos(), numeric values are not applied as the ordinal value of a character + * @param int offset (default = 0) The search offset. If it is not specified, 0 is used. A negative offset counts from the end of the string + * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, + * the internal character encoding value will be used + * @return int|false Returns the numeric position of the first occurrence of needle in the haystack string. If needle is not found, it returns false + */ +Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_strrchr() finds the last occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false + * @param string haystack The string from which to get the last occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. + * If set to false, it returns all of haystack from the last occurrence of needle to the end + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack. or false if needle is not found + */ +Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * mb_strrichr() finds the last occurrence of needle in haystack and returns the portion of haystack. Unlike mb_strrchr(), mb_strrichr() is case-insensitive. + * If needle is not found, it returns false + * @param string haystack The string from which to get the last occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. + * If set to false, it returns all of haystack from the last occurrence of needle to the end + * @param ?string encoding (default = null) + * @return string|false Character encoding name to use. If it is omitted, internal character encoding is used + */ +Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * mb_strripos() performs multi-byte safe strripos() operation based on number of characters. needle position is counted from the beginning of haystack. + * First character's position is 0. Second character position is 1. Unlike mb_strrpos(), mb_strripos() is case-insensitive + * @param string haystack The string from which to get the position of the last occurrence of needle + * @param string needle The string to find in haystack + * @param int offset The position in haystack to start searching + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return int|false Return the numeric position of the last occurrence of needle in the haystack string, or false if needle is not found + */ +Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * Performs a multibyte safe strrpos() operation based on the number of characters. needle position is counted from the beginning of haystack. + * First character's position is 0. Second character position is 1 + * @param string haystack The string being checked, for the last occurrence of needle + * @param string needle The string to find in haystack + * @param int offset (default = 0) May be specified to begin searching an arbitrary number of characters into the string. Negative values will stop searching at an arbitrary point prior to the end of the string + * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used + * @return int|false Returns the numeric position of the last occurrence of needle in the haystack string. If needle is not found, it returns false + */ +Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +/** + * mb_strstr() finds the first occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false + * @param string haystack The string from which to get the first occurrence of needle + * @param string needle The string to find in haystack + * @param bool before_needle Determines which portion of haystack this function returns. + * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). + * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) + * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used + * @return string|false Returns the portion of haystack, or false if needle is not found + */ +Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +/** + * Returns string with all alphabetic characters converted to lowercase + * @param string str The string being lowercased + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string string with all alphabetic characters converted to lowercase + */ +string f$mb_strtolower(const string &str, const Optional &encoding); + +/** + * Returns string with all alphabetic characters converted to uppercase. + * @param string str The string being uppercased + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string string with all alphabetic characters converted to uppercase + */ +string f$mb_strtoupper(const string &str, const Optional &encoding); + +/** + * Returns the width of string string, where halfwidth characters count as 1, and fullwidth characters count as 2. + * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths. The fullwidth characters are: + * U+1100-U+115F, U+11A3-U+11A7, U+11FA-U+11FF, U+2329-U+232A, U+2E80-U+2E99, U+2E9B-U+2EF3, U+2F00-U+2FD5, U+2FF0-U+2FFB, U+3000-U+303E, U+3041-U+3096, + * U+3099-U+30FF, U+3105-U+312D, U+3131-U+318E, U+3190-U+31BA, U+31C0-U+31E3, U+31F0-U+321E, U+3220-U+3247, U+3250-U+32FE, U+3300-U+4DBF, U+4E00-U+A48C, + * U+A490-U+A4C6, U+A960-U+A97C, U+AC00-U+D7A3, U+D7B0-U+D7C6, U+D7CB-U+D7FB, U+F900-U+FAFF, U+FE10-U+FE19, U+FE30-U+FE52, U+FE54-U+FE66, U+FE68-U+FE6B, + * U+FF01-U+FF60, U+FFE0-U+FFE6, U+1B000-U+1B001, U+1F200-U+1F202, U+1F210-U+1F23A, U+1F240-U+1F248, U+1F250-U+1F251, U+20000-U+2FFFD, U+30000-U+3FFFD. + * All other characters are halfwidth characters + * @param string str The string being decoded + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int The width of string string + */ +int64_t f$mb_strwidth(const string &str, const Optional &encoding); + +/** + * Specifies a substitution character when input character encoding is invalid or character code does not exist in output character encoding. + * Invalid characters may be substituted "none" (no output), string or int value (Unicode character code value). + * This setting affects mb_convert_encoding(), mb_convert_variables(), mb_output_handler(), and mb_send_mail() + * @param string|int|null substitute_character (default = null) Specify the Unicode value as an int, or as one of the following strings: + * "none": no output + * "long": Output character code value (Example: U+3000, JIS+7E7E) + * "entity": Output character entity (Example: Ȁ) + * @return string|int|bool If substitute_character is set, it returns true for success, otherwise returns false. + * If substitute_character is not set, it returns the current setting + */ +mixed f$mb_substitute_character(const mixed &substitute_character); + +/** + * Counts the number of times the needle substring occurs in the haystack string + * @param string haystack The string being checked + * @param string needle The string being found + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return int The number of times the needle substring occurs in the haystack string + */ +int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding); + +/** + * Performs a multi-byte safe substr() operation based on number of characters. Position is counted from the beginning of string. + * First character's position is 0. Second character position is 1, and so on + * @param string str The string to extract the substring from + * @param int start If start is non-negative, the returned string will start at the start'th position in string, counting from zero. + * For instance, in the string 'abcdef', the character at position 0 is 'a', the character at position 2 is 'c', and so forth. + * If start is negative, the returned string will start at the start'th character from the end of string + * @param ?int length (default = null) Maximum number of characters to use from string. + * If omitted or NULL is passed, extract all characters to the end of the string + * @param ?string encoding (default = null) The encoding parameter is the character encoding. + * If it is omitted or null, the internal character encoding value will be used + * @return string mb_substr() returns the portion of string specified by the start and length parameters + */ +string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding); + +#else + +#include + +#include "runtime/kphp_core.h" +#include "runtime/string_functions.h" + +bool f$mb_check_encoding(const string &str, const string &encoding = CP1251); + +int64_t f$mb_strlen(const string &str, const string &encoding = CP1251); + +string f$mb_strtolower(const string &str, const string &encoding = CP1251); + +string f$mb_strtoupper(const string &str, const string &encoding = CP1251); + +Optional f$mb_strpos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; + +Optional f$mb_stripos(const string &haystack, const string &needle, int64_t offset = 0, const string &encoding = CP1251) noexcept; + +string f$mb_substr(const string &str, int64_t start, const mixed &length = std::numeric_limits::max(), const string &encoding = CP1251); + +void f$set_detect_incorrect_encoding_names_warning(bool show); + +void free_detect_incorrect_encoding_names(); + +#endif \ No newline at end of file diff --git a/runtime/regexp.h b/runtime/regexp.h index 8c20fe98ad..5eb579b447 100644 --- a/runtime/regexp.h +++ b/runtime/regexp.h @@ -9,7 +9,7 @@ #include "common/mixin/not_copyable.h" #include "runtime/kphp_core.h" -#include "runtime/mbstring.h" +#include "runtime/mbstring/mbstring.h" namespace re2 { class RE2; diff --git a/runtime/runtime.cmake b/runtime/runtime.cmake index 16f8a55283..3df11e3680 100644 --- a/runtime/runtime.cmake +++ b/runtime/runtime.cmake @@ -49,10 +49,8 @@ prepend(KPHP_RUNTIME_PDO_PGSQL_SOURCES pdo/pgsql/ pgsql_pdo_emulated_statement.cpp) endif() -if (MBFL) prepend(KPHP_RUNTIME_MBSTRING_SOURCES mbstring/ mbstring.cpp) -endif() prepend(KPHP_RUNTIME_SOURCES ${BASE_DIR}/runtime/ ${KPHP_RUNTIME_MBSTRING_SOURCES} @@ -88,7 +86,6 @@ prepend(KPHP_RUNTIME_SOURCES ${BASE_DIR}/runtime/ kphp-backtrace.cpp mail.cpp math_functions.cpp - mbstring.cpp memcache.cpp memory_usage.cpp migration_php8.cpp @@ -150,6 +147,7 @@ if (MBFL) add_dependencies(kphp_runtime libmbfl) endif() + prepare_cross_platform_libs(RUNTIME_LIBS yaml-cpp re2 zstd h3) # todo: linking between static libs is no-op, is this redundant? do we need to add mysqlclient here? set(RUNTIME_LIBS vk::kphp_runtime vk::kphp_server vk::popular_common vk::unicode vk::common_src vk::binlog_src vk::net_src ${RUNTIME_LIBS} OpenSSL::Crypto m z pthread) vk_add_library(kphp-full-runtime STATIC) diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp index 42ab014a48..2a0a484302 100644 --- a/tests/cpp/runtime/mbstring-test.cpp +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -3,6 +3,7 @@ #ifdef MBFL /* TODO: make fun strings for tests */ + TEST(mbstring_test, test_mb_check_encoding) { ASSERT_TRUE(f$mb_check_encoding(string("sdf"), string("Windows-1251"))); ASSERT_TRUE(f$mb_check_encoding(string("ыва"), string("Windows-1251"))); @@ -10,9 +11,11 @@ TEST(mbstring_test, test_mb_check_encoding) { ASSERT_TRUE(f$mb_check_encoding(string("İnanç Esasları"), string("Windows-1251"))); ASSERT_FALSE(f$mb_check_encoding(string("İnanç Esasları"), string("ASCII"))); } + TEST(mbstring_test, test_mb_convert_encoding) { - ASSERT_STREQ(f$mb_convert_encoding(string("Hello"), string("UTF-8"), string("EUC-KR")).c_str(), "Hello"); - ASSERT_STREQ(f$mb_convert_encoding(string("ыавыа"), string("UTF-8"), string("Windows-1251")).c_str(), "ыавыа"); - ASSERT_STREQ(f$mb_convert_encoding(string("ыва"), string("UTF-8"), string("ASCII")).c_str(), "??????"); + ASSERT_STREQ(f$mb_convert_encoding(string("Hello"), string("UTF-8"), string("EUC-KR")).to_string().c_str(), "Hello"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыавыа"), string("UTF-8"), string("Windows-1251")).to_string().c_str(), "ыавыа"); + ASSERT_STREQ(f$mb_convert_encoding(string("ыва"), string("UTF-8"), string("ASCII")).to_string().c_str(), "??????"); } + #endif \ No newline at end of file From ac8f9975387c5a7d86827f289d7eb49e76b5851b Mon Sep 17 00:00:00 2001 From: Andrey Arutiunian Date: Mon, 1 May 2023 01:27:12 +0300 Subject: [PATCH 14/27] small fixes --- builtin-functions/_functions.txt | 2 +- runtime/mbstring/mbstring.cpp | 43 -------------------------------- runtime/mbstring/mbstring.h | 8 +++--- 3 files changed, 5 insertions(+), 48 deletions(-) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 34536b2e98..b7ccdecbfc 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1632,7 +1632,7 @@ function mb_convert_variables(string $to_encoding, array|string $from_encoding, function mb_decode_mimeheader(string $string): string; function mb_decode_numericentity(string $string, array $map, ?string $encoding = null): string; function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false; -function mb_detect_order(array|string|null $encoding = null): array|bool; +function mb_detect_order(array|string|null $encoding = null): mixed; // return array|bool function mb_encode_mimeheader(string $string, ?string $charset = null, ?string $transfer_encoding = null, string $newline = "\r\n", int $indent = 0): string; function mb_encode_numericentity(string $string, array $map, ?string $encoding = null, bool $hex = false): string; function mb_encoding_aliases(string $encoding): array; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 82c966f0ad..0e11898e04 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -234,49 +234,6 @@ static int64_t mb_UTF8_get_offset(const char *s, int64_t pos) { return res; } -bool mb_UTF8_check(const char *s) { - do { -#define CHECK(condition) if (!(condition)) {return false;} - unsigned int a = (unsigned char)(*s++); - if ((a & 0x80) == 0) { - if (a == 0) { - return true; - } - continue; - } - - CHECK ((a & 0x40) != 0); - - unsigned int b = (unsigned char)(*s++); - CHECK((b & 0xc0) == 0x80); - if ((a & 0x20) == 0) { - CHECK((a & 0x1e) > 0); - continue; - } - - unsigned int c = (unsigned char)(*s++); - CHECK((c & 0xc0) == 0x80); - if ((a & 0x10) == 0) { - int x = (((a & 0x0f) << 6) | (b & 0x20)); - CHECK(x != 0 && x != 0x360);//surrogates - continue; - } - - unsigned int d = (unsigned char)(*s++); - CHECK((d & 0xc0) == 0x80); - if ((a & 0x08) == 0) { - int t = (((a & 0x07) << 6) | (b & 0x30)); - CHECK(0 < t && t < 0x110);//end of unicode - continue; - } - - return false; -#undef CHECK - } while (true); - - php_assert (0); -} - bool f$mb_check_encoding(const string &str, const string &encoding) { int encoding_num = mb_detect_encoding(encoding); if (encoding_num < 0) { diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 52f295e3d4..2647d32163 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -220,7 +220,7 @@ int64_t f$mb_ereg_search_getpos(void); * If there are some matches, the first element will have the matched sub-string, the second element will have the first part grouped with brackets, * the third element will have the second part grouped with brackets, and so on. It returns false on error. */ -Optional f$mb_ereg_search_getregs(void); +mixed f$mb_ereg_search_getregs(void); /** * Setup string and regular expression for a multibyte regular expression match @@ -241,7 +241,7 @@ bool f$mb_ereg_search_init(const string &str, const Optional &pattern, c * @return array|false An array containing two elements. The first element is the offset, in bytes, where the match begins relative to the start of * the search string, and the second element is the length in bytes of the match. If an error occurs, false is returned. */ -Optional f$mb_ereg_search_pos(const Optional &pattern, const Optional &options); +mixed f$mb_ereg_search_pos(const Optional &pattern, const Optional &options); /** * Returns the matched part of a multibyte regular expression @@ -251,7 +251,7 @@ Optional f$mb_ereg_search_pos(const Optional &pattern, const Opti * it returns an array including substring of matched part as first element, the first grouped part with brackets as second element, * the second grouped part as third element, and so on. It returns false on error. */ -Optional f$mb_ereg_search_regs(const Optional &pattern, const Optional &options); +mixed f$mb_ereg_search_regs(const Optional &pattern, const Optional &options); /** * Set start point of next regular expression match @@ -496,7 +496,7 @@ bool f$mb_send_mail(const string &to, const string &subject, const string &messa * @param int limit (default = -1) If optional parameter limit is specified, it will be split in limit elements as maximum * @return array|false The result as an array, or false on failure */ -Optional f$mb_split(const string &pattern, const string &str, const int64_t limit = -1); +mixed f$mb_split(const string &pattern, const string &str, const int64_t limit = -1); /** * This function will return an array of strings, it is a version of str_split() with support for encodings of variable character size as well From 0b967c41930a44b7ce60e257458ad9aa83ae2291 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=83=D1=82=D1=8E=D0=BD=D1=8F=D0=BD=20=D0=90?= =?UTF-8?q?=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=A0=D0=BE=D0=BC=D0=B0=D0=BD?= =?UTF-8?q?=D0=BE=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Dec 2023 17:48:38 +0300 Subject: [PATCH 15/27] add test workflow --- .github/workflows/linux-install.yml | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 .github/workflows/linux-install.yml diff --git a/.github/workflows/linux-install.yml b/.github/workflows/linux-install.yml new file mode 100644 index 0000000000..39b54e0627 --- /dev/null +++ b/.github/workflows/linux-install.yml @@ -0,0 +1,31 @@ +name: linux-install + +on: + workflow_dispatch: + +env: + kphp_root_dir: /home/kitten/kphp + kphp_polyfills_dir: /home/kitten/kphp/kphp-polyfills + kphp_build_dir: /home/kitten/kphp/build + +jobs: + install-linux: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - os: buster + - os: focal + # - os: jammy # TODO: enable after release to artifactory servers + + steps: + - uses: actions/checkout@v3 + + - name: Build and start Docker container + run: | + docker build -f $GITHUB_WORKSPACE/.github/workflows/Dockerfile.${{matrix.os}}.install $GITHUB_WORKSPACE -t kphp-build-img-${{matrix.os}}-install + docker run -dt --name kphp-build-container-${{matrix.os}}-install kphp-build-img-${{matrix.os}}-install + + - name: Run php dummy script + run: docker exec -u kitten kphp-build-container-${{matrix.os}}-install bash -c + "cd ${{env.demo_dir}} && echo 'hello world' > demo.php && kphp --mode=cli --cxx=g++ demo.php && ./kphp_out/cli -o --user kitten" From 2cfb9629237d84ce406c0d3a4e8fdbd1384c7b6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=83=D1=82=D1=8E=D0=BD=D1=8F=D0=BD=20=D0=90?= =?UTF-8?q?=D0=BD=D0=B4=D1=80=D0=B5=D0=B9=20=D0=A0=D0=BE=D0=BC=D0=B0=D0=BD?= =?UTF-8?q?=D0=BE=D0=B2=D0=B8=D1=87?= Date: Tue, 26 Dec 2023 17:48:38 +0300 Subject: [PATCH 16/27] add test workflow --- .github/workflows/Dockerfile.buster.install | 16 +++++++++++ .github/workflows/Dockerfile.focal.install | 13 +++++++++ .github/workflows/Dockerfile.jammy.install | 17 +++++++++++ .github/workflows/linux-install.yml | 31 +++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 .github/workflows/Dockerfile.buster.install create mode 100644 .github/workflows/Dockerfile.focal.install create mode 100644 .github/workflows/Dockerfile.jammy.install create mode 100644 .github/workflows/linux-install.yml diff --git a/.github/workflows/Dockerfile.buster.install b/.github/workflows/Dockerfile.buster.install new file mode 100644 index 0000000000..598f191b55 --- /dev/null +++ b/.github/workflows/Dockerfile.buster.install @@ -0,0 +1,16 @@ +FROM debian:buster +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends apt-utils ca-certificates gnupg wget lsb-release && \ + echo "deb https://deb.debian.org/debian buster-backports main" >> /etc/apt/sources.list && \ + wget -qO /etc/apt/trusted.gpg.d/vkpartner.asc https://artifactory-external.vkpartner.ru/artifactory/api/gpg/key/public && \ + echo "deb https://artifactory-external.vkpartner.ru/artifactory/kphp buster main" >> /etc/apt/sources.list && \ + wget -qO - https://packages.sury.org/php/apt.gpg | apt-key add - && \ + echo "deb https://packages.sury.org/php/ buster main" >> /etc/apt/sources.list.d/php.list + +RUN apt-get update && apt-get install -y git cmake make g++ lld gperf netcat php7.4-vkext kphp vk-tl-tools && \ + mkdir -p /var/www/vkontakte/data/www/vkontakte.com/tl/ && \ + tl-compiler -e /var/www/vkontakte/data/www/vkontakte.com/tl/scheme.tlo /usr/share/vkontakte/examples/tl-files/common.tl /usr/share/vkontakte/examples/tl-files/tl.tl + +RUN useradd -ms /bin/bash kitten \ No newline at end of file diff --git a/.github/workflows/Dockerfile.focal.install b/.github/workflows/Dockerfile.focal.install new file mode 100644 index 0000000000..2645446602 --- /dev/null +++ b/.github/workflows/Dockerfile.focal.install @@ -0,0 +1,13 @@ +FROM ubuntu:20.04 +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y --no-install-recommends apt-utils ca-certificates gnupg wget software-properties-common pkg-config && \ + wget -qO /etc/apt/trusted.gpg.d/vkpartner.asc https://artifactory-external.vkpartner.ru/artifactory/api/gpg/key/public && \ + echo "deb https://artifactory-external.vkpartner.ru/artifactory/kphp focal main" >> /etc/apt/sources.list + +RUN apt-get update && apt-get install -y git cmake make g++ lld gperf netcat php7.4-vkext kphp vk-tl-tools && \ + mkdir -p /var/www/vkontakte/data/www/vkontakte.com/tl/ && \ + tl-compiler -e /var/www/vkontakte/data/www/vkontakte.com/tl/scheme.tlo /usr/share/vkontakte/examples/tl-files/common.tl /usr/share/vkontakte/examples/tl-files/tl.tl + +RUN useradd -ms /bin/bash kitten \ No newline at end of file diff --git a/.github/workflows/Dockerfile.jammy.install b/.github/workflows/Dockerfile.jammy.install new file mode 100644 index 0000000000..4c7fe6a654 --- /dev/null +++ b/.github/workflows/Dockerfile.jammy.install @@ -0,0 +1,17 @@ +FROM ubuntu:22.04 +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt update && \ + apt install -y --no-install-recommends apt-utils ca-certificates gnupg wget software-properties-common pkg-config && \ + wget -qO /etc/apt/trusted.gpg.d/vkpartner.asc https://artifactory-external.vkpartner.ru/artifactory/api/gpg/key/public && \ + echo "deb [arch=amd64] https://artifactory-external.vkpartner.ru/artifactory/kphp jammy main" | tee /etc/apt/sources.list.d/vkpartner.list + +RUN apt install -y software-properties-common && apt update && \ + add-apt-repository ppa:ondrej/php -y && \ + apt update + +RUN apt install -y git cmake make g++ lld gperf netcat php7.4-vkext kphp vk-tl-tools && \ + mkdir -p /var/www/vkontakte/data/www/vkontakte.com/tl/ && \ + tl-compiler -e /var/www/vkontakte/data/www/vkontakte.com/tl/scheme.tlo /usr/share/vkontakte/examples/tl-files/common.tl /usr/share/vkontakte/examples/tl-files/tl.tl + +RUN useradd -ms /bin/bash kitten \ No newline at end of file diff --git a/.github/workflows/linux-install.yml b/.github/workflows/linux-install.yml new file mode 100644 index 0000000000..39b54e0627 --- /dev/null +++ b/.github/workflows/linux-install.yml @@ -0,0 +1,31 @@ +name: linux-install + +on: + workflow_dispatch: + +env: + kphp_root_dir: /home/kitten/kphp + kphp_polyfills_dir: /home/kitten/kphp/kphp-polyfills + kphp_build_dir: /home/kitten/kphp/build + +jobs: + install-linux: + runs-on: ubuntu-latest + strategy: + matrix: + include: + - os: buster + - os: focal + # - os: jammy # TODO: enable after release to artifactory servers + + steps: + - uses: actions/checkout@v3 + + - name: Build and start Docker container + run: | + docker build -f $GITHUB_WORKSPACE/.github/workflows/Dockerfile.${{matrix.os}}.install $GITHUB_WORKSPACE -t kphp-build-img-${{matrix.os}}-install + docker run -dt --name kphp-build-container-${{matrix.os}}-install kphp-build-img-${{matrix.os}}-install + + - name: Run php dummy script + run: docker exec -u kitten kphp-build-container-${{matrix.os}}-install bash -c + "cd ${{env.demo_dir}} && echo 'hello world' > demo.php && kphp --mode=cli --cxx=g++ demo.php && ./kphp_out/cli -o --user kitten" From b6bec7e1815d1f0d73bf5da79f164417127446dc Mon Sep 17 00:00:00 2001 From: catnyan02 Date: Tue, 6 Feb 2024 18:42:20 +0000 Subject: [PATCH 17/27] Add 14 functions (mb_substr, mb_strlen, mb_substr_count, mb_strtolower, mb_strtoupper, mb_strwidth, mb_strpos, mb_stripos, mb_strripos, mb_strrpos, mb_stristr, mb_strrchr, mb_strrichr, mb_strstr) + php tests --- builtin-functions/_functions.txt | 59 +- runtime/mbstring/mbstring.cpp | 562 +++++++++++++++- runtime/mbstring/mbstring.h | 707 +------------------- tests/cpp/runtime/mbstring-test.cpp | 67 ++ tests/phpt/mbstring/001_mb_strlen.php | 64 ++ tests/phpt/mbstring/002_mb_substr.php | 57 ++ tests/phpt/mbstring/003_mb_substr_count.php | 57 ++ tests/phpt/mbstring/004_mb_strwidth.php | 63 ++ tests/phpt/mbstring/005_mb_strtoupper.php | 33 + tests/phpt/mbstring/006_mb_strtolower.php | 33 + tests/phpt/mbstring/007_mb_strpos.php | 51 ++ tests/phpt/mbstring/008_mb_stripos.php | 46 ++ tests/phpt/mbstring/009_mb_strrpos.php | 51 ++ tests/phpt/mbstring/010_mb_strripos.php | 51 ++ tests/phpt/mbstring/011_mb_strstr.php | 63 ++ tests/phpt/mbstring/012_mb_stristr.php | 63 ++ tests/phpt/mbstring/013_mb_strrchr.php | 63 ++ tests/phpt/mbstring/014_mb_strrichr.php | 63 ++ 18 files changed, 1411 insertions(+), 742 deletions(-) create mode 100644 tests/phpt/mbstring/001_mb_strlen.php create mode 100644 tests/phpt/mbstring/002_mb_substr.php create mode 100644 tests/phpt/mbstring/003_mb_substr_count.php create mode 100644 tests/phpt/mbstring/004_mb_strwidth.php create mode 100644 tests/phpt/mbstring/005_mb_strtoupper.php create mode 100644 tests/phpt/mbstring/006_mb_strtolower.php create mode 100644 tests/phpt/mbstring/007_mb_strpos.php create mode 100644 tests/phpt/mbstring/008_mb_stripos.php create mode 100644 tests/phpt/mbstring/009_mb_strrpos.php create mode 100644 tests/phpt/mbstring/010_mb_strripos.php create mode 100644 tests/phpt/mbstring/011_mb_strstr.php create mode 100644 tests/phpt/mbstring/012_mb_stristr.php create mode 100644 tests/phpt/mbstring/013_mb_strrchr.php create mode 100644 tests/phpt/mbstring/014_mb_strrichr.php diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index b7ccdecbfc..bb275e4751 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1619,60 +1619,17 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_strlen(string $string, ?string $encoding = null): int; -function mb_strpos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; -function mb_stripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; function mb_strtolower(string $string, ?string $encoding = null): string; function mb_strtoupper(string $string, ?string $encoding = null): string; -function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; -function mb_chr(int $codepoint, ?string $encoding = null): string|false; -function mb_convert_case(string $string, int $mode, ?string $encoding = null): string; -function mb_convert_kana(string $string, string $mode = "KV", ?string $encoding = null): string; -function mb_convert_variables(string $to_encoding, array|string $from_encoding, mixed &$vars): string|false; // ??? (change variable bytes + kwargs) -function mb_decode_mimeheader(string $string): string; -function mb_decode_numericentity(string $string, array $map, ?string $encoding = null): string; -function mb_detect_encoding(string $string, array|string|null $encodings = null, bool $strict = false): string|false; -function mb_detect_order(array|string|null $encoding = null): mixed; // return array|bool -function mb_encode_mimeheader(string $string, ?string $charset = null, ?string $transfer_encoding = null, string $newline = "\r\n", int $indent = 0): string; -function mb_encode_numericentity(string $string, array $map, ?string $encoding = null, bool $hex = false): string; -function mb_encoding_aliases(string $encoding): array; -function mb_ereg_match(string $pattern, string $string, ?string $options = null): bool; -function mb_ereg_replace_callback(string $pattern, callable $callback, string $string, ?string $options = null): string|false|null; -function mb_ereg_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; -function mb_ereg_search_getpos(): int; -function mb_ereg_search_getregs(): array|false; -function mb_ereg_search_init(string $string, ?string $pattern = null, ?string $options = null): bool; -function mb_ereg_search_pos(?string $pattern = null, ?string $options = null): array|false; -function mb_ereg_search_regs(?string $pattern = null, ?string $options = null): array|false; -function mb_ereg_search_setpos(int $offset): bool; -function mb_ereg_search(?string $pattern = null, ?string $options = null): bool; -function mb_ereg(string $pattern, string $string, array &$matches = null): bool; -function mb_eregi_replace(string $pattern, string $replacement, string $string, ?string $options = null): string|false|null; -function mb_eregi(string $pattern, string $string, array &$matches = null): bool; -function mb_get_info(string $type = "all"): array|string|int|false; -function mb_http_input(?string $type = null): array|string|false; -function mb_http_output(?string $encoding = null): string|false; -function mb_internal_encoding(?string $encoding = null): string|false; -function mb_language(?string $language = null): string|false; -function mb_list_encodings(): array; -function mb_ord(string $string, ?string $encoding = null): int|false; -function mb_output_handler(string $string, int $status): string; -function mb_parse_str(string $string, array &$result): bool; -function mb_preferred_mime_name(string $encoding): string|false; -function mb_regex_encoding(?string $encoding = null): string|false; -function mb_regex_set_options(?string $options = null): string; -function mb_scrub(string $string, ?string $encoding = null): string; -function mb_send_mail(string $to, string $subject, string $message, array|string $additional_headers = [], ?string $additional_params = null): bool; -function mb_split(string $pattern, string $string, int $limit = -1): array|false; -function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; -function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; -function mb_strimwidth(string $string, int $start, int $width, string $trim_marker = "", ?string $encoding = null): string; +function mb_strwidth(string $string, ?string $encoding = null): int; +function mb_strpos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_stripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; +function mb_strrpos(string $haystack, string $needle, int $offset = 0, string $encoding = null): int|false; function mb_stristr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; function mb_strrchr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; function mb_strrichr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; -function mb_strripos(string $haystack, string $needle, int $offset = 0, ?string $encoding = null): int|false; -function mb_strrpos(string $haystack, string $needle, int $offset = 0, string $encoding = null): int|false; -function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; -function mb_strwidth(string $string, ?string $encoding = null): int; -function mb_substitute_character(string|int|null $substitute_character = null): string|int|false; -function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; \ No newline at end of file +function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; \ No newline at end of file diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 0e11898e04..e42d999c34 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -1,4 +1,11 @@ #include "mbstring.h" +#include "runtime/exception.h" + +#include "common/unicode/unicode-utils.h" +#include "common/unicode/utf8-utils.h" + +#define MIN(a, b) (((a)<(b))?(a):(b)) +#define MBFL_SUBSTR_UNTIL_END ((size_t) -1) bool mb_UTF8_check(const char *s) { do { @@ -46,6 +53,23 @@ bool mb_UTF8_check(const char *s) { #ifdef MBFL extern "C" { #include +#include +} + +#define KPHP_UNICODE_CASE_UPPER 0 +#define KPHP_UNICODE_CASE_LOWER 1 +#define KPHP_UNICODE_CASE_TITLE 2 +#define KPHP_UNICODE_CASE_FOLD 3 +#define KPHP_UNICODE_CASE_UPPER_SIMPLE 4 +#define KPHP_UNICODE_CASE_LOWER_SIMPLE 5 +#define KPHP_UNICODE_CASE_TITLE_SIMPLE 6 +#define KPHP_UNICODE_CASE_FOLD_SIMPLE 7 +#define KPHP_UNICODE_CASE_MODE_MAX 7 + +static const char * DEFAULT_ENCODING = "UTF-8" ; + +static inline int mbfl_is_error(size_t len) { + return len >= (size_t) -16; } mbfl_string *convert_encoding(const char *str, const char *to, const char *from) { @@ -149,10 +173,544 @@ bool f$mb_check_encoding(const mixed &value, const Optional &encoding) { return check_encoding(c_value, c_encoding); } +static const mbfl_encoding *mb_get_encoding(const Optional &enc_name) { + if (enc_name.has_value()) { + // no caching unlike PHP version - can be changed if we're going to add mbstring config + const mbfl_encoding *encoding; + encoding = mbfl_name2encoding(enc_name.val().c_str()); + if (!encoding) { + return NULL; + } else { + return encoding; + } + } + return mbfl_name2encoding(DEFAULT_ENCODING); // change if we are going to use current encoding +} + +int64_t f$mb_strlen(const string &str, const Optional &enc_name){ + const mbfl_encoding *encoding = mb_get_encoding(enc_name); + if (!encoding) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strlen", enc_name.val().c_str()); + } + mbfl_string _string; + mbfl_string_init(&_string); + _string.no_encoding = encoding->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*)str.c_str(); + + size_t n = mbfl_strlen(&_string); + + if (mbfl_is_error(n)) { + php_critical_error ("error working with \"%s\" string", str.c_str()); + } + + return (int64_t) n; + +} + + +string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding){ + size_t real_start, real_len; + bool len_is_null = !length.has_value(); + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_substr", encoding.val().c_str()); + } + + mbfl_string _string, result, *ret; + mbfl_string_init(&_string); + _string.no_encoding = enc->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*)str.c_str(); + + size_t mblen = 0; + if (start < 0 || (!len_is_null && val(length) < 0)) { + mblen = mbfl_strlen(&_string); + } + + if (start >= 0) { + real_start = (size_t) start; + } else if (-start < mblen) { + real_start = mblen + start; + } else { + real_start = 0; + } + + /* if "length" position is negative, set it to the length + * needed to stop that many chars from the end of the string */ + if (len_is_null) { + real_len = mbfl_strlen(&_string) + 1; + } else if (val(length) >= 0) { + real_len = (size_t) val(length); + } else if (real_start < mblen && - val(length) < mblen - real_start) { + real_len = (mblen - real_start) + val(length); + } else { + real_len = 0; + } + + ret = mbfl_substr(&_string, &result, real_start, real_len); + php_assert(ret != NULL); + return string((const char*) ret->val, ret->len); +} + +int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding){ + + size_t n; + mbfl_string _haystack, _needle; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_substr_count", encoding.val().c_str()); + } + + mbfl_string_init(&_haystack); + _haystack.no_encoding = enc->no_encoding; + _haystack.len = haystack.size(); + _haystack.val = (unsigned char*) haystack.c_str(); + + mbfl_string_init(&_needle); + _needle.no_encoding = enc->no_encoding; + _needle.len = needle.size(); + _needle.val = (unsigned char*) needle.c_str(); + + if (needle.size() <= 0) { + php_warning("empty substring"); + } + + n = mbfl_substr_count(&_haystack, &_needle); + + if (mbfl_is_error(n)) { + php_critical_error ("internal error"); + } + + return (int64_t) n; +} + +string mb_convert_case(const string &str, const int64_t mode, const Optional &encoding){ + + mixed unicode = f$mb_convert_encoding(str, string("UTF_8"), encoding.val()); + + if (unicode.is_string()) { + const string &unicode_str = unicode.to_string(); + + int len = str.size(); + string unicode_res(len * 3, false); + const char *s = str.c_str(); + int p = 0, ch = 0, res_len = 0; + + switch(mode) { + case KPHP_UNICODE_CASE_UPPER: + while ((p = get_char_utf8(&ch, s)) > 0) { + s += p; + res_len += put_char_utf8(unicode_toupper(ch), &unicode_res[res_len]); + } + break; + + case KPHP_UNICODE_CASE_LOWER: + while ((p = get_char_utf8(&ch, s)) > 0) { + s += p; + res_len += put_char_utf8(unicode_tolower(ch), &unicode_res[res_len]); + } + break; + } + + if (p < 0) { + php_warning("Incorrect UTF-8 string \"%s\" in function mb_convert_case", str.c_str()); + } + unicode_res.shrink(res_len); + + mixed res = f$mb_convert_encoding(unicode_res, encoding.val(), string("UTF-8")); + + if (res.is_string()) { + return res.to_string(); + } + else { + php_critical_error ("encoding \"%s\" isn't supported in mb_convert_case", encoding.val().c_str()); + } + } + else { + php_critical_error ("encoding \"%s\" isn't supported in mb_convert_case", encoding.val().c_str()); + } + +// if (mode < 0 || mode > PHP_UNICODE_CASE_MODE_MAX) { +// php_critical_error ("case mode isn't supported"); +// } + +// if (mode != PHP_UNICODE_CASE_UPPER || mode != PHP_UNICODE_CASE_LOWER) { +// php_critical_error ("case mode isn't supported"); +// } +// +// struct convert_case_data data; +// mbfl_convert_filter *from_wchar, *to_wchar; +// mbfl_string result, *result_ptr; +// +// mbfl_memory_device device; +// mbfl_memory_device_init(&device, str.size() + 1, 0); +// +// /* encoding -> wchar filter */ +// to_wchar = mbfl_convert_filter_new(enc->no_encoding, +// (&mbfl_encoding_wchar)->no_encoding, convert_case_filter, NULL, &data); +// if (to_wchar == NULL) { +// mbfl_memory_device_clear(&device); +// php_critical_error ("encoding isn't supported"); +// } +// +// /* wchar -> encoding filter */ +// from_wchar = mbfl_convert_filter_new((&mbfl_encoding_wchar)->no_encoding, enc->no_encoding, mbfl_memory_device_output, +// NULL, &device); +// if (from_wchar == NULL) { +// mbfl_convert_filter_delete(to_wchar); +// mbfl_memory_device_clear(&device); +// php_critical_error ("encoding isn't supported"); +// } +// +// data.next_filter = from_wchar; +// data.no_encoding = enc->no_encoding; +// data.case_mode = mode; +// data.title_mode = 0; +// +// { +// /* feed data */ +// const unsigned char *p = (const unsigned char *) str.c_str(); +// size_t n = str.size(); +// while (n > 0) { +// if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) { +// break; +// } +// n--; +// } +// } +// +// mbfl_convert_filter_flush(to_wchar); +// mbfl_convert_filter_flush(from_wchar); +// result_ptr = mbfl_memory_device_result(&device, &result); +// mbfl_convert_filter_delete(to_wchar); +// mbfl_convert_filter_delete(from_wchar); +// +// if (!result_ptr) { +// THROW_EXCEPTION (new_Exception(string(__FILE__), __LINE__, string("mbfl error", 10))); +// } +// +// return string((const char*) result_ptr->val, result_ptr->len); +} + +string f$mb_strtoupper(const string &str, const Optional &encoding){ + return mb_convert_case(str, KPHP_UNICODE_CASE_UPPER, encoding); +} + +string f$mb_strtolower(const string &str, const Optional &encoding){ + return mb_convert_case(str, KPHP_UNICODE_CASE_LOWER, encoding); +} + +int64_t f$mb_strwidth(const string &str, const Optional &encoding){ + size_t n; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strwidth", encoding.val().c_str()); + } + + mbfl_string _string; + mbfl_string_init(&_string); + _string.no_encoding = enc->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*)str.c_str(); + + n = mbfl_strwidth(&_string); + + return n; +} + +Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding){ + int reverse = 0; + size_t real_offset = offset; + mbfl_string _haystack, _needle; + size_t n; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strpos", encoding.val().c_str()); + } + + mbfl_string_init(&_haystack); + _haystack.no_encoding = enc->no_encoding; + _haystack.len = haystack.size(); + _haystack.val = (unsigned char*) haystack.c_str(); + + mbfl_string_init(&_needle); + _needle.no_encoding = enc->no_encoding; + _needle.len = needle.size(); + _needle.val = (unsigned char*) needle.c_str(); + + if (real_offset != 0) { + size_t slen = mbfl_strlen(&_haystack); + if (offset < 0) { + real_offset += slen; + } + if (real_offset > slen) { + php_warning ("offset not contained in string"); + return false; + } + } + + if (needle.size() <= 0) { + php_warning ("empty delimiter"); + return false; + } + + n = mbfl_strpos(&_haystack, &_needle, real_offset, reverse); + if (!mbfl_is_error(n)){ + return n; + } else { + switch (-n) { + case 1: + break; + case 2: + php_warning ("Needle has not positive length"); + break; + case 4: + php_warning ("Unknown encoding or conversion error"); + break; + case 8: + php_warning ("Argument is empty"); + break; + default: + php_warning ("Unknown error in mb_strpos"); + break; + } + return false; + } +} + +Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding){ + int reverse = 1; + mbfl_string _haystack, _needle; + size_t n; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strrpos", encoding.val().c_str()); + } + + mbfl_string_init(&_haystack); + _haystack.no_encoding = enc->no_encoding; + _haystack.len = haystack.size(); + _haystack.val = (unsigned char*) haystack.c_str(); + + mbfl_string_init(&_needle); + _needle.no_encoding = enc->no_encoding; + _needle.len = needle.size(); + _needle.val = (unsigned char*) needle.c_str(); + + if (offset != 0) { + size_t haystack_char_len = mbfl_strlen(&_haystack); + if ((offset > 0 && offset > haystack_char_len) || + (offset < 0 && -offset > haystack_char_len)) { + php_warning ("Offset is greater than the length of haystack string"); + return false; + } + } + + n = mbfl_strpos(&_haystack, &_needle, offset, reverse); + if (!mbfl_is_error(n)) { return n; } else { return false; } + +} + +Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding){ + int reverse = 1; + int64_t real_offset = offset; + mbfl_string _haystack, _needle; + size_t n = (size_t) - 1; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + return n; + } + + if (needle.size() == 0) { + php_warning ("Empty delimiter"); + return false; + } + + mbfl_string_init(&_haystack); + _haystack.no_encoding = enc->no_encoding; + + mbfl_string_init(&_needle); + _needle.no_encoding = enc->no_encoding; + + do { + /* We're using simple case-folding here, because we'd have to deal with remapping of + * offsets otherwise. */ + + string lower_haystack = f$mb_strtolower(haystack, encoding); + _haystack.len = lower_haystack.size(); + _haystack.val = (unsigned char*) lower_haystack.c_str(); + + if (!_haystack.val || _haystack.len == 0) { + break; + } + + string lower_needle = f$mb_strtolower(needle, encoding); + _needle.len = lower_needle.size(); + _needle.val = (unsigned char*) lower_needle.c_str(); + + if (!_needle.val || _needle.len == 0) { + break; + } + + if (offset != 0) { + size_t haystack_char_len = mbfl_strlen(&_haystack); + + if (reverse) { + if ((offset > 0 && (size_t)offset > haystack_char_len) || + (offset < 0 && (size_t)(-offset) > haystack_char_len)) { + php_warning("Offset is greater than the length of haystack string"); + break; + } + } else { + if (offset < 0) { + real_offset += (int64_t )haystack_char_len; + } + if (real_offset < 0 || (size_t)real_offset > haystack_char_len) { + php_warning("Offset not contained in string"); + break; + } + } + } + + n = mbfl_strpos(&_haystack, &_needle, real_offset, reverse); + } while(0); + + if (!mbfl_is_error(n)) { return n; } else { return false; } +} + +Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding){ + int reverse = 0; + int64_t real_offset = offset; + mbfl_string _haystack, _needle; + size_t n = (size_t) - 1; + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + return n; + } + + if (needle.size() == 0) { + php_warning ("Empty delimiter"); + return false; + } + + mbfl_string_init(&_haystack); + _haystack.no_encoding = enc->no_encoding; + + mbfl_string_init(&_needle); + _needle.no_encoding = enc->no_encoding; + + do { + /* We're using simple case-folding here, because we'd have to deal with remapping of + * offsets otherwise. */ + + string lower_haystack = f$mb_strtolower(haystack, encoding); + _haystack.len = lower_haystack.size(); + _haystack.val = (unsigned char*) lower_haystack.c_str(); + + if (!_haystack.val || _haystack.len == 0) { + break; + } + + string lower_needle = f$mb_strtolower(needle, encoding); + _needle.len = lower_needle.size(); + _needle.val = (unsigned char*) lower_needle.c_str(); + + if (!_needle.val || _needle.len == 0) { + break; + } + + if (offset != 0) { + size_t haystack_char_len = mbfl_strlen(&_haystack); + + if (reverse) { + if ((offset > 0 && (size_t)offset > haystack_char_len) || + (offset < 0 && (size_t)(-offset) > haystack_char_len)) { + php_warning("Offset is greater than the length of haystack string"); + break; + } + } else { + if (offset < 0) { + real_offset += (int64_t )haystack_char_len; + } + if (real_offset < 0 || (size_t)real_offset > haystack_char_len) { + php_warning("Offset not contained in string"); + break; + } + } + } + + n = mbfl_strpos(&_haystack, &_needle, real_offset, reverse); + } while(0); + + if (!mbfl_is_error(n)) { return n; } else { return false; } +} + +Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding) { + Optional start = f$mb_strpos(haystack, needle, 0, encoding); + if (start.has_value()) { + if (before_needle) { + return f$mb_substr(haystack, 0, val(start), encoding); + } else { + return f$mb_substr(haystack, val(start), false, encoding); + } + } + return false; +} + + +Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding){ + Optional start = f$mb_stripos(haystack, needle, 0, encoding); + if (start.has_value()) { + if (before_needle) { + return f$mb_substr(haystack, 0, val(start), encoding); + } else { + return f$mb_substr(haystack, val(start), false, encoding); + } + } + return false; +} + +Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding){ + Optional start = f$mb_strrpos(haystack, needle, 0, encoding); + if (start.has_value()) { + if (before_needle) { + return f$mb_substr(haystack, 0, val(start), encoding); + } else { + return f$mb_substr(haystack, val(start), false, encoding); + } + } + return false; +} + +Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding){ + Optional start = f$mb_strripos(haystack, needle, 0, encoding); + if (start.has_value()) { + if (before_needle) { + return f$mb_substr(haystack, 0, val(start), encoding); + } else { + return f$mb_substr(haystack, val(start), false, encoding); + } + } + return false; +} + #else -#include "common/unicode/unicode-utils.h" -#include "common/unicode/utf8-utils.h" static bool is_detect_incorrect_encoding_names_warning{false}; diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 2647d32163..bcf0da9c0b 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -12,709 +12,38 @@ bool mb_UTF8_check(const char *s); #ifdef MBFL -/** - * Check if strings are valid for the specified encoding - * Checks if the specified byte stream is valid for the specified encoding. If value is of type array, all keys and values are validated recursively. - * It is useful to prevent so-called "Invalid Encoding Attack". - * @param array|string value The byte stream - * @param ?string encoding (default = null) The expected encoding - * @return bool Returns true on success or false on failure - */ + bool f$mb_check_encoding(const mixed &value, const Optional &encoding); -/** - * Returns a string containing the character specified by the Unicode code point value, encoded in the specified encoding - * @param int codepoint A Unicode codepoint value, e.g. 128024 for U+1F418 ELEPHANT - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used. - * @return string|false A string containing the requested character, if it can be represented in the specified encoding or false on failure. - */ -Optional f$mb_chr(const int64_t codepoint, const Optional &encoding); - -/** - * Perform case folding on a string - * @param string str The string being converted - * @param int mode The mode of the conversion. It can be one of MB_CASE_UPPER, MB_CASE_LOWER, MB_CASE_TITLE, MB_CASE_FOLD, - * MB_CASE_UPPER_SIMPLE, MB_CASE_LOWER_SIMPLE, MB_CASE_TITLE_SIMPLE, MB_CASE_FOLD_SIMPLE - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used. - * @return string A case folded version of string converted in the way specified by mode - */ -string f$mb_convert_case(const string &str, const int64_t mode, const Optional &encoding); - -/** - * Convert from one character encoding to another - * @param array|string str The string or array to be converted - * @param string to_encoding The desired encoding of the result - * @param array|string|null from_encoding (default = null) The current encoding used to interpret string. - * Multiple encodings may be specified as an array or comma separated list, - * in which case the correct encoding will be guessed using the same algorithm as mb_detect_encoding(). - * If from_encoding is null or not specified, the mbstring.internal_encoding setting will be used if set, otherwise the default_charset setting. - * @return array|string|false The encoded string - */ mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); -/** - * Convert "kana" one from another ("zen-kaku", "han-kaku" and more) - * @param string str The string being converted - * @param string mode The conversion option (default = "KV") - * r - Convert "zen-kaku" alphabets to "han-kaku" - * R - Convert "han-kaku" alphabets to "zen-kaku" - * n - Convert "zen-kaku" numbers to "han-kaku" - * N - Convert "han-kaku" numbers to "zen-kaku" - * a - Convert "zen-kaku" alphabets and numbers to "han-kaku" - * A - Convert "han-kaku" alphabets and numbers to "zen-kaku" - * (Characters included in "a", "A" options are U+0021 - U+007E excluding U+0022, U+0027, U+005C, U+007E) - * s - Convert "zen-kaku" space to "han-kaku" (U+3000 -> U+0020) - * S - Convert "han-kaku" space to "zen-kaku" (U+0020 -> U+3000) - * k - Convert "zen-kaku kata-kana" to "han-kaku kata-kana" - * K - Convert "han-kaku kata-kana" to "zen-kaku kata-kana" - * h - Convert "zen-kaku hira-gana" to "han-kaku kata-kana" - * H - Convert "han-kaku kata-kana" to "zen-kaku hira-gana" - * c - Convert "zen-kaku kata-kana" to "zen-kaku hira-gana" - * C - Convert "zen-kaku hira-gana" to "zen-kaku kata-kana" - * V - Collapse voiced sound notation and convert them into a character. Use with "K","H" - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used. - * @return string The converted string - */ -string f$mb_convert_kana(const string &str, const string &mode, const Optional &encoding); - -/** - * Convert character code in variable(s) - * @param string to_encoding The encoding that the string is being converted to - * @param array|string from_encoding is specified as an array or comma separated string, it tries to detect encoding from from-coding. - * When from_encoding is omitted, detect_order is used. - * @param mixed &vars References to the variable being converted. String, Array are accepted. mb_convert_variables() assumes - * all parameters have the same encoding. - * @return string|false The character encoding before conversion for success, or false for failure - */ -Optional f$mb_convert_variables(const string &to_encoding, const mixed &from_encoding, const mixed &vars); // ??? - -/** - * Decode string in MIME header field - * @param string str The string being decoded - * @return string The decoded string in internal character encoding - */ -string f$mb_decode_mimeheader(const string &string); - -/** - * Decode HTML numeric string reference to character - * @param string str The string being decoded - * @param array map An array that specifies the code area to convert - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used. - * @return string The converted string - */ -string f$mb_decode_numericentity(const string &str, const array &map, const Optional &encoding); - -/** - * Detect character encoding - * Detects the most likely character encoding for string string from an ordered list of candidates. Automatic detection of the intended character encoding - * can never be entirely reliable; without some additional information, it is similar to decoding an encrypted string without the key. It is always preferable - * to use an indication of character encoding stored or transmitted with the data, such as a "Content-Type" HTTP header. This function is most useful with - * multibyte encodings, where not all sequences of bytes form a valid string. If the input string contains such a sequence, that encoding will be rejected, - * and the next encoding checked. - * @param string str The string being inspected - * @param array|string|null encodings (default = null) A list of character encodings to try, in order. The list may be specified as an array of strings, - * or a single string separated by commas. If encodings is omitted or null, the current detect_order (set with the mbstring.detect_order configuration option, - * or mb_detect_order() function) will be used. - * @param bool strict (default = false) Controls the behaviour when string is not valid in any of the listed encodings. - * If strict is set to false, the closest matching encoding will be returned; if strict is set to true, false will be returned. - * @return string|false Controls the behaviour when string is not valid in any of the listed encodings. If strict is set to false, - * the closest matching encoding will be returned; if strict is set to true, false will be returned. The default value for strict can be set - * with the mbstring.strict_detection configuration option. - */ -Optional f$mb_detect_encoding(const string &str, const mixed &encodings, const bool strict = false); - -/** - * Set/Get character encoding detection order - * @param array|string|null encoding (default = null) encoding is an array or comma separated list of character encoding. See supported encodings. - * If encoding is omitted or null, it returns the current character encoding detection order as array. This setting affects - * mb_detect_encoding() and mb_send_mail(). - * @return array|bool When setting the encoding detection order, true is returned on success or false on failure. - * When getting the encoding detection order, an ordered array of the encodings is returned. - */ -mixed f$mb_detect_order(const mixed &encoding); - -/** - * Encode string for MIME header - * @param string str The string being encoded. Its encoding should be same as mb_internal_encoding() - * @param ?string charset (default = null) Specifies the name of the character set in which string is represented in. - * The default value is determined by the current NLS setting (mbstring.language) - * @param ?string transfer_encoding (default = null) Specifies the scheme of MIME encoding. - * It should be either "B" (Base64) or "Q" (Quoted-Printable). Falls back to "B" if not given. - * @param string newline (default = "\r\n") Specifies the EOL (end-of-line) marker with which mb_encode_mimeheader() performs line-folding - * (a » RFC term, the act of breaking a line longer than a certain length into multiple lines. The length is currently hard-coded to 74 characters). - * Falls back to "\r\n" (CRLF) if not given. - * @param int indent (default = 0) Indentation of the first line (number of characters in the header before string) - * @return string A converted version of the string represented in ASCII - */ -string f$mb_encode_mimeheader(const string &str, const Optional &charset, const Optional &transfer_encoding, const string &newline, const int64_t indent); - -/** - * Encode character to HTML numeric string reference - * Converts specified character codes in string string from character code to HTML numeric character reference - * @param string str The string being encoded - * @param array map Aarray specifies code area to convert - * @param ?string encding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used - * @param bool hex (default = false) Whether the returned entity reference should be in hexadecimal notation (otherwise it is in decimal notation) - * @return string The converted string - */ -string f$mb_encode_numericentity(const string &str, const array &map, const Optional &encoding, const bool hex = false); - -/** - * Get aliases of a known encoding type - * @param string encoding The encoding type being checked, for aliases - * @return array Returns a numerically indexed array of encoding aliases - */ -array f$mb_encoding_aliases(const string &encoding); - -/** - * Regular expression match for multibyte string - * @param string pattern The regular expression pattern - * @param string str The string being evaluated - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return bool Returns true if string matches the regular expression pattern, false if not - */ -bool f$mb_ereg_match(const string &pattern, const string &str, const Optional &options); - -/** - * Perform a regular expression search and replace with multibyte support using a callback - * Scans string for matches to pattern, then replaces the matched text with the output of callback function. - * The behavior of this function is almost identical to mb_ereg_replace(), except for the fact that instead of replacement parameter, - * one should specify a callback. - * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern. - * @param callable callback A callback that will be called and passed an array of matched elements in the subject string. - * The callback should return the replacement string. You'll often need the callback function for a mb_ereg_replace_callback() in just one place. - * In this case you can use an anonymous function to declare the callback within the call to mb_ereg_replace_callback(). - * By doing it this way you have all information for the call in one place and do not clutter the function namespace with a callback - * function's name not used anywhere else. - * @param string str The string being checked - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned - */ -// Optional f$mb_ereg_replace_callback(const string &pattern, const CallableT &callback, const string &str, const Optional options); // callback - -/** - * Replace regular expression with multibyte support - * Scans string for matches to pattern, then replaces the matched text with replacement - * @param string pattern The regular expression pattern. Multibyte characters may be used in pattern - * @param string replacement The replacement text - * @param string str The string being checked - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return string|false|null The resultant string on success, or false on error. If string is not valid for the current encoding, null is returned - */ -Optional f$mb_ereg_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); - -/** - * Returns start point for next regular expression match - * @return int mb_ereg_search_getpos() returns the point to start regular expression match for mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). - * The position is represented by bytes from the head of string. - */ -int64_t f$mb_ereg_search_getpos(void); - -/** - * Retrieve the result from the last multibyte regular expression match - * @return array|false An array including the sub-string of matched part by last mb_ereg_search(), mb_ereg_search_pos(), mb_ereg_search_regs(). - * If there are some matches, the first element will have the matched sub-string, the second element will have the first part grouped with brackets, - * the third element will have the second part grouped with brackets, and so on. It returns false on error. - */ -mixed f$mb_ereg_search_getregs(void); - -/** - * Setup string and regular expression for a multibyte regular expression match - * mb_ereg_search_init() sets string and pattern for a multibyte regular expression. - * These values are used for mb_ereg_search(), mb_ereg_search_pos(),and mb_ereg_search_regs(). - * @param string str The search string - * @param ?string pattern (default = null) The search pattern - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return bool Returns true on success or false on failure - */ -bool f$mb_ereg_search_init(const string &str, const Optional &pattern, const Optional &options); - -/** - * Returns position and length of a matched part of the multibyte regular expression for a predefined multibyte string - * The string for match is specified by mb_ereg_search_init(). If it is not specified, the previous one will be used - * @param ?string pattern (default = null) The search pattern - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return array|false An array containing two elements. The first element is the offset, in bytes, where the match begins relative to the start of - * the search string, and the second element is the length in bytes of the match. If an error occurs, false is returned. - */ -mixed f$mb_ereg_search_pos(const Optional &pattern, const Optional &options); - -/** - * Returns the matched part of a multibyte regular expression - * @param ?string pattern (default = null) The search pattern - * @param ?string options (deafult = null) The search option. See mb_regex_set_options() for explanation - * @return array|false mb_ereg_search_regs() executes the multibyte regular expression match, and if there are some matched part, - * it returns an array including substring of matched part as first element, the first grouped part with brackets as second element, - * the second grouped part as third element, and so on. It returns false on error. - */ -mixed f$mb_ereg_search_regs(const Optional &pattern, const Optional &options); - -/** - * Set start point of next regular expression match - * mb_ereg_search_setpos() sets the starting point of a match for mb_ereg_search(). - * @param int offset The position to set. If it is negative, it counts from the end of the string - * @return bool Returns true on success or false on failure - */ -bool f$mb_ereg_search_setpos(const int64_t offset); - -/** - * Multibyte regular expression match for predefined multibyte string - * @param ?string pattern (default = null) The search pattern - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return bool mb_ereg_search() returns true if the multibyte string matches with the regular expression, or false otherwise. The string for matching - * is set by mb_ereg_search_init(). If pattern is not specified, the previous one is used. - */ -bool f$mb_ereg_search(const Optional &pattern, const Optional &options); - -/** - * Regular expression match with multibyte support - * @param string pattern The search pattern - * @param string str The search string - * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the - * third argument matches, the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. - * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, - * and so on. $matches[0] will contain a copy of the complete string matched. - * @return bool Returns whether pattern matches string - */ -bool f$mb_ereg(const string &pattern, const string &str, const array &matches); - -/** - * Replace regular expression with multibyte support ignoring case - * Scans string for matches to pattern, then replaces the matched text with replacement - * @param string pattern The regular expression pattern. Multibyte characters may be used. The case will be ignored - * @param string replacement The replacement text - * @param string str The searched string - * @param ?string options (default = null) The search option. See mb_regex_set_options() for explanation - * @return string|false|null The resultant string or false on error. If string is not valid for the current encoding, null is returned - */ -Optional f$mb_eregi_replace(const string &pattern, const string &replacement, const string &str, const Optional &options); - -/** - * Regular expression match ignoring case with multibyte support - * @param string pattern The regular expression pattern - * @param string str The string being searched - * @param array matches (default = null) If matches are found for parenthesized substrings of pattern and the function is called with the third argument matches, - * the matches will be stored in the elements of the array matches. If no matches are found, matches is set to an empty array. - * matches[1] will contain the substring which starts at the first left parenthesis; $matches[2] will contain the substring starting at the second, - * and so on. $matches[0] will contain a copy of the complete string matched. - * @return bool Returns whether pattern matches string - */ -bool f$mb_eregi(const string &pattern, const string &str, const array &matches); - -/** - * Get internal settings of mbstring - * @param string type (default = "all") If type is not specified or is specified as "all", "internal_encoding", "http_input", "http_output", - * "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", "mail_body_encoding", "illegal_chars", "encoding_translation", "language", - * "detect_order", "substitute_character" and "strict_detection" will be returned. - * If type is specified as "internal_encoding", "http_input", "http_output", "http_output_conv_mimetypes", "mail_charset", "mail_header_encoding", - * "mail_body_encoding", "illegal_chars", "encoding_translation", "language", "detect_order", "substitute_character" or "strict_detection" - * the specified setting parameter will be returned. - * @return array|string|int|false An array of type information if type is not specified, otherwise a specific type, or false on failure - */ -mixed f$mb_get_info(const string &type); - -/** - * Detect HTTP input character encoding - * @param ?string type (default = null) Input string specifies the input type. "G" for GET, "P" for POST, "C" for COOKIE, "S" for string, - * "L" for list, and "I" for the whole list (will return array). If type is omitted, it returns the last input type processed. - * @return array|string|false The character encoding name, as per the type, or an array of character encoding names, if type is "I". - * If mb_http_input() does not process specified HTTP input, it returns false. - */ -mixed f$mb_http_input(const Optional &type); - -/** - * Set/Get the HTTP output character encoding. Output after this function is called will be converted from the set internal encoding to encoding - * @param ?string encoding (default = null) If encoding is set, mb_http_output() sets the HTTP output character encoding to encoding. - * If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. - * @return string|bool If encoding is omitted, mb_http_output() returns the current HTTP output character encoding. Otherwise, - * Returns true on success or false on failure. - */ -mixed f$mb_http_output(const Optional &encoding); - -/** - * Set/Get internal character encoding - * @param ?string encoding (default = null) encoding is the character encoding name used for the HTTP input character encoding conversion, - * HTTP output character encoding conversion, and the default character encoding for string functions defined by the mbstring module. - * You should notice that the internal encoding is totally different from the one for multibyte regex. - * @return string|bool If encoding is set, then Returns true on success or false on failure. - * In this case, the character encoding for multibyte regex is NOT changed. - * If encoding is omitted, then the current character encoding name is returned. - */ -mixed f$mb_internal_encoding(const Optional &encoding); - -/** - * Set/Get the current language - * @param ?string language (default = null) Used for encoding e-mail messages. The valid languages are listed in the following table. - * mb_send_mail() uses this setting to encode e-mail. - * +---------------------------+-------------+------------------+-----------+ - * | Language | Charset | Encoding | Alias | - * +---------------------------+-------------+------------------+-----------+ - * | German/de | ISO-8859-15 | Quoted-Printable | Deutsch | - * | English/en | ISO-8859-1 | Quoted-Printable | | - * | Armenian/hy | ArmSCII-8 | Quoted-Printable | | - * | Japanese/ja | ISO-2022-JP | BASE64 | | - * | Korean/ko | ISO-2022-KR | BASE64 | | - * | neutral | UTF-8 | BASE64 | | - * | Russian/ru | KOI8-R | Quoted-Printable | | - * | Turkish/tr | ISO-8859-9 | Quoted-Printable | | - * | Ukrainian/ua | KOI8-U | Quoted-Printable | | - * | uni | UTF-8 | BASE64 | universal | - * | Simplified Chinese/zh-cn | HZ | BASE64 | | - * | Traditional Chinese/zh-tw | BIG-5 | BASE64 | | - * +---------------------------+-------------+------------------+-----------+ - * @return string|bool If language is set and language is valid, it returns true. Otherwise, it returns false. When language is omitted or null, - * it returns the language name as a string - */ -mixed f$mb_language(const Optional &language); - -/** - * Returns an array of all supported encodings - * @return array Returns a numerically indexed array - */ -array f$mb_list_encodings(void); - -/** - * Returns the Unicode code point value of the given character. This function complements mb_chr(). - * @param string str A string - * @param string? encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used. - * @return int|false The Unicode code point for the first character of string or false on failure. - */ -Optional f$mb_ord(const string &str, const Optional &encoding); - -/** - * mb_output_handler() is ob_start() callback function. mb_output_handler() converts characters in the output buffer from internal - * character encoding to HTTP output character encoding. - * @param string str The contents of the output buffer - * @param int status The status of the output buffer - * @return string The converted string - */ -string f$mb_output_handler(const string &str, const int64_t status); - -/** - * Parses GET/POST/COOKIE data and sets global variables. Since PHP does not provide raw POST/COOKIE data, it can only be used for GET data for now. - * It parses URL encoded data, detects encoding, converts coding to internal encoding and set values to the result array or global variables. - * @param string str The URL encoded data - * @param array result An array containing decoded and character encoded converted values - * @return bool Returns true on success or false on failure - */ -bool f$mb_parse_str(const string &str, const array &result); // result = map - -/** - * Get a MIME charset string for a specific encoding. - * @param string encoding The encoding being checked - * @return string|false The MIME charset string for character encoding encoding, or false if no charset is preferred for the given encoding - */ -Optional f$mb_preferred_mime_name(const string &encoding); - -/** - * Set/Get character encoding for a multibyte regex - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used - * @return string|bool If encoding is set, then Returns true on success or false on failure. In this case, the internal character encoding is NOT changed. - * If encoding is omitted, then the current character encoding name for a multibyte regex is returned - */ -mixed f$mb_regex_encoding(const Optional &encoding); - -/** - * Sets the default options described by options for multibyte regex functions - * @param ?string options (default = null) The options to set. This is a string where each character is an option. - * To set a mode, the mode character must be the last one set, however there can only be set one mode but multiple options - * - * Regex options: - * +--------+----------------------------------+ - * | Option | Meaning | - * +--------+----------------------------------+ - * | i | Ambiguity match on | - * | x | Enables extended pattern form | - * | m | '.' matches with newlines | - * | s | '^' -> '\A', '$' -> '\Z' | - * | p | Same as both the m and s options | - * | l | Finds longest matches | - * | n | Ignores empty matches | - * | e | eval() resulting code | - * +--------+----------------------------------+ - * - * Regex syntax modes: - * +------+----------------------------+ - * | Mode | Meaning | - * +------+----------------------------+ - * | j | Java (Sun java.util.regex) | - * | u | GNU regex | - * | g | grep | - * | c | Emacs | - * | r | Ruby | - * | z | Perl | - * | b | POSIX Basic regex | - * | d | POSIX Extended regex | - * +------+----------------------------+ - * - * @return string The previous options. If options is omitted or null, it returns the string that describes the current options - */ -string f$mb_regex_set_options(const Optional &options); - -/** - * This function is currently not documented; only its argument list is available. - * @param string str - * @param ?string encoding (default = null) - * @return string - */ -string f$mb_scrub(const string &str, const Optional &encoding); - -/** - * Sends email. Headers and messages are converted and encoded according to the mb_language() setting. - * It's a wrapper function for mail(), so see also mail() for detail - * @param string to The mail addresses being sent to. Multiple recipients may be specified by putting a comma between each address in to. - * This parameter is not automatically encoded - * @param string subject The subject of the mail - * @param string message The message of the mail - * @param array|string additional_headers (default = []) String or array to be inserted at the end of the email header. - * This is typically used to add extra headers (From, Cc, and Bcc). Multiple extra headers should be separated with a CRLF (\r\n). - * Validate parameter not to be injected unwanted headers by attackers. If an array is passed, its keys are the header names and its - * values are the respective header values - * Note: - * If messages are not received, try using a LF (\n) only. Some Unix mail transfer agents (most notably » qmail) replace LF by CRLF automatically - * (which leads to doubling CR if CRLF is used). This should be a last resort, as it does not comply with » RFC 2822. - * @param ?string additional_params (default = null) additional_params is a MTA command line parameter. It is useful when setting the correct Return-Path header - * when using sendmail. This parameter is escaped by escapeshellcmd() internally to prevent command execution. escapeshellcmd() prevents command execution, - * but allows to add additional parameters. For security reason, this parameter should be validated. Since escapeshellcmd() is applied automatically, - * some characters that are allowed as email addresses by internet RFCs cannot be used. Programs that are required to use these characters mail() cannot be used. - * The user that the webserver runs as should be added as a trusted user to the sendmail configuration to prevent a 'X-Warning' header from being added to - * the message when the envelope sender (-f) is set using this method. For sendmail users, this file is /etc/mail/trusted-users - * @return bool Returns true on success or false on failure - */ -bool f$mb_send_mail(const string &to, const string &subject, const string &message, const mixed &additional_headers, const Optional &additional_params); - -/** - * Split a multibyte string using regular expression pattern and returns the result as an array - * @param string pattern The regular expression pattern - * @param string str The string being split - * @param int limit (default = -1) If optional parameter limit is specified, it will be split in limit elements as maximum - * @return array|false The result as an array, or false on failure - */ -mixed f$mb_split(const string &pattern, const string &str, const int64_t limit = -1); - -/** - * This function will return an array of strings, it is a version of str_split() with support for encodings of variable character size as well - * as fixed-size encodings of 1,2 or 4 byte characters. If the length parameter is specified, the string is broken down into chunks of the specified - * length in characters (not bytes). The encoding parameter can be optionally specified and it is good practice to do so - * @param string str The string to split into characters or chunks - * @param int length (default = 1) If specified, each element of the returned array will be composed of multiple characters instead of a single character - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value - * will be used. A string specifying one of the supported encodings - * @return array mb_str_split() returns an array of strings - */ -array f$mb_str_split(const string &str, const int64_t length, const Optional &encoding); - -/** - * mb_strcut() extracts a substring from a string similarly to mb_substr(), but operates on bytes instead of characters. - * If the cut position happens to be between two bytes of a multi-byte character, the cut is performed starting from the first byte of that character. - * This is also the difference to the substr() function, which would simply cut the string between the bytes and thus result in a malformed byte sequence - * @param string str The string being cut - * @param int start If start is non-negative, the returned string will start at the start'th byte position in string, counting from zero. - * For instance, in the string 'abcdef', the byte at position 0 is 'a', the byte at position 2 is 'c', and so forth. - * If start is negative, the returned string will start at the start'th byte counting back from the end of string. - * However, if the magnitude of a negative start is greater than the length of the string, the returned portion will start from the beginning of string - * @param ?int length (default = null) Length in bytes. If omitted or NULL is passed, extract all bytes to the end of the string. - * If length is negative, the returned string will end at the length'th byte counting back from the end of string. - * However, if the magnitude of a negative length is greater than the number of characters after the start position, an empty string will be returned - * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used - * @return string mb_strcut() returns the portion of string specified by the start and length parameters - */ -string f$mb_strcut(const string &str, const int64_t start, const Optional &length, const Optional &encoding); - -/** - * Truncates string string to specified width, where halfwidth characters count as 1, and fullwidth characters count as 2. - * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths - * @param string str The string being decoded - * @param int start The start position offset. Number of characters from the beginning of string (first character is 0), - * or if start is negative, number of characters from the end of the string - * @param int width The width of the desired trim. Negative widths count from the end of the string - * @param string trim_marker (default = "") A string that is added to the end of string when string is truncated - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used - * @return string The truncated string. If trim_marker is set, trim_marker replaces the last chars to match the width - */ -string f$mb_strimwidth(const string &str, const int64_t start, const int64_t width, const string &trim_marker, const Optional &encoding); - -/** - * mb_stripos() returns the numeric position of the first occurrence of needle in the haystack string. Unlike mb_strpos(), - * mb_stripos() is case-insensitive. If needle is not found, it returns false - * @param string haystack The string from which to get the position of the first occurrence of needl - * @param string needle The string to find in haystack - * @param int offset (default = 0) The position in haystack to start searching. A negative offset counts from the end of the string - * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used - * @return int|false Return the numeric position of the first occurrence of needle in the haystack string, or false if needle is not found - */ -Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); - -/** - * mb_stristr() finds the first occurrence of needle in haystack and returns the portion of haystack. - * Unlike mb_strstr(), mb_stristr() is case-insensitive. If needle is not found, it returns false - * @param string haystack The string from which to get the first occurrence of needle - * @param string needle The string to find in haystack - * @param bool before_needle (default = false) Determines which portion of haystack this function returns. - * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). - * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) - * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used - * @return string|false Returns the portion of haystack, or false if needle is not found - */ -Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); - -/** - * Gets the length of a string - * @param string str The string being checked for length - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return int Returns the number of characters in string string having character encoding encoding. A multi-byte character is counted as 1 - */ int64_t f$mb_strlen(const string &str, const Optional &encoding); -/** - * Finds position of the first occurrence of a string in a string. Performs a multi-byte safe strpos() operation based on number of characters. - * The first character's position is 0, the second character position is 1, and so on - * @param string haystack The string being checked - * @param string needle The string to find in haystack. In contrast with strpos(), numeric values are not applied as the ordinal value of a character - * @param int offset (default = 0) The search offset. If it is not specified, 0 is used. A negative offset counts from the end of the string - * @param ?string encoding (default = null) The encoding parameter is the character encoding. If it is omitted or null, - * the internal character encoding value will be used - * @return int|false Returns the numeric position of the first occurrence of needle in the haystack string. If needle is not found, it returns false - */ -Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); - -/** - * mb_strrchr() finds the last occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false - * @param string haystack The string from which to get the last occurrence of needle - * @param string needle The string to find in haystack - * @param bool before_needle Determines which portion of haystack this function returns. - * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. - * If set to false, it returns all of haystack from the last occurrence of needle to the end - * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used - * @return string|false Returns the portion of haystack. or false if needle is not found - */ -Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding); -/** - * mb_strrichr() finds the last occurrence of needle in haystack and returns the portion of haystack. Unlike mb_strrchr(), mb_strrichr() is case-insensitive. - * If needle is not found, it returns false - * @param string haystack The string from which to get the last occurrence of needle - * @param string needle The string to find in haystack - * @param bool before_needle Determines which portion of haystack this function returns. - * If set to true, it returns all of haystack from the beginning to the last occurrence of needle. - * If set to false, it returns all of haystack from the last occurrence of needle to the end - * @param ?string encoding (default = null) - * @return string|false Character encoding name to use. If it is omitted, internal character encoding is used - */ -Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding); -/** - * mb_strripos() performs multi-byte safe strripos() operation based on number of characters. needle position is counted from the beginning of haystack. - * First character's position is 0. Second character position is 1. Unlike mb_strrpos(), mb_strripos() is case-insensitive - * @param string haystack The string from which to get the position of the last occurrence of needle - * @param string needle The string to find in haystack - * @param int offset The position in haystack to start searching - * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used - * @return int|false Return the numeric position of the last occurrence of needle in the haystack string, or false if needle is not found - */ -Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); - -/** - * Performs a multibyte safe strrpos() operation based on the number of characters. needle position is counted from the beginning of haystack. - * First character's position is 0. Second character position is 1 - * @param string haystack The string being checked, for the last occurrence of needle - * @param string needle The string to find in haystack - * @param int offset (default = 0) May be specified to begin searching an arbitrary number of characters into the string. Negative values will stop searching at an arbitrary point prior to the end of the string - * @param ?string encoding The encoding parameter is the character encoding. If it is omitted or null, the internal character encoding value will be used - * @return int|false Returns the numeric position of the last occurrence of needle in the haystack string. If needle is not found, it returns false - */ -Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); - -/** - * mb_strstr() finds the first occurrence of needle in haystack and returns the portion of haystack. If needle is not found, it returns false - * @param string haystack The string from which to get the first occurrence of needle - * @param string needle The string to find in haystack - * @param bool before_needle Determines which portion of haystack this function returns. - * If set to true, it returns all of haystack from the beginning to the first occurrence of needle (excluding needle). - * If set to false, it returns all of haystack from the first occurrence of needle to the end (including needle) - * @param ?string encoding (default = null) Character encoding name to use. If it is omitted, internal character encoding is used - * @return string|false Returns the portion of haystack, or false if needle is not found - */ -Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +string f$mb_strtoupper(const string &str, const Optional &encoding); -/** - * Returns string with all alphabetic characters converted to lowercase - * @param string str The string being lowercased - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return string string with all alphabetic characters converted to lowercase - */ string f$mb_strtolower(const string &str, const Optional &encoding); -/** - * Returns string with all alphabetic characters converted to uppercase. - * @param string str The string being uppercased - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return string string with all alphabetic characters converted to uppercase - */ -string f$mb_strtoupper(const string &str, const Optional &encoding); - -/** - * Returns the width of string string, where halfwidth characters count as 1, and fullwidth characters count as 2. - * See » http://www.unicode.org/reports/tr11/ for details regarding East Asian character widths. The fullwidth characters are: - * U+1100-U+115F, U+11A3-U+11A7, U+11FA-U+11FF, U+2329-U+232A, U+2E80-U+2E99, U+2E9B-U+2EF3, U+2F00-U+2FD5, U+2FF0-U+2FFB, U+3000-U+303E, U+3041-U+3096, - * U+3099-U+30FF, U+3105-U+312D, U+3131-U+318E, U+3190-U+31BA, U+31C0-U+31E3, U+31F0-U+321E, U+3220-U+3247, U+3250-U+32FE, U+3300-U+4DBF, U+4E00-U+A48C, - * U+A490-U+A4C6, U+A960-U+A97C, U+AC00-U+D7A3, U+D7B0-U+D7C6, U+D7CB-U+D7FB, U+F900-U+FAFF, U+FE10-U+FE19, U+FE30-U+FE52, U+FE54-U+FE66, U+FE68-U+FE6B, - * U+FF01-U+FF60, U+FFE0-U+FFE6, U+1B000-U+1B001, U+1F200-U+1F202, U+1F210-U+1F23A, U+1F240-U+1F248, U+1F250-U+1F251, U+20000-U+2FFFD, U+30000-U+3FFFD. - * All other characters are halfwidth characters - * @param string str The string being decoded - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return int The width of string string - */ int64_t f$mb_strwidth(const string &str, const Optional &encoding); -/** - * Specifies a substitution character when input character encoding is invalid or character code does not exist in output character encoding. - * Invalid characters may be substituted "none" (no output), string or int value (Unicode character code value). - * This setting affects mb_convert_encoding(), mb_convert_variables(), mb_output_handler(), and mb_send_mail() - * @param string|int|null substitute_character (default = null) Specify the Unicode value as an int, or as one of the following strings: - * "none": no output - * "long": Output character code value (Example: U+3000, JIS+7E7E) - * "entity": Output character entity (Example: Ȁ) - * @return string|int|bool If substitute_character is set, it returns true for success, otherwise returns false. - * If substitute_character is not set, it returns the current setting - */ -mixed f$mb_substitute_character(const mixed &substitute_character); - -/** - * Counts the number of times the needle substring occurs in the haystack string - * @param string haystack The string being checked - * @param string needle The string being found - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return int The number of times the needle substring occurs in the haystack string - */ -int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding); +Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); -/** - * Performs a multi-byte safe substr() operation based on number of characters. Position is counted from the beginning of string. - * First character's position is 0. Second character position is 1, and so on - * @param string str The string to extract the substring from - * @param int start If start is non-negative, the returned string will start at the start'th position in string, counting from zero. - * For instance, in the string 'abcdef', the character at position 0 is 'a', the character at position 2 is 'c', and so forth. - * If start is negative, the returned string will start at the start'th character from the end of string - * @param ?int length (default = null) Maximum number of characters to use from string. - * If omitted or NULL is passed, extract all characters to the end of the string - * @param ?string encoding (default = null) The encoding parameter is the character encoding. - * If it is omitted or null, the internal character encoding value will be used - * @return string mb_substr() returns the portion of string specified by the start and length parameters - */ -string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding); +Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); + +Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); + +Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); #else diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp index 2a0a484302..b2c663dbdc 100644 --- a/tests/cpp/runtime/mbstring-test.cpp +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -18,4 +18,71 @@ TEST(mbstring_test, test_mb_convert_encoding) { ASSERT_STREQ(f$mb_convert_encoding(string("ыва"), string("UTF-8"), string("ASCII")).to_string().c_str(), "??????"); } +TEST(mbstring_test, test_mb_strlen) { + const int predicted = 12; + int real = 0; + real = f$mb_strlen(string("Hello world!"), string("UTF-8")); + ASSERT_TRUE(real == predicted); +} + +TEST(mbstring_test, test_mb_substr) { + ASSERT_STREQ(f$mb_substr(string("Hello world"), 2, 3, string("UTF-8")).c_str(), "llo"); +} + +TEST(mbstring_test, test_mb_substr_null_length) { + ASSERT_STREQ(f$mb_substr(string("Hello world"), 3, false, string("UTF-8")).c_str(), "lo world"); +} + +TEST(mbstring_test, test_mb_substr_count) { + ASSERT_TRUE(f$mb_substr_count(string("Hello world"), string("l"), string("UTF-8")) == 3); +} + +TEST(mbstring_test, test_mb_strpos) { + ASSERT_TRUE(val(f$mb_strpos(string("This is a test string"), string("test"), 0, string("UTF-8"))) == 10); +} + +TEST(mbstring_test, test_mb_strrpos) { + ASSERT_TRUE(val(f$mb_strrpos(string("españololol"), string("ol"), 0, string("UTF-8"))) == 9); +} + +TEST(mbstring_test, test_mb_strtoupper) { + ASSERT_STREQ(f$mb_strtoupper(string("españololol"), string("UTF-8")).c_str(), "ESPAÑOLOLOL"); +} + +TEST(mbstring_test, test_mb_strtolower) { + ASSERT_STREQ(f$mb_strtolower(string("ESPAÑOLOLOL"), string("UTF-8")).c_str(), "españololol"); +} + +TEST(mbstring_test, test_mb_stripos) { + ASSERT_TRUE(val(f$mb_stripos(string("This is a tEsT string"), string("TeSt"), 0, string("UTF-8"))) == 10); +} + +TEST(mbstring_test, test_mb_strripos) { + ASSERT_TRUE(val(f$mb_strripos(string("espaÑOLOlol"), string("oL"), 0, string("UTF-8"))) == 9); +} + +TEST(mbstring_test, test_mb_strwidth) { + ASSERT_TRUE(val(f$mb_strwidth(string("現現"), string("UTF-8"))) == 4); +} + +TEST(mbstring_test, test_mb_strstr) { + ASSERT_STREQ(f$mb_strstr(string("This is a test string"), string("test"), true, string("UTF-8")).val().c_str(), "This is a "); + ASSERT_STREQ(f$mb_strstr(string("This is a test string"), string("test"), false, string("UTF-8")).val().c_str(), "test string"); +} + +TEST(mbstring_test, test_mb_stristr_before_needle) { + ASSERT_STREQ(val(f$mb_stristr(string("This is a tEsT string"), string("TeSt"), true, string("UTF-8"))).c_str(), "This is a "); + ASSERT_STREQ(val(f$mb_stristr(string("This is a tEsT string"), string("TeSt"), false, string("UTF-8"))).c_str(), "tEsT string"); +} + +TEST(mbstring_test, test_mb_strrchr) { + ASSERT_STREQ(f$mb_strrchr(string("This is a test string"), string("test"), true, string("UTF-8")).val().c_str(), "This is a "); + ASSERT_STREQ(f$mb_strrchr(string("This is a test string"), string("test"), false, string("UTF-8")).val().c_str(), "test string"); +} + +TEST(mbstring_test, test_mb_strrichr) { + ASSERT_STREQ(f$mb_strrichr(string("This is a test string"), string("test"), true, string("UTF-8")).val().c_str(), "This is a "); + ASSERT_STREQ(f$mb_strrichr(string("This is a test string"), string("test"), false, string("UTF-8")).val().c_str(), "test string"); +} + #endif \ No newline at end of file diff --git a/tests/phpt/mbstring/001_mb_strlen.php b/tests/phpt/mbstring/001_mb_strlen.php new file mode 100644 index 0000000000..8d2f990cf0 --- /dev/null +++ b/tests/phpt/mbstring/001_mb_strlen.php @@ -0,0 +1,64 @@ +@ok + Date: Wed, 7 Feb 2024 19:29:25 +0000 Subject: [PATCH 18/27] Fix small compilation errors --- cmake/external-libraries.cmake | 10 ++++++++++ runtime/mbstring/mbstring.cpp | 12 +++++++++++- runtime/mbstring/mbstring.h | 4 +++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index b040ca6f18..0fec2a0a56 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -11,6 +11,16 @@ function(handle_missing_library LIB_NAME) endif() endfunction() +if(MBFL) + message(STATUS "MBFL=On, libmbfl will be downloaded and built") + add_compile_options(-DMBFL) + FetchContent_Declare(libmbfl GIT_REPOSITORY https://github.com/andreylzmw/libmbfl) + FetchContent_MakeAvailable(libmbfl) + include_directories(${libmbfl_SOURCE_DIR}/include) + add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") + add_link_options(-L${libmbfl_SOURCE_DIR}/objs) +endif() + find_package(fmt QUIET) if(NOT fmt_FOUND) handle_missing_library("fmtlib") diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index e42d999c34..588f37a14b 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -50,6 +50,8 @@ bool mb_UTF8_check(const char *s) { php_assert (0); } +static bool is_detect_incorrect_encoding_names_warning{false}; + #ifdef MBFL extern "C" { #include @@ -1051,4 +1053,12 @@ string f$mb_substr(const string &str, int64_t start, const mixed &length_var, co return {str.c_str() + UTF8_start, static_cast(UTF8_length)}; } -#endif \ No newline at end of file +#endif + +void f$set_detect_incorrect_encoding_names_warning(bool show) { + is_detect_incorrect_encoding_names_warning = show; +} + +void free_detect_incorrect_encoding_names() { + is_detect_incorrect_encoding_names_warning = false; +} diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index bcf0da9c0b..3a3b3ec4b2 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -70,4 +70,6 @@ void f$set_detect_incorrect_encoding_names_warning(bool show); void free_detect_incorrect_encoding_names(); -#endif \ No newline at end of file +#endif + +void free_detect_incorrect_encoding_names(); From c6be4b33c2da8b5946d8203c7492c6251724b6fb Mon Sep 17 00:00:00 2001 From: catnyan02 Date: Sat, 24 Feb 2024 07:34:31 +0000 Subject: [PATCH 19/27] Fix null encoding input, small fix mb_convert_case --- runtime/mbstring/mbstring.cpp | 15 ++++++++------ runtime/mbstring/mbstring.h | 28 +++++++++++++-------------- tests/cpp/runtime/mbstring-test.cpp | 7 +++++++ tests/phpt/mbstring/001_mb_strlen.php | 2 +- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 588f37a14b..c69020ca63 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -293,14 +293,17 @@ int64_t f$mb_substr_count(const string &haystack, const string &needle, const Op string mb_convert_case(const string &str, const int64_t mode, const Optional &encoding){ - mixed unicode = f$mb_convert_encoding(str, string("UTF_8"), encoding.val()); + mixed utf_8 = f$mb_convert_encoding(str, string("UTF_8"), encoding.val()); - if (unicode.is_string()) { - const string &unicode_str = unicode.to_string(); + if (utf_8.is_string()) { + string utf_8_str = utf_8.to_string(); + if (strcmp(encoding.val().c_str(), "UTF_8")){ + utf_8_str = str; + } - int len = str.size(); + int len = utf_8_str.size(); string unicode_res(len * 3, false); - const char *s = str.c_str(); + const char *s = utf_8_str.c_str(); int p = 0, ch = 0, res_len = 0; switch(mode) { @@ -320,7 +323,7 @@ string mb_convert_case(const string &str, const int64_t mode, const Optional &encoding); mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); -int64_t f$mb_strlen(const string &str, const Optional &encoding); +int64_t f$mb_strlen(const string &str, const Optional &encoding=false); -string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding); +string f$mb_substr(const string &str, const int64_t start, const Optional &length=0, const Optional &encoding=false); -int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding); +int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding=false); -string f$mb_strtoupper(const string &str, const Optional &encoding); +string f$mb_strtoupper(const string &str, const Optional &encoding=false); -string f$mb_strtolower(const string &str, const Optional &encoding); +string f$mb_strtolower(const string &str, const Optional &encoding=false); -int64_t f$mb_strwidth(const string &str, const Optional &encoding); +int64_t f$mb_strwidth(const string &str, const Optional &encoding=false); -Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); +Optional f$mb_strpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding=false); -Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); +Optional f$mb_strrpos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding=false); -Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); +Optional f$mb_strripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding=false); -Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding); +Optional f$mb_stripos(const string &haystack, const string &needle, const int64_t offset, const Optional &encoding=false); -Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +Optional f$mb_stristr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding=false); -Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +Optional f$mb_strstr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding=false); -Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +Optional f$mb_strrchr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding=false); -Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding); +Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding=false); #else diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp index b2c663dbdc..ec7506da61 100644 --- a/tests/cpp/runtime/mbstring-test.cpp +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -25,6 +25,13 @@ TEST(mbstring_test, test_mb_strlen) { ASSERT_TRUE(real == predicted); } +TEST(mbstring_test, test_mb_strlen_no_encoding) { + const int predicted = 12; + int real = 0; + real = f$mb_strlen(string("Hello world!")); + ASSERT_TRUE(real == predicted); +} + TEST(mbstring_test, test_mb_substr) { ASSERT_STREQ(f$mb_substr(string("Hello world"), 2, 3, string("UTF-8")).c_str(), "llo"); } diff --git a/tests/phpt/mbstring/001_mb_strlen.php b/tests/phpt/mbstring/001_mb_strlen.php index 8d2f990cf0..2c395957d4 100644 --- a/tests/phpt/mbstring/001_mb_strlen.php +++ b/tests/phpt/mbstring/001_mb_strlen.php @@ -55,7 +55,7 @@ function test_mb_strlen_long_string() { test_mb_strlen_basic_ascii(); test_mb_strlen_basic_utf_8(); test_mb_strlen_empty_string(); -// test_mb_strlen_null_encoding(); // doesn't put null through for some reason +test_mb_strlen_null_encoding(); // doesn't put null through for some reason test_mb_strlen_utf_16_encoding(); test_mb_strlen_html_entities(); test_mb_strlen_whitespaces(); From ad2584146c378c0ec2a61e8d9909f0401c863c46 Mon Sep 17 00:00:00 2001 From: Marat Omarov Date: Fri, 8 Mar 2024 19:17:52 +0300 Subject: [PATCH 20/27] Add mb_strcut function with simple tests --- builtin-functions/_functions.txt | 1 + runtime/mbstring/mbstring.cpp | 45 +++++++++++++++++++++ runtime/mbstring/mbstring.h | 2 + tests/cpp/runtime/mbstring-test.cpp | 8 ++++ tests/phpt/mbstring/015_mb_strcut.php | 57 +++++++++++++++++++++++++++ 5 files changed, 113 insertions(+) create mode 100644 tests/phpt/mbstring/015_mb_strcut.php diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 9e3af0c621..145b1d7254 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1649,6 +1649,7 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_strlen(string $string, ?string $encoding = null): int; function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index c69020ca63..a3e38a42ee 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -210,6 +210,51 @@ int64_t f$mb_strlen(const string &str, const Optional &enc_name){ } +string f$mb_strcut(const string &str, const int64_t start, const Optional &length, const Optional &encoding){ + int64_t _start, _length; + bool len_is_null = !length.has_value(); + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strcut", encoding.val().c_str()); + } + + mbfl_string _string, result, *ret; + mbfl_string_init(&_string); + _string.no_encoding = enc->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*)str.c_str(); + + if (len_is_null) { + _length = _string.len; + } else { + _length = length.val(); + } + + _start = start; + if (start < 0) { + _start = _string.len + start; + if (_start < 0) { + _start = 0; + } + } + + if (_length < 0) { + _length = (_string.len - _start) + _length; + if (_length < 0) { + _length = 0; + } + } + + if (_start > _length) { + return string(); + } + + ret = mbfl_strcut(&_string, &result, _start, _length); + php_assert(ret != NULL); + return string((const char*) ret->val, ret->len); +} + string f$mb_substr(const string &str, const int64_t start, const Optional &length, const Optional &encoding){ size_t real_start, real_len; diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index b3c153de9e..5d8ce56849 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -19,6 +19,8 @@ mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const m int64_t f$mb_strlen(const string &str, const Optional &encoding=false); +string f$mb_strcut(const string &str, const int64_t start, const Optional &length=0, const Optional &encoding=false); + string f$mb_substr(const string &str, const int64_t start, const Optional &length=0, const Optional &encoding=false); int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding=false); diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp index ec7506da61..17ec3ef9dd 100644 --- a/tests/cpp/runtime/mbstring-test.cpp +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -32,6 +32,14 @@ TEST(mbstring_test, test_mb_strlen_no_encoding) { ASSERT_TRUE(real == predicted); } +TEST(mbstring_test, test_mb_strcut) { + ASSERT_STREQ(f$mb_strcut(string("Hello world"), 2, 3, string("UTF-8")).c_str(), "llo"); +} + +TEST(mbstring_test, test_mb_strcut_null_length) { + ASSERT_STREQ(f$mb_strcut(string("Hello world"), 3, false, string("UTF-8")).c_str(), "lo world"); +} + TEST(mbstring_test, test_mb_substr) { ASSERT_STREQ(f$mb_substr(string("Hello world"), 2, 3, string("UTF-8")).c_str(), "llo"); } diff --git a/tests/phpt/mbstring/015_mb_strcut.php b/tests/phpt/mbstring/015_mb_strcut.php new file mode 100644 index 0000000000..93b42e8ecf --- /dev/null +++ b/tests/phpt/mbstring/015_mb_strcut.php @@ -0,0 +1,57 @@ +@ok + Date: Sat, 9 Mar 2024 17:13:33 +0300 Subject: [PATCH 21/27] Add mb_str_split function --- builtin-functions/_functions.txt | 2 ++ runtime/mbstring/mbstring.cpp | 35 ++++++++++++++++++++++++++++++++ runtime/mbstring/mbstring.h | 2 ++ 3 files changed, 39 insertions(+) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 145b1d7254..e94b93fa7c 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1649,6 +1649,8 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; + +function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_strlen(string $string, ?string $encoding = null): int; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index a3e38a42ee..c2c3c6446a 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -189,6 +189,41 @@ static const mbfl_encoding *mb_get_encoding(const Optional &enc_name) { return mbfl_name2encoding(DEFAULT_ENCODING); // change if we are going to use current encoding } +array f$mb_str_split(const string &str, const int64_t &length, const Optional &encoding){ + if (length <= 0) { + php_critical_error ("mb_str_split(): Argument #2 ($length) must be greater than 0"); + } else if (length > INT_MAX / 4) { + php_critical_error ("mb_str_split(): Argument #2 ($length) is too large"); + } + + const mbfl_encoding *enc = mb_get_encoding(encoding); + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strlen", encoding.val().c_str()); + } + + array result = array(); + + if (!str.size()) { + return result; + } + + mbfl_string _string; + mbfl_string_init(&_string); + _string.no_encoding = enc->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*)str.c_str(); + + size_t n = mbfl_strlen(&_string); // take into account the number of bytes in the encoding character + size_t char_length = _string.len / n; // get the number of bytes of a character + size_t chunk_length = char_length * (size_t)length; + + for (auto i = 0; i < _string.len; i += chunk_length) { + result.push_back(str.substr(i, chunk_length)); + // result.push_back(string(reinterpret_cast(_string.val) + i, chunk_length)); + } + return result; +} + int64_t f$mb_strlen(const string &str, const Optional &enc_name){ const mbfl_encoding *encoding = mb_get_encoding(enc_name); if (!encoding) { diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 5d8ce56849..89e119cd15 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -17,6 +17,8 @@ bool f$mb_check_encoding(const mixed &value, const Optional &encoding); mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); +array f$mb_str_split(const string &str, const int64_t &length=1, const Optional &encoding=false); + int64_t f$mb_strlen(const string &str, const Optional &encoding=false); string f$mb_strcut(const string &str, const int64_t start, const Optional &length=0, const Optional &encoding=false); From cfbe230f76aaf524487a79526c0a49226cde1cc6 Mon Sep 17 00:00:00 2001 From: Marat Omarov Date: Sat, 9 Mar 2024 21:24:27 +0300 Subject: [PATCH 22/27] Add mb_preferred_mime_name function and simple php-tests --- builtin-functions/_functions.txt | 2 +- runtime/mbstring/mbstring.cpp | 21 +++++++++++++++--- runtime/mbstring/mbstring.h | 2 ++ tests/phpt/mbstring/016_mb_str_split.php | 0 .../mbstring/017_mb_preferred_mime_name.php | 22 +++++++++++++++++++ 5 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 tests/phpt/mbstring/016_mb_str_split.php create mode 100644 tests/phpt/mbstring/017_mb_preferred_mime_name.php diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index e94b93fa7c..57a5e9957b 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1649,7 +1649,7 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; - +function mb_preferred_mime_name(string $encoding): string|false; function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index c2c3c6446a..8821d0c72c 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -189,16 +189,31 @@ static const mbfl_encoding *mb_get_encoding(const Optional &enc_name) { return mbfl_name2encoding(DEFAULT_ENCODING); // change if we are going to use current encoding } +Optional f$mb_preferred_mime_name(const string &enc_name) { + const mbfl_encoding *encoding; + encoding = mbfl_name2encoding(enc_name.c_str()); + if (!encoding) { + php_critical_error("encoding must be a valid encoding, \"%s\" given", enc_name.c_str()); + } + const char *preferred_name = (encoding->mime_name && encoding->mime_name[0] != '\0') ? encoding->mime_name : NULL; + if (preferred_name == NULL || *preferred_name == '\0') { + php_warning("No MIME preferred name corresponding to \"%s\"", enc_name.c_str()); + return false; + } else { + return string(preferred_name); + } +} + array f$mb_str_split(const string &str, const int64_t &length, const Optional &encoding){ if (length <= 0) { - php_critical_error ("mb_str_split(): Argument #2 ($length) must be greater than 0"); + php_critical_error("length argument must be greater than 0"); } else if (length > INT_MAX / 4) { - php_critical_error ("mb_str_split(): Argument #2 ($length) is too large"); + php_critical_error("length argument is too large"); } const mbfl_encoding *enc = mb_get_encoding(encoding); if (!enc) { - php_critical_error ("encoding \"%s\" isn't supported in mb_strlen", encoding.val().c_str()); + php_critical_error("encoding \"%s\" isn't supported in mb_strlen", encoding.val().c_str()); } array result = array(); diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 89e119cd15..59c2d5e4f4 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -17,6 +17,8 @@ bool f$mb_check_encoding(const mixed &value, const Optional &encoding); mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); +Optional f$mb_preferred_mime_name(const string &$encoding); + array f$mb_str_split(const string &str, const int64_t &length=1, const Optional &encoding=false); int64_t f$mb_strlen(const string &str, const Optional &encoding=false); diff --git a/tests/phpt/mbstring/016_mb_str_split.php b/tests/phpt/mbstring/016_mb_str_split.php new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/phpt/mbstring/017_mb_preferred_mime_name.php b/tests/phpt/mbstring/017_mb_preferred_mime_name.php new file mode 100644 index 0000000000..7d7df3cc8b --- /dev/null +++ b/tests/phpt/mbstring/017_mb_preferred_mime_name.php @@ -0,0 +1,22 @@ +@ok +getMessage() . \PHP_EOL; +// } +// } \ No newline at end of file From 07588d3f8fa9aa00454c836b1b60874ee97170c8 Mon Sep 17 00:00:00 2001 From: Marat Omarov Date: Mon, 11 Mar 2024 22:32:26 +0300 Subject: [PATCH 23/27] Add mb_list_encodings function --- builtin-functions/_functions.txt | 1 + runtime/mbstring/mbstring.cpp | 19 +++++++++++++++++++ runtime/mbstring/mbstring.h | 2 ++ 3 files changed, 22 insertions(+) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 57a5e9957b..f3cb3b9c56 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1649,6 +1649,7 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_list_encodings(): array; function mb_preferred_mime_name(string $encoding): string|false; function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; function mb_strcut(string $string, int $start, ?int $length = null, ?string $encoding = null): string; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index 8821d0c72c..acd199156d 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -68,8 +68,15 @@ extern "C" { #define KPHP_UNICODE_CASE_FOLD_SIMPLE 7 #define KPHP_UNICODE_CASE_MODE_MAX 7 +// TO-DO: +// #define MBFL_BAD_INPUT (-1) + static const char * DEFAULT_ENCODING = "UTF-8" ; +static inline array get_supported_encodings(); + +static const array supported_encodings_list = get_supported_encodings(); + static inline int mbfl_is_error(size_t len) { return len >= (size_t) -16; } @@ -189,6 +196,18 @@ static const mbfl_encoding *mb_get_encoding(const Optional &enc_name) { return mbfl_name2encoding(DEFAULT_ENCODING); // change if we are going to use current encoding } +static inline array get_supported_encodings() { + array result; + for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) { + result.push_back(string((*encodings)->name)); + } + return result; +} + +array f$mb_list_encodings() { + return supported_encodings_list; +} + Optional f$mb_preferred_mime_name(const string &enc_name) { const mbfl_encoding *encoding; encoding = mbfl_name2encoding(enc_name.c_str()); diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index 59c2d5e4f4..ef126abc66 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -17,6 +17,8 @@ bool f$mb_check_encoding(const mixed &value, const Optional &encoding); mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); +array f$mb_list_encodings(); + Optional f$mb_preferred_mime_name(const string &$encoding); array f$mb_str_split(const string &str, const int64_t &length=1, const Optional &encoding=false); From 2ecb94d1807086031b940a2f65448e48ec3bf509 Mon Sep 17 00:00:00 2001 From: Marat Omarov Date: Tue, 12 Mar 2024 15:24:04 +0300 Subject: [PATCH 24/27] Add mb_encoding_aliases function --- builtin-functions/_functions.txt | 1 + runtime/mbstring/mbstring.cpp | 18 +++++++++++++++++- runtime/mbstring/mbstring.h | 2 ++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index f3cb3b9c56..9adb08de4c 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1649,6 +1649,7 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; +function mb_encoding_aliases(string $encoding): array; function mb_list_encodings(): array; function mb_preferred_mime_name(string $encoding): string|false; function mb_str_split(string $string, int $length = 1, ?string $encoding = null): array; diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index acd199156d..f8ef898908 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -196,9 +196,25 @@ static const mbfl_encoding *mb_get_encoding(const Optional &enc_name) { return mbfl_name2encoding(DEFAULT_ENCODING); // change if we are going to use current encoding } +array f$mb_encoding_aliases(const string &encoding) { + const mbfl_encoding *enc = mb_get_encoding(encoding); + if (!enc) { + php_critical_error("encoding \"%s\" isn't supported in mb_encoding_aliases", encoding.c_str()); + } + + array result; + if (enc->aliases) { + const char *(*aliases)[] = enc->aliases; + for (int i = 0; (*aliases)[i]; ++i) { + result.push_back(string((*aliases)[i])); + } + } + return result; +} + static inline array get_supported_encodings() { array result; - for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; encodings++) { + for (const mbfl_encoding **encodings = mbfl_get_supported_encodings(); *encodings; ++encodings) { result.push_back(string((*encodings)->name)); } return result; diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index ef126abc66..ec8b2b5822 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -17,6 +17,8 @@ bool f$mb_check_encoding(const mixed &value, const Optional &encoding); mixed f$mb_convert_encoding(const mixed &str, const string &to_encoding, const mixed &from_encoding); +array f$mb_encoding_aliases(const string &encoding); + array f$mb_list_encodings(); Optional f$mb_preferred_mime_name(const string &$encoding); From 24400a0ff5d5cfe1d47e6803962269f432f6ab12 Mon Sep 17 00:00:00 2001 From: catnyan02 Date: Wed, 10 Jul 2024 21:23:41 +0100 Subject: [PATCH 25/27] Add oniguruma and regular expressions --- .idea/misc.xml | 3 + builtin-functions/_functions.txt | 8 +- cmake/external-libraries.cmake | 13 + compiler/compiler-settings.cpp | 5 + runtime/mbstring/mbstring.cpp | 672 +++++++++++++++++++++- runtime/mbstring/mbstring.h | 12 + runtime/runtime.cmake | 2 + tests/cpp/runtime/mbstring-test.cpp | 13 + tests/phpt/mbstring/001_mb_strlen.php | 2 +- tests/phpt/mbstring/015_mb_strimwidth.php | 58 ++ 10 files changed, 785 insertions(+), 3 deletions(-) create mode 100644 tests/phpt/mbstring/015_mb_strimwidth.php diff --git a/.idea/misc.xml b/.idea/misc.xml index bdd226825f..9f883b3c5b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,5 +1,8 @@ + + diff --git a/builtin-functions/_functions.txt b/builtin-functions/_functions.txt index 9e3af0c621..5c700db035 100644 --- a/builtin-functions/_functions.txt +++ b/builtin-functions/_functions.txt @@ -1650,6 +1650,7 @@ function getenv(string $varname = '', bool $local_only = false): mixed; function mb_check_encoding(array|string $value, ?string $encoding = null): bool; function mb_convert_encoding(array|string $string, string $to_encoding, array|string|null $from_encoding = null): array|string|false; function mb_substr(string $string, int $start, ?int $length = null, ?string $encoding = null): string; +function mb_substitute_character(string|int|null $substitute_character = null): string|int|false; function mb_strlen(string $string, ?string $encoding = null): int; function mb_substr_count(string $haystack, string $needle, ?string $encoding = null): int; function mb_strtolower(string $string, ?string $encoding = null): string; @@ -1662,4 +1663,9 @@ function mb_strrpos(string $haystack, string $needle, int $offset = 0, string $e function mb_stristr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; function mb_strrchr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; function mb_strrichr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; -function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; \ No newline at end of file +function mb_strstr(string $haystack, string $needle, bool $before_needle = false, ?string $encoding = null): string|false; +function mb_strimwidth(string $string, int $start, int $width, string $trim_marker = "", ?string $encoding = null): string; +function mb_scrub(string $string, ?string $encoding = null): string; +function mb_regex_encoding(?string $encoding = null): string|false; +function mb_regex_set_options(?string $options = null): string; +function mb_ereg_match(string $pattern, string $string, ?string $options = null): bool; diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 0fec2a0a56..169fb14ae4 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -19,6 +19,19 @@ if(MBFL) include_directories(${libmbfl_SOURCE_DIR}/include) add_definitions(-DLIBMBFL_LIB_DIR="${libmbfl_SOURCE_DIR}/objs") add_link_options(-L${libmbfl_SOURCE_DIR}/objs) + + find_package(onig QUIET) + if(NOT onig_FOUND) + handle_missing_library("onig") + FetchContent_Declare( + onig + GIT_REPOSITORY https://github.com/kkos/oniguruma/ + GIT_TAG v6.9.9 + ) + FetchContent_MakeAvailable(onig) + include_directories(${onig_SOURCE_DIR}/src) + message(STATUS "---------------------") + endif() endif() find_package(fmt QUIET) diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index 9e761f3fcf..e18ea72368 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -342,6 +342,11 @@ void CompilerSettings::init() { ld_flags.value_ += " -L" LIBMBFL_LIB_DIR; #endif +#ifdef ONIG_LIB_DIR + external_static_libs.emplace_back("onig"); + ld_flags.value_ += " -lonig"; +#endif + #ifdef KPHP_H3_LIB_DIR ld_flags.value_ += " -L" KPHP_H3_LIB_DIR; #else diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index c69020ca63..fe62de73bd 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -58,6 +58,8 @@ extern "C" { #include } +#include + #define KPHP_UNICODE_CASE_UPPER 0 #define KPHP_UNICODE_CASE_LOWER 1 #define KPHP_UNICODE_CASE_TITLE 2 @@ -68,7 +70,16 @@ extern "C" { #define KPHP_UNICODE_CASE_FOLD_SIMPLE 7 #define KPHP_UNICODE_CASE_MODE_MAX 7 -static const char * DEFAULT_ENCODING = "UTF-8" ; +#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0 +#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1 +#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2 +#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3 +#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 + +static const char * DEFAULT_ENCODING = "UTF-8"; + +int current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; +int current_filter_illegal_substchar = '?'; static inline int mbfl_is_error(size_t len) { return len >= (size_t) -16; @@ -94,6 +105,8 @@ mbfl_string *convert_encoding(const char *str, const char *to, const char *from) /* converting */ convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0); + mbfl_buffer_converter_illegal_mode(convd, current_filter_illegal_mode); + mbfl_buffer_converter_illegal_substchar(convd, current_filter_illegal_substchar); ret = mbfl_buffer_converter_feed_result(convd, &_string, &result); mbfl_buffer_converter_delete(convd); @@ -257,6 +270,73 @@ string f$mb_substr(const string &str, const int64_t start, const Optionalval, ret->len); } +static inline int php_mb_check_code_point(long cp) +{ + if (cp <= 0 || cp >= 0x110000) { + return 0; + } + + if (cp >= 0xd800 && cp <= 0xdfff) { + return 0; + } + + return 1; +} + +//static uint8_t is_numeric_string_ex(const char *str, size_t length, zend_long *lval, +// double *dval) +//{ +// if (*str > '9') { +// return 0; +// } +// return _is_numeric_string_ex(str, length, lval, dval, allow_errors, oflow_info, trailing_data); +//} +// +//long convert_to_long(string *op) +//{ +// long lval; +// double dval; +// if (0 == is_numeric_string(op->c_str(), op->size(), &lval, &dval)) { +// return 0; +// } else { +// return (long)lval; +// } +//} + +mixed f$mb_substitute_character(const mixed &substitute_character){ + if (!substitute_character) { + if (current_filter_illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) { + return string("none", 4); + } else if (current_filter_illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) { + return string("long", 4); + } else if (current_filter_illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) { + return string("entity", 6); + } else { + return current_filter_illegal_substchar; + } + } else { + if (substitute_character.is_string()) { + if (strncasecmp("none", substitute_character.to_string().c_str(), substitute_character.to_string().size()) == 0) { + current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + } else if (strncasecmp("long", substitute_character.to_string().c_str(), substitute_character.to_string().size()) == 0) { + current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG; + } else if (strncasecmp("entity", substitute_character.to_string().c_str(), substitute_character.to_string().size()) == 0) { + current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY; + } + } else { +// long substitute_char = convert_to_long(substitute_character); +// if (php_mb_check_code_point(substitute_char)) { +// current_filter_illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR; +// current_filter_illegal_substchar = substitute_char; +// } else { + php_warning("Unknown character"); + return false; +// } + } + return true; + } +} + int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding){ size_t n; @@ -714,6 +794,596 @@ Optional f$mb_strrichr(const string &haystack, const string &needle, con return false; } +string f$mb_strimwidth(const string &str, const int64_t start, const int64_t width, const string &trim_marker, const Optional &encoding){ + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + php_critical_error ("encoding \"%s\" isn't supported in mb_strrpos", encoding.val().c_str()); + } + + mbfl_string _string, result, _trim_marker, *ret; + int64_t from = start; + int64_t swidth = 0; + + mbfl_string_init(&_string); + _string.no_encoding = enc->no_encoding; + _string.len = str.size(); + _string.val = (unsigned char*) str.c_str(); + + mbfl_string_init(&_trim_marker); + _trim_marker.no_encoding = enc->no_encoding; + _trim_marker.len = 0; + _trim_marker.val = NULL; + + if ((from < 0) || (width < 0)) { + swidth = mbfl_strwidth(&_string); + } + + if (from < 0) { + from += swidth; + } + + if (from < 0 || (size_t)from > str.size()) { + php_critical_error ("Start position is out of range"); + } + + if (width < 0) { + swidth = swidth + width - from; + } + + if (swidth < 0) { + php_critical_error ("Width is out of range"); + } + + if (trim_marker.size() > 0) { + _trim_marker.len = trim_marker.size(); + _trim_marker.val = (unsigned char*) trim_marker.c_str(); + } + + ret = mbfl_strimwidth(&_string, &_trim_marker, &result, from, width); + + if (ret == NULL) { + php_critical_error ("Internal error"); + } + + return string((const char*) ret->val, ret->len); + +} + +Optional f$mb_scrub(const string &str, const Optional &encoding){ + + const mbfl_encoding *enc = mb_get_encoding(encoding); + + if (!enc) { + return false; + } + + mbfl_string *ret = convert_encoding(str.c_str(), val(encoding).c_str(), val(encoding).c_str()); + + if (ret == NULL) { + return false; + } + + return string((const char*)ret->val, ret->len); +} + +///* REGEXPS */ +// +///* +// * encoding name resolver +// */ +typedef struct _kphp_mb_regex_enc_name_map_t { + const char *names; + OnigEncoding code; +} kphp_mb_regex_enc_name_map_t; + +static const kphp_mb_regex_enc_name_map_t enc_name_map[] = { +#ifdef ONIG_ENCODING_EUC_JP + { + "EUC-JP\0EUCJP\0X-EUC-JP\0UJIS\0EUCJP\0EUCJP-WIN\0", + ONIG_ENCODING_EUC_JP + }, +#endif +#ifdef ONIG_ENCODING_UTF8 + { + "UTF-8\0UTF8\0", + ONIG_ENCODING_UTF8 + }, +#endif +#ifdef ONIG_ENCODING_UTF16_BE + { + "UTF-16\0UTF-16BE\0", + ONIG_ENCODING_UTF16_BE + }, +#endif +#ifdef ONIG_ENCODING_UTF16_LE + { + "UTF-16LE\0", + ONIG_ENCODING_UTF16_LE + }, +#endif +#ifdef ONIG_ENCODING_UTF32_BE + { + "UCS-4\0UTF-32\0UTF-32BE\0", + ONIG_ENCODING_UTF32_BE + }, +#endif +#ifdef ONIG_ENCODING_UTF32_LE + { + "UCS-4LE\0UTF-32LE\0", + ONIG_ENCODING_UTF32_LE + }, +#endif +#ifdef ONIG_ENCODING_SJIS + { + "SJIS\0CP932\0MS932\0SHIFT_JIS\0SJIS-WIN\0WINDOWS-31J\0", + ONIG_ENCODING_SJIS + }, +#endif +#ifdef ONIG_ENCODING_BIG5 + { + "BIG5\0BIG-5\0BIGFIVE\0CN-BIG5\0BIG-FIVE\0", + ONIG_ENCODING_BIG5 + }, +#endif +#ifdef ONIG_ENCODING_EUC_CN + { + "EUC-CN\0EUCCN\0EUC_CN\0GB-2312\0GB2312\0", + ONIG_ENCODING_EUC_CN + }, +#endif +#ifdef ONIG_ENCODING_EUC_TW + { + "EUC-TW\0EUCTW\0EUC_TW\0", + ONIG_ENCODING_EUC_TW + }, +#endif +#ifdef ONIG_ENCODING_EUC_KR + { + "EUC-KR\0EUCKR\0EUC_KR\0", + ONIG_ENCODING_EUC_KR + }, +#endif +#if defined(ONIG_ENCODING_KOI8) && !PHP_ONIG_BAD_KOI8_ENTRY + { + "KOI8\0KOI-8\0", + ONIG_ENCODING_KOI8 + }, +#endif +#ifdef ONIG_ENCODING_KOI8_R + { + "KOI8R\0KOI8-R\0KOI-8R\0", + ONIG_ENCODING_KOI8_R + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_1 + { + "ISO-8859-1\0ISO8859-1\0", + ONIG_ENCODING_ISO_8859_1 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_2 + { + "ISO-8859-2\0ISO8859-2\0", + ONIG_ENCODING_ISO_8859_2 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_3 + { + "ISO-8859-3\0ISO8859-3\0", + ONIG_ENCODING_ISO_8859_3 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_4 + { + "ISO-8859-4\0ISO8859-4\0", + ONIG_ENCODING_ISO_8859_4 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_5 + { + "ISO-8859-5\0ISO8859-5\0", + ONIG_ENCODING_ISO_8859_5 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_6 + { + "ISO-8859-6\0ISO8859-6\0", + ONIG_ENCODING_ISO_8859_6 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_7 + { + "ISO-8859-7\0ISO8859-7\0", + ONIG_ENCODING_ISO_8859_7 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_8 + { + "ISO-8859-8\0ISO8859-8\0", + ONIG_ENCODING_ISO_8859_8 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_9 + { + "ISO-8859-9\0ISO8859-9\0", + ONIG_ENCODING_ISO_8859_9 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_10 + { + "ISO-8859-10\0ISO8859-10\0", + ONIG_ENCODING_ISO_8859_10 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_11 + { + "ISO-8859-11\0ISO8859-11\0", + ONIG_ENCODING_ISO_8859_11 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_13 + { + "ISO-8859-13\0ISO8859-13\0", + ONIG_ENCODING_ISO_8859_13 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_14 + { + "ISO-8859-14\0ISO8859-14\0", + ONIG_ENCODING_ISO_8859_14 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_15 + { + "ISO-8859-15\0ISO8859-15\0", + ONIG_ENCODING_ISO_8859_15 + }, +#endif +#ifdef ONIG_ENCODING_ISO_8859_16 + { + "ISO-8859-16\0ISO8859-16\0", + ONIG_ENCODING_ISO_8859_16 + }, +#endif +#ifdef ONIG_ENCODING_ASCII + { + "ASCII\0US-ASCII\0US_ASCII\0ISO646\0", + ONIG_ENCODING_ASCII + }, +#endif + { NULL, ONIG_ENCODING_UNDEF } +}; + +OnigEncoding default_mbctype = ONIG_ENCODING_UTF8; +OnigEncoding current_mbctype = ONIG_ENCODING_UTF8; +mbfl_no_encoding current_mbctype_mbfl_encoding = mbfl_no_encoding_utf8; + +OnigOptionType regex_default_options = ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; +OnigSyntaxType *regex_default_syntax = ONIG_SYNTAX_RUBY; + +long regex_stack_limit = 100000; +long regex_retry_limit = 1000000; + +//static OnigEncoding _kphp_mb_regex_name2mbctype(const char *pname) +//{ +// const char *p; +// const kphp_mb_regex_enc_name_map_t *mapping; +// +// if (pname == NULL || !*pname) { +// return ONIG_ENCODING_UNDEF; +// } +// +// for (mapping = enc_name_map; mapping->names != NULL; mapping++) { +// for (p = mapping->names; *p != '\0'; p += (strlen(p) + 1)) { +// if (strcasecmp(p, pname) == 0) { +// return mapping->code; +// } +// } +// } +// +// return ONIG_ENCODING_UNDEF; +//} +// +// +//static const char *_kphp_mb_regex_mbctype2name(OnigEncoding mbctype) +//{ +// const kphp_mb_regex_enc_name_map_t *mapping; +// +// for (mapping = enc_name_map; mapping->names != NULL; mapping++) { +// if (mapping->code == mbctype) { +// return mapping->names; +// } +// } +// +// return NULL; +//} + +//mixed f$mb_regex_encoding(const Optional &encoding){ +// if (!encoding.has_value()) { +// const char *retval = _kphp_mb_regex_mbctype2name(current_mbctype); +// if (retval != NULL){ +// return string(retval); +// } +// return NULL; +// } else { +// OnigEncoding mbctype = _kphp_mb_regex_name2mbctype(val(encoding).c_str()); +// if (mbctype == ONIG_ENCODING_UNDEF) { +// php_critical_error ("must be a valid encoding, \"%s\" given", encoding.val().c_str()); +// } +// current_mbctype = mbctype; +// current_mbctype_mbfl_encoding = mb_get_encoding(encoding)->no_encoding; +// return true; +// } +//} + +static bool _kphp_mb_regex_init_options(const char *parg, size_t narg, OnigOptionType *option, OnigSyntaxType **syntax) +{ + size_t n; + char c; + OnigOptionType optm = 0; + + *syntax = ONIG_SYNTAX_RUBY; + + if (parg != NULL) { + n = 0; + while(n < narg) { + c = parg[n++]; + switch (c) { + case 'i': + optm |= ONIG_OPTION_IGNORECASE; + break; + case 'x': + optm |= ONIG_OPTION_EXTEND; + break; + case 'm': + optm |= ONIG_OPTION_MULTILINE; + break; + case 's': + optm |= ONIG_OPTION_SINGLELINE; + break; + case 'p': + optm |= ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE; + break; + case 'l': + optm |= ONIG_OPTION_FIND_LONGEST; + break; + case 'n': + optm |= ONIG_OPTION_FIND_NOT_EMPTY; + break; + case 'j': + *syntax = ONIG_SYNTAX_JAVA; + break; + case 'u': + *syntax = ONIG_SYNTAX_GNU_REGEX; + break; + case 'g': + *syntax = ONIG_SYNTAX_GREP; + break; + case 'c': + *syntax = ONIG_SYNTAX_EMACS; + break; + case 'r': + *syntax = ONIG_SYNTAX_RUBY; + break; + case 'z': + *syntax = ONIG_SYNTAX_PERL; + break; + case 'b': + *syntax = ONIG_SYNTAX_POSIX_BASIC; + break; + case 'd': + *syntax = ONIG_SYNTAX_POSIX_EXTENDED; + break; + default: + return false; + } + } + if (option != NULL) *option|=optm; + } + return true; +} + +static void _kphp_mb_regex_set_options(OnigOptionType options, OnigSyntaxType *syntax, OnigOptionType *prev_options, OnigSyntaxType **prev_syntax) +{ + if (prev_options != NULL) { + *prev_options = regex_default_options; + } + if (prev_syntax != NULL) { + *prev_syntax = regex_default_syntax; + } + regex_default_options = options; + regex_default_syntax = syntax; +} + +static size_t _kphp_mb_regex_get_option_string(char *str, size_t len, OnigOptionType option, OnigSyntaxType *syntax) +{ + size_t len_left = len; + size_t len_req = 0; + char *p = str; + char c; + + if ((option & ONIG_OPTION_IGNORECASE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'i'; + } + ++len_req; + } + + if ((option & ONIG_OPTION_EXTEND) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'x'; + } + ++len_req; + } + + if ((option & (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) == + (ONIG_OPTION_MULTILINE | ONIG_OPTION_SINGLELINE)) { + if (len_left > 0) { + --len_left; + *(p++) = 'p'; + } + ++len_req; + } else { + if ((option & ONIG_OPTION_MULTILINE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'm'; + } + ++len_req; + } + + if ((option & ONIG_OPTION_SINGLELINE) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 's'; + } + ++len_req; + } + } + if ((option & ONIG_OPTION_FIND_LONGEST) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'l'; + } + ++len_req; + } + if ((option & ONIG_OPTION_FIND_NOT_EMPTY) != 0) { + if (len_left > 0) { + --len_left; + *(p++) = 'n'; + } + ++len_req; + } + + c = 0; + + if (syntax == ONIG_SYNTAX_JAVA) { + c = 'j'; + } else if (syntax == ONIG_SYNTAX_GNU_REGEX) { + c = 'u'; + } else if (syntax == ONIG_SYNTAX_GREP) { + c = 'g'; + } else if (syntax == ONIG_SYNTAX_EMACS) { + c = 'c'; + } else if (syntax == ONIG_SYNTAX_RUBY) { + c = 'r'; + } else if (syntax == ONIG_SYNTAX_PERL) { + c = 'z'; + } else if (syntax == ONIG_SYNTAX_POSIX_BASIC) { + c = 'b'; + } else if (syntax == ONIG_SYNTAX_POSIX_EXTENDED) { + c = 'd'; + } + + if (c != 0) { + if (len_left > 0) { + --len_left; + *(p++) = c; + } + ++len_req; + } + + + if (len_left > 0) { + --len_left; + *(p++) = '\0'; + } + ++len_req; + if (len < len_req) { + return len_req; + } + + return 0; +} + +string f$mb_regex_set_options(const Optional &options){ + OnigOptionType opt, prev_opt; + OnigSyntaxType *syntax, *prev_syntax; + char buf[16]; + + if (options.has_value()) { + opt = 0; + syntax = NULL; + if (!_kphp_mb_regex_init_options(val(options).c_str(), val(options).size(), &opt, &syntax)) { + php_critical_error("Wrong regex options."); + } + _kphp_mb_regex_set_options(opt, syntax, &prev_opt, &prev_syntax); + opt = prev_opt; + syntax = prev_syntax; + } else { + opt = regex_default_options; + syntax = regex_default_syntax; + } + + _kphp_mb_regex_get_option_string(buf, sizeof(buf), opt, syntax); + return string((const char *) buf, sizeof(buf)); +} + +static regex_t *kphp_mbregex_compile_pattern(const char *pattern, size_t patlen, OnigOptionType options, OnigSyntaxType *syntax) +{ + // TODO: hashing + int err_code = 0; + regex_t *retval = NULL; + OnigErrorInfo err_info; + OnigUChar err_str[ONIG_MAX_ERROR_MESSAGE_LEN]; + OnigEncoding enc = current_mbctype; + + if ((err_code = onig_new(&retval, (OnigUChar *)pattern, (OnigUChar *)(pattern + patlen), options, enc, syntax, &err_info)) != ONIG_NORMAL) { + onig_error_code_to_str(err_str, err_code, &err_info); + php_warning("mbregex compile err: %s", err_str); + return NULL; + } + + return retval; +} + +bool f$mb_ereg_match(const string &pattern, const string &str, const Optional &options){ + regex_t *re; + OnigSyntaxType *syntax; + OnigOptionType option = 0; + int err; + OnigMatchParam *mp; + + if (options.has_value()) { + if(!_kphp_mb_regex_init_options(val(options).c_str(), val(options).size(), &option, &syntax)) { + php_critical_error("Wrong regex options."); + } + } else { + option |= regex_default_options; + syntax = regex_default_syntax; + } + + if (!f$mb_check_encoding(str, DEFAULT_ENCODING)) { + return false; + } + + if ((re = kphp_mbregex_compile_pattern(pattern.c_str(), pattern.size(), option, syntax)) == NULL) { + return false; + } + + mp = onig_new_match_param(); + onig_initialize_match_param(mp); + + if (regex_stack_limit > 0 && regex_stack_limit < UINT_MAX) { + onig_set_match_stack_limit_size_of_match_param(mp, (unsigned int) regex_stack_limit); + } + if (regex_retry_limit > 0 && regex_retry_limit < UINT_MAX) { + onig_set_retry_limit_in_match_of_match_param(mp, (unsigned int) regex_retry_limit); + } + /* match */ + err = onig_match_with_param(re, (OnigUChar *) str.c_str(), (OnigUChar *)(str.c_str() + str.size()), (OnigUChar *) str.c_str(), NULL, 0, mp); // error is here + onig_free_match_param(mp); + + if (err >= 0) { + return true; + } else { + return false; + } +} + #else diff --git a/runtime/mbstring/mbstring.h b/runtime/mbstring/mbstring.h index b3c153de9e..f90283be9b 100644 --- a/runtime/mbstring/mbstring.h +++ b/runtime/mbstring/mbstring.h @@ -21,6 +21,8 @@ int64_t f$mb_strlen(const string &str, const Optional &encoding=false); string f$mb_substr(const string &str, const int64_t start, const Optional &length=0, const Optional &encoding=false); +mixed f$mb_substitute_character(const mixed &substitute_character); + int64_t f$mb_substr_count(const string &haystack, const string &needle, const Optional &encoding=false); string f$mb_strtoupper(const string &str, const Optional &encoding=false); @@ -45,6 +47,16 @@ Optional f$mb_strrchr(const string &haystack, const string &needle, cons Optional f$mb_strrichr(const string &haystack, const string &needle, const bool before_needle, const Optional &encoding=false); +string f$mb_strimwidth(const string &str, const int64_t start, const int64_t width, const string &trim_marker, const Optional &encoding=false); + +Optional f$mb_scrub(const string &str, const Optional &encoding=false); + +//mixed f$mb_regex_encoding(const Optional &encoding=false); + +string f$mb_regex_set_options(const Optional &options=false); + +bool f$mb_ereg_match(const string &pattern, const string &str, const Optional &options=false); + #else #include diff --git a/runtime/runtime.cmake b/runtime/runtime.cmake index a24173eb8a..a5d07756a9 100644 --- a/runtime/runtime.cmake +++ b/runtime/runtime.cmake @@ -151,6 +151,7 @@ target_include_directories(kphp_runtime PUBLIC ${BASE_DIR} /opt/curl7600/include add_dependencies(kphp_runtime kphp-timelib) if (MBFL) add_dependencies(kphp_runtime libmbfl) + add_dependencies(kphp_runtime onig) endif() @@ -173,6 +174,7 @@ endif() if (MBFL) list(APPEND RUNTIME_LINK_TEST_LIBS libmbfl) + list(APPEND RUNTIME_LINK_TEST_LIBS onig) endif() file(GLOB_RECURSE KPHP_RUNTIME_ALL_HEADERS diff --git a/tests/cpp/runtime/mbstring-test.cpp b/tests/cpp/runtime/mbstring-test.cpp index ec7506da61..4c10fe9442 100644 --- a/tests/cpp/runtime/mbstring-test.cpp +++ b/tests/cpp/runtime/mbstring-test.cpp @@ -92,4 +92,17 @@ TEST(mbstring_test, test_mb_strrichr) { ASSERT_STREQ(f$mb_strrichr(string("This is a test string"), string("test"), false, string("UTF-8")).val().c_str(), "test string"); } +TEST(mbstring_test, test_mb_strimwidth) { + ASSERT_STREQ(f$mb_strimwidth(string("This is a very long string that needs to be trimmed"), 0, 20, string("...")).c_str(), "This is a very lo..."); +} + +TEST(mbstring_test, test_mb_regex_set_options) { + ASSERT_STREQ(f$mb_regex_set_options("xpu").c_str(), "pr"); + ASSERT_STREQ(f$mb_regex_set_options("npj").c_str(), "xpu"); +} + +TEST(mbstring_test, test_mb_ereg_match) { + ASSERT_TRUE(f$mb_ereg_match(string("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"), string("dachman@gmail.com"))); +} + #endif \ No newline at end of file diff --git a/tests/phpt/mbstring/001_mb_strlen.php b/tests/phpt/mbstring/001_mb_strlen.php index 2c395957d4..55c9d2b761 100644 --- a/tests/phpt/mbstring/001_mb_strlen.php +++ b/tests/phpt/mbstring/001_mb_strlen.php @@ -55,7 +55,7 @@ function test_mb_strlen_long_string() { test_mb_strlen_basic_ascii(); test_mb_strlen_basic_utf_8(); test_mb_strlen_empty_string(); -test_mb_strlen_null_encoding(); // doesn't put null through for some reason +test_mb_strlen_null_encoding(); test_mb_strlen_utf_16_encoding(); test_mb_strlen_html_entities(); test_mb_strlen_whitespaces(); diff --git a/tests/phpt/mbstring/015_mb_strimwidth.php b/tests/phpt/mbstring/015_mb_strimwidth.php new file mode 100644 index 0000000000..2559eaa84e --- /dev/null +++ b/tests/phpt/mbstring/015_mb_strimwidth.php @@ -0,0 +1,58 @@ +@ok + Date: Thu, 18 Jul 2024 16:08:59 +0300 Subject: [PATCH 26/27] Fix linking problem --- cmake/external-libraries.cmake | 1 + compiler/compiler-settings.cpp | 3 +-- runtime/mbstring/mbstring.cpp | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/cmake/external-libraries.cmake b/cmake/external-libraries.cmake index 169fb14ae4..da79e2cbdf 100644 --- a/cmake/external-libraries.cmake +++ b/cmake/external-libraries.cmake @@ -30,6 +30,7 @@ if(MBFL) ) FetchContent_MakeAvailable(onig) include_directories(${onig_SOURCE_DIR}/src) + add_definitions(-DONIG_LIB_DIR="${onig_BINARY_DIR}") message(STATUS "---------------------") endif() endif() diff --git a/compiler/compiler-settings.cpp b/compiler/compiler-settings.cpp index e18ea72368..274ea633a5 100644 --- a/compiler/compiler-settings.cpp +++ b/compiler/compiler-settings.cpp @@ -343,8 +343,7 @@ void CompilerSettings::init() { #endif #ifdef ONIG_LIB_DIR - external_static_libs.emplace_back("onig"); - ld_flags.value_ += " -lonig"; + ld_flags.value_ += " -L " ONIG_LIB_DIR " -lonig"; #endif #ifdef KPHP_H3_LIB_DIR diff --git a/runtime/mbstring/mbstring.cpp b/runtime/mbstring/mbstring.cpp index fe62de73bd..411d361b63 100644 --- a/runtime/mbstring/mbstring.cpp +++ b/runtime/mbstring/mbstring.cpp @@ -55,11 +55,10 @@ static bool is_detect_incorrect_encoding_names_warning{false}; #ifdef MBFL extern "C" { #include -#include + #include + #include } -#include - #define KPHP_UNICODE_CASE_UPPER 0 #define KPHP_UNICODE_CASE_LOWER 1 #define KPHP_UNICODE_CASE_TITLE 2 From cf7f205e03e9591c714cb1dd5a9ecfe8f8663830 Mon Sep 17 00:00:00 2001 From: catnyan02 Date: Tue, 23 Jul 2024 16:04:35 +0300 Subject: [PATCH 27/27] Add php tests --- .../mbstring/016_mb_regex_set_options.php | 67 +++++++++++++++++++ tests/phpt/mbstring/017_mb_ereg_match.php | 58 ++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 tests/phpt/mbstring/016_mb_regex_set_options.php create mode 100644 tests/phpt/mbstring/017_mb_ereg_match.php diff --git a/tests/phpt/mbstring/016_mb_regex_set_options.php b/tests/phpt/mbstring/016_mb_regex_set_options.php new file mode 100644 index 0000000000..cd1de1c144 --- /dev/null +++ b/tests/phpt/mbstring/016_mb_regex_set_options.php @@ -0,0 +1,67 @@ +@ok +